Spaces:

ThomasBasil
/

slm-customer-support-chatbot

Paused

App Files Files Community

BasilTh commited on 26 days ago

Commit

77b14f6

1 Parent(s): bbe7b0d

Deploy updated SLM customer-support chatbot

Browse files

Files changed (3) hide show

SLM_CService.py +144 -71
app.py +12 -7
requirements.txt +3 -2

SLM_CService.py CHANGED Viewed

@@ -1,60 +1,121 @@
 # ─── SLM_CService.py ─────────────────────────────────────────────────────────
-import os
-# Fix for libgomp warning in Spaces
-os.environ["OMP_NUM_THREADS"] = "1"
-# 1) Unsloth must come first
-import unsloth
 import torch
 from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
 from peft import PeftModel
-# 2) Simple in-memory convo buffer
-#    we keep alternating (user, assistant) tuples
-conversation_history = []
-# 3) Model + adapter path in your repo (copied into the Space repo root)
-MODEL_DIR = "ThomasBasil/bitext-qlora-tinyllama"
-# 4) Load tokenizer from local dir
-tokenizer = AutoTokenizer.from_pretrained(
-    "ThomasBasil/bitext-qlora-tinyllama", use_fast=False
-)
-tokenizer.pad_token_id = tokenizer.eos_token_id
-tokenizer.padding_side = "left"
 tokenizer.truncation_side = "right"
-# 5) QLoRA + Unsloth load in 4-bit
 bnb_cfg = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_use_double_quant=True,
-    bnb_4bit_compute_dtype=torch.bfloat16
 )
-# 5a) Base model
 model = unsloth.FastLanguageModel.from_pretrained(
-    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
     load_in_4bit=True,
-    quantization_config=bnb_cfg,
     device_map="auto",
-    trust_remote_code=True
 )
-# 5b) Attach your LoRA adapter
-model = PeftModel.from_pretrained(model, "ThomasBasil/bitext-qlora-tinyllama")
-# 6) HF text-gen pipeline
 chat_pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
     trust_remote_code=True,
     return_full_text=False,
-    generate_kwargs={"max_new_tokens":128, "do_sample":True, "top_p":0.9, "temperature":0.7}
 )
-# 7) FSM helpers (your existing code unmodified)
-import re
 order_re = re.compile(r"#(\d{1,10})")
 def extract_order(text: str):
     m = order_re.search(text)
@@ -65,70 +126,82 @@ def handle_eta(o):    return f"Delivery for order #{o} typically takes 3–5 day
 def handle_track(o):  return f"Track order #{o} here: https://track.example.com/{o}"
 def handle_link(o):   return f"Here’s the latest tracking link for order #{o}: https://track.example.com/{o}"
 def handle_return_policy(_=None):
-    return ("Our return policy allows returns of unused items in their original packaging "
-            "within 30 days of receipt. Would you like me to connect you with a human agent?")
 def handle_gratitude(_=None):
     return "You’re welcome! Is there anything else I can help with?"
 def handle_escalation(_=None):
     return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?"
-# 8) Core chat fn
 stored_order   = None
 pending_intent = None
 def chat_with_memory(user_input: str) -> str:
     global stored_order, pending_intent
-    # A) Save into history
-    conversation_history.append(("User", user_input))
-    # B) New order?
-    new_o = extract_order(user_input)
     if new_o:
         stored_order = new_o
-        if pending_intent in ("status","eta","track","link"):
-            fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[pending_intent]
             reply = fn(stored_order)
             pending_intent = None
-            conversation_history.append(("Assistant", reply))
             return reply
-    ui = user_input.lower().strip()
-    # C) Gratitude
-    if any(tok in ui for tok in ["thank you","thanks","thx"]):
-        reply = handle_gratitude()
-        conversation_history.append(("Assistant", reply))
-        return reply
-    # D) Return policy
-    if "return" in ui:
-        reply = handle_return_policy()
-        conversation_history.append(("Assistant", reply))
-        return reply
-    # E) Classify intent
-    if any(k in ui for k in ["status","where is my order","check status"]):
-        intent="status"
-    elif any(k in ui for k in ["how long","eta","delivery time"]):
-        intent="eta"
-    elif any(k in ui for k in ["how can i track","track my order","where is my package"]):
-        intent="track"
-    elif "tracking link" in ui or "resend" in ui:
-        intent="link"
     else:
-        intent="fallback"
-    # F) Fulfill or ask order #
-    if intent in ("status","eta","track","link"):
         if not stored_order:
             pending_intent = intent
             reply = "Sure—what’s your order number (e.g., #12345)?"
         else:
-            fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[intent]
             reply = fn(stored_order)
-    else:
-        reply = handle_escalation()
-    # G) Save & done
-    conversation_history.append(("Assistant", reply))
     return reply

 # ─── SLM_CService.py ─────────────────────────────────────────────────────────
+# Launch-time model setup + FSM + conversational memory for the chatbot.
+import os, shutil, zipfile
+os.environ["OMP_NUM_THREADS"] = "1"          # quiet libgomp noise
+os.environ.pop("HF_HUB_OFFLINE", None)       # avoid accidental offline mode
+# 1) Unsloth must be imported before transformers
+import unsloth
 import torch
 from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
 from peft import PeftModel
+from langchain.memory import ConversationBufferMemory
+import gdown
+import re
+# ── Persistent storage (HF Spaces -> Settings -> Persistent storage) ─────────
+# Docs: /data persists across Space restarts. hf docs: persistent storage. :contentReference[oaicite:0]{index=0}
+PERSIST_DIR   = os.environ.get("PERSIST_DIR", "/data/slm_assets")
+ADAPTER_DIR   = os.path.join(PERSIST_DIR, "adapter")
+TOKENIZER_DIR = os.path.join(PERSIST_DIR, "tokenizer")
+ZIP_PATH      = os.path.join(PERSIST_DIR, "assets.zip")
+# ── Provide Google Drive IDs as Secrets (HF Space -> Settings -> Variables) ──
+# Either one zip with both folders...
+GDRIVE_ZIP_ID       = os.environ.get("GDRIVE_ZIP_ID")
+# ...or separate zips/files for each:
+GDRIVE_ADAPTER_ID   = os.environ.get("GDRIVE_ADAPTER_ID")
+GDRIVE_TOKENIZER_ID = os.environ.get("GDRIVE_TOKENIZER_ID")
+def _ensure_dirs():
+    os.makedirs(PERSIST_DIR, exist_ok=True)
+    os.makedirs(ADAPTER_DIR, exist_ok=True)
+    os.makedirs(TOKENIZER_DIR, exist_ok=True)
+def _have_local_assets():
+    # minimal sanity checks for typical PEFT/tokenizer files
+    tok_ok = any(os.path.exists(os.path.join(TOKENIZER_DIR, f))
+                 for f in ("tokenizer.json", "tokenizer.model", "tokenizer_config.json"))
+    lora_ok = any(os.path.exists(os.path.join(ADAPTER_DIR, f))
+                  for f in ("adapter_config.json", "adapter_model.bin", "adapter_model.safetensors"))
+    return tok_ok and lora_ok
+def _download_from_drive():
+    """Download adapter/tokenizer from Google Drive into /data using gdown."""
+    _ensure_dirs()
+    if GDRIVE_ZIP_ID:
+        gdown.download(id=GDRIVE_ZIP_ID, output=ZIP_PATH, quiet=False)  # gdown is built for Drive. :contentReference[oaicite:1]{index=1}
+        with zipfile.ZipFile(ZIP_PATH, "r") as zf:
+            zf.extractall(PERSIST_DIR)
+        return
+    if GDRIVE_ADAPTER_ID:
+        ad_zip = os.path.join(PERSIST_DIR, "adapter.zip")
+        gdown.download(id=GDRIVE_ADAPTER_ID, output=ad_zip, quiet=False)
+        try:
+            with zipfile.ZipFile(ad_zip, "r") as zf:
+                zf.extractall(ADAPTER_DIR)
+        except zipfile.BadZipFile:
+            # not a zip – assume single file
+            shutil.move(ad_zip, os.path.join(ADAPTER_DIR, "adapter_model.bin"))
+    if GDRIVE_TOKENIZER_ID:
+        tk_zip = os.path.join(PERSIST_DIR, "tokenizer.zip")
+        gdown.download(id=GDRIVE_TOKENIZER_ID, output=tk_zip, quiet=False)
+        try:
+            with zipfile.ZipFile(tk_zip, "r") as zf:
+                zf.extractall(TOKENIZER_DIR)
+        except zipfile.BadZipFile:
+            shutil.move(tk_zip, os.path.join(TOKENIZER_DIR, "tokenizer.json"))
+# ── Ensure local assets from Drive (first launch will download) ──────────────
+if not _have_local_assets():
+    _download_from_drive()     # persists in /data if you enabled it. :contentReference[oaicite:2]{index=2}
+# ── Tokenizer (from your Drive-backed folder) ────────────────────────────────
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, use_fast=False)
+tokenizer.pad_token_id    = tokenizer.eos_token_id
+tokenizer.padding_side    = "left"
 tokenizer.truncation_side = "right"
+# ── Base model (4-bit) via Unsloth + your PEFT adapter ──────────────────────
+BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 bnb_cfg = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
 )
 model = unsloth.FastLanguageModel.from_pretrained(
+    BASE,
     load_in_4bit=True,
+    quantization_config=bnb_cfg,     # prefer quantization_config over quant_type
     device_map="auto",
+    trust_remote_code=True,
 )
+model = PeftModel.from_pretrained(model, ADAPTER_DIR)
+# ── Text-generation pipeline (use generate_kwargs, not generation_kwargs) ────
+# Transformers pipelines accept `generate_kwargs` to forward to .generate(). :contentReference[oaicite:3]{index=3}
 chat_pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
     trust_remote_code=True,
     return_full_text=False,
+    generate_kwargs={"max_new_tokens": 128, "do_sample": True, "top_p": 0.9, "temperature": 0.7},
 )
+# ── Conversational memory (LangChain) ────────────────────────────────────────
+# ConversationBufferMemory stores full turn-by-turn chat history. :contentReference[oaicite:4]{index=4}
+memory = ConversationBufferMemory(return_messages=True)
+# ── FSM helpers (your original logic, kept intact) ───────────────────────────
 order_re = re.compile(r"#(\d{1,10})")
 def extract_order(text: str):
     m = order_re.search(text)
 def handle_track(o):  return f"Track order #{o} here: https://track.example.com/{o}"
 def handle_link(o):   return f"Here’s the latest tracking link for order #{o}: https://track.example.com/{o}"
 def handle_return_policy(_=None):
+    return ("Our return policy allows returns of unused items in their original packaging within 30 days of receipt. "
+            "Would you like me to connect you with a human agent?")
 def handle_gratitude(_=None):
     return "You’re welcome! Is there anything else I can help with?"
 def handle_escalation(_=None):
     return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?"
 stored_order   = None
 pending_intent = None
+def _history_to_prompt(user_input: str) -> str:
+    """Build a prompt from LangChain memory turns for fallback generation."""
+    hist = memory.load_memory_variables({}).get("chat_history", [])
+    prompt = "You are a helpful support assistant.\n"
+    for msg in hist:
+        # LangChain messages expose a .type like 'human'/'ai' in many versions
+        mtype = getattr(msg, "type", "")
+        role  = "User" if mtype == "human" else "Assistant"
+        content = getattr(msg, "content", "")
+        prompt += f"{role}: {content}\n"
+    prompt += f"User: {user_input}\nAssistant: "
+    return prompt
 def chat_with_memory(user_input: str) -> str:
     global stored_order, pending_intent
+    ui = user_input.strip()
+    low = ui.lower()
+    # A) quick intent short-circuits
+    if any(tok in low for tok in ["thank you", "thanks", "thx"]):
+        reply = handle_gratitude()
+        memory.save_context({"input": ui}, {"output": reply})
+        return reply
+    if "return" in low:
+        reply = handle_return_policy()
+        memory.save_context({"input": ui}, {"output": reply})
+        return reply
+    # B) order number?
+    new_o = extract_order(ui)
     if new_o:
         stored_order = new_o
+        if pending_intent in ("status", "eta", "track", "link"):
+            fn = {"status": handle_status, "eta": handle_eta, "track": handle_track, "link": handle_link}[pending_intent]
             reply = fn(stored_order)
             pending_intent = None
+            memory.save_context({"input": ui}, {"output": reply})
             return reply
+    # C) intent classification
+    if any(k in low for k in ["status", "where is my order", "check status"]):
+        intent = "status"
+    elif any(k in low for k in ["how long", "eta", "delivery time"]):
+        intent = "eta"
+    elif any(k in low for k in ["how can i track", "track my order", "where is my package"]):
+        intent = "track"
+    elif "tracking link" in low or "resend" in low:
+        intent = "link"
     else:
+        intent = "fallback"
+    # D) handle core intents (ask for order first if needed)
+    if intent in ("status", "eta", "track", "link"):
         if not stored_order:
             pending_intent = intent
             reply = "Sure—what’s your order number (e.g., #12345)?"
         else:
+            fn = {"status": handle_status, "eta": handle_eta, "track": handle_track, "link": handle_link}[intent]
             reply = fn(stored_order)
+        memory.save_context({"input": ui}, {"output": reply})
+        return reply
+    # E) fallback → generate with chat history context
+    prompt = _history_to_prompt(ui)
+    out = chat_pipe(prompt)[0]["generated_text"]
+    reply = out.split("Assistant:")[-1].strip()
+    memory.save_context({"input": ui}, {"output": reply})
     return reply

app.py CHANGED Viewed

@@ -1,23 +1,28 @@
 import os
-os.environ["OMP_NUM_THREADS"] = "1"  # Silence Gradio startup warning
 import gradio as gr
 from SLM_CService import chat_with_memory
 def respond(user_message, history):
     bot_reply = chat_with_memory(user_message)
-    history = history + [(user_message, bot_reply)]
     return history, history
 with gr.Blocks() as demo:
     gr.Markdown("# 🛎 Customer Support Chatbot")
-    chatbot = gr.Chatbot()  # Replaces ChatInterface/FileMessage/TextMessage :contentReference[oaicite:8]{index=8}
     with gr.Row():
-        user_in = gr.Textbox(placeholder="Type your message here...")
-        submit = gr.Button("Send")
-        reset = gr.Button("🔄 Reset Chat")
-    submit.click(respond, [user_in, chatbot], [chatbot, chatbot])
     reset.click(lambda: ([], []), None, [chatbot, chatbot])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
+os.environ["OMP_NUM_THREADS"] = "1"  # silence OpenMP spam
 import gradio as gr
 from SLM_CService import chat_with_memory
 def respond(user_message, history):
+    if not user_message:
+        return history, history
     bot_reply = chat_with_memory(user_message)
+    history = (history or []) + [(user_message, bot_reply)]
     return history, history
 with gr.Blocks() as demo:
     gr.Markdown("# 🛎 Customer Support Chatbot")
+    chatbot = gr.Chatbot()
     with gr.Row():
+        user_in = gr.Textbox(placeholder="Type your message here...", scale=5)
+        send    = gr.Button("Send", variant="primary")
+        reset   = gr.Button("🔄 Reset Chat")
+    send.click(respond, [user_in, chatbot], [chatbot, chatbot])
     reset.click(lambda: ([], []), None, [chatbot, chatbot])
+    # Optional: submit on enter
+    user_in.submit(respond, [user_in, chatbot], [chatbot, chatbot])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt CHANGED Viewed

@@ -1,11 +1,12 @@
-gradio==5.41.1                # Matches Spaces SDK version :contentReference[oaicite:9]{index=9}
 transformers
 torch
 sentencepiece
-langchain                    # Required for ConversationBufferMemory :contentReference[oaicite:10]{index=10}
 bitsandbytes
 peft
 xformers
 unsloth
 unsloth_zoo
 huggingface_hub

+gradio==5.41.1
 transformers
 torch
 sentencepiece
+langchain
 bitsandbytes
 peft
 xformers
 unsloth
 unsloth_zoo
 huggingface_hub
+gdown