Spaces:

ThomasBasil
/

slm-customer-support-chatbot

Paused

App Files Files Community

BasilTh commited on Aug 7

Commit

f3b040f

1 Parent(s): 4365d85

Deploy updated SLM customer-support chatbot

Browse files

Files changed (4) hide show

README.md +4 -7
SLM_CService.py +97 -76
app.py +11 -76
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -1,11 +1,8 @@
 ---
-license: mit
-title: Customer Support Chatbot
-sdk: gradio
-sdk_version: "5.41.1"
-emoji: 🚀
 colorFrom: blue
 colorTo: purple
-pinned: false
 app_file: app.py
----

 ---
+title: "Customer Support Chatbot"
+emoji: "🛎"
 colorFrom: blue
 colorTo: purple
+sdk: gradio
+sdk_version: "5.41.1"
 app_file: app.py

SLM_CService.py CHANGED Viewed

@@ -1,82 +1,90 @@
 import os
-os.environ["OMP_NUM_THREADS"] = "1"
-import unsloth                                         # patch before transformers
-import torch, triton
-from transformers import pipeline, AutoTokenizer
-from langchain.memory import ConversationBufferMemory
-import re
-from peft import PeftModel                         # if you attach adapters
-# ——— Model & Tokenizer Loading (example) ——————————————————
-# adjust FINETUNED_DIR to your drive path or HF path
-FINETUNED_DIR = "/content/drive/MyDrive/bitext-qlora-tinyllama"
-bnb_cfg = {
-    "load_in_4bit": True,
-    "bnb_4bit_quant_type": "nf4",
-    "bnb_4bit_use_double_quant": True,
-    "bnb_4bit_compute_dtype": torch.bfloat16
-}
-# 1) Load tokenizer & model
-tokenizer = AutoTokenizer.from_pretrained(FINETUNED_DIR, use_fast=False)
 tokenizer.pad_token_id = tokenizer.eos_token_id
 tokenizer.padding_side = "left"
 tokenizer.truncation_side = "right"
-model = PeftModel.from_pretrained(
-    # base model loaded via unsloth
-    unsloth.FastLanguageModel.from_pretrained(
-        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        quant_type="nf4",
-        load_in_4bit=True,
-        device_map="auto",
-        trust_remote_code=True
-    ),
-    FINETUNED_DIR,
-    local_files_only=True
 )
-# 2) Build a Gradio-style pipeline
 chat_pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
     trust_remote_code=True,
     return_full_text=False,
-    generation_kwargs={"max_new_tokens": 256, "do_sample": True, "top_p": 0.9, "temperature": 0.7}
-)
-# ——— Conversation Memory Setup ————————————————————————
-memory = ConversationBufferMemory(
-    memory_key="chat_history",
-    human_prefix="User",
-    ai_prefix="Assistant",
-    return_messages=False
 )
-# ——— Intent Handlers ——————————————————————————————————
 order_re = re.compile(r"#(\d{1,10})")
-stored_order = None
-pending_intent = None
 def extract_order(text: str):
     m = order_re.search(text)
     return m.group(1) if m else None
-def handle_status(o): return f"Order #{o} is in transit and should arrive in 3–5 days."
-def handle_eta(o):    return f"Delivery for order #{o} typically takes 3–5 days; track at https://track.example.com/{o}"
 def handle_track(o):  return f"Track order #{o} here: https://track.example.com/{o}"
-def handle_link(o):   return f"Here’s the latest link for order #{o}: https://track.example.com/{o}"
-def handle_return(_): return "Our return policy: unused items in original packaging within 30 days. Need human agent?"
-def handle_thanks(_): return "You’re welcome! Anything else I can do?"
-def handle_escalate(_): return "I’m sorry, I don’t have that info. Connect you with a human agent?"
-# ——— Main Chat Function ————————————————————————————
 def chat_with_memory(user_input: str) -> str:
     global stored_order, pending_intent
-    memory.save_context({"input": user_input}, {"output": ""})
-    # capture fresh order # and fulfill pending
     new_o = extract_order(user_input)
     if new_o:
         stored_order = new_o
@@ -84,33 +92,46 @@ def chat_with_memory(user_input: str) -> str:
             fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[pending_intent]
             reply = fn(stored_order)
             pending_intent = None
-            memory.save_context({"input": user_input}, {"output": reply})
             return reply
     ui = user_input.lower().strip()
-    # gratitude
     if any(tok in ui for tok in ["thank you","thanks","thx"]):
-        reply = handle_thanks(None)
-    # return policy
-    elif "return" in ui:
-        reply = handle_return(None)
     else:
-        # intents
-        if any(k in ui for k in ["status","where is my order"]): intent="status"
-        elif any(k in ui for k in ["how long","eta"]):              intent="eta"
-        elif any(k in ui for k in ["track","where is my package"]): intent="track"
-        elif any(k in ui for k in ["tracking link","resend"]):      intent="link"
-        else:                                                        intent="escalate"
-        if intent in ("status","eta","track","link"):
-            if not stored_order:
-                pending_intent = intent
-                reply = "Sure—what’s your order number (e.g. #12345)?"
-            else:
-                fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[intent]
-                reply = fn(stored_order)
         else:
-            reply = handle_escalate(None)
-    memory.save_context({"input": user_input}, {"output": reply})
     return reply

+# ─── SLM_CService.py ─────────────────────────────────────────────────────────
 import os
+# Fix for libgomp warning in Spaces
+os.environ.pop("OMP_NUM_THREADS", None)
+# 1) Unsloth must come first
+import unsloth
+import triton
+import torch
+from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
+from peft import PeftModel
+# 2) Simple in-memory convo buffer
+#    we keep alternating (user, assistant) tuples
+conversation_history = []
+# 3) Model + adapter path in your repo (copied into the Space repo root)
+MODEL_DIR = "bitext-qlora-tinyllama"
+# 4) Load tokenizer from local dir
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_DIR,
+    use_fast=False,
+    local_files_only=True
+)
 tokenizer.pad_token_id = tokenizer.eos_token_id
 tokenizer.padding_side = "left"
 tokenizer.truncation_side = "right"
+# 5) QLoRA + Unsloth load in 4-bit
+bnb_cfg = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16
 )
+# 5a) Base model
+model = unsloth.FastLanguageModel.from_pretrained(
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    load_in_4bit=True,
+    quant_type="nf4",
+    device_map="auto",
+    trust_remote_code=True
+)
+# 5b) Attach your LoRA adapter
+model = PeftModel.from_pretrained(model, MODEL_DIR, local_files_only=True)
+# 6) HF text-gen pipeline
 chat_pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
     trust_remote_code=True,
     return_full_text=False,
+    generation_kwargs={"max_new_tokens":128, "do_sample":True, "top_p":0.9, "temperature":0.7}
 )
+# 7) FSM helpers (your existing code unmodified)
+import re
 order_re = re.compile(r"#(\d{1,10})")
 def extract_order(text: str):
     m = order_re.search(text)
     return m.group(1) if m else None
+def handle_status(o): return f"Order #{o} is in transit and should arrive in 3–5 business days."
+def handle_eta(o):    return f"Delivery for order #{o} typically takes 3–5 days; you can track it at https://track.example.com/{o}"
 def handle_track(o):  return f"Track order #{o} here: https://track.example.com/{o}"
+def handle_link(o):   return f"Here’s the latest tracking link for order #{o}: https://track.example.com/{o}"
+def handle_return_policy(_=None):
+    return ("Our return policy allows returns of unused items in their original packaging "
+            "within 30 days of receipt. Would you like me to connect you with a human agent?")
+def handle_gratitude(_=None):
+    return "You’re welcome! Is there anything else I can help with?"
+def handle_escalation(_=None):
+    return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?"
+# 8) Core chat fn
+stored_order   = None
+pending_intent = None
 def chat_with_memory(user_input: str) -> str:
     global stored_order, pending_intent
+    # A) Save into history
+    conversation_history.append(("User", user_input))
+    # B) New order?
     new_o = extract_order(user_input)
     if new_o:
         stored_order = new_o
             fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[pending_intent]
             reply = fn(stored_order)
             pending_intent = None
+            conversation_history.append(("Assistant", reply))
             return reply
     ui = user_input.lower().strip()
+    # C) Gratitude
     if any(tok in ui for tok in ["thank you","thanks","thx"]):
+        reply = handle_gratitude()
+        conversation_history.append(("Assistant", reply))
+        return reply
+    # D) Return policy
+    if "return" in ui:
+        reply = handle_return_policy()
+        conversation_history.append(("Assistant", reply))
+        return reply
+    # E) Classify intent
+    if any(k in ui for k in ["status","where is my order","check status"]):
+        intent="status"
+    elif any(k in ui for k in ["how long","eta","delivery time"]):
+        intent="eta"
+    elif any(k in ui for k in ["how can i track","track my order","where is my package"]):
+        intent="track"
+    elif "tracking link" in ui or "resend" in ui:
+        intent="link"
     else:
+        intent="fallback"
+    # F) Fulfill or ask order #
+    if intent in ("status","eta","track","link"):
+        if not stored_order:
+            pending_intent = intent
+            reply = "Sure—what’s your order number (e.g., #12345)?"
         else:
+            fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[intent]
+            reply = fn(stored_order)
+    else:
+        reply = handle_escalation()
+    # G) Save & done
+    conversation_history.append(("Assistant", reply))
     return reply

app.py CHANGED Viewed

@@ -1,81 +1,16 @@
-# app.py
 import os
-#  ── suppress libgomp warnings ─────────────────────────────────────────────
-os.environ["OMP_NUM_THREADS"] = "1"
-#  ── patch Unsloth before transformers ────────────────────────────────────
-import unsloth
-import torch
-import triton
-#  ── Gradio & model deps ──────────────────────────────────────────────────
-import gradio as gr
-from transformers import pipeline, AutoTokenizer, BitsAndBytesConfig
-from peft import PeftModel
-from langchain.memory import ConversationBufferMemory
-#  ── Load your fine-tuned QLoRA model ────────────────────────────────────
-BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-ADAPTER_DIR = "tinyllama-qlora-adapters"  # or wherever you stored your adapters
-bnb_cfg = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="bnb_dynamic",
-    bnb_4bit_use_double_quant=True
-)
-tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False, local_files_only=True)
-tokenizer.pad_token_id = tokenizer.eos_token_id
-model = unsloth.FastLanguageModel.from_pretrained(
-    BASE,
-    load_in_4bit=True,
-    quant_type="bnb_dynamic",
-    device_map="auto",
-    trust_remote_code=True
-)
-model = PeftModel.from_pretrained(model, ADAPTER_DIR, local_files_only=True)
-chat_pipe = pipeline(
-    "conversational",
-    model=model,
-    tokenizer=tokenizer,
-    trust_remote_code=True,
-    return_full_text=False,
-    device_map="auto",
-    generation_kwargs={"max_new_tokens":128, "do_sample":True, "top_p":0.9, "temperature":0.7}
-)
-#  ── Conversational memory ────────────────────────────────────────────────
-memory = ConversationBufferMemory(
-    memory_key="chat_history",
-    human_prefix="User",
-    ai_prefix="Assistant",
-    return_messages=True
-)
 def respond(user_message, history):
-    # reconstruct history for LangChain
-    for msg in history:
-        if msg["role"] == "user":
-            chat_pipe.add_user_input(msg["content"])
-        else:
-            chat_pipe.append_response(msg["content"])
-    # get new reply
-    reply = chat_pipe(user_message).generated_responses[-1]
-    # save into memory
-    memory.save_context({"input": user_message}, {"output": reply})
-    return reply, memory.load_memory_variables({})["chat_history"]
-#  ── Gradio UI ────────────────────────────────────────────────────────────
-with gr.Blocks() as demo:
-    gr.Markdown("# 🛎 Customer Support Chatbot")
-    chatbot = gr.Chatbot()
-    with gr.Row():
-        user_in = gr.Textbox(placeholder="Type your message here…")
-        send    = gr.Button("Send")
-        reset   = gr.Button("🔄 Reset Chat")
-    send.click(respond, [user_in, chatbot], [chatbot, chatbot])
-    reset.click(lambda: ([], []), None, [chatbot, chatbot])
-demo.launch(server_name="0.0.0.0", server_port=7860)

+# ─── app.py ─────────────────────────────────────────────────────────────────
 import os
+from gradio import ChatInterface, FileMessage, TextMessage
+from SLM_CService import chat_with_memory, conversation_history
+# Gradio chat UI
 def respond(user_message, history):
+    reply = chat_with_memory(user_message)
+    # Gradio expects list of lists
+    return history + [(user_message, reply)]
+iface = ChatInterface(fn=respond,
+                      title="SLM Customer Support Bot",
+                      description="A quantized TinyLlama+QLoRA chatbot with simple memory.")
+iface.launch()

requirements.txt CHANGED Viewed

@@ -8,4 +8,4 @@ unsloth_zoo
 huggingface_hub
 sentencepiece
 torch
-langchain>=0.0.250

 huggingface_hub
 sentencepiece
 torch
+langchain