BasilTh commited on
Commit
f3b040f
Β·
1 Parent(s): 4365d85

Deploy updated SLM customer-support chatbot

Browse files
Files changed (4) hide show
  1. README.md +4 -7
  2. SLM_CService.py +97 -76
  3. app.py +11 -76
  4. requirements.txt +1 -1
README.md CHANGED
@@ -1,11 +1,8 @@
1
  ---
2
- license: mit
3
- title: Customer Support Chatbot
4
- sdk: gradio
5
- sdk_version: "5.41.1"
6
- emoji: πŸš€
7
  colorFrom: blue
8
  colorTo: purple
9
- pinned: false
 
10
  app_file: app.py
11
- ---
 
1
  ---
2
+ title: "Customer Support Chatbot"
3
+ emoji: "πŸ›Ž"
 
 
 
4
  colorFrom: blue
5
  colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: "5.41.1"
8
  app_file: app.py
 
SLM_CService.py CHANGED
@@ -1,82 +1,90 @@
 
1
  import os
2
- os.environ["OMP_NUM_THREADS"] = "1"
3
- import unsloth # patch before transformers
4
- import torch, triton
5
- from transformers import pipeline, AutoTokenizer
6
- from langchain.memory import ConversationBufferMemory
7
- import re
8
- from peft import PeftModel # if you attach adapters
9
-
10
- # β€”β€”β€” Model & Tokenizer Loading (example) β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
11
- # adjust FINETUNED_DIR to your drive path or HF path
12
- FINETUNED_DIR = "/content/drive/MyDrive/bitext-qlora-tinyllama"
13
- bnb_cfg = {
14
- "load_in_4bit": True,
15
- "bnb_4bit_quant_type": "nf4",
16
- "bnb_4bit_use_double_quant": True,
17
- "bnb_4bit_compute_dtype": torch.bfloat16
18
- }
19
-
20
- # 1) Load tokenizer & model
21
- tokenizer = AutoTokenizer.from_pretrained(FINETUNED_DIR, use_fast=False)
 
 
 
 
22
  tokenizer.pad_token_id = tokenizer.eos_token_id
23
  tokenizer.padding_side = "left"
24
  tokenizer.truncation_side = "right"
25
 
26
- model = PeftModel.from_pretrained(
27
- # base model loaded via unsloth
28
- unsloth.FastLanguageModel.from_pretrained(
29
- "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
30
- quant_type="nf4",
31
- load_in_4bit=True,
32
- device_map="auto",
33
- trust_remote_code=True
34
- ),
35
- FINETUNED_DIR,
36
- local_files_only=True
37
  )
 
 
 
 
 
 
 
 
 
 
38
 
39
- # 2) Build a Gradio-style pipeline
40
  chat_pipe = pipeline(
41
  "text-generation",
42
  model=model,
43
  tokenizer=tokenizer,
44
  trust_remote_code=True,
45
  return_full_text=False,
46
- generation_kwargs={"max_new_tokens": 256, "do_sample": True, "top_p": 0.9, "temperature": 0.7}
47
- )
48
-
49
- # β€”β€”β€” Conversation Memory Setup β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
50
- memory = ConversationBufferMemory(
51
- memory_key="chat_history",
52
- human_prefix="User",
53
- ai_prefix="Assistant",
54
- return_messages=False
55
  )
56
 
57
- # β€”β€”β€” Intent Handlers β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
 
58
  order_re = re.compile(r"#(\d{1,10})")
59
- stored_order = None
60
- pending_intent = None
61
-
62
  def extract_order(text: str):
63
  m = order_re.search(text)
64
  return m.group(1) if m else None
65
 
66
- def handle_status(o): return f"Order #{o} is in transit and should arrive in 3–5 days."
67
- def handle_eta(o): return f"Delivery for order #{o} typically takes 3–5 days; track at https://track.example.com/{o}"
68
  def handle_track(o): return f"Track order #{o} here: https://track.example.com/{o}"
69
- def handle_link(o): return f"Here’s the latest link for order #{o}: https://track.example.com/{o}"
70
- def handle_return(_): return "Our return policy: unused items in original packaging within 30 days. Need human agent?"
71
- def handle_thanks(_): return "You’re welcome! Anything else I can do?"
72
- def handle_escalate(_): return "I’m sorry, I don’t have that info. Connect you with a human agent?"
 
 
 
 
73
 
74
- # β€”β€”β€” Main Chat Function β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
 
 
75
  def chat_with_memory(user_input: str) -> str:
76
  global stored_order, pending_intent
77
- memory.save_context({"input": user_input}, {"output": ""})
78
 
79
- # capture fresh order # and fulfill pending
 
 
 
80
  new_o = extract_order(user_input)
81
  if new_o:
82
  stored_order = new_o
@@ -84,33 +92,46 @@ def chat_with_memory(user_input: str) -> str:
84
  fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[pending_intent]
85
  reply = fn(stored_order)
86
  pending_intent = None
87
- memory.save_context({"input": user_input}, {"output": reply})
88
  return reply
89
 
90
  ui = user_input.lower().strip()
91
- # gratitude
 
92
  if any(tok in ui for tok in ["thank you","thanks","thx"]):
93
- reply = handle_thanks(None)
94
- # return policy
95
- elif "return" in ui:
96
- reply = handle_return(None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  else:
98
- # intents
99
- if any(k in ui for k in ["status","where is my order"]): intent="status"
100
- elif any(k in ui for k in ["how long","eta"]): intent="eta"
101
- elif any(k in ui for k in ["track","where is my package"]): intent="track"
102
- elif any(k in ui for k in ["tracking link","resend"]): intent="link"
103
- else: intent="escalate"
104
-
105
- if intent in ("status","eta","track","link"):
106
- if not stored_order:
107
- pending_intent = intent
108
- reply = "Sureβ€”what’s your order number (e.g. #12345)?"
109
- else:
110
- fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[intent]
111
- reply = fn(stored_order)
112
  else:
113
- reply = handle_escalate(None)
 
 
 
114
 
115
- memory.save_context({"input": user_input}, {"output": reply})
 
116
  return reply
 
1
+ # ─── SLM_CService.py ─────────────────────────────────────────────────────────
2
  import os
3
+ # Fix for libgomp warning in Spaces
4
+ os.environ.pop("OMP_NUM_THREADS", None)
5
+
6
+ # 1) Unsloth must come first
7
+ import unsloth
8
+ import triton
9
+ import torch
10
+
11
+ from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
12
+ from peft import PeftModel
13
+
14
+ # 2) Simple in-memory convo buffer
15
+ # we keep alternating (user, assistant) tuples
16
+ conversation_history = []
17
+
18
+ # 3) Model + adapter path in your repo (copied into the Space repo root)
19
+ MODEL_DIR = "bitext-qlora-tinyllama"
20
+
21
+ # 4) Load tokenizer from local dir
22
+ tokenizer = AutoTokenizer.from_pretrained(
23
+ MODEL_DIR,
24
+ use_fast=False,
25
+ local_files_only=True
26
+ )
27
  tokenizer.pad_token_id = tokenizer.eos_token_id
28
  tokenizer.padding_side = "left"
29
  tokenizer.truncation_side = "right"
30
 
31
+ # 5) QLoRA + Unsloth load in 4-bit
32
+ bnb_cfg = BitsAndBytesConfig(
33
+ load_in_4bit=True,
34
+ bnb_4bit_quant_type="nf4",
35
+ bnb_4bit_use_double_quant=True,
36
+ bnb_4bit_compute_dtype=torch.bfloat16
 
 
 
 
 
37
  )
38
+ # 5a) Base model
39
+ model = unsloth.FastLanguageModel.from_pretrained(
40
+ "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
41
+ load_in_4bit=True,
42
+ quant_type="nf4",
43
+ device_map="auto",
44
+ trust_remote_code=True
45
+ )
46
+ # 5b) Attach your LoRA adapter
47
+ model = PeftModel.from_pretrained(model, MODEL_DIR, local_files_only=True)
48
 
49
+ # 6) HF text-gen pipeline
50
  chat_pipe = pipeline(
51
  "text-generation",
52
  model=model,
53
  tokenizer=tokenizer,
54
  trust_remote_code=True,
55
  return_full_text=False,
56
+ generation_kwargs={"max_new_tokens":128, "do_sample":True, "top_p":0.9, "temperature":0.7}
 
 
 
 
 
 
 
 
57
  )
58
 
59
+ # 7) FSM helpers (your existing code unmodified)
60
+ import re
61
  order_re = re.compile(r"#(\d{1,10})")
 
 
 
62
  def extract_order(text: str):
63
  m = order_re.search(text)
64
  return m.group(1) if m else None
65
 
66
+ def handle_status(o): return f"Order #{o} is in transit and should arrive in 3–5 business days."
67
+ def handle_eta(o): return f"Delivery for order #{o} typically takes 3–5 days; you can track it at https://track.example.com/{o}"
68
  def handle_track(o): return f"Track order #{o} here: https://track.example.com/{o}"
69
+ def handle_link(o): return f"Here’s the latest tracking link for order #{o}: https://track.example.com/{o}"
70
+ def handle_return_policy(_=None):
71
+ return ("Our return policy allows returns of unused items in their original packaging "
72
+ "within 30 days of receipt. Would you like me to connect you with a human agent?")
73
+ def handle_gratitude(_=None):
74
+ return "You’re welcome! Is there anything else I can help with?"
75
+ def handle_escalation(_=None):
76
+ return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?"
77
 
78
+ # 8) Core chat fn
79
+ stored_order = None
80
+ pending_intent = None
81
  def chat_with_memory(user_input: str) -> str:
82
  global stored_order, pending_intent
 
83
 
84
+ # A) Save into history
85
+ conversation_history.append(("User", user_input))
86
+
87
+ # B) New order?
88
  new_o = extract_order(user_input)
89
  if new_o:
90
  stored_order = new_o
 
92
  fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[pending_intent]
93
  reply = fn(stored_order)
94
  pending_intent = None
95
+ conversation_history.append(("Assistant", reply))
96
  return reply
97
 
98
  ui = user_input.lower().strip()
99
+
100
+ # C) Gratitude
101
  if any(tok in ui for tok in ["thank you","thanks","thx"]):
102
+ reply = handle_gratitude()
103
+ conversation_history.append(("Assistant", reply))
104
+ return reply
105
+
106
+ # D) Return policy
107
+ if "return" in ui:
108
+ reply = handle_return_policy()
109
+ conversation_history.append(("Assistant", reply))
110
+ return reply
111
+
112
+ # E) Classify intent
113
+ if any(k in ui for k in ["status","where is my order","check status"]):
114
+ intent="status"
115
+ elif any(k in ui for k in ["how long","eta","delivery time"]):
116
+ intent="eta"
117
+ elif any(k in ui for k in ["how can i track","track my order","where is my package"]):
118
+ intent="track"
119
+ elif "tracking link" in ui or "resend" in ui:
120
+ intent="link"
121
  else:
122
+ intent="fallback"
123
+
124
+ # F) Fulfill or ask order #
125
+ if intent in ("status","eta","track","link"):
126
+ if not stored_order:
127
+ pending_intent = intent
128
+ reply = "Sureβ€”what’s your order number (e.g., #12345)?"
 
 
 
 
 
 
 
129
  else:
130
+ fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[intent]
131
+ reply = fn(stored_order)
132
+ else:
133
+ reply = handle_escalation()
134
 
135
+ # G) Save & done
136
+ conversation_history.append(("Assistant", reply))
137
  return reply
app.py CHANGED
@@ -1,81 +1,16 @@
1
- # app.py
2
  import os
3
- # ── suppress libgomp warnings ─────────────────────────────────────────────
4
- os.environ["OMP_NUM_THREADS"] = "1"
5
-
6
- # ── patch Unsloth before transformers ────────────────────────────────────
7
- import unsloth
8
- import torch
9
- import triton
10
-
11
- # ── Gradio & model deps ──────────────────────────────────────────────────
12
- import gradio as gr
13
- from transformers import pipeline, AutoTokenizer, BitsAndBytesConfig
14
- from peft import PeftModel
15
- from langchain.memory import ConversationBufferMemory
16
-
17
- # ── Load your fine-tuned QLoRA model ────────────────────────────────────
18
- BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
19
- ADAPTER_DIR = "tinyllama-qlora-adapters" # or wherever you stored your adapters
20
-
21
- bnb_cfg = BitsAndBytesConfig(
22
- load_in_4bit=True,
23
- bnb_4bit_quant_type="bnb_dynamic",
24
- bnb_4bit_use_double_quant=True
25
- )
26
-
27
- tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=False, local_files_only=True)
28
- tokenizer.pad_token_id = tokenizer.eos_token_id
29
-
30
- model = unsloth.FastLanguageModel.from_pretrained(
31
- BASE,
32
- load_in_4bit=True,
33
- quant_type="bnb_dynamic",
34
- device_map="auto",
35
- trust_remote_code=True
36
- )
37
- model = PeftModel.from_pretrained(model, ADAPTER_DIR, local_files_only=True)
38
-
39
- chat_pipe = pipeline(
40
- "conversational",
41
- model=model,
42
- tokenizer=tokenizer,
43
- trust_remote_code=True,
44
- return_full_text=False,
45
- device_map="auto",
46
- generation_kwargs={"max_new_tokens":128, "do_sample":True, "top_p":0.9, "temperature":0.7}
47
- )
48
-
49
- # ── Conversational memory ────────────────────────────────────────────────
50
- memory = ConversationBufferMemory(
51
- memory_key="chat_history",
52
- human_prefix="User",
53
- ai_prefix="Assistant",
54
- return_messages=True
55
- )
56
 
 
57
  def respond(user_message, history):
58
- # reconstruct history for LangChain
59
- for msg in history:
60
- if msg["role"] == "user":
61
- chat_pipe.add_user_input(msg["content"])
62
- else:
63
- chat_pipe.append_response(msg["content"])
64
- # get new reply
65
- reply = chat_pipe(user_message).generated_responses[-1]
66
- # save into memory
67
- memory.save_context({"input": user_message}, {"output": reply})
68
- return reply, memory.load_memory_variables({})["chat_history"]
69
 
70
- # ── Gradio UI ────────────────────────────────────────────────────────────
71
- with gr.Blocks() as demo:
72
- gr.Markdown("# πŸ›Ž Customer Support Chatbot")
73
- chatbot = gr.Chatbot()
74
- with gr.Row():
75
- user_in = gr.Textbox(placeholder="Type your message here…")
76
- send = gr.Button("Send")
77
- reset = gr.Button("πŸ”„ Reset Chat")
78
- send.click(respond, [user_in, chatbot], [chatbot, chatbot])
79
- reset.click(lambda: ([], []), None, [chatbot, chatbot])
80
 
81
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ # ─── app.py ─────────────────────────────────────────────────────────────────
2
  import os
3
+ from gradio import ChatInterface, FileMessage, TextMessage
4
+ from SLM_CService import chat_with_memory, conversation_history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # Gradio chat UI
7
  def respond(user_message, history):
8
+ reply = chat_with_memory(user_message)
9
+ # Gradio expects list of lists
10
+ return history + [(user_message, reply)]
 
 
 
 
 
 
 
 
11
 
12
+ iface = ChatInterface(fn=respond,
13
+ title="SLM Customer Support Bot",
14
+ description="A quantized TinyLlama+QLoRA chatbot with simple memory.")
15
+ iface.launch()
 
 
 
 
 
 
16
 
 
requirements.txt CHANGED
@@ -8,4 +8,4 @@ unsloth_zoo
8
  huggingface_hub
9
  sentencepiece
10
  torch
11
- langchain>=0.0.250
 
8
  huggingface_hub
9
  sentencepiece
10
  torch
11
+ langchain