BasilTh
commited on
Commit
Β·
93d3bfa
1
Parent(s):
77b14f6
Deploy updated SLM customer-support chatbot
Browse files- SLM_CService.py +58 -87
- requirements.txt +1 -4
SLM_CService.py
CHANGED
@@ -1,88 +1,26 @@
|
|
1 |
-
#
|
2 |
# Launch-time model setup + FSM + conversational memory for the chatbot.
|
3 |
|
4 |
-
import os
|
5 |
-
|
6 |
-
os.environ
|
|
|
7 |
|
8 |
-
# 1) Unsloth must be imported
|
9 |
-
|
10 |
-
import
|
11 |
|
|
|
12 |
from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
|
13 |
from peft import PeftModel
|
14 |
from langchain.memory import ConversationBufferMemory
|
15 |
-
import gdown
|
16 |
-
import re
|
17 |
|
18 |
-
#
|
19 |
-
#
|
20 |
-
|
21 |
-
ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapter")
|
22 |
-
TOKENIZER_DIR = os.path.join(PERSIST_DIR, "tokenizer")
|
23 |
-
ZIP_PATH = os.path.join(PERSIST_DIR, "assets.zip")
|
24 |
-
|
25 |
-
# ββ Provide Google Drive IDs as Secrets (HF Space -> Settings -> Variables) ββ
|
26 |
-
# Either one zip with both folders...
|
27 |
-
GDRIVE_ZIP_ID = os.environ.get("GDRIVE_ZIP_ID")
|
28 |
-
# ...or separate zips/files for each:
|
29 |
-
GDRIVE_ADAPTER_ID = os.environ.get("GDRIVE_ADAPTER_ID")
|
30 |
-
GDRIVE_TOKENIZER_ID = os.environ.get("GDRIVE_TOKENIZER_ID")
|
31 |
-
|
32 |
-
def _ensure_dirs():
|
33 |
-
os.makedirs(PERSIST_DIR, exist_ok=True)
|
34 |
-
os.makedirs(ADAPTER_DIR, exist_ok=True)
|
35 |
-
os.makedirs(TOKENIZER_DIR, exist_ok=True)
|
36 |
-
|
37 |
-
def _have_local_assets():
|
38 |
-
# minimal sanity checks for typical PEFT/tokenizer files
|
39 |
-
tok_ok = any(os.path.exists(os.path.join(TOKENIZER_DIR, f))
|
40 |
-
for f in ("tokenizer.json", "tokenizer.model", "tokenizer_config.json"))
|
41 |
-
lora_ok = any(os.path.exists(os.path.join(ADAPTER_DIR, f))
|
42 |
-
for f in ("adapter_config.json", "adapter_model.bin", "adapter_model.safetensors"))
|
43 |
-
return tok_ok and lora_ok
|
44 |
-
|
45 |
-
def _download_from_drive():
|
46 |
-
"""Download adapter/tokenizer from Google Drive into /data using gdown."""
|
47 |
-
_ensure_dirs()
|
48 |
-
if GDRIVE_ZIP_ID:
|
49 |
-
gdown.download(id=GDRIVE_ZIP_ID, output=ZIP_PATH, quiet=False) # gdown is built for Drive. :contentReference[oaicite:1]{index=1}
|
50 |
-
with zipfile.ZipFile(ZIP_PATH, "r") as zf:
|
51 |
-
zf.extractall(PERSIST_DIR)
|
52 |
-
return
|
53 |
-
|
54 |
-
if GDRIVE_ADAPTER_ID:
|
55 |
-
ad_zip = os.path.join(PERSIST_DIR, "adapter.zip")
|
56 |
-
gdown.download(id=GDRIVE_ADAPTER_ID, output=ad_zip, quiet=False)
|
57 |
-
try:
|
58 |
-
with zipfile.ZipFile(ad_zip, "r") as zf:
|
59 |
-
zf.extractall(ADAPTER_DIR)
|
60 |
-
except zipfile.BadZipFile:
|
61 |
-
# not a zip β assume single file
|
62 |
-
shutil.move(ad_zip, os.path.join(ADAPTER_DIR, "adapter_model.bin"))
|
63 |
-
|
64 |
-
if GDRIVE_TOKENIZER_ID:
|
65 |
-
tk_zip = os.path.join(PERSIST_DIR, "tokenizer.zip")
|
66 |
-
gdown.download(id=GDRIVE_TOKENIZER_ID, output=tk_zip, quiet=False)
|
67 |
-
try:
|
68 |
-
with zipfile.ZipFile(tk_zip, "r") as zf:
|
69 |
-
zf.extractall(TOKENIZER_DIR)
|
70 |
-
except zipfile.BadZipFile:
|
71 |
-
shutil.move(tk_zip, os.path.join(TOKENIZER_DIR, "tokenizer.json"))
|
72 |
-
|
73 |
-
# ββ Ensure local assets from Drive (first launch will download) ββββββββββββββ
|
74 |
-
if not _have_local_assets():
|
75 |
-
_download_from_drive() # persists in /data if you enabled it. :contentReference[oaicite:2]{index=2}
|
76 |
-
|
77 |
-
# ββ Tokenizer (from your Drive-backed folder) ββββββββββββββββββββββββββββββββ
|
78 |
-
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, use_fast=False)
|
79 |
-
tokenizer.pad_token_id = tokenizer.eos_token_id
|
80 |
-
tokenizer.padding_side = "left"
|
81 |
-
tokenizer.truncation_side = "right"
|
82 |
-
|
83 |
-
# ββ Base model (4-bit) via Unsloth + your PEFT adapter ββββββββββββββββββββββ
|
84 |
BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
85 |
|
|
|
86 |
bnb_cfg = BitsAndBytesConfig(
|
87 |
load_in_4bit=True,
|
88 |
bnb_4bit_quant_type="nf4",
|
@@ -90,33 +28,65 @@ bnb_cfg = BitsAndBytesConfig(
|
|
90 |
bnb_4bit_compute_dtype=torch.bfloat16,
|
91 |
)
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
model = unsloth.FastLanguageModel.from_pretrained(
|
94 |
BASE,
|
95 |
load_in_4bit=True,
|
96 |
-
quantization_config=bnb_cfg,
|
97 |
device_map="auto",
|
98 |
trust_remote_code=True,
|
99 |
)
|
100 |
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
#
|
104 |
-
# Transformers pipelines accept `generate_kwargs` to forward to .generate(). :contentReference[oaicite:3]{index=3}
|
105 |
chat_pipe = pipeline(
|
106 |
"text-generation",
|
107 |
model=model,
|
108 |
tokenizer=tokenizer,
|
109 |
trust_remote_code=True,
|
110 |
return_full_text=False,
|
111 |
-
generate_kwargs={
|
|
|
|
|
|
|
|
|
|
|
112 |
)
|
113 |
|
114 |
-
#
|
115 |
-
#
|
116 |
memory = ConversationBufferMemory(return_messages=True)
|
117 |
|
118 |
-
#
|
|
|
119 |
order_re = re.compile(r"#(\d{1,10})")
|
|
|
120 |
def extract_order(text: str):
|
121 |
m = order_re.search(text)
|
122 |
return m.group(1) if m else None
|
@@ -131,17 +101,17 @@ def handle_return_policy(_=None):
|
|
131 |
def handle_gratitude(_=None):
|
132 |
return "Youβre welcome! Is there anything else I can help with?"
|
133 |
def handle_escalation(_=None):
|
134 |
-
return "Iβm sorry, I donβt have that information. Would you like me to connect you with a human agent?"
|
135 |
|
136 |
stored_order = None
|
137 |
pending_intent = None
|
138 |
|
139 |
def _history_to_prompt(user_input: str) -> str:
|
140 |
-
"""Build a prompt
|
141 |
hist = memory.load_memory_variables({}).get("chat_history", [])
|
142 |
prompt = "You are a helpful support assistant.\n"
|
143 |
for msg in hist:
|
144 |
-
# LangChain messages
|
145 |
mtype = getattr(msg, "type", "")
|
146 |
role = "User" if mtype == "human" else "Assistant"
|
147 |
content = getattr(msg, "content", "")
|
@@ -150,9 +120,10 @@ def _history_to_prompt(user_input: str) -> str:
|
|
150 |
return prompt
|
151 |
|
152 |
def chat_with_memory(user_input: str) -> str:
|
|
|
153 |
global stored_order, pending_intent
|
154 |
|
155 |
-
ui = user_input.strip()
|
156 |
low = ui.lower()
|
157 |
|
158 |
# A) quick intent short-circuits
|
|
|
1 |
+
# ββ SLM_CService.py βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
2 |
# Launch-time model setup + FSM + conversational memory for the chatbot.
|
3 |
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
os.environ["OMP_NUM_THREADS"] = "1" # quiet libgomp noise in Spaces
|
7 |
+
os.environ.pop("HF_HUB_OFFLINE", None) # ensure online Hub access if set
|
8 |
|
9 |
+
# 1) Unsloth must be imported BEFORE transformers/peft for proper patching.
|
10 |
+
# (If not, you may see perf/memory warnings from Unsloth.)
|
11 |
+
import unsloth # keep this import at the top
|
12 |
|
13 |
+
import torch
|
14 |
from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
|
15 |
from peft import PeftModel
|
16 |
from langchain.memory import ConversationBufferMemory
|
|
|
|
|
17 |
|
18 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
19 |
+
# Hub repo that contains your tokenizer + adapter files
|
20 |
+
REPO = "ThomasBasil/bitext-qlora-tinyllama"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
22 |
|
23 |
+
# 4-bit NF4 quantization config (QLoRA-style)
|
24 |
bnb_cfg = BitsAndBytesConfig(
|
25 |
load_in_4bit=True,
|
26 |
bnb_4bit_quant_type="nf4",
|
|
|
28 |
bnb_4bit_compute_dtype=torch.bfloat16,
|
29 |
)
|
30 |
|
31 |
+
# ---- Tokenizer ----
|
32 |
+
def _load_tokenizer(repo_id: str):
|
33 |
+
# Try at repo root first; fall back to subfolder("tokenizer") if needed.
|
34 |
+
try:
|
35 |
+
tok = AutoTokenizer.from_pretrained(repo_id, use_fast=False)
|
36 |
+
except Exception:
|
37 |
+
tok = AutoTokenizer.from_pretrained(repo_id, subfolder="tokenizer", use_fast=False)
|
38 |
+
# sensible defaults for causal LM
|
39 |
+
if tok.pad_token_id is None and tok.eos_token_id is not None:
|
40 |
+
tok.pad_token_id = tok.eos_token_id
|
41 |
+
tok.padding_side = "left"
|
42 |
+
tok.truncation_side = "right"
|
43 |
+
return tok
|
44 |
+
|
45 |
+
tokenizer = _load_tokenizer(REPO)
|
46 |
+
|
47 |
+
# ---- Base model (Unsloth) ----
|
48 |
model = unsloth.FastLanguageModel.from_pretrained(
|
49 |
BASE,
|
50 |
load_in_4bit=True,
|
51 |
+
quantization_config=bnb_cfg, # prefer quantization_config over legacy args
|
52 |
device_map="auto",
|
53 |
trust_remote_code=True,
|
54 |
)
|
55 |
|
56 |
+
# ---- Apply your LoRA adapter from the same repo ----
|
57 |
+
def _attach_adapter(base_model, repo_id: str):
|
58 |
+
# Try repo root; if the adapter lives under adapter/, use subfolder.
|
59 |
+
try:
|
60 |
+
return PeftModel.from_pretrained(base_model, repo_id)
|
61 |
+
except Exception:
|
62 |
+
return PeftModel.from_pretrained(base_model, repo_id, subfolder="adapter")
|
63 |
+
|
64 |
+
model = _attach_adapter(model, REPO)
|
65 |
+
model.eval()
|
66 |
|
67 |
+
# ---- Text-generation pipeline (note: generate_kwargs, not generation_kwargs) ----
|
|
|
68 |
chat_pipe = pipeline(
|
69 |
"text-generation",
|
70 |
model=model,
|
71 |
tokenizer=tokenizer,
|
72 |
trust_remote_code=True,
|
73 |
return_full_text=False,
|
74 |
+
generate_kwargs={
|
75 |
+
"max_new_tokens": 128,
|
76 |
+
"do_sample": True,
|
77 |
+
"top_p": 0.9,
|
78 |
+
"temperature": 0.7,
|
79 |
+
},
|
80 |
)
|
81 |
|
82 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
83 |
+
# Conversational Memory (LangChain)
|
84 |
memory = ConversationBufferMemory(return_messages=True)
|
85 |
|
86 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
87 |
+
# Simple FSM helpers
|
88 |
order_re = re.compile(r"#(\d{1,10})")
|
89 |
+
|
90 |
def extract_order(text: str):
|
91 |
m = order_re.search(text)
|
92 |
return m.group(1) if m else None
|
|
|
101 |
def handle_gratitude(_=None):
|
102 |
return "Youβre welcome! Is there anything else I can help with?"
|
103 |
def handle_escalation(_=None):
|
104 |
+
return "Iβm sorry, I donβt have that information. Would you like me to connect you with a human agent?")
|
105 |
|
106 |
stored_order = None
|
107 |
pending_intent = None
|
108 |
|
109 |
def _history_to_prompt(user_input: str) -> str:
|
110 |
+
"""Build a plain-text prompt that includes chat history for fallback generation."""
|
111 |
hist = memory.load_memory_variables({}).get("chat_history", [])
|
112 |
prompt = "You are a helpful support assistant.\n"
|
113 |
for msg in hist:
|
114 |
+
# LangChain messages often have .type ('human'/'ai') and .content fields.
|
115 |
mtype = getattr(msg, "type", "")
|
116 |
role = "User" if mtype == "human" else "Assistant"
|
117 |
content = getattr(msg, "content", "")
|
|
|
120 |
return prompt
|
121 |
|
122 |
def chat_with_memory(user_input: str) -> str:
|
123 |
+
"""Main entrypoint called by app.py."""
|
124 |
global stored_order, pending_intent
|
125 |
|
126 |
+
ui = (user_input or "").strip()
|
127 |
low = ui.lower()
|
128 |
|
129 |
# A) quick intent short-circuits
|
requirements.txt
CHANGED
@@ -3,10 +3,7 @@ transformers
|
|
3 |
torch
|
4 |
sentencepiece
|
5 |
langchain
|
6 |
-
bitsandbytes
|
7 |
peft
|
8 |
-
|
9 |
unsloth
|
10 |
-
unsloth_zoo
|
11 |
huggingface_hub
|
12 |
-
gdown
|
|
|
3 |
torch
|
4 |
sentencepiece
|
5 |
langchain
|
|
|
6 |
peft
|
7 |
+
bitsandbytes
|
8 |
unsloth
|
|
|
9 |
huggingface_hub
|
|