BasilTh commited on
Commit
93d3bfa
Β·
1 Parent(s): 77b14f6

Deploy updated SLM customer-support chatbot

Browse files
Files changed (2) hide show
  1. SLM_CService.py +58 -87
  2. requirements.txt +1 -4
SLM_CService.py CHANGED
@@ -1,88 +1,26 @@
1
- # ─── SLM_CService.py ─────────────────────────────────────────────────────────
2
  # Launch-time model setup + FSM + conversational memory for the chatbot.
3
 
4
- import os, shutil, zipfile
5
- os.environ["OMP_NUM_THREADS"] = "1" # quiet libgomp noise
6
- os.environ.pop("HF_HUB_OFFLINE", None) # avoid accidental offline mode
 
7
 
8
- # 1) Unsloth must be imported before transformers
9
- import unsloth
10
- import torch
11
 
 
12
  from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
13
  from peft import PeftModel
14
  from langchain.memory import ConversationBufferMemory
15
- import gdown
16
- import re
17
 
18
- # ── Persistent storage (HF Spaces -> Settings -> Persistent storage) ─────────
19
- # Docs: /data persists across Space restarts. hf docs: persistent storage. :contentReference[oaicite:0]{index=0}
20
- PERSIST_DIR = os.environ.get("PERSIST_DIR", "/data/slm_assets")
21
- ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapter")
22
- TOKENIZER_DIR = os.path.join(PERSIST_DIR, "tokenizer")
23
- ZIP_PATH = os.path.join(PERSIST_DIR, "assets.zip")
24
-
25
- # ── Provide Google Drive IDs as Secrets (HF Space -> Settings -> Variables) ──
26
- # Either one zip with both folders...
27
- GDRIVE_ZIP_ID = os.environ.get("GDRIVE_ZIP_ID")
28
- # ...or separate zips/files for each:
29
- GDRIVE_ADAPTER_ID = os.environ.get("GDRIVE_ADAPTER_ID")
30
- GDRIVE_TOKENIZER_ID = os.environ.get("GDRIVE_TOKENIZER_ID")
31
-
32
- def _ensure_dirs():
33
- os.makedirs(PERSIST_DIR, exist_ok=True)
34
- os.makedirs(ADAPTER_DIR, exist_ok=True)
35
- os.makedirs(TOKENIZER_DIR, exist_ok=True)
36
-
37
- def _have_local_assets():
38
- # minimal sanity checks for typical PEFT/tokenizer files
39
- tok_ok = any(os.path.exists(os.path.join(TOKENIZER_DIR, f))
40
- for f in ("tokenizer.json", "tokenizer.model", "tokenizer_config.json"))
41
- lora_ok = any(os.path.exists(os.path.join(ADAPTER_DIR, f))
42
- for f in ("adapter_config.json", "adapter_model.bin", "adapter_model.safetensors"))
43
- return tok_ok and lora_ok
44
-
45
- def _download_from_drive():
46
- """Download adapter/tokenizer from Google Drive into /data using gdown."""
47
- _ensure_dirs()
48
- if GDRIVE_ZIP_ID:
49
- gdown.download(id=GDRIVE_ZIP_ID, output=ZIP_PATH, quiet=False) # gdown is built for Drive. :contentReference[oaicite:1]{index=1}
50
- with zipfile.ZipFile(ZIP_PATH, "r") as zf:
51
- zf.extractall(PERSIST_DIR)
52
- return
53
-
54
- if GDRIVE_ADAPTER_ID:
55
- ad_zip = os.path.join(PERSIST_DIR, "adapter.zip")
56
- gdown.download(id=GDRIVE_ADAPTER_ID, output=ad_zip, quiet=False)
57
- try:
58
- with zipfile.ZipFile(ad_zip, "r") as zf:
59
- zf.extractall(ADAPTER_DIR)
60
- except zipfile.BadZipFile:
61
- # not a zip – assume single file
62
- shutil.move(ad_zip, os.path.join(ADAPTER_DIR, "adapter_model.bin"))
63
-
64
- if GDRIVE_TOKENIZER_ID:
65
- tk_zip = os.path.join(PERSIST_DIR, "tokenizer.zip")
66
- gdown.download(id=GDRIVE_TOKENIZER_ID, output=tk_zip, quiet=False)
67
- try:
68
- with zipfile.ZipFile(tk_zip, "r") as zf:
69
- zf.extractall(TOKENIZER_DIR)
70
- except zipfile.BadZipFile:
71
- shutil.move(tk_zip, os.path.join(TOKENIZER_DIR, "tokenizer.json"))
72
-
73
- # ── Ensure local assets from Drive (first launch will download) ──────────────
74
- if not _have_local_assets():
75
- _download_from_drive() # persists in /data if you enabled it. :contentReference[oaicite:2]{index=2}
76
-
77
- # ── Tokenizer (from your Drive-backed folder) ────────────────────────────────
78
- tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, use_fast=False)
79
- tokenizer.pad_token_id = tokenizer.eos_token_id
80
- tokenizer.padding_side = "left"
81
- tokenizer.truncation_side = "right"
82
-
83
- # ── Base model (4-bit) via Unsloth + your PEFT adapter ──────────────────────
84
  BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
85
 
 
86
  bnb_cfg = BitsAndBytesConfig(
87
  load_in_4bit=True,
88
  bnb_4bit_quant_type="nf4",
@@ -90,33 +28,65 @@ bnb_cfg = BitsAndBytesConfig(
90
  bnb_4bit_compute_dtype=torch.bfloat16,
91
  )
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  model = unsloth.FastLanguageModel.from_pretrained(
94
  BASE,
95
  load_in_4bit=True,
96
- quantization_config=bnb_cfg, # prefer quantization_config over quant_type
97
  device_map="auto",
98
  trust_remote_code=True,
99
  )
100
 
101
- model = PeftModel.from_pretrained(model, ADAPTER_DIR)
 
 
 
 
 
 
 
 
 
102
 
103
- # ── Text-generation pipeline (use generate_kwargs, not generation_kwargs) ────
104
- # Transformers pipelines accept `generate_kwargs` to forward to .generate(). :contentReference[oaicite:3]{index=3}
105
  chat_pipe = pipeline(
106
  "text-generation",
107
  model=model,
108
  tokenizer=tokenizer,
109
  trust_remote_code=True,
110
  return_full_text=False,
111
- generate_kwargs={"max_new_tokens": 128, "do_sample": True, "top_p": 0.9, "temperature": 0.7},
 
 
 
 
 
112
  )
113
 
114
- # ── Conversational memory (LangChain) ────────────────────────────────────────
115
- # ConversationBufferMemory stores full turn-by-turn chat history. :contentReference[oaicite:4]{index=4}
116
  memory = ConversationBufferMemory(return_messages=True)
117
 
118
- # ── FSM helpers (your original logic, kept intact) ───────────────────────────
 
119
  order_re = re.compile(r"#(\d{1,10})")
 
120
  def extract_order(text: str):
121
  m = order_re.search(text)
122
  return m.group(1) if m else None
@@ -131,17 +101,17 @@ def handle_return_policy(_=None):
131
  def handle_gratitude(_=None):
132
  return "You’re welcome! Is there anything else I can help with?"
133
  def handle_escalation(_=None):
134
- return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?"
135
 
136
  stored_order = None
137
  pending_intent = None
138
 
139
  def _history_to_prompt(user_input: str) -> str:
140
- """Build a prompt from LangChain memory turns for fallback generation."""
141
  hist = memory.load_memory_variables({}).get("chat_history", [])
142
  prompt = "You are a helpful support assistant.\n"
143
  for msg in hist:
144
- # LangChain messages expose a .type like 'human'/'ai' in many versions
145
  mtype = getattr(msg, "type", "")
146
  role = "User" if mtype == "human" else "Assistant"
147
  content = getattr(msg, "content", "")
@@ -150,9 +120,10 @@ def _history_to_prompt(user_input: str) -> str:
150
  return prompt
151
 
152
  def chat_with_memory(user_input: str) -> str:
 
153
  global stored_order, pending_intent
154
 
155
- ui = user_input.strip()
156
  low = ui.lower()
157
 
158
  # A) quick intent short-circuits
 
1
+ # ── SLM_CService.py ───────────────────────────────────────────────────────────
2
  # Launch-time model setup + FSM + conversational memory for the chatbot.
3
 
4
+ import os
5
+ import re
6
+ os.environ["OMP_NUM_THREADS"] = "1" # quiet libgomp noise in Spaces
7
+ os.environ.pop("HF_HUB_OFFLINE", None) # ensure online Hub access if set
8
 
9
+ # 1) Unsloth must be imported BEFORE transformers/peft for proper patching.
10
+ # (If not, you may see perf/memory warnings from Unsloth.)
11
+ import unsloth # keep this import at the top
12
 
13
+ import torch
14
  from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
15
  from peft import PeftModel
16
  from langchain.memory import ConversationBufferMemory
 
 
17
 
18
+ # ──────────────────────────────────────────────────────────────────────────────
19
+ # Hub repo that contains your tokenizer + adapter files
20
+ REPO = "ThomasBasil/bitext-qlora-tinyllama"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
22
 
23
+ # 4-bit NF4 quantization config (QLoRA-style)
24
  bnb_cfg = BitsAndBytesConfig(
25
  load_in_4bit=True,
26
  bnb_4bit_quant_type="nf4",
 
28
  bnb_4bit_compute_dtype=torch.bfloat16,
29
  )
30
 
31
+ # ---- Tokenizer ----
32
+ def _load_tokenizer(repo_id: str):
33
+ # Try at repo root first; fall back to subfolder("tokenizer") if needed.
34
+ try:
35
+ tok = AutoTokenizer.from_pretrained(repo_id, use_fast=False)
36
+ except Exception:
37
+ tok = AutoTokenizer.from_pretrained(repo_id, subfolder="tokenizer", use_fast=False)
38
+ # sensible defaults for causal LM
39
+ if tok.pad_token_id is None and tok.eos_token_id is not None:
40
+ tok.pad_token_id = tok.eos_token_id
41
+ tok.padding_side = "left"
42
+ tok.truncation_side = "right"
43
+ return tok
44
+
45
+ tokenizer = _load_tokenizer(REPO)
46
+
47
+ # ---- Base model (Unsloth) ----
48
  model = unsloth.FastLanguageModel.from_pretrained(
49
  BASE,
50
  load_in_4bit=True,
51
+ quantization_config=bnb_cfg, # prefer quantization_config over legacy args
52
  device_map="auto",
53
  trust_remote_code=True,
54
  )
55
 
56
+ # ---- Apply your LoRA adapter from the same repo ----
57
+ def _attach_adapter(base_model, repo_id: str):
58
+ # Try repo root; if the adapter lives under adapter/, use subfolder.
59
+ try:
60
+ return PeftModel.from_pretrained(base_model, repo_id)
61
+ except Exception:
62
+ return PeftModel.from_pretrained(base_model, repo_id, subfolder="adapter")
63
+
64
+ model = _attach_adapter(model, REPO)
65
+ model.eval()
66
 
67
+ # ---- Text-generation pipeline (note: generate_kwargs, not generation_kwargs) ----
 
68
  chat_pipe = pipeline(
69
  "text-generation",
70
  model=model,
71
  tokenizer=tokenizer,
72
  trust_remote_code=True,
73
  return_full_text=False,
74
+ generate_kwargs={
75
+ "max_new_tokens": 128,
76
+ "do_sample": True,
77
+ "top_p": 0.9,
78
+ "temperature": 0.7,
79
+ },
80
  )
81
 
82
+ # ──────────────────────────────────────────────────────────────────────────────
83
+ # Conversational Memory (LangChain)
84
  memory = ConversationBufferMemory(return_messages=True)
85
 
86
+ # ──────────────────────────────────────────────────────────────────────────────
87
+ # Simple FSM helpers
88
  order_re = re.compile(r"#(\d{1,10})")
89
+
90
  def extract_order(text: str):
91
  m = order_re.search(text)
92
  return m.group(1) if m else None
 
101
  def handle_gratitude(_=None):
102
  return "You’re welcome! Is there anything else I can help with?"
103
  def handle_escalation(_=None):
104
+ return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?")
105
 
106
  stored_order = None
107
  pending_intent = None
108
 
109
  def _history_to_prompt(user_input: str) -> str:
110
+ """Build a plain-text prompt that includes chat history for fallback generation."""
111
  hist = memory.load_memory_variables({}).get("chat_history", [])
112
  prompt = "You are a helpful support assistant.\n"
113
  for msg in hist:
114
+ # LangChain messages often have .type ('human'/'ai') and .content fields.
115
  mtype = getattr(msg, "type", "")
116
  role = "User" if mtype == "human" else "Assistant"
117
  content = getattr(msg, "content", "")
 
120
  return prompt
121
 
122
  def chat_with_memory(user_input: str) -> str:
123
+ """Main entrypoint called by app.py."""
124
  global stored_order, pending_intent
125
 
126
+ ui = (user_input or "").strip()
127
  low = ui.lower()
128
 
129
  # A) quick intent short-circuits
requirements.txt CHANGED
@@ -3,10 +3,7 @@ transformers
3
  torch
4
  sentencepiece
5
  langchain
6
- bitsandbytes
7
  peft
8
- xformers
9
  unsloth
10
- unsloth_zoo
11
  huggingface_hub
12
- gdown
 
3
  torch
4
  sentencepiece
5
  langchain
 
6
  peft
7
+ bitsandbytes
8
  unsloth
 
9
  huggingface_hub