BasilTh commited on
Commit
7d9bb79
Β·
1 Parent(s): 93d3bfa

Deploy updated SLM customer-support chatbot

Browse files
Files changed (1) hide show
  1. SLM_CService.py +60 -23
SLM_CService.py CHANGED
@@ -1,14 +1,18 @@
1
  # ── SLM_CService.py ───────────────────────────────────────────────────────────
2
- # Launch-time model setup + FSM + conversational memory for the chatbot.
3
 
4
  import os
5
  import re
6
- os.environ["OMP_NUM_THREADS"] = "1" # quiet libgomp noise in Spaces
7
- os.environ.pop("HF_HUB_OFFLINE", None) # ensure online Hub access if set
8
 
9
- # 1) Unsloth must be imported BEFORE transformers/peft for proper patching.
10
- # (If not, you may see perf/memory warnings from Unsloth.)
11
- import unsloth # keep this import at the top
 
 
 
 
 
 
12
 
13
  import torch
14
  from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
@@ -16,11 +20,18 @@ from peft import PeftModel
16
  from langchain.memory import ConversationBufferMemory
17
 
18
  # ──────────────────────────────────────────────────────────────────────────────
19
- # Hub repo that contains your tokenizer + adapter files
20
  REPO = "ThomasBasil/bitext-qlora-tinyllama"
21
  BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
22
 
 
 
 
 
 
 
23
  # 4-bit NF4 quantization config (QLoRA-style)
 
24
  bnb_cfg = BitsAndBytesConfig(
25
  load_in_4bit=True,
26
  bnb_4bit_quant_type="nf4",
@@ -28,13 +39,24 @@ bnb_cfg = BitsAndBytesConfig(
28
  bnb_4bit_compute_dtype=torch.bfloat16,
29
  )
30
 
31
- # ---- Tokenizer ----
 
32
  def _load_tokenizer(repo_id: str):
33
- # Try at repo root first; fall back to subfolder("tokenizer") if needed.
 
 
 
 
34
  try:
35
  tok = AutoTokenizer.from_pretrained(repo_id, use_fast=False)
36
  except Exception:
37
- tok = AutoTokenizer.from_pretrained(repo_id, subfolder="tokenizer", use_fast=False)
 
 
 
 
 
 
38
  # sensible defaults for causal LM
39
  if tok.pad_token_id is None and tok.eos_token_id is not None:
40
  tok.pad_token_id = tok.eos_token_id
@@ -42,29 +64,42 @@ def _load_tokenizer(repo_id: str):
42
  tok.truncation_side = "right"
43
  return tok
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  tokenizer = _load_tokenizer(REPO)
46
 
47
- # ---- Base model (Unsloth) ----
48
  model = unsloth.FastLanguageModel.from_pretrained(
49
  BASE,
50
  load_in_4bit=True,
51
- quantization_config=bnb_cfg, # prefer quantization_config over legacy args
52
  device_map="auto",
53
  trust_remote_code=True,
54
  )
55
 
56
- # ---- Apply your LoRA adapter from the same repo ----
57
- def _attach_adapter(base_model, repo_id: str):
58
- # Try repo root; if the adapter lives under adapter/, use subfolder.
59
- try:
60
- return PeftModel.from_pretrained(base_model, repo_id)
61
- except Exception:
62
- return PeftModel.from_pretrained(base_model, repo_id, subfolder="adapter")
63
-
64
  model = _attach_adapter(model, REPO)
65
  model.eval()
66
 
67
- # ---- Text-generation pipeline (note: generate_kwargs, not generation_kwargs) ----
 
68
  chat_pipe = pipeline(
69
  "text-generation",
70
  model=model,
@@ -101,17 +136,18 @@ def handle_return_policy(_=None):
101
  def handle_gratitude(_=None):
102
  return "You’re welcome! Is there anything else I can help with?"
103
  def handle_escalation(_=None):
104
- return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?")
105
 
106
  stored_order = None
107
  pending_intent = None
108
 
 
109
  def _history_to_prompt(user_input: str) -> str:
110
  """Build a plain-text prompt that includes chat history for fallback generation."""
111
  hist = memory.load_memory_variables({}).get("chat_history", [])
112
  prompt = "You are a helpful support assistant.\n"
113
  for msg in hist:
114
- # LangChain messages often have .type ('human'/'ai') and .content fields.
115
  mtype = getattr(msg, "type", "")
116
  role = "User" if mtype == "human" else "Assistant"
117
  content = getattr(msg, "content", "")
@@ -119,6 +155,7 @@ def _history_to_prompt(user_input: str) -> str:
119
  prompt += f"User: {user_input}\nAssistant: "
120
  return prompt
121
 
 
122
  def chat_with_memory(user_input: str) -> str:
123
  """Main entrypoint called by app.py."""
124
  global stored_order, pending_intent
 
1
  # ── SLM_CService.py ───────────────────────────────────────────────────────────
2
+ # Model load + FSM + conversational memory for your Gradio Space.
3
 
4
  import os
5
  import re
 
 
6
 
7
+ # Keep OpenMP quiet in Spaces logs
8
+ os.environ["OMP_NUM_THREADS"] = "1"
9
+ # Ensure we don't accidentally run offline
10
+ os.environ.pop("HF_HUB_OFFLINE", None)
11
+
12
+ # 1) Unsloth must be imported BEFORE transformers/peft to apply optimizations.
13
+ # (Otherwise you may see perf/memory warnings.)
14
+ # Ref: Unsloth team warning in issues.
15
+ import unsloth # noqa: E402 # must be before transformers/peft :contentReference[oaicite:2]{index=2}
16
 
17
  import torch
18
  from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
 
20
  from langchain.memory import ConversationBufferMemory
21
 
22
  # ──────────────────────────────────────────────────────────────────────────────
23
+ # Your Hub repo that contains the tokenizer + PEFT adapter files
24
  REPO = "ThomasBasil/bitext-qlora-tinyllama"
25
  BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
26
 
27
+ # If your files are nested, set this to the exact subfolder path (or use
28
+ # the HF_SUBFOLDER env var from Space β†’ Settings β†’ Variables).
29
+ # Example from your screenshot:
30
+ DEFAULT_SUBFOLDER = "bitext-qlora-tinyllama-20250807T224217Z-1-001/bitext-qlora-tinyllama"
31
+ SUBFOLDER = os.environ.get("HF_SUBFOLDER", DEFAULT_SUBFOLDER)
32
+
33
  # 4-bit NF4 quantization config (QLoRA-style)
34
+ # Ref: Transformers bitsandbytes quantization docs. :contentReference[oaicite:3]{index=3}
35
  bnb_cfg = BitsAndBytesConfig(
36
  load_in_4bit=True,
37
  bnb_4bit_quant_type="nf4",
 
39
  bnb_4bit_compute_dtype=torch.bfloat16,
40
  )
41
 
42
+ # ---- Robust helpers to load from root or subfolder ---------------------------
43
+
44
  def _load_tokenizer(repo_id: str):
45
+ """
46
+ Try to load tokenizer from repo root; if missing, try configured subfolder.
47
+ Transformers supports `subfolder` in from_pretrained for tokenizers. :contentReference[oaicite:4]{index=4}
48
+ """
49
+ # Try at repo root first
50
  try:
51
  tok = AutoTokenizer.from_pretrained(repo_id, use_fast=False)
52
  except Exception:
53
+ # Try "tokenizer" subdir at root
54
+ try:
55
+ tok = AutoTokenizer.from_pretrained(repo_id, subfolder="tokenizer", use_fast=False)
56
+ except Exception:
57
+ # Try the provided nested path
58
+ tok = AutoTokenizer.from_pretrained(repo_id, subfolder=SUBFOLDER, use_fast=False)
59
+
60
  # sensible defaults for causal LM
61
  if tok.pad_token_id is None and tok.eos_token_id is not None:
62
  tok.pad_token_id = tok.eos_token_id
 
64
  tok.truncation_side = "right"
65
  return tok
66
 
67
+
68
+ def _attach_adapter(base_model, repo_id: str):
69
+ """
70
+ Attach PEFT adapter from root; if not found, try subfolder variants.
71
+ (PEFT supports kwargs like `subfolder`, though older versions had quirks;
72
+ if you ever hit issues, place adapter files at repo root.) :contentReference[oaicite:5]{index=5}
73
+ """
74
+ # Try repo root
75
+ try:
76
+ return PeftModel.from_pretrained(base_model, repo_id)
77
+ except Exception:
78
+ # Try 'adapter' subdir at root
79
+ try:
80
+ return PeftModel.from_pretrained(base_model, repo_id, subfolder="adapter")
81
+ except Exception:
82
+ # Try the provided nested path
83
+ return PeftModel.from_pretrained(base_model, repo_id, subfolder=SUBFOLDER)
84
+
85
+
86
+ # ---- Load tokenizer, base model (4-bit), and attach adapter ------------------
87
+
88
  tokenizer = _load_tokenizer(REPO)
89
 
 
90
  model = unsloth.FastLanguageModel.from_pretrained(
91
  BASE,
92
  load_in_4bit=True,
93
+ quantization_config=bnb_cfg, # prefer quantization_config over legacy args
94
  device_map="auto",
95
  trust_remote_code=True,
96
  )
97
 
 
 
 
 
 
 
 
 
98
  model = _attach_adapter(model, REPO)
99
  model.eval()
100
 
101
+ # Transformers pipeline accepts `generate_kwargs` to pass through to .generate().
102
+ # Ref: Pipelines docs mention `generate_kwargs`. :contentReference[oaicite:6]{index=6}
103
  chat_pipe = pipeline(
104
  "text-generation",
105
  model=model,
 
136
  def handle_gratitude(_=None):
137
  return "You’re welcome! Is there anything else I can help with?"
138
  def handle_escalation(_=None):
139
+ return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?"
140
 
141
  stored_order = None
142
  pending_intent = None
143
 
144
+
145
  def _history_to_prompt(user_input: str) -> str:
146
  """Build a plain-text prompt that includes chat history for fallback generation."""
147
  hist = memory.load_memory_variables({}).get("chat_history", [])
148
  prompt = "You are a helpful support assistant.\n"
149
  for msg in hist:
150
+ # LangChain messages often have .type ('human'/'ai') and .content
151
  mtype = getattr(msg, "type", "")
152
  role = "User" if mtype == "human" else "Assistant"
153
  content = getattr(msg, "content", "")
 
155
  prompt += f"User: {user_input}\nAssistant: "
156
  return prompt
157
 
158
+
159
  def chat_with_memory(user_input: str) -> str:
160
  """Main entrypoint called by app.py."""
161
  global stored_order, pending_intent