Spaces:

ThomasBasil
/

slm-customer-support-chatbot

Paused

App Files Files Community

BasilTh commited on Aug 7

Commit

7d9bb79

1 Parent(s): 93d3bfa

Deploy updated SLM customer-support chatbot

Browse files

Files changed (1) hide show

SLM_CService.py +60 -23

SLM_CService.py CHANGED Viewed

@@ -1,14 +1,18 @@
 # ── SLM_CService.py ───────────────────────────────────────────────────────────
-# Launch-time model setup + FSM + conversational memory for the chatbot.
 import os
 import re
-os.environ["OMP_NUM_THREADS"] = "1"            # quiet libgomp noise in Spaces
-os.environ.pop("HF_HUB_OFFLINE", None)         # ensure online Hub access if set
-# 1) Unsloth must be imported BEFORE transformers/peft for proper patching.
-#    (If not, you may see perf/memory warnings from Unsloth.)
-import unsloth  # keep this import at the top
 import torch
 from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
@@ -16,11 +20,18 @@ from peft import PeftModel
 from langchain.memory import ConversationBufferMemory
 # ──────────────────────────────────────────────────────────────────────────────
-# Hub repo that contains your tokenizer + adapter files
 REPO = "ThomasBasil/bitext-qlora-tinyllama"
 BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 # 4-bit NF4 quantization config (QLoRA-style)
 bnb_cfg = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
@@ -28,13 +39,24 @@ bnb_cfg = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.bfloat16,
 )
-# ---- Tokenizer ----
 def _load_tokenizer(repo_id: str):
-    # Try at repo root first; fall back to subfolder("tokenizer") if needed.
     try:
         tok = AutoTokenizer.from_pretrained(repo_id, use_fast=False)
     except Exception:
-        tok = AutoTokenizer.from_pretrained(repo_id, subfolder="tokenizer", use_fast=False)
     # sensible defaults for causal LM
     if tok.pad_token_id is None and tok.eos_token_id is not None:
         tok.pad_token_id = tok.eos_token_id
@@ -42,29 +64,42 @@ def _load_tokenizer(repo_id: str):
     tok.truncation_side = "right"
     return tok
 tokenizer = _load_tokenizer(REPO)
-# ---- Base model (Unsloth) ----
 model = unsloth.FastLanguageModel.from_pretrained(
     BASE,
     load_in_4bit=True,
-    quantization_config=bnb_cfg,      # prefer quantization_config over legacy args
     device_map="auto",
     trust_remote_code=True,
 )
-# ---- Apply your LoRA adapter from the same repo ----
-def _attach_adapter(base_model, repo_id: str):
-    # Try repo root; if the adapter lives under adapter/, use subfolder.
-    try:
-        return PeftModel.from_pretrained(base_model, repo_id)
-    except Exception:
-        return PeftModel.from_pretrained(base_model, repo_id, subfolder="adapter")
 model = _attach_adapter(model, REPO)
 model.eval()
-# ---- Text-generation pipeline (note: generate_kwargs, not generation_kwargs) ----
 chat_pipe = pipeline(
     "text-generation",
     model=model,
@@ -101,17 +136,18 @@ def handle_return_policy(_=None):
 def handle_gratitude(_=None):
     return "You’re welcome! Is there anything else I can help with?"
 def handle_escalation(_=None):
-    return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?")
 stored_order   = None
 pending_intent = None
 def _history_to_prompt(user_input: str) -> str:
     """Build a plain-text prompt that includes chat history for fallback generation."""
     hist = memory.load_memory_variables({}).get("chat_history", [])
     prompt = "You are a helpful support assistant.\n"
     for msg in hist:
-        # LangChain messages often have .type ('human'/'ai') and .content fields.
         mtype = getattr(msg, "type", "")
         role  = "User" if mtype == "human" else "Assistant"
         content = getattr(msg, "content", "")
@@ -119,6 +155,7 @@ def _history_to_prompt(user_input: str) -> str:
     prompt += f"User: {user_input}\nAssistant: "
     return prompt
 def chat_with_memory(user_input: str) -> str:
     """Main entrypoint called by app.py."""
     global stored_order, pending_intent

 # ── SLM_CService.py ───────────────────────────────────────────────────────────
+# Model load + FSM + conversational memory for your Gradio Space.
 import os
 import re
+# Keep OpenMP quiet in Spaces logs
+os.environ["OMP_NUM_THREADS"] = "1"
+# Ensure we don't accidentally run offline
+os.environ.pop("HF_HUB_OFFLINE", None)
+# 1) Unsloth must be imported BEFORE transformers/peft to apply optimizations.
+#    (Otherwise you may see perf/memory warnings.)
+#    Ref: Unsloth team warning in issues.
+import unsloth  # noqa: E402  # must be before transformers/peft  :contentReference[oaicite:2]{index=2}
 import torch
 from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
 from langchain.memory import ConversationBufferMemory
 # ──────────────────────────────────────────────────────────────────────────────
+# Your Hub repo that contains the tokenizer + PEFT adapter files
 REPO = "ThomasBasil/bitext-qlora-tinyllama"
 BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+# If your files are nested, set this to the exact subfolder path (or use
+# the HF_SUBFOLDER env var from Space → Settings → Variables).
+# Example from your screenshot:
+DEFAULT_SUBFOLDER = "bitext-qlora-tinyllama-20250807T224217Z-1-001/bitext-qlora-tinyllama"
+SUBFOLDER = os.environ.get("HF_SUBFOLDER", DEFAULT_SUBFOLDER)
 # 4-bit NF4 quantization config (QLoRA-style)
+# Ref: Transformers bitsandbytes quantization docs.  :contentReference[oaicite:3]{index=3}
 bnb_cfg = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_compute_dtype=torch.bfloat16,
 )
+# ---- Robust helpers to load from root or subfolder ---------------------------
 def _load_tokenizer(repo_id: str):
+    """
+    Try to load tokenizer from repo root; if missing, try configured subfolder.
+    Transformers supports `subfolder` in from_pretrained for tokenizers. :contentReference[oaicite:4]{index=4}
+    """
+    # Try at repo root first
     try:
         tok = AutoTokenizer.from_pretrained(repo_id, use_fast=False)
     except Exception:
+        # Try "tokenizer" subdir at root
+        try:
+            tok = AutoTokenizer.from_pretrained(repo_id, subfolder="tokenizer", use_fast=False)
+        except Exception:
+            # Try the provided nested path
+            tok = AutoTokenizer.from_pretrained(repo_id, subfolder=SUBFOLDER, use_fast=False)
     # sensible defaults for causal LM
     if tok.pad_token_id is None and tok.eos_token_id is not None:
         tok.pad_token_id = tok.eos_token_id
     tok.truncation_side = "right"
     return tok
+def _attach_adapter(base_model, repo_id: str):
+    """
+    Attach PEFT adapter from root; if not found, try subfolder variants.
+    (PEFT supports kwargs like `subfolder`, though older versions had quirks;
+     if you ever hit issues, place adapter files at repo root.) :contentReference[oaicite:5]{index=5}
+    """
+    # Try repo root
+    try:
+        return PeftModel.from_pretrained(base_model, repo_id)
+    except Exception:
+        # Try 'adapter' subdir at root
+        try:
+            return PeftModel.from_pretrained(base_model, repo_id, subfolder="adapter")
+        except Exception:
+            # Try the provided nested path
+            return PeftModel.from_pretrained(base_model, repo_id, subfolder=SUBFOLDER)
+# ---- Load tokenizer, base model (4-bit), and attach adapter ------------------
 tokenizer = _load_tokenizer(REPO)
 model = unsloth.FastLanguageModel.from_pretrained(
     BASE,
     load_in_4bit=True,
+    quantization_config=bnb_cfg,   # prefer quantization_config over legacy args
     device_map="auto",
     trust_remote_code=True,
 )
 model = _attach_adapter(model, REPO)
 model.eval()
+# Transformers pipeline accepts `generate_kwargs` to pass through to .generate().
+# Ref: Pipelines docs mention `generate_kwargs`. :contentReference[oaicite:6]{index=6}
 chat_pipe = pipeline(
     "text-generation",
     model=model,
 def handle_gratitude(_=None):
     return "You’re welcome! Is there anything else I can help with?"
 def handle_escalation(_=None):
+    return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?"
 stored_order   = None
 pending_intent = None
 def _history_to_prompt(user_input: str) -> str:
     """Build a plain-text prompt that includes chat history for fallback generation."""
     hist = memory.load_memory_variables({}).get("chat_history", [])
     prompt = "You are a helpful support assistant.\n"
     for msg in hist:
+        # LangChain messages often have .type ('human'/'ai') and .content
         mtype = getattr(msg, "type", "")
         role  = "User" if mtype == "human" else "Assistant"
         content = getattr(msg, "content", "")
     prompt += f"User: {user_input}\nAssistant: "
     return prompt
 def chat_with_memory(user_input: str) -> str:
     """Main entrypoint called by app.py."""
     global stored_order, pending_intent