Spaces:

ThomasBasil
/

slm-customer-support-chatbot

Paused

App Files Files Community

BasilTh commited on 25 days ago

Commit

93d3bfa

1 Parent(s): 77b14f6

Deploy updated SLM customer-support chatbot

Browse files

Files changed (2) hide show

SLM_CService.py +58 -87
requirements.txt +1 -4

SLM_CService.py CHANGED Viewed

@@ -1,88 +1,26 @@
-# ─── SLM_CService.py ─────────────────────────────────────────────────────────
 # Launch-time model setup + FSM + conversational memory for the chatbot.
-import os, shutil, zipfile
-os.environ["OMP_NUM_THREADS"] = "1"          # quiet libgomp noise
-os.environ.pop("HF_HUB_OFFLINE", None)       # avoid accidental offline mode
-# 1) Unsloth must be imported before transformers
-import unsloth
-import torch
 from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
 from peft import PeftModel
 from langchain.memory import ConversationBufferMemory
-import gdown
-import re
-# ── Persistent storage (HF Spaces -> Settings -> Persistent storage) ─────────
-# Docs: /data persists across Space restarts. hf docs: persistent storage. :contentReference[oaicite:0]{index=0}
-PERSIST_DIR   = os.environ.get("PERSIST_DIR", "/data/slm_assets")
-ADAPTER_DIR   = os.path.join(PERSIST_DIR, "adapter")
-TOKENIZER_DIR = os.path.join(PERSIST_DIR, "tokenizer")
-ZIP_PATH      = os.path.join(PERSIST_DIR, "assets.zip")
-# ── Provide Google Drive IDs as Secrets (HF Space -> Settings -> Variables) ──
-# Either one zip with both folders...
-GDRIVE_ZIP_ID       = os.environ.get("GDRIVE_ZIP_ID")
-# ...or separate zips/files for each:
-GDRIVE_ADAPTER_ID   = os.environ.get("GDRIVE_ADAPTER_ID")
-GDRIVE_TOKENIZER_ID = os.environ.get("GDRIVE_TOKENIZER_ID")
-def _ensure_dirs():
-    os.makedirs(PERSIST_DIR, exist_ok=True)
-    os.makedirs(ADAPTER_DIR, exist_ok=True)
-    os.makedirs(TOKENIZER_DIR, exist_ok=True)
-def _have_local_assets():
-    # minimal sanity checks for typical PEFT/tokenizer files
-    tok_ok = any(os.path.exists(os.path.join(TOKENIZER_DIR, f))
-                 for f in ("tokenizer.json", "tokenizer.model", "tokenizer_config.json"))
-    lora_ok = any(os.path.exists(os.path.join(ADAPTER_DIR, f))
-                  for f in ("adapter_config.json", "adapter_model.bin", "adapter_model.safetensors"))
-    return tok_ok and lora_ok
-def _download_from_drive():
-    """Download adapter/tokenizer from Google Drive into /data using gdown."""
-    _ensure_dirs()
-    if GDRIVE_ZIP_ID:
-        gdown.download(id=GDRIVE_ZIP_ID, output=ZIP_PATH, quiet=False)  # gdown is built for Drive. :contentReference[oaicite:1]{index=1}
-        with zipfile.ZipFile(ZIP_PATH, "r") as zf:
-            zf.extractall(PERSIST_DIR)
-        return
-    if GDRIVE_ADAPTER_ID:
-        ad_zip = os.path.join(PERSIST_DIR, "adapter.zip")
-        gdown.download(id=GDRIVE_ADAPTER_ID, output=ad_zip, quiet=False)
-        try:
-            with zipfile.ZipFile(ad_zip, "r") as zf:
-                zf.extractall(ADAPTER_DIR)
-        except zipfile.BadZipFile:
-            # not a zip – assume single file
-            shutil.move(ad_zip, os.path.join(ADAPTER_DIR, "adapter_model.bin"))
-    if GDRIVE_TOKENIZER_ID:
-        tk_zip = os.path.join(PERSIST_DIR, "tokenizer.zip")
-        gdown.download(id=GDRIVE_TOKENIZER_ID, output=tk_zip, quiet=False)
-        try:
-            with zipfile.ZipFile(tk_zip, "r") as zf:
-                zf.extractall(TOKENIZER_DIR)
-        except zipfile.BadZipFile:
-            shutil.move(tk_zip, os.path.join(TOKENIZER_DIR, "tokenizer.json"))
-# ── Ensure local assets from Drive (first launch will download) ──────────────
-if not _have_local_assets():
-    _download_from_drive()     # persists in /data if you enabled it. :contentReference[oaicite:2]{index=2}
-# ── Tokenizer (from your Drive-backed folder) ────────────────────────────────
-tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, use_fast=False)
-tokenizer.pad_token_id    = tokenizer.eos_token_id
-tokenizer.padding_side    = "left"
-tokenizer.truncation_side = "right"
-# ── Base model (4-bit) via Unsloth + your PEFT adapter ──────────────────────
 BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 bnb_cfg = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
@@ -90,33 +28,65 @@ bnb_cfg = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.bfloat16,
 )
 model = unsloth.FastLanguageModel.from_pretrained(
     BASE,
     load_in_4bit=True,
-    quantization_config=bnb_cfg,     # prefer quantization_config over quant_type
     device_map="auto",
     trust_remote_code=True,
 )
-model = PeftModel.from_pretrained(model, ADAPTER_DIR)
-# ── Text-generation pipeline (use generate_kwargs, not generation_kwargs) ────
-# Transformers pipelines accept `generate_kwargs` to forward to .generate(). :contentReference[oaicite:3]{index=3}
 chat_pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
     trust_remote_code=True,
     return_full_text=False,
-    generate_kwargs={"max_new_tokens": 128, "do_sample": True, "top_p": 0.9, "temperature": 0.7},
 )
-# ── Conversational memory (LangChain) ────────────────────────────────────────
-# ConversationBufferMemory stores full turn-by-turn chat history. :contentReference[oaicite:4]{index=4}
 memory = ConversationBufferMemory(return_messages=True)
-# ── FSM helpers (your original logic, kept intact) ───────────────────────────
 order_re = re.compile(r"#(\d{1,10})")
 def extract_order(text: str):
     m = order_re.search(text)
     return m.group(1) if m else None
@@ -131,17 +101,17 @@ def handle_return_policy(_=None):
 def handle_gratitude(_=None):
     return "You’re welcome! Is there anything else I can help with?"
 def handle_escalation(_=None):
-    return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?"
 stored_order   = None
 pending_intent = None
 def _history_to_prompt(user_input: str) -> str:
-    """Build a prompt from LangChain memory turns for fallback generation."""
     hist = memory.load_memory_variables({}).get("chat_history", [])
     prompt = "You are a helpful support assistant.\n"
     for msg in hist:
-        # LangChain messages expose a .type like 'human'/'ai' in many versions
         mtype = getattr(msg, "type", "")
         role  = "User" if mtype == "human" else "Assistant"
         content = getattr(msg, "content", "")
@@ -150,9 +120,10 @@ def _history_to_prompt(user_input: str) -> str:
     return prompt
 def chat_with_memory(user_input: str) -> str:
     global stored_order, pending_intent
-    ui = user_input.strip()
     low = ui.lower()
     # A) quick intent short-circuits

+# ── SLM_CService.py ───────────────────────────────────────────────────────────
 # Launch-time model setup + FSM + conversational memory for the chatbot.
+import os
+import re
+os.environ["OMP_NUM_THREADS"] = "1"            # quiet libgomp noise in Spaces
+os.environ.pop("HF_HUB_OFFLINE", None)         # ensure online Hub access if set
+# 1) Unsloth must be imported BEFORE transformers/peft for proper patching.
+#    (If not, you may see perf/memory warnings from Unsloth.)
+import unsloth  # keep this import at the top
+import torch
 from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
 from peft import PeftModel
 from langchain.memory import ConversationBufferMemory
+# ──────────────────────────────────────────────────────────────────────────────
+# Hub repo that contains your tokenizer + adapter files
+REPO = "ThomasBasil/bitext-qlora-tinyllama"
 BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+# 4-bit NF4 quantization config (QLoRA-style)
 bnb_cfg = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_compute_dtype=torch.bfloat16,
 )
+# ---- Tokenizer ----
+def _load_tokenizer(repo_id: str):
+    # Try at repo root first; fall back to subfolder("tokenizer") if needed.
+    try:
+        tok = AutoTokenizer.from_pretrained(repo_id, use_fast=False)
+    except Exception:
+        tok = AutoTokenizer.from_pretrained(repo_id, subfolder="tokenizer", use_fast=False)
+    # sensible defaults for causal LM
+    if tok.pad_token_id is None and tok.eos_token_id is not None:
+        tok.pad_token_id = tok.eos_token_id
+    tok.padding_side = "left"
+    tok.truncation_side = "right"
+    return tok
+tokenizer = _load_tokenizer(REPO)
+# ---- Base model (Unsloth) ----
 model = unsloth.FastLanguageModel.from_pretrained(
     BASE,
     load_in_4bit=True,
+    quantization_config=bnb_cfg,      # prefer quantization_config over legacy args
     device_map="auto",
     trust_remote_code=True,
 )
+# ---- Apply your LoRA adapter from the same repo ----
+def _attach_adapter(base_model, repo_id: str):
+    # Try repo root; if the adapter lives under adapter/, use subfolder.
+    try:
+        return PeftModel.from_pretrained(base_model, repo_id)
+    except Exception:
+        return PeftModel.from_pretrained(base_model, repo_id, subfolder="adapter")
+model = _attach_adapter(model, REPO)
+model.eval()
+# ---- Text-generation pipeline (note: generate_kwargs, not generation_kwargs) ----
 chat_pipe = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
     trust_remote_code=True,
     return_full_text=False,
+    generate_kwargs={
+        "max_new_tokens": 128,
+        "do_sample": True,
+        "top_p": 0.9,
+        "temperature": 0.7,
+    },
 )
+# ──────────────────────────────────────────────────────────────────────────────
+# Conversational Memory (LangChain)
 memory = ConversationBufferMemory(return_messages=True)
+# ──────────────────────────────────────────────────────────────────────────────
+# Simple FSM helpers
 order_re = re.compile(r"#(\d{1,10})")
 def extract_order(text: str):
     m = order_re.search(text)
     return m.group(1) if m else None
 def handle_gratitude(_=None):
     return "You’re welcome! Is there anything else I can help with?"
 def handle_escalation(_=None):
+    return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?")
 stored_order   = None
 pending_intent = None
 def _history_to_prompt(user_input: str) -> str:
+    """Build a plain-text prompt that includes chat history for fallback generation."""
     hist = memory.load_memory_variables({}).get("chat_history", [])
     prompt = "You are a helpful support assistant.\n"
     for msg in hist:
+        # LangChain messages often have .type ('human'/'ai') and .content fields.
         mtype = getattr(msg, "type", "")
         role  = "User" if mtype == "human" else "Assistant"
         content = getattr(msg, "content", "")
     return prompt
 def chat_with_memory(user_input: str) -> str:
+    """Main entrypoint called by app.py."""
     global stored_order, pending_intent
+    ui = (user_input or "").strip()
     low = ui.lower()
     # A) quick intent short-circuits

requirements.txt CHANGED Viewed

@@ -3,10 +3,7 @@ transformers
 torch
 sentencepiece
 langchain
-bitsandbytes
 peft
-xformers
 unsloth
-unsloth_zoo
 huggingface_hub
-gdown

 torch
 sentencepiece
 langchain
 peft
+bitsandbytes
 unsloth
 huggingface_hub