# ── SLM_CService.py ─────────────────────────────────────────────────────────── # Model load + FSM + conversational memory for your Gradio Space. import os import re # Keep OpenMP quiet in Spaces logs os.environ["OMP_NUM_THREADS"] = "1" # Ensure we don't accidentally run offline os.environ.pop("HF_HUB_OFFLINE", None) # 1) Unsloth must be imported BEFORE transformers/peft to apply optimizations. # (Otherwise you may see perf/memory warnings.) # Ref: Unsloth team warning in issues. import unsloth # noqa: E402 # must be before transformers/peft :contentReference[oaicite:2]{index=2} import torch from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline from peft import PeftModel from langchain.memory import ConversationBufferMemory # ────────────────────────────────────────────────────────────────────────────── # Your Hub repo that contains the tokenizer + PEFT adapter files REPO = "ThomasBasil/bitext-qlora-tinyllama" BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # If your files are nested, set this to the exact subfolder path (or use # the HF_SUBFOLDER env var from Space → Settings → Variables). # Example from your screenshot: DEFAULT_SUBFOLDER = "bitext-qlora-tinyllama-20250807T224217Z-1-001/bitext-qlora-tinyllama" SUBFOLDER = os.environ.get("HF_SUBFOLDER", DEFAULT_SUBFOLDER) # 4-bit NF4 quantization config (QLoRA-style) # Ref: Transformers bitsandbytes quantization docs. :contentReference[oaicite:3]{index=3} bnb_cfg = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16, ) # ---- Robust helpers to load from root or subfolder --------------------------- def _load_tokenizer(repo_id: str): """ Try to load tokenizer from repo root; if missing, try configured subfolder. Transformers supports `subfolder` in from_pretrained for tokenizers. :contentReference[oaicite:4]{index=4} """ # Try at repo root first try: tok = AutoTokenizer.from_pretrained(repo_id, use_fast=False) except Exception: # Try "tokenizer" subdir at root try: tok = AutoTokenizer.from_pretrained(repo_id, subfolder="tokenizer", use_fast=False) except Exception: # Try the provided nested path tok = AutoTokenizer.from_pretrained(repo_id, subfolder=SUBFOLDER, use_fast=False) # sensible defaults for causal LM if tok.pad_token_id is None and tok.eos_token_id is not None: tok.pad_token_id = tok.eos_token_id tok.padding_side = "left" tok.truncation_side = "right" return tok def _attach_adapter(base_model, repo_id: str): """ Attach PEFT adapter from root; if not found, try subfolder variants. (PEFT supports kwargs like `subfolder`, though older versions had quirks; if you ever hit issues, place adapter files at repo root.) :contentReference[oaicite:5]{index=5} """ # Try repo root try: return PeftModel.from_pretrained(base_model, repo_id) except Exception: # Try 'adapter' subdir at root try: return PeftModel.from_pretrained(base_model, repo_id, subfolder="adapter") except Exception: # Try the provided nested path return PeftModel.from_pretrained(base_model, repo_id, subfolder=SUBFOLDER) # ---- Load tokenizer, base model (4-bit), and attach adapter ------------------ tokenizer = _load_tokenizer(REPO) model = unsloth.FastLanguageModel.from_pretrained( BASE, load_in_4bit=True, quantization_config=bnb_cfg, # prefer quantization_config over legacy args device_map="auto", trust_remote_code=True, ) model = _attach_adapter(model, REPO) model.eval() # Transformers pipeline accepts `generate_kwargs` to pass through to .generate(). # Ref: Pipelines docs mention `generate_kwargs`. :contentReference[oaicite:6]{index=6} chat_pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, trust_remote_code=True, return_full_text=False, generate_kwargs={ "max_new_tokens": 128, "do_sample": True, "top_p": 0.9, "temperature": 0.7, }, ) # ────────────────────────────────────────────────────────────────────────────── # Conversational Memory (LangChain) memory = ConversationBufferMemory(return_messages=True) # ────────────────────────────────────────────────────────────────────────────── # Simple FSM helpers order_re = re.compile(r"#(\d{1,10})") def extract_order(text: str): m = order_re.search(text) return m.group(1) if m else None def handle_status(o): return f"Order #{o} is in transit and should arrive in 3–5 business days." def handle_eta(o): return f"Delivery for order #{o} typically takes 3–5 days; you can track it at https://track.example.com/{o}" def handle_track(o): return f"Track order #{o} here: https://track.example.com/{o}" def handle_link(o): return f"Here’s the latest tracking link for order #{o}: https://track.example.com/{o}" def handle_return_policy(_=None): return ("Our return policy allows returns of unused items in their original packaging within 30 days of receipt. " "Would you like me to connect you with a human agent?") def handle_gratitude(_=None): return "You’re welcome! Is there anything else I can help with?" def handle_escalation(_=None): return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?" stored_order = None pending_intent = None def _history_to_prompt(user_input: str) -> str: """Build a plain-text prompt that includes chat history for fallback generation.""" hist = memory.load_memory_variables({}).get("chat_history", []) prompt = "You are a helpful support assistant.\n" for msg in hist: # LangChain messages often have .type ('human'/'ai') and .content mtype = getattr(msg, "type", "") role = "User" if mtype == "human" else "Assistant" content = getattr(msg, "content", "") prompt += f"{role}: {content}\n" prompt += f"User: {user_input}\nAssistant: " return prompt def chat_with_memory(user_input: str) -> str: """Main entrypoint called by app.py.""" global stored_order, pending_intent ui = (user_input or "").strip() low = ui.lower() # A) quick intent short-circuits if any(tok in low for tok in ["thank you", "thanks", "thx"]): reply = handle_gratitude() memory.save_context({"input": ui}, {"output": reply}) return reply if "return" in low: reply = handle_return_policy() memory.save_context({"input": ui}, {"output": reply}) return reply # B) order number? new_o = extract_order(ui) if new_o: stored_order = new_o if pending_intent in ("status", "eta", "track", "link"): fn = {"status": handle_status, "eta": handle_eta, "track": handle_track, "link": handle_link}[pending_intent] reply = fn(stored_order) pending_intent = None memory.save_context({"input": ui}, {"output": reply}) return reply # C) intent classification if any(k in low for k in ["status", "where is my order", "check status"]): intent = "status" elif any(k in low for k in ["how long", "eta", "delivery time"]): intent = "eta" elif any(k in low for k in ["how can i track", "track my order", "where is my package"]): intent = "track" elif "tracking link" in low or "resend" in low: intent = "link" else: intent = "fallback" # D) handle core intents (ask for order first if needed) if intent in ("status", "eta", "track", "link"): if not stored_order: pending_intent = intent reply = "Sure—what’s your order number (e.g., #12345)?" else: fn = {"status": handle_status, "eta": handle_eta, "track": handle_track, "link": handle_link}[intent] reply = fn(stored_order) memory.save_context({"input": ui}, {"output": reply}) return reply # E) fallback → generate with chat history context prompt = _history_to_prompt(ui) out = chat_pipe(prompt)[0]["generated_text"] reply = out.split("Assistant:")[-1].strip() memory.save_context({"input": ui}, {"output": reply}) return reply