# ─── SLM_CService.py ───────────────────────────────────────────────────────── import os # Fix for libgomp warning in Spaces os.environ.pop("OMP_NUM_THREADS", None) # 1) Unsloth must come first import unsloth import triton import torch from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline from peft import PeftModel # 2) Simple in-memory convo buffer # we keep alternating (user, assistant) tuples conversation_history = [] # 3) Model + adapter path in your repo (copied into the Space repo root) MODEL_DIR = "bitext-qlora-tinyllama" # 4) Load tokenizer from local dir tokenizer = AutoTokenizer.from_pretrained( MODEL_DIR, use_fast=False, local_files_only=True ) tokenizer.pad_token_id = tokenizer.eos_token_id tokenizer.padding_side = "left" tokenizer.truncation_side = "right" # 5) QLoRA + Unsloth load in 4-bit bnb_cfg = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16 ) # 5a) Base model model = unsloth.FastLanguageModel.from_pretrained( "TinyLlama/TinyLlama-1.1B-Chat-v1.0", load_in_4bit=True, quant_type="nf4", device_map="auto", trust_remote_code=True ) # 5b) Attach your LoRA adapter model = PeftModel.from_pretrained(model, MODEL_DIR, local_files_only=True) # 6) HF text-gen pipeline chat_pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, trust_remote_code=True, return_full_text=False, generation_kwargs={"max_new_tokens":128, "do_sample":True, "top_p":0.9, "temperature":0.7} ) # 7) FSM helpers (your existing code unmodified) import re order_re = re.compile(r"#(\d{1,10})") def extract_order(text: str): m = order_re.search(text) return m.group(1) if m else None def handle_status(o): return f"Order #{o} is in transit and should arrive in 3–5 business days." def handle_eta(o): return f"Delivery for order #{o} typically takes 3–5 days; you can track it at https://track.example.com/{o}" def handle_track(o): return f"Track order #{o} here: https://track.example.com/{o}" def handle_link(o): return f"Here’s the latest tracking link for order #{o}: https://track.example.com/{o}" def handle_return_policy(_=None): return ("Our return policy allows returns of unused items in their original packaging " "within 30 days of receipt. Would you like me to connect you with a human agent?") def handle_gratitude(_=None): return "You’re welcome! Is there anything else I can help with?" def handle_escalation(_=None): return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?" # 8) Core chat fn stored_order = None pending_intent = None def chat_with_memory(user_input: str) -> str: global stored_order, pending_intent # A) Save into history conversation_history.append(("User", user_input)) # B) New order? new_o = extract_order(user_input) if new_o: stored_order = new_o if pending_intent in ("status","eta","track","link"): fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[pending_intent] reply = fn(stored_order) pending_intent = None conversation_history.append(("Assistant", reply)) return reply ui = user_input.lower().strip() # C) Gratitude if any(tok in ui for tok in ["thank you","thanks","thx"]): reply = handle_gratitude() conversation_history.append(("Assistant", reply)) return reply # D) Return policy if "return" in ui: reply = handle_return_policy() conversation_history.append(("Assistant", reply)) return reply # E) Classify intent if any(k in ui for k in ["status","where is my order","check status"]): intent="status" elif any(k in ui for k in ["how long","eta","delivery time"]): intent="eta" elif any(k in ui for k in ["how can i track","track my order","where is my package"]): intent="track" elif "tracking link" in ui or "resend" in ui: intent="link" else: intent="fallback" # F) Fulfill or ask order # if intent in ("status","eta","track","link"): if not stored_order: pending_intent = intent reply = "Sure—what’s your order number (e.g., #12345)?" else: fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[intent] reply = fn(stored_order) else: reply = handle_escalation() # G) Save & done conversation_history.append(("Assistant", reply)) return reply