Spaces:

ThomasBasil
/

slm-customer-support-chatbot

Paused

slm-customer-support-chatbot / SLM_CService.py

BasilTh

Deploy updated SLM customer-support chatbot

77b14f6 about 1 month ago

9.27 kB

	# ─── SLM_CService.py ─────────────────────────────────────────────────────────
	# Launch-time model setup + FSM + conversational memory for the chatbot.

	import os, shutil, zipfile
	os.environ["OMP_NUM_THREADS"] = "1" # quiet libgomp noise
	os.environ.pop("HF_HUB_OFFLINE", None) # avoid accidental offline mode

	# 1) Unsloth must be imported before transformers
	import unsloth
	import torch

	from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
	from peft import PeftModel
	from langchain.memory import ConversationBufferMemory
	import gdown
	import re

	# ── Persistent storage (HF Spaces -> Settings -> Persistent storage) ─────────
	# Docs: /data persists across Space restarts. hf docs: persistent storage. :contentReference[oaicite:0]{index=0}
	PERSIST_DIR = os.environ.get("PERSIST_DIR", "/data/slm_assets")
	ADAPTER_DIR = os.path.join(PERSIST_DIR, "adapter")
	TOKENIZER_DIR = os.path.join(PERSIST_DIR, "tokenizer")
	ZIP_PATH = os.path.join(PERSIST_DIR, "assets.zip")

	# ── Provide Google Drive IDs as Secrets (HF Space -> Settings -> Variables) ──
	# Either one zip with both folders...
	GDRIVE_ZIP_ID = os.environ.get("GDRIVE_ZIP_ID")
	# ...or separate zips/files for each:
	GDRIVE_ADAPTER_ID = os.environ.get("GDRIVE_ADAPTER_ID")
	GDRIVE_TOKENIZER_ID = os.environ.get("GDRIVE_TOKENIZER_ID")

	def _ensure_dirs():
	os.makedirs(PERSIST_DIR, exist_ok=True)
	os.makedirs(ADAPTER_DIR, exist_ok=True)
	os.makedirs(TOKENIZER_DIR, exist_ok=True)

	def _have_local_assets():
	# minimal sanity checks for typical PEFT/tokenizer files
	tok_ok = any(os.path.exists(os.path.join(TOKENIZER_DIR, f))
	for f in ("tokenizer.json", "tokenizer.model", "tokenizer_config.json"))
	lora_ok = any(os.path.exists(os.path.join(ADAPTER_DIR, f))
	for f in ("adapter_config.json", "adapter_model.bin", "adapter_model.safetensors"))
	return tok_ok and lora_ok

	def _download_from_drive():
	"""Download adapter/tokenizer from Google Drive into /data using gdown."""
	_ensure_dirs()
	if GDRIVE_ZIP_ID:
	gdown.download(id=GDRIVE_ZIP_ID, output=ZIP_PATH, quiet=False) # gdown is built for Drive. :contentReference[oaicite:1]{index=1}
	with zipfile.ZipFile(ZIP_PATH, "r") as zf:
	zf.extractall(PERSIST_DIR)
	return

	if GDRIVE_ADAPTER_ID:
	ad_zip = os.path.join(PERSIST_DIR, "adapter.zip")
	gdown.download(id=GDRIVE_ADAPTER_ID, output=ad_zip, quiet=False)
	try:
	with zipfile.ZipFile(ad_zip, "r") as zf:
	zf.extractall(ADAPTER_DIR)
	except zipfile.BadZipFile:
	# not a zip – assume single file
	shutil.move(ad_zip, os.path.join(ADAPTER_DIR, "adapter_model.bin"))

	if GDRIVE_TOKENIZER_ID:
	tk_zip = os.path.join(PERSIST_DIR, "tokenizer.zip")
	gdown.download(id=GDRIVE_TOKENIZER_ID, output=tk_zip, quiet=False)
	try:
	with zipfile.ZipFile(tk_zip, "r") as zf:
	zf.extractall(TOKENIZER_DIR)
	except zipfile.BadZipFile:
	shutil.move(tk_zip, os.path.join(TOKENIZER_DIR, "tokenizer.json"))

	# ── Ensure local assets from Drive (first launch will download) ──────────────
	if not _have_local_assets():
	_download_from_drive() # persists in /data if you enabled it. :contentReference[oaicite:2]{index=2}

	# ── Tokenizer (from your Drive-backed folder) ────────────────────────────────
	tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, use_fast=False)
	tokenizer.pad_token_id = tokenizer.eos_token_id
	tokenizer.padding_side = "left"
	tokenizer.truncation_side = "right"

	# ── Base model (4-bit) via Unsloth + your PEFT adapter ──────────────────────
	BASE = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

	bnb_cfg = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=torch.bfloat16,
	)

	model = unsloth.FastLanguageModel.from_pretrained(
	BASE,
	load_in_4bit=True,
	quantization_config=bnb_cfg, # prefer quantization_config over quant_type
	device_map="auto",
	trust_remote_code=True,
	)

	model = PeftModel.from_pretrained(model, ADAPTER_DIR)

	# ── Text-generation pipeline (use generate_kwargs, not generation_kwargs) ────
	# Transformers pipelines accept `generate_kwargs` to forward to .generate(). :contentReference[oaicite:3]{index=3}
	chat_pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	trust_remote_code=True,
	return_full_text=False,
	generate_kwargs={"max_new_tokens": 128, "do_sample": True, "top_p": 0.9, "temperature": 0.7},
	)

	# ── Conversational memory (LangChain) ────────────────────────────────────────
	# ConversationBufferMemory stores full turn-by-turn chat history. :contentReference[oaicite:4]{index=4}
	memory = ConversationBufferMemory(return_messages=True)

	# ── FSM helpers (your original logic, kept intact) ───────────────────────────
	order_re = re.compile(r"#(\d{1,10})")
	def extract_order(text: str):
	m = order_re.search(text)
	return m.group(1) if m else None

	def handle_status(o): return f"Order #{o} is in transit and should arrive in 3–5 business days."
	def handle_eta(o): return f"Delivery for order #{o} typically takes 3–5 days; you can track it at https://track.example.com/{o}"
	def handle_track(o): return f"Track order #{o} here: https://track.example.com/{o}"
	def handle_link(o): return f"Here’s the latest tracking link for order #{o}: https://track.example.com/{o}"
	def handle_return_policy(_=None):
	return ("Our return policy allows returns of unused items in their original packaging within 30 days of receipt. "
	"Would you like me to connect you with a human agent?")
	def handle_gratitude(_=None):
	return "You’re welcome! Is there anything else I can help with?"
	def handle_escalation(_=None):
	return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?"

	stored_order = None
	pending_intent = None

	def _history_to_prompt(user_input: str) -> str:
	"""Build a prompt from LangChain memory turns for fallback generation."""
	hist = memory.load_memory_variables({}).get("chat_history", [])
	prompt = "You are a helpful support assistant.\n"
	for msg in hist:
	# LangChain messages expose a .type like 'human'/'ai' in many versions
	mtype = getattr(msg, "type", "")
	role = "User" if mtype == "human" else "Assistant"
	content = getattr(msg, "content", "")
	prompt += f"{role}: {content}\n"
	prompt += f"User: {user_input}\nAssistant: "
	return prompt

	def chat_with_memory(user_input: str) -> str:
	global stored_order, pending_intent

	ui = user_input.strip()
	low = ui.lower()

	# A) quick intent short-circuits
	if any(tok in low for tok in ["thank you", "thanks", "thx"]):
	reply = handle_gratitude()
	memory.save_context({"input": ui}, {"output": reply})
	return reply
	if "return" in low:
	reply = handle_return_policy()
	memory.save_context({"input": ui}, {"output": reply})
	return reply

	# B) order number?
	new_o = extract_order(ui)
	if new_o:
	stored_order = new_o
	if pending_intent in ("status", "eta", "track", "link"):
	fn = {"status": handle_status, "eta": handle_eta, "track": handle_track, "link": handle_link}[pending_intent]
	reply = fn(stored_order)
	pending_intent = None
	memory.save_context({"input": ui}, {"output": reply})
	return reply

	# C) intent classification
	if any(k in low for k in ["status", "where is my order", "check status"]):
	intent = "status"
	elif any(k in low for k in ["how long", "eta", "delivery time"]):
	intent = "eta"
	elif any(k in low for k in ["how can i track", "track my order", "where is my package"]):
	intent = "track"
	elif "tracking link" in low or "resend" in low:
	intent = "link"
	else:
	intent = "fallback"

	# D) handle core intents (ask for order first if needed)
	if intent in ("status", "eta", "track", "link"):
	if not stored_order:
	pending_intent = intent
	reply = "Sure—what’s your order number (e.g., #12345)?"
	else:
	fn = {"status": handle_status, "eta": handle_eta, "track": handle_track, "link": handle_link}[intent]
	reply = fn(stored_order)
	memory.save_context({"input": ui}, {"output": reply})
	return reply

	# E) fallback → generate with chat history context
	prompt = _history_to_prompt(ui)
	out = chat_pipe(prompt)[0]["generated_text"]
	reply = out.split("Assistant:")[-1].strip()
	memory.save_context({"input": ui}, {"output": reply})
	return reply