File size: 4,835 Bytes
f3b040f
eff99d8
f3b040f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85c8b2b
 
 
 
f3b040f
 
 
 
 
 
85c8b2b
f3b040f
 
 
 
 
 
 
 
 
 
85c8b2b
f3b040f
85c8b2b
 
 
 
 
 
f3b040f
816e617
 
f3b040f
 
816e617
 
 
 
 
f3b040f
 
85c8b2b
f3b040f
 
 
 
 
 
 
 
816e617
f3b040f
 
 
816e617
 
85c8b2b
f3b040f
 
 
 
816e617
 
 
 
85c8b2b
 
816e617
f3b040f
816e617
 
 
f3b040f
 
816e617
f3b040f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816e617
f3b040f
 
 
 
 
 
 
816e617
f3b040f
 
 
 
85c8b2b
f3b040f
 
816e617
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# ─── SLM_CService.py ─────────────────────────────────────────────────────────
import os
# Fix for libgomp warning in Spaces
os.environ.pop("OMP_NUM_THREADS", None)

# 1) Unsloth must come first
import unsloth        
import triton
import torch

from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel

# 2) Simple in-memory convo buffer
#    we keep alternating (user, assistant) tuples
conversation_history = []

# 3) Model + adapter path in your repo (copied into the Space repo root)
MODEL_DIR = "bitext-qlora-tinyllama"

# 4) Load tokenizer from local dir
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_DIR,
    use_fast=False,
    local_files_only=True
)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
tokenizer.truncation_side = "right"

# 5) QLoRA + Unsloth load in 4-bit
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
# 5a) Base model
model = unsloth.FastLanguageModel.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    load_in_4bit=True,
    quant_type="nf4",
    device_map="auto",
    trust_remote_code=True
)
# 5b) Attach your LoRA adapter
model = PeftModel.from_pretrained(model, MODEL_DIR, local_files_only=True)

# 6) HF text-gen pipeline
chat_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    return_full_text=False,
    generation_kwargs={"max_new_tokens":128, "do_sample":True, "top_p":0.9, "temperature":0.7}
)

# 7) FSM helpers (your existing code unmodified)
import re
order_re = re.compile(r"#(\d{1,10})")
def extract_order(text: str):
    m = order_re.search(text)
    return m.group(1) if m else None

def handle_status(o): return f"Order #{o} is in transit and should arrive in 3–5 business days."
def handle_eta(o):    return f"Delivery for order #{o} typically takes 3–5 days; you can track it at https://track.example.com/{o}"
def handle_track(o):  return f"Track order #{o} here: https://track.example.com/{o}"
def handle_link(o):   return f"Here’s the latest tracking link for order #{o}: https://track.example.com/{o}"
def handle_return_policy(_=None):
    return ("Our return policy allows returns of unused items in their original packaging "
            "within 30 days of receipt. Would you like me to connect you with a human agent?")
def handle_gratitude(_=None):
    return "You’re welcome! Is there anything else I can help with?"
def handle_escalation(_=None):
    return "I’m sorry, I don’t have that information. Would you like me to connect you with a human agent?"

# 8) Core chat fn
stored_order   = None
pending_intent = None
def chat_with_memory(user_input: str) -> str:
    global stored_order, pending_intent

    # A) Save into history
    conversation_history.append(("User", user_input))

    # B) New order?
    new_o = extract_order(user_input)
    if new_o:
        stored_order = new_o
        if pending_intent in ("status","eta","track","link"):
            fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[pending_intent]
            reply = fn(stored_order)
            pending_intent = None
            conversation_history.append(("Assistant", reply))
            return reply

    ui = user_input.lower().strip()

    # C) Gratitude
    if any(tok in ui for tok in ["thank you","thanks","thx"]):
        reply = handle_gratitude()
        conversation_history.append(("Assistant", reply))
        return reply

    # D) Return policy
    if "return" in ui:
        reply = handle_return_policy()
        conversation_history.append(("Assistant", reply))
        return reply

    # E) Classify intent
    if any(k in ui for k in ["status","where is my order","check status"]):
        intent="status"
    elif any(k in ui for k in ["how long","eta","delivery time"]):
        intent="eta"
    elif any(k in ui for k in ["how can i track","track my order","where is my package"]):
        intent="track"
    elif "tracking link" in ui or "resend" in ui:
        intent="link"
    else:
        intent="fallback"

    # F) Fulfill or ask order #
    if intent in ("status","eta","track","link"):
        if not stored_order:
            pending_intent = intent
            reply = "Sureβ€”what’s your order number (e.g., #12345)?"
        else:
            fn = {"status":handle_status,"eta":handle_eta,"track":handle_track,"link":handle_link}[intent]
            reply = fn(stored_order)
    else:
        reply = handle_escalation()

    # G) Save & done
    conversation_history.append(("Assistant", reply))
    return reply