2ba commited on
Commit
19ec252
·
verified ·
1 Parent(s): 3ea3833

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -12
app.py CHANGED
@@ -18,7 +18,7 @@ def load_llm():
18
  )
19
  llm = Llama(
20
  model_path=model_path,
21
- n_ctx=512,
22
  n_threads=max(2, os.cpu_count() or 2),
23
  n_gpu_layers=0,
24
  n_batch=32,
@@ -29,30 +29,36 @@ def load_llm():
29
  SYSTEM_PROMPT = "به فارسی، روشن و کوتاه پاسخ بده (حداکثر ۲ جمله)."
30
 
31
  def respond(message, history):
32
-
33
- messages = [{"role": "system", "content": SYSTEM_PROMPT}]
34
- for user, assistant in history:
35
- messages.append({"role": "user", "content": user})
36
- messages.append({"role": "assistant", "content": assistant})
37
- messages.append({"role": "user", "content": message})
 
 
 
38
 
39
  llm = load_llm()
 
40
  stream = llm.create_chat_completion(
41
- messages=messages,
42
- max_tokens=60,
43
  temperature=0.4,
44
  top_p=0.9,
45
  stream=True,
46
  )
47
-
48
  partial = ""
49
  for chunk in stream:
50
  delta = chunk["choices"][0]["delta"].get("content", "")
51
- partial += delta
52
- yield partial
 
 
53
 
54
  demo = gr.ChatInterface(
55
  fn=respond,
 
56
  title="چت‌بات خیلی ساده (CPU رایگان)",
57
  description="SmolLM2-135M (GGUF) با llama.cpp روی CPU.",
58
  )
 
18
  )
19
  llm = Llama(
20
  model_path=model_path,
21
+ n_ctx=512,
22
  n_threads=max(2, os.cpu_count() or 2),
23
  n_gpu_layers=0,
24
  n_batch=32,
 
29
  SYSTEM_PROMPT = "به فارسی، روشن و کوتاه پاسخ بده (حداکثر ۲ جمله)."
30
 
31
  def respond(message, history):
32
+ # تاریخچه می‌تونه tuples یا messages باشه؛ هر دو رو پشتیبانی می‌کنیم
33
+ msgs = [{"role": "system", "content": SYSTEM_PROMPT}]
34
+ if history and isinstance(history[0], dict) and "role" in history[0]:
35
+ msgs.extend(history) # already messages format
36
+ else:
37
+ for user, assistant in (history or []):
38
+ msgs.append({"role": "user", "content": user})
39
+ msgs.append({"role": "assistant", "content": assistant})
40
+ msgs.append({"role": "user", "content": message})
41
 
42
  llm = load_llm()
43
+ print(">> gen start") # برای لاگ
44
  stream = llm.create_chat_completion(
45
+ messages=msgs,
46
+ max_tokens=64,
47
  temperature=0.4,
48
  top_p=0.9,
49
  stream=True,
50
  )
 
51
  partial = ""
52
  for chunk in stream:
53
  delta = chunk["choices"][0]["delta"].get("content", "")
54
+ if delta:
55
+ partial += delta
56
+ yield partial
57
+ print(">> gen done") # برای لاگ
58
 
59
  demo = gr.ChatInterface(
60
  fn=respond,
61
+ type="messages", # هشدار Gradio هم برطرف می‌شود
62
  title="چت‌بات خیلی ساده (CPU رایگان)",
63
  description="SmolLM2-135M (GGUF) با llama.cpp روی CPU.",
64
  )