2ba commited on
Commit
3ea3833
·
verified ·
1 Parent(s): be990f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -23
app.py CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
4
  from huggingface_hub import hf_hub_download
5
  from llama_cpp import Llama
6
 
7
- # مدل خیلی سریع (۱۳۵M).
8
  REPO_ID = os.getenv("GGUF_REPO_ID", "bartowski/SmolLM2-135M-Instruct-GGUF")
9
  FILENAME = os.getenv("GGUF_FILENAME", "SmolLM2-135M-Instruct-Q4_K_M.gguf")
10
 
@@ -16,49 +16,46 @@ def load_llm():
16
  local_dir=".",
17
  local_dir_use_symlinks=False,
18
  )
19
-
20
  llm = Llama(
21
  model_path=model_path,
22
- n_ctx=256,
23
  n_threads=max(2, os.cpu_count() or 2),
24
  n_gpu_layers=0,
25
- n_batch=16,
26
- verbose=True,
27
  )
28
  return llm
29
 
30
- SYSTEM_PROMPT = "به فارسی، خیلی کوتاه و روشن جواب بده (حداکثر ۲ جمله)."
31
 
32
- def build_prompt(message, history):
33
- prompt = f"<s>[SYSTEM]\n{SYSTEM_PROMPT}\n[/SYSTEM]\n"
 
34
  for user, assistant in history:
35
- prompt += f"[USER]\n{user}\n[/USER]\n[ASSISTANT]\n{assistant}\n[/ASSISTANT]\n"
36
- prompt += f"[USER]\n{message}\n[/USER]\n[ASSISTANT]\n"
37
- return prompt
38
 
39
- def respond(message, history):
40
  llm = load_llm()
41
- prompt = build_prompt(message, history)
42
- stream = llm.create_completion(
43
- prompt=prompt,
44
- max_tokens=60,
45
- temperature=0.5,
46
  top_p=0.9,
47
- stop=["[/ASSISTANT]", "[USER]", "\n[USER]"],
48
  stream=True,
49
  )
 
50
  partial = ""
51
- for out in stream:
52
- token = out["choices"][0]["text"]
53
- partial += token
54
  yield partial
55
 
56
  demo = gr.ChatInterface(
57
  fn=respond,
58
  title="چت‌بات خیلی ساده (CPU رایگان)",
59
- description="SmolLM2-135M (GGUF) با llama.cpp روی CPU. نسخه‌ی مینیمال برای یادگیری.",
60
  )
61
 
62
  if __name__ == "__main__":
63
-
64
  demo.launch(ssr_mode=False)
 
4
  from huggingface_hub import hf_hub_download
5
  from llama_cpp import Llama
6
 
7
+ # مدل خیلی کوچک برای سرعت
8
  REPO_ID = os.getenv("GGUF_REPO_ID", "bartowski/SmolLM2-135M-Instruct-GGUF")
9
  FILENAME = os.getenv("GGUF_FILENAME", "SmolLM2-135M-Instruct-Q4_K_M.gguf")
10
 
 
16
  local_dir=".",
17
  local_dir_use_symlinks=False,
18
  )
 
19
  llm = Llama(
20
  model_path=model_path,
21
+ n_ctx=512,
22
  n_threads=max(2, os.cpu_count() or 2),
23
  n_gpu_layers=0,
24
+ n_batch=32,
25
+ verbose=False,
26
  )
27
  return llm
28
 
29
+ SYSTEM_PROMPT = "به فارسی، روشن و کوتاه پاسخ بده (حداکثر ۲ جمله)."
30
 
31
+ def respond(message, history):
32
+
33
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
34
  for user, assistant in history:
35
+ messages.append({"role": "user", "content": user})
36
+ messages.append({"role": "assistant", "content": assistant})
37
+ messages.append({"role": "user", "content": message})
38
 
 
39
  llm = load_llm()
40
+ stream = llm.create_chat_completion(
41
+ messages=messages,
42
+ max_tokens=60,
43
+ temperature=0.4,
 
44
  top_p=0.9,
 
45
  stream=True,
46
  )
47
+
48
  partial = ""
49
+ for chunk in stream:
50
+ delta = chunk["choices"][0]["delta"].get("content", "")
51
+ partial += delta
52
  yield partial
53
 
54
  demo = gr.ChatInterface(
55
  fn=respond,
56
  title="چت‌بات خیلی ساده (CPU رایگان)",
57
+ description="SmolLM2-135M (GGUF) با llama.cpp روی CPU.",
58
  )
59
 
60
  if __name__ == "__main__":
 
61
  demo.launch(ssr_mode=False)