Update app.py
Browse files
app.py
CHANGED
@@ -16,16 +16,19 @@ def load_llm():
|
|
16 |
local_dir=".",
|
17 |
local_dir_use_symlinks=False,
|
18 |
)
|
|
|
19 |
llm = Llama(
|
20 |
model_path=model_path,
|
21 |
-
n_ctx=2048
|
22 |
n_threads=max(2, os.cpu_count() or 2),
|
23 |
n_gpu_layers=0,
|
|
|
24 |
verbose=False,
|
25 |
)
|
26 |
return llm
|
27 |
|
28 |
-
|
|
|
29 |
|
30 |
def build_prompt(message, history):
|
31 |
prompt = f"<s>[SYSTEM]\n{SYSTEM_PROMPT}\n[/SYSTEM]\n"
|
@@ -37,10 +40,11 @@ def build_prompt(message, history):
|
|
37 |
def respond(message, history):
|
38 |
llm = load_llm()
|
39 |
prompt = build_prompt(message, history)
|
|
|
40 |
stream = llm.create_completion(
|
41 |
prompt=prompt,
|
42 |
-
max_tokens=256
|
43 |
-
temperature=0.
|
44 |
top_p=0.9,
|
45 |
stop=["[/ASSISTANT]", "[USER]", "\n[USER]"],
|
46 |
stream=True,
|
@@ -49,13 +53,4 @@ def respond(message, history):
|
|
49 |
for out in stream:
|
50 |
token = out["choices"][0]["text"]
|
51 |
partial += token
|
52 |
-
yield
|
53 |
-
|
54 |
-
demo = gr.ChatInterface(
|
55 |
-
fn=respond,
|
56 |
-
title="چتبات خیلی ساده (CPU رایگان)",
|
57 |
-
description="TinyLlama 1.1B (GGUF) روی llama.cpp برای یادگیری راهاندازی LLM.",
|
58 |
-
)
|
59 |
-
|
60 |
-
if __name__ == "__main__":
|
61 |
-
demo.launch()
|
|
|
16 |
local_dir=".",
|
17 |
local_dir_use_symlinks=False,
|
18 |
)
|
19 |
+
# تغییرات سرعت: n_ctx کمتر و n_batch اضافه شد
|
20 |
llm = Llama(
|
21 |
model_path=model_path,
|
22 |
+
n_ctx=1024, # CHANGED: قبلاً 2048 بود
|
23 |
n_threads=max(2, os.cpu_count() or 2),
|
24 |
n_gpu_layers=0,
|
25 |
+
n_batch=64, # CHANGED: پردازش سریعتر روی CPU
|
26 |
verbose=False,
|
27 |
)
|
28 |
return llm
|
29 |
|
30 |
+
# پاسخهای کوتاهتر و سریعتر
|
31 |
+
SYSTEM_PROMPT = "به فارسی، واضح و خیلی کوتاه جواب بده (حداکثر ۲ جمله)."
|
32 |
|
33 |
def build_prompt(message, history):
|
34 |
prompt = f"<s>[SYSTEM]\n{SYSTEM_PROMPT}\n[/SYSTEM]\n"
|
|
|
40 |
def respond(message, history):
|
41 |
llm = load_llm()
|
42 |
prompt = build_prompt(message, history)
|
43 |
+
# تغییرات سرعت: max_tokens و temperature
|
44 |
stream = llm.create_completion(
|
45 |
prompt=prompt,
|
46 |
+
max_tokens=120, # CHANGED: قبلاً 256 بود
|
47 |
+
temperature=0.5, # کمی خنکتر تا جوابها جمعوجورتر شوند
|
48 |
top_p=0.9,
|
49 |
stop=["[/ASSISTANT]", "[USER]", "\n[USER]"],
|
50 |
stream=True,
|
|
|
53 |
for out in stream:
|
54 |
token = out["choices"][0]["text"]
|
55 |
partial += token
|
56 |
+
yield parti
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|