|
import os |
|
from functools import lru_cache |
|
import gradio as gr |
|
from huggingface_hub import hf_hub_download |
|
from llama_cpp import Llama |
|
|
|
REPO_ID = os.getenv("GGUF_REPO_ID", "bartowski/SmolLM2-135M-Instruct-GGUF") |
|
FILENAME = os.getenv("GGUF_FILENAME", "SmolLM2-135M-Instruct-Q4_K_M.gguf") |
|
|
|
@lru_cache() |
|
def load_llm(): |
|
model_path = hf_hub_download( |
|
repo_id=REPO_ID, |
|
filename=FILENAME, |
|
local_dir=".", |
|
local_dir_use_symlinks=False, |
|
) |
|
llm = Llama( |
|
model_path=model_path, |
|
n_ctx=512, |
|
n_threads=max(2, os.cpu_count() or 2), |
|
n_gpu_layers=0, |
|
n_batch=32, |
|
verbose=False, |
|
) |
|
return llm |
|
|
|
SYSTEM_PROMPT = "به فارسی، روشن و کوتاه پاسخ بده (حداکثر ۲ جمله)." |
|
|
|
def respond(message, history): |
|
|
|
user_text = message.get("content", "") if isinstance(message, dict) else str(message or "") |
|
|
|
|
|
msgs = [ |
|
{"role": "system", "content": SYSTEM_PROMPT}, |
|
{"role": "user", "content": user_text}, |
|
] |
|
|
|
llm = load_llm() |
|
print(">> gen start (stream minimal)") |
|
partial = "" |
|
try: |
|
for chunk in llm.create_chat_completion( |
|
messages=msgs, |
|
max_tokens=48, |
|
temperature=0.3, |
|
top_p=0.95, |
|
stop=["<|im_end|>", "</s>"], |
|
stream=True, |
|
): |
|
delta = chunk["choices"][0]["delta"].get("content", "") |
|
if delta: |
|
partial += delta |
|
yield partial |
|
print(">> gen done (stream minimal)") |
|
except Exception as e: |
|
print(">> exception:", repr(e)) |
|
yield "یه خطای داخلی رخ داد؛ لطفاً دوباره تلاش کن." |
|
|
|
|
|
|
|
demo = gr.ChatInterface( |
|
fn=respond, |
|
type="messages", |
|
title="چتبات خیلی ساده (CPU رایگان)", |
|
description="SmolLM2-135M (GGUF) با llama.cpp روی CPU.", |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(ssr_mode=False) |
|
|