import os from functools import lru_cache import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama REPO_ID = os.getenv("GGUF_REPO_ID", "bartowski/SmolLM2-135M-Instruct-GGUF") FILENAME = os.getenv("GGUF_FILENAME", "SmolLM2-135M-Instruct-Q4_K_M.gguf") @lru_cache() def load_llm(): model_path = hf_hub_download( repo_id=REPO_ID, filename=FILENAME, local_dir=".", local_dir_use_symlinks=False, ) llm = Llama( model_path=model_path, n_ctx=512, n_threads=max(2, os.cpu_count() or 2), n_gpu_layers=0, n_batch=32, verbose=False, ) return llm SYSTEM_PROMPT = "به فارسی، روشن و کوتاه پاسخ بده (حداکثر ۲ جمله)." def respond(message, history): # message می‌تواند dict یا str باشد user_text = message.get("content", "") if isinstance(message, dict) else str(message or "") # history می‌تواند tuples یا messages باشد msgs = [{"role": "system", "content": SYSTEM_PROMPT}] if history and isinstance(history[0], dict) and "role" in history[0]: msgs.extend(history) else: for user, assistant in (history or []): msgs.append({"role": "user", "content": user}) msgs.append({"role": "assistant", "content": assistant}) msgs.append({"role": "user", "content": user_text}) llm = load_llm() print(">> gen start (non-stream)") out = llm.create_chat_completion( messages=msgs, max_tokens=64, temperature=0.4, top_p=0.9, stream=False, ) text = out["choices"][0]["message"]["content"] print(">> gen done (non-stream)") return text demo = gr.ChatInterface( fn=respond, type="messages", # تاریخچه را با role/content می‌فرستد title="چت‌بات خیلی ساده (CPU رایگان)", description="SmolLM2-135M (GGUF) با llama.cpp روی CPU.", ) if __name__ == "__main__": demo.launch(ssr_mode=False)