import os
from functools import lru_cache
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

REPO_ID = os.getenv("GGUF_REPO_ID", "bartowski/SmolLM2-135M-Instruct-GGUF")
FILENAME = os.getenv("GGUF_FILENAME", "SmolLM2-135M-Instruct-Q4_K_M.gguf")

@lru_cache()
def load_llm():
    model_path = hf_hub_download(
        repo_id=REPO_ID,
        filename=FILENAME,
        local_dir=".",
        local_dir_use_symlinks=False,
    )
    llm = Llama(
        model_path=model_path,
        n_ctx=512,
        n_threads=max(2, os.cpu_count() or 2),
        n_gpu_layers=0,
        n_batch=32,
        verbose=False,
    )
    return llm

SYSTEM_PROMPT = "به فارسی، روشن و کوتاه پاسخ بده (حداکثر ۲ جمله)."

def respond(message, history):
    # message می‌تواند dict یا str باشد
    user_text = message.get("content", "") if isinstance(message, dict) else str(message or "")

    # history می‌تواند tuples یا messages باشد
    msgs = [{"role": "system", "content": SYSTEM_PROMPT}]
    if history and isinstance(history[0], dict) and "role" in history[0]:
        msgs.extend(history)
    else:
        for user, assistant in (history or []):
            msgs.append({"role": "user", "content": user})
            msgs.append({"role": "assistant", "content": assistant})
    msgs.append({"role": "user", "content": user_text})

    llm = load_llm()
    print(">> gen start (non-stream)")
    out = llm.create_chat_completion(
        messages=msgs,
        max_tokens=64,      
        temperature=0.4,
        top_p=0.9,
        stream=False,       
    )
    text = out["choices"][0]["message"]["content"]
    print(">> gen done (non-stream)")
    return text


demo = gr.ChatInterface(
    fn=respond,
    type="messages",  # تاریخچه را با role/content می‌فرستد
    title="چت‌بات خیلی ساده (CPU رایگان)",
    description="SmolLM2-135M (GGUF) با llama.cpp روی CPU.",
)

if __name__ == "__main__":
    demo.launch(ssr_mode=False)