|
import os |
|
from functools import lru_cache |
|
import gradio as gr |
|
from huggingface_hub import hf_hub_download |
|
from llama_cpp import Llama |
|
|
|
REPO_ID = "bartowski/SmolLM2-135M-Instruct-GGUF" |
|
FILENAME = "SmolLM2-135M-Instruct-Q4_K_M.gguf" |
|
|
|
@lru_cache() |
|
def load_llm(): |
|
model_path = hf_hub_download( |
|
repo_id=REPO_ID, |
|
filename=FILENAME, |
|
local_dir=".", |
|
local_dir_use_symlinks=False, |
|
) |
|
llm = Llama( |
|
model_path=model_path, |
|
n_ctx=512, |
|
n_threads=max(2, os.cpu_count() or 2), |
|
n_gpu_layers=0, |
|
n_batch=32, |
|
verbose=False, |
|
) |
|
return llm |
|
|
|
|
|
SYSTEM_PROMPT = "به فارسی، واضح و خیلی کوتاه جواب بده (حداکثر ۲ جمله)." |
|
|
|
def build_prompt(message, history): |
|
prompt = f"<s>[SYSTEM]\n{SYSTEM_PROMPT}\n[/SYSTEM]\n" |
|
for user, assistant in history: |
|
prompt += f"[USER]\n{user}\n[/USER]\n[ASSISTANT]\n{assistant}\n[/ASSISTANT]\n" |
|
prompt += f"[USER]\n{message}\n[/USER]\n[ASSISTANT]\n" |
|
return prompt |
|
|
|
def respond(message, history): |
|
llm = load_llm() |
|
prompt = build_prompt(message, history) |
|
|
|
stream = llm.create_completion( |
|
prompt=prompt, |
|
max_tokens=80, |
|
temperature=0.5, |
|
top_p=0.9, |
|
stop=["[/ASSISTANT]", "[USER]", "\n[USER]"], |
|
stream=True, |
|
) |
|
partial = "" |
|
for out in stream: |
|
token = out["choices"][0]["text"] |
|
partial += token |
|
yield partial |
|
|
|
demo = gr.ChatInterface( |
|
fn=respond, |
|
title="چتبات خیلی ساده (CPU رایگان)", |
|
description="TinyLlama 1.1B (GGUF) روی llama.cpp برای یادگیری راهاندازی LLM.", |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|