Spaces:

2ba
/

babyLLM

Sleeping

App Files Files Community

babyLLM / app.py

2ba

Update app.py

be990f4 verified about 2 months ago

raw

history blame

1.92 kB

	import os
	from functools import lru_cache
	import gradio as gr
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	# مدل خیلی سریع (۱۳۵M).
	REPO_ID = os.getenv("GGUF_REPO_ID", "bartowski/SmolLM2-135M-Instruct-GGUF")
	FILENAME = os.getenv("GGUF_FILENAME", "SmolLM2-135M-Instruct-Q4_K_M.gguf")

	@lru_cache()
	def load_llm():
	model_path = hf_hub_download(
	repo_id=REPO_ID,
	filename=FILENAME,
	local_dir=".",
	local_dir_use_symlinks=False,
	)

	llm = Llama(
	model_path=model_path,
	n_ctx=256,
	n_threads=max(2, os.cpu_count() or 2),
	n_gpu_layers=0,
	n_batch=16,
	verbose=True,
	)
	return llm

	SYSTEM_PROMPT = "به فارسی، خیلی کوتاه و روشن جواب بده (حداکثر ۲ جمله)."

	def build_prompt(message, history):
	prompt = f"<s>[SYSTEM]\n{SYSTEM_PROMPT}\n[/SYSTEM]\n"
	for user, assistant in history:
	prompt += f"[USER]\n{user}\n[/USER]\n[ASSISTANT]\n{assistant}\n[/ASSISTANT]\n"
	prompt += f"[USER]\n{message}\n[/USER]\n[ASSISTANT]\n"
	return prompt

	def respond(message, history):
	llm = load_llm()
	prompt = build_prompt(message, history)
	stream = llm.create_completion(
	prompt=prompt,
	max_tokens=60,
	temperature=0.5,
	top_p=0.9,
	stop=["[/ASSISTANT]", "[USER]", "\n[USER]"],
	stream=True,
	)
	partial = ""
	for out in stream:
	token = out["choices"][0]["text"]
	partial += token
	yield partial

	demo = gr.ChatInterface(
	fn=respond,
	title="چت‌بات خیلی ساده (CPU رایگان)",
	description="SmolLM2-135M (GGUF) با llama.cpp روی CPU. نسخه‌ی مینیمال برای یادگیری.",
	)

	if __name__ == "__main__":

	demo.launch(ssr_mode=False)