import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # Download GGUF model from Hugging Face Hub MODEL_REPO = "Futuresony/gemma2-2b-gguf-q4_k_m" MODEL_FILENAME = "unsloth.Q4_K_M.gguf" # Or check exact filename on the repo model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILENAME) # Load model llm = Llama(model_path=model_path, n_ctx=2048, n_threads=4, verbose=True) # Format prompt as Alpaca-style def format_prompt(user_message): return f"""### Instruction: {user_message} ### Response:""" # Chat handler def respond(user_message, chat_history): prompt = format_prompt(user_message) output = llm(prompt, max_tokens=300, stop=["###"]) response = output["choices"][0]["text"].strip() chat_history.append((user_message, response)) return "", chat_history # Gradio UI with gr.Blocks() as demo: gr.Markdown("## 🤖 DStv AI Assistant - Powered by Gemma 2B GGUF") chatbot = gr.Chatbot() msg = gr.Textbox(placeholder="Ask your question...") state = gr.State([]) msg.submit(respond, [msg, state], [msg, chatbot]) demo.launch()