from collections.abc import Iterator from datetime import datetime from pathlib import Path from threading import Thread from huggingface_hub import hf_hub_download from themes.research_monochrome import theme from typing import Iterator, List, Dict import requests import json import subprocess import gradio as gr today_date = datetime.today().strftime("%B %-d, %Y") # noqa: DTZ002 SYS_PROMPT = f"""Today's Date: {today_date}. You are Gemma, developed by Google. You are a helpful AI assistant""" TITLE = "Gemma3 1b instruct IQ4_NL from local GGUF server using BPP library" DESCRIPTION = """
Gemma3 1b instruct is an open-source LLM supporting a 128k context window. This demo uses only 2K context.
The BPP library implements matrix multiplication with far less multiplications.
""" LLAMA_CPP_SERVER = "http://127.0.0.1:8081" MAX_NEW_TOKENS = 1024 TEMPERATURE = 0.7 TOP_P = 0.85 TOP_K = 50 REPETITION_PENALTY = 1.05 # download GGUF into local directory gguf_path = hf_hub_download( repo_id="bartowski/google_gemma-3-1b-it-GGUF", filename="google_gemma-3-1b-it-IQ4_NL.gguf", local_dir="." ) # start llama-server subprocess.run(["chmod", "+x", "llama-server"]) command = ["./llama-server", "-m", "google_gemma-3-1b-it-IQ4_NL.gguf", "-ngl", "0", "--temp", "0.0", "-c", "2048", "-t", "8", "--port", "8081"] process = subprocess.Popen(command) print(f"Llama-server process started with PID {process.pid}") def generate( message: str, chat_history: List[Dict], temperature: float = TEMPERATURE, repetition_penalty: float = REPETITION_PENALTY, top_p: float = TOP_P, top_k: float = TOP_K, max_new_tokens: int = MAX_NEW_TOKENS, ) -> Iterator[str]: """Generate function for chat demo using Llama.cpp server.""" # Build messages conversation = [] conversation.append({"role": "system", "content": SYS_PROMPT}) conversation += chat_history conversation.append({"role": "user", "content": message}) # Prepare the prompt for the Llama.cpp server prompt = "" for item in conversation: if item["role"] == "system": prompt += f"<|system|>\n{item['content']}\n<|file_separator|>\n" elif item["role"] == "user": prompt += f"<|user|>\n{item['content']}\n<|file_separator|>\n" elif item["role"] == "assistant": prompt += f"<|model|>\n{item['content']}\n<|file_separator|>\n" prompt += "<|model|>\n" # Add the beginning token for the assistant # Construct the request payload payload = { "prompt": prompt, "stream": True, # Enable streaming "max_tokens": max_new_tokens, "temperature": temperature, "repeat_penalty": repetition_penalty, "top_p": top_p, "top_k": top_k, "stop": ["<|file_separator|>"], #stops after it sees this } try: # Make the request to the Llama.cpp server with requests.post(f"{LLAMA_CPP_SERVER}/completion", json=payload, stream=True, timeout=60) as response: response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) # Stream the response from the server outputs = [] for line in response.iter_lines(): if line: # Decode the line decoded_line = line.decode('utf-8') # Remove 'data: ' prefix if present if decoded_line.startswith("data: "): decoded_line = decoded_line[6:] # Handle potential JSON decoding errors try: json_data = json.loads(decoded_line) text = json_data.get("content", "") # Extract content field. crucial. if text: outputs.append(text) yield "".join(outputs) except json.JSONDecodeError: print(f"JSONDecodeError: {decoded_line}") # Handle the error, potentially skipping the line or logging it. except requests.exceptions.RequestException as e: print(f"Request failed: {e}") yield f"Error: {e}" # Yield an error message to the user except Exception as e: print(f"An unexpected error occurred: {e}") yield f"Error: {e}" # Yield error message css_file_path = Path(Path(__file__).parent / "app.css") # advanced settings (displayed in Accordion) temperature_slider = gr.Slider( minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"] ) top_p_slider = gr.Slider( minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"] ) top_k_slider = gr.Slider( minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"] ) repetition_penalty_slider = gr.Slider( minimum=0, maximum=2.0, value=REPETITION_PENALTY, step=0.05, label="Repetition Penalty", elem_classes=["gr_accordion_element"], ) max_new_tokens_slider = gr.Slider( minimum=1, maximum=2000, value=MAX_NEW_TOKENS, step=1, label="Max New Tokens", elem_classes=["gr_accordion_element"], ) chat_interface_accordion = gr.Accordion(label="Advanced Settings", open=False) with gr.Blocks(fill_height=True, css_paths=css_file_path, theme=theme, title=TITLE) as demo: gr.HTML(f"