Spaces:
Running
Running
File size: 6,504 Bytes
251a15f b2df681 251a15f b2df681 251a15f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
from collections.abc import Iterator
from datetime import datetime
from pathlib import Path
from threading import Thread
from huggingface_hub import hf_hub_download
from themes.research_monochrome import theme
from typing import Iterator, List, Dict
import requests
import json
import subprocess
import gradio as gr
today_date = datetime.today().strftime("%B %-d, %Y") # noqa: DTZ002
SYS_PROMPT = f"""Today's Date: {today_date}.
You are Gemma, developed by Google. You are a helpful AI assistant"""
TITLE = "Gemma3 1b instruct IQ4_NL from local GGUF server using BPP library"
DESCRIPTION = """
<p>Gemma3 1b instruct is an open-source LLM supporting a 128k context window. This demo uses only 2K context.
</p>
<p> The BPP library implements matrix multiplication with far less multiplications.
</p>
"""
LLAMA_CPP_SERVER = "http://127.0.0.1:8081"
MAX_NEW_TOKENS = 1024
TEMPERATURE = 0.7
TOP_P = 0.85
TOP_K = 50
REPETITION_PENALTY = 1.05
# download GGUF into local directory
gguf_path = hf_hub_download(
repo_id="bartowski/google_gemma-3-1b-it-GGUF",
filename="google_gemma-3-1b-it-IQ4_NL.gguf",
local_dir="."
)
# start llama-server
subprocess.run(["chmod", "+x", "llama-server"])
command = ["./llama-server", "-m", "google_gemma-3-1b-it-IQ4_NL.gguf", "-ngl", "0", "--temp", "0.0", "-c", "2048", "-t", "8", "--port", "8081"]
process = subprocess.Popen(command)
print(f"Llama-server process started with PID {process.pid}")
def generate(
message: str,
chat_history: List[Dict],
temperature: float = TEMPERATURE,
repetition_penalty: float = REPETITION_PENALTY,
top_p: float = TOP_P,
top_k: float = TOP_K,
max_new_tokens: int = MAX_NEW_TOKENS,
) -> Iterator[str]:
"""Generate function for chat demo using Llama.cpp server."""
# Build messages
conversation = []
conversation.append({"role": "system", "content": SYS_PROMPT})
conversation += chat_history
conversation.append({"role": "user", "content": message})
# Prepare the prompt for the Llama.cpp server
prompt = ""
for item in conversation:
if item["role"] == "system":
prompt += f"<|system|>\n{item['content']}\n<|file_separator|>\n"
elif item["role"] == "user":
prompt += f"<|user|>\n{item['content']}\n<|file_separator|>\n"
elif item["role"] == "assistant":
prompt += f"<|model|>\n{item['content']}\n<|file_separator|>\n"
prompt += "<|model|>\n" # Add the beginning token for the assistant
# Construct the request payload
payload = {
"prompt": prompt,
"stream": True, # Enable streaming
"max_tokens": max_new_tokens,
"temperature": temperature,
"repeat_penalty": repetition_penalty,
"top_p": top_p,
"top_k": top_k,
"stop": ["<|file_separator|>"], #stops after it sees this
}
try:
# Make the request to the Llama.cpp server
with requests.post(f"{LLAMA_CPP_SERVER}/completion", json=payload, stream=True, timeout=60) as response:
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
# Stream the response from the server
outputs = []
for line in response.iter_lines():
if line:
# Decode the line
decoded_line = line.decode('utf-8')
# Remove 'data: ' prefix if present
if decoded_line.startswith("data: "):
decoded_line = decoded_line[6:]
# Handle potential JSON decoding errors
try:
json_data = json.loads(decoded_line)
text = json_data.get("content", "") # Extract content field. crucial.
if text:
outputs.append(text)
yield "".join(outputs)
except json.JSONDecodeError:
print(f"JSONDecodeError: {decoded_line}")
# Handle the error, potentially skipping the line or logging it.
except requests.exceptions.RequestException as e:
print(f"Request failed: {e}")
yield f"Error: {e}" # Yield an error message to the user
except Exception as e:
print(f"An unexpected error occurred: {e}")
yield f"Error: {e}" # Yield error message
css_file_path = Path(Path(__file__).parent / "app.css")
# advanced settings (displayed in Accordion)
temperature_slider = gr.Slider(
minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"]
)
top_p_slider = gr.Slider(
minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"]
)
top_k_slider = gr.Slider(
minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"]
)
repetition_penalty_slider = gr.Slider(
minimum=0,
maximum=2.0,
value=REPETITION_PENALTY,
step=0.05,
label="Repetition Penalty",
elem_classes=["gr_accordion_element"],
)
max_new_tokens_slider = gr.Slider(
minimum=1,
maximum=2000,
value=MAX_NEW_TOKENS,
step=1,
label="Max New Tokens",
elem_classes=["gr_accordion_element"],
)
chat_interface_accordion = gr.Accordion(label="Advanced Settings", open=False)
with gr.Blocks(fill_height=True, css_paths=css_file_path, theme=theme, title=TITLE) as demo:
gr.HTML(f"<h2>{TITLE}</h2>", elem_classes=["gr_title"])
gr.HTML(DESCRIPTION)
chat_interface = gr.ChatInterface(
fn=generate,
examples=[
["Explain the concept of quantum computing to someone with no background in physics or computer science."],
["What is OpenShift?"],
["What's the importance of low latency inference?"],
["Help me boost productivity habits."],
],
example_labels=[
"Explain quantum computing",
"What is OpenShift?",
"Importance of low latency inference",
"Boosting productivity habits",
],
cache_examples=False,
type="messages",
additional_inputs=[
temperature_slider,
repetition_penalty_slider,
top_p_slider,
top_k_slider,
max_new_tokens_slider,
],
additional_inputs_accordion=chat_interface_accordion,
)
if __name__ == "__main__":
demo.queue().launch()
|