File size: 1,902 Bytes
c109793 1e2d981 6da1c26 1e2d981 f67587b c2aa89c f67587b 1cd873c fca7347 c2aa89c f7bec6d c2aa89c 1e2d981 7bcf91a 1e2d981 1cd873c 1d9d6ab 1cd873c 1e2d981 1cd873c 1e2d981 92cb988 1d9d6ab 52ae9af d47337b d21f374 059b292 c109793 1e2d981 8a8d916 1e2d981 02743b6 7bcf91a 02743b6 a09beae 3465cac 1e2d981 6da1c26 1e2d981 8a8d916 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import re
import gradio as gr
from llama_cpp import Llama
model = "ggml-org/gemma-3-1b-it-GGUF"
llm = Llama.from_pretrained(
repo_id=model,
filename="gemma-3-1b-it-Q8_0.gguf",
verbose=True,
use_mmap=True,
use_mlock=True,
n_threads=4,
n_threads_batch=4,
n_ctx=8000,
)
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
if len(system_message) > 0:
messages = [{"role": "system", "content": system_message}]
else:
messages = []
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
completion = llm.create_chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p
)
for message in completion:
delta = message['choices'][0]['delta']
if 'content' in delta:
response += delta['content']
formatted_response = re.sub(r"<think>\s*(.*?)\s*</think>", r"*\1*", response, flags=re.DOTALL)
yield formatted_response
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(
value="",
label="System message",
),
gr.Slider(minimum=200, maximum=100000, value=4000, step=100, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
description=model,
)
if __name__ == "__main__":
demo.launch()
|