File size: 1,902 Bytes
c109793
1e2d981
6da1c26
1e2d981
f67587b
c2aa89c
 
f67587b
1cd873c
fca7347
c2aa89c
f7bec6d
 
 
c2aa89c
1e2d981
 
 
 
 
 
 
 
 
 
7bcf91a
 
 
 
 
1e2d981
 
 
 
 
 
 
 
 
1cd873c
1d9d6ab
1cd873c
1e2d981
1cd873c
1e2d981
92cb988
1d9d6ab
 
 
52ae9af
d47337b
d21f374
059b292
c109793
1e2d981
8a8d916
1e2d981
 
 
02743b6
7bcf91a
02743b6
 
a09beae
3465cac
1e2d981
 
 
 
 
 
 
 
6da1c26
1e2d981
 
 
 
8a8d916
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re
import gradio as gr
from llama_cpp import Llama

model = "ggml-org/gemma-3-1b-it-GGUF"
llm = Llama.from_pretrained(
    repo_id=model,
    filename="gemma-3-1b-it-Q8_0.gguf",
    verbose=True,
    use_mmap=True,
    use_mlock=True,
    n_threads=4,
    n_threads_batch=4,
    n_ctx=8000,
)


def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    if len(system_message) > 0:
        messages = [{"role": "system", "content": system_message}]
    else:
        messages = []
        

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""
    completion = llm.create_chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p
    )

    for message in completion:
        delta = message['choices'][0]['delta']
        if 'content' in delta:
            response += delta['content']
            formatted_response = re.sub(r"<think>\s*(.*?)\s*</think>", r"*\1*", response, flags=re.DOTALL)
            yield formatted_response


demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(
            value="",
            label="System message",
        ),
        gr.Slider(minimum=200, maximum=100000, value=4000, step=100, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
    description=model,
)


if __name__ == "__main__":
    demo.launch()