File size: 2,490 Bytes
e24d270
82c65d0
 
e24d270
82c65d0
0880f7b
82c65d0
0880f7b
 
 
 
e24d270
 
 
 
 
 
 
 
 
82c65d0
e24d270
 
82c65d0
 
 
 
 
e24d270
0880f7b
e24d270
 
82c65d0
 
e24d270
82c65d0
0880f7b
82c65d0
 
 
e24d270
 
82c65d0
 
 
0880f7b
82c65d0
e24d270
82c65d0
0880f7b
82c65d0
0880f7b
 
 
 
 
 
82c65d0
e24d270
 
82c65d0
e24d270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82c65d0
e24d270
82c65d0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load your custom model and tokenizer
MODEL_NAME = "Qwen/Qwen2.5-1.5B"  # Replace with your model's Hugging Face repo ID or local path
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Ensure the model is on the CPU
model.to("cpu")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    # Prepare the chat history
    messages = [{"role": "system", "content": system_message}]

    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})

    # Add the latest user message
    messages.append({"role": "user", "content": message})

    # Format the input for the model
    input_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])

    # Generate a response
    inputs = tokenizer(input_text, return_tensors="pt").to("cpu")  # Move inputs to CPU
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
    )

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the assistant's response
    # Split the response by "assistant:" and take the last part
    assistant_response = response.split("assistant:")[-1].strip()

    # Remove any repeated history from the response
    # This ensures the response doesn't include the entire conversation
    if "user:" in assistant_response:
        assistant_response = assistant_response.split("user:")[0].strip()

    yield assistant_response


# Create the Gradio interface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

# Launch the app
if __name__ == "__main__":
    demo.launch()