File size: 2,490 Bytes
e24d270 82c65d0 e24d270 82c65d0 0880f7b 82c65d0 0880f7b e24d270 82c65d0 e24d270 82c65d0 e24d270 0880f7b e24d270 82c65d0 e24d270 82c65d0 0880f7b 82c65d0 e24d270 82c65d0 0880f7b 82c65d0 e24d270 82c65d0 0880f7b 82c65d0 0880f7b 82c65d0 e24d270 82c65d0 e24d270 82c65d0 e24d270 82c65d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# Load your custom model and tokenizer
MODEL_NAME = "Qwen/Qwen2.5-1.5B" # Replace with your model's Hugging Face repo ID or local path
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
# Ensure the model is on the CPU
model.to("cpu")
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
# Prepare the chat history
messages = [{"role": "system", "content": system_message}]
for user_msg, assistant_msg in history:
if user_msg:
messages.append({"role": "user", "content": user_msg})
if assistant_msg:
messages.append({"role": "assistant", "content": assistant_msg})
# Add the latest user message
messages.append({"role": "user", "content": message})
# Format the input for the model
input_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
# Generate a response
inputs = tokenizer(input_text, return_tensors="pt").to("cpu") # Move inputs to CPU
outputs = model.generate(
inputs.input_ids,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
)
# Decode the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the assistant's response
# Split the response by "assistant:" and take the last part
assistant_response = response.split("assistant:")[-1].strip()
# Remove any repeated history from the response
# This ensures the response doesn't include the entire conversation
if "user:" in assistant_response:
assistant_response = assistant_response.split("user:")[0].strip()
yield assistant_response
# Create the Gradio interface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
# Launch the app
if __name__ == "__main__":
demo.launch() |