Spaces:
Sleeping
Sleeping
import torch | |
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
# ---------------- CONFIG ---------------- | |
MODEL_NAME = "google/gemma-3-270m-it" # β instruction-tuned Gemma 3 model | |
SYSTEM_PROMPT_DEFAULT = ( | |
"You are a formal and polite AI assistant. " | |
"Always respond appropriately depending on the selected explanation style." | |
) | |
MAX_NEW_TOKENS_DEFAULT = 256 | |
TEMP_DEFAULT = 0.7 | |
TOP_P_DEFAULT = 0.9 | |
# ---------------- LOAD MODEL ---------------- | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_NAME, | |
torch_dtype=torch.float32, # β safe for CPU | |
) | |
generator = pipeline( | |
"text-generation", # β causal LM (not seq2seq) | |
model=model, | |
tokenizer=tokenizer, | |
device=-1 # β force CPU | |
) | |
# ---------------- HELPERS ---------------- | |
def format_prompt(chat_history, user_message, system_message, response_style): | |
# Start with system message | |
prompt = system_message + "\n\n" | |
# Add only user messages (optional: you can also add last assistant reply if needed) | |
for turn in chat_history: | |
if turn["role"] == "user": | |
prompt += f"{turn['content']}\n" | |
# Add the new user message | |
prompt += f"{user_message}\n" | |
# Optionally instruct for explanation style | |
if response_style == "No explanation": | |
prompt += " Answer concisely with no explanation." | |
elif response_style == "Short explanation": | |
prompt += " Answer briefly with a one-sentence explanation." | |
elif response_style == "Detailed explanation": | |
prompt += " Answer in detail with reasoning and examples." | |
return prompt | |
# ---------------- CHAT FUNCTION ---------------- | |
def chat(user_message, chat_history, system_message, max_tokens, temperature, top_p, response_style): | |
chat_history = chat_history or [] | |
prompt = format_prompt(chat_history, user_message, system_message, response_style) | |
output = generator( | |
prompt, | |
max_new_tokens=max_tokens, | |
do_sample=True, | |
temperature=temperature, | |
top_p=top_p, | |
)[0]['generated_text'] | |
# For causal LMs, output includes the prompt β strip it | |
response = output[len(prompt):].strip() | |
# Save user and assistant content without labels | |
chat_history.append({"role": "user", "content": user_message}) | |
chat_history.append({"role": "assistant", "content": response}) | |
return "", chat_history | |
# ---------------- UI ---------------- | |
with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink")) as demo: | |
gr.Markdown("# π§ Gemma-3-270M Chat Assistant (CPU-safe)") | |
chatbot = gr.Chatbot(type="messages", height=500, show_copy_button=True) | |
with gr.Row(): | |
msg = gr.Textbox(label="π¬ Your Message", placeholder="Type hereβ¦", scale=6) | |
send_btn = gr.Button("π Send", variant="primary", scale=1) | |
clear_btn = gr.Button("π§Ή Clear Chat", scale=1) | |
with gr.Accordion("βοΈ Advanced Settings", open=False): | |
system_prompt = gr.Textbox(label="System Prompt", value=SYSTEM_PROMPT_DEFAULT, lines=3) | |
response_style = gr.Dropdown( | |
["No explanation", "Short explanation", "Detailed explanation"], | |
value="Detailed explanation", | |
label="Response Style" | |
) | |
temperature = gr.Slider(0.1, 1.5, value=TEMP_DEFAULT, step=0.1, label="Temperature") | |
top_p = gr.Slider(0.1, 1.0, value=TOP_P_DEFAULT, step=0.05, label="Top-p") | |
max_tokens = gr.Slider(32, 512, value=MAX_NEW_TOKENS_DEFAULT, step=16, label="Max new tokens") | |
send_btn.click( | |
chat, | |
[msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style], | |
[msg, chatbot] | |
) | |
msg.submit( | |
chat, | |
[msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style], | |
[msg, chatbot] | |
) | |
clear_btn.click(lambda: [], None, chatbot, queue=False) | |
if __name__ == "__main__": | |
demo.launch() |