Spaces:

Asilbek14
/

zephyr-for-mobile

Sleeping

App Files Files Community

zephyr-for-mobile / app.py

Asilbek14

Update app.py

ce493a4 verified 20 days ago

raw

history blame

4.03 kB

	import torch
	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

	# ---------------- CONFIG ----------------
	MODEL_NAME = "google/gemma-3-270m-it" # ✅ instruction-tuned Gemma 3 model
	SYSTEM_PROMPT_DEFAULT = (
	"You are a formal and polite AI assistant. "
	"Always respond appropriately depending on the selected explanation style."
	)

	MAX_NEW_TOKENS_DEFAULT = 256
	TEMP_DEFAULT = 0.7
	TOP_P_DEFAULT = 0.9

	# ---------------- LOAD MODEL ----------------
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float32, # ✅ safe for CPU
	)

	generator = pipeline(
	"text-generation", # ✅ causal LM (not seq2seq)
	model=model,
	tokenizer=tokenizer,
	device=-1 # ✅ force CPU
	)

	# ---------------- HELPERS ----------------
	def format_prompt(chat_history, user_message, system_message, response_style):
	# Start with system message
	prompt = system_message + "\n\n"

	# Add only user messages (optional: you can also add last assistant reply if needed)
	for turn in chat_history:
	if turn["role"] == "user":
	prompt += f"{turn['content']}\n"

	# Add the new user message
	prompt += f"{user_message}\n"

	# Optionally instruct for explanation style
	if response_style == "No explanation":
	prompt += " Answer concisely with no explanation."
	elif response_style == "Short explanation":
	prompt += " Answer briefly with a one-sentence explanation."
	elif response_style == "Detailed explanation":
	prompt += " Answer in detail with reasoning and examples."

	return prompt


	# ---------------- CHAT FUNCTION ----------------
	def chat(user_message, chat_history, system_message, max_tokens, temperature, top_p, response_style):
	chat_history = chat_history or []
	prompt = format_prompt(chat_history, user_message, system_message, response_style)

	output = generator(
	prompt,
	max_new_tokens=max_tokens,
	do_sample=True,
	temperature=temperature,
	top_p=top_p,
	)[0]['generated_text']

	# For causal LMs, output includes the prompt → strip it
	response = output[len(prompt):].strip()

	# Save user and assistant content without labels
	chat_history.append({"role": "user", "content": user_message})
	chat_history.append({"role": "assistant", "content": response})

	return "", chat_history


	# ---------------- UI ----------------
	with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="pink")) as demo:
	gr.Markdown("# 🧠 Gemma-3-270M Chat Assistant (CPU-safe)")

	chatbot = gr.Chatbot(type="messages", height=500, show_copy_button=True)

	with gr.Row():
	msg = gr.Textbox(label="💬 Your Message", placeholder="Type here…", scale=6)
	send_btn = gr.Button("🚀 Send", variant="primary", scale=1)
	clear_btn = gr.Button("🧹 Clear Chat", scale=1)

	with gr.Accordion("⚙️ Advanced Settings", open=False):
	system_prompt = gr.Textbox(label="System Prompt", value=SYSTEM_PROMPT_DEFAULT, lines=3)
	response_style = gr.Dropdown(
	["No explanation", "Short explanation", "Detailed explanation"],
	value="Detailed explanation",
	label="Response Style"
	)
	temperature = gr.Slider(0.1, 1.5, value=TEMP_DEFAULT, step=0.1, label="Temperature")
	top_p = gr.Slider(0.1, 1.0, value=TOP_P_DEFAULT, step=0.05, label="Top-p")
	max_tokens = gr.Slider(32, 512, value=MAX_NEW_TOKENS_DEFAULT, step=16, label="Max new tokens")

	send_btn.click(
	chat,
	[msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style],
	[msg, chatbot]
	)
	msg.submit(
	chat,
	[msg, chatbot, system_prompt, max_tokens, temperature, top_p, response_style],
	[msg, chatbot]
	)
	clear_btn.click(lambda: [], None, chatbot, queue=False)

	if __name__ == "__main__":
	demo.launch()