gpt-oss-20b-mutlilingual-reasoning

Runtime error

App Files Files Community

gpt-oss-20b-mutlilingual-reasoning / app.py

Tonic

attempts lora adapter and streaming

757241b 4 months ago

raw

history blame

5.35 kB

	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, pipeline
	import torch
	from threading import Thread
	import gradio as gr
	import spaces
	import re
	from peft import PeftModel

	# Load the base model
	try:
	base_model = AutoModelForCausalLM.from_pretrained(
	"openai/gpt-oss-20b",
	torch_dtype="auto",
	device_map="auto",
	attn_implementation="kernel-community/vllm-flash-attention3"
	)
	tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")

	# Load the LoRA adapter
	try:
	model = PeftModel.from_pretrained(base_model, "Tonic/gpt-oss-20b-multilingual-reasoner")
	print("✅ LoRA model loaded successfully!")
	except Exception as lora_error:
	print(f"⚠️ LoRA adapter failed to load: {lora_error}")
	print("🔄 Falling back to base model...")
	model = base_model

	except Exception as e:
	print(f"❌ Error loading model: {e}")
	raise e

	def format_messages(messages):
	"""Format messages into a prompt string"""
	formatted = ""
	for message in messages:
	role = message["role"]
	content = message["content"]
	if role == "system":
	formatted += f"System: {content}\n"
	elif role == "user":
	formatted += f"User: {content}\n"
	elif role == "assistant":
	formatted += f"Assistant: {content}\n"
	formatted += "Assistant: "
	return formatted

	def format_conversation_history(chat_history):
	messages = []
	for item in chat_history:
	role = item["role"]
	content = item["content"]
	if isinstance(content, list):
	content = content[0]["text"] if content and "text" in content[0] else str(content)
	messages.append({"role": role, "content": content})
	return messages

	@spaces.GPU(duration=60)
	def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
	new_message = {"role": "user", "content": input_data}
	system_message = [{"role": "system", "content": system_prompt}] if system_prompt else []
	processed_history = format_conversation_history(chat_history)
	messages = system_message + processed_history + [new_message]

	# Format the prompt
	prompt = format_messages(messages)

	# Create streamer for proper streaming
	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

	# Prepare generation kwargs
	generation_kwargs = {
	"max_new_tokens": max_new_tokens,
	"do_sample": True,
	"temperature": temperature,
	"top_p": top_p,
	"top_k": top_k,
	"repetition_penalty": repetition_penalty,
	"pad_token_id": tokenizer.eos_token_id,
	"streamer": streamer,
	"use_cache": True
	}

	# Tokenize input
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

	# Start generation in a separate thread
	thread = Thread(target=model.generate, kwargs={inputs, generation_kwargs})
	thread.start()

	# Stream the response
	thinking = ""
	final = ""
	started_final = False

	for chunk in streamer:
	if not started_final:
	if "assistantfinal" in chunk.lower():
	split_parts = re.split(r'assistantfinal', chunk, maxsplit=1)
	thinking += split_parts[0]
	final += split_parts[1]
	started_final = True
	else:
	thinking += chunk
	else:
	final += chunk

	clean_thinking = re.sub(r'^analysis\s*', '', thinking).strip()
	clean_final = final.strip()
	formatted = f"<details open><summary>Click to view Thinking Process</summary>\n\n{clean_thinking}\n\n</details>\n\n{clean_final}"
	yield formatted

	demo = gr.ChatInterface(
	fn=generate_response,
	additional_inputs=[
	gr.Slider(label="Max new tokens", minimum=64, maximum=4096, step=1, value=2048),
	gr.Textbox(
	label="System Prompt",
	value="You are a helpful assistant. Reasoning: medium",
	lines=4,
	placeholder="Change system prompt"
	),
	gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7),
	gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
	gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=50),
	gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.0)
	],
	examples=[
	[{"text": "Explain Newton laws clearly and concisely"}],
	[{"text": "Write a Python function to calculate the Fibonacci sequence"}],
	[{"text": "What are the benefits of open weight AI models"}],
	],
	cache_examples=False,
	type="messages",
	description="""
	# 🙋🏻‍♂️Welcome to 🌟Tonic's gpt-oss-20b Multilingual Reasoner Demo !
	Wait couple of seconds initially. You can adjust reasoning level in the system prompt like "Reasoning: high.
	""",
	fill_height=True,
	textbox=gr.Textbox(
	label="Query Input",
	placeholder="Type your prompt"
	),
	stop_btn="Stop Generation",
	multimodal=False,
	theme=gr.themes.Soft()
	)

	if __name__ == "__main__":
	demo.launch(share=True)