Spaces:

KingNish
/

Sarvam-M-Demo

Sleeping

App Files Files Community

Sarvam-M-Demo / app.py

KingNish

Update app.py

b2e8189 verified 6 months ago

raw

history blame

3.28 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	import torch
	from threading import Thread
	import spaces
	import time

	# Load the model and tokenizer
	model_name = "sarvamai/sarvam-m"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")

	indicators = ["Thinking ⠋", "Thinking ⠙", "Thinking ⠹", "Thinking ⠸", "Thinking ⠼", "Thinking ⠴", "Thinking ⠦", "Thinking ⠧", "Thinking ⠇", "Thinking ⠏"]

	@spaces.GPU(duration=120)
	def generate_response(prompt, chat_history):
	chat_history.append({"role": "user", "content": prompt})
	yield chat_history, ""

	print(chat_history)

	# Preprocess chat history to include thinking tags
	processed_chat_history = []
	for message in chat_history:
	# Skipping Thought Process in history
	if message["role"] == "assistant":
	metadata = message.get("metadata", {})
	if isinstance(metadata, dict) and metadata.get("title", "").startswith("Thought"):
	pass
	else:
	processed_chat_history.append(message)
	else:
	processed_chat_history.append(message)

	text = tokenizer.apply_chat_template(processed_chat_history, tokenize=False, add_generation_prompt=True)

	model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

	# Use TextIteratorStreamer for streaming
	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

	# Conduct text generation with streaming
	generation_kwargs = dict(
	input_ids=model_inputs.input_ids,
	max_new_tokens=8192,
	streamer=streamer,
	)

	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	# Initialize variables to capture reasoning content and main content
	reasoning_content = ""
	content = ""
	reasoning_done = False
	start_time = time.time()

	chat_history.append({"role": "assistant", "content": reasoning_content, "metadata": {"title": "Thinking..."}})

	indicator_index = 0
	for new_text in streamer:
	if "</think>" in new_text:
	reasoning_done = True
	thought_duration = time.time() - start_time
	chat_history[-1]["metadata"] = {"title": f"Thought for {thought_duration:.2f} seconds"}
	chat_history.append({"role": "assistant", "content": content})

	if not reasoning_done:
	# Update the thinking indicator
	indicator_index = (indicator_index + 1) % len(indicators)
	chat_history[-1]["metadata"] = {"title": indicators[indicator_index]}
	reasoning_content += new_text
	chat_history[-1]["content"] = reasoning_content
	else:
	content += new_text
	chat_history[-1]["content"] = content

	yield chat_history, ""

	# Create the Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Sarvam M Demo")
	chatbot = gr.Chatbot(height=500, type="messages")
	msg = gr.Textbox(label="Your Message")
	msg.submit(generate_response, [msg, chatbot], [chatbot, msg])

	if __name__ == "__main__":
	demo.launch(mcp_server=True)