Spaces:

KingNish
/

Sarvam-M-Demo

Running on Zero

App Files Files Community

KingNish commited on Jun 8

Commit

b34ac00

verified ·

1 Parent(s): 6f2ede7

Update app.py

Browse files

Files changed (1) hide show

app.py +278 -54

app.py CHANGED Viewed

@@ -3,65 +3,289 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
 import torch
 from threading import Thread
 import spaces
-# Load the model and tokenizer
 model_name = "sarvamai/sarvam-m"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
 @spaces.GPU
-def generate_response(prompt, chat_history):
-    messages = [{"role": "user", "content": prompt}]
-    text = tokenizer.apply_chat_template(messages, tokenize=False, enable_thinking=True)
-    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-    # Use TextIteratorStreamer for streaming
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    # Conduct text generation with streaming
-    generation_kwargs = dict(
-        input_ids=model_inputs.input_ids,
-        max_new_tokens=8192,
-        do_sample=True,
-        temperature=0.7,
-        streamer=streamer,
-    )
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    # Initialize variables to capture reasoning content and main content
-    reasoning_content = ""
-    content = ""
-    for new_text in streamer:
-        if "</think>" in new_text:
-            parts = new_text.split("</think>")
-            reasoning_content = parts[0].rstrip("\n")
-            content = parts[-1].lstrip("\n").rstrip("</s>")
-            yield reasoning_content, content
-        else:
-            content += new_text
-            yield reasoning_content, content
-# Create the Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("# Sarvam AI Chatbot")
-    chatbot = gr.Chatbot()
-    msg = gr.Textbox(label="Your Message")
-    def respond(message, chat_history):
-        chat_history.append((message, ""))
-        reasoning_content = ""
-        content = ""
-        for reasoning_part, content_part in generate_response(message, chat_history):
-            reasoning_content = reasoning_part
-            content = content_part
-            chat_history[-1] = (message, f"{reasoning_content}\n{content}" if reasoning_content else content)
-            yield chat_history, ""
-    msg.submit(respond, [msg, chatbot], [chatbot, msg])
 if __name__ == "__main__":
-    demo.launch()

 import torch
 from threading import Thread
 import spaces
+import time
+# For the advanced UI components
+import modelscope_studio.components.antd as antd
+import modelscope_studio.components.antdx as antdx
+import modelscope_studio.components.base as ms
+import modelscope_studio.components.pro as pro
+from modelscope_studio.components.pro.chatbot import (ChatbotBotConfig,
+                                                      ChatbotPromptsConfig,
+                                                      ChatbotUserConfig,
+                                                      ChatbotWelcomeConfig)
+# --- 1. Load the Hugging Face Model and Tokenizer ---
+# This will be done once when the script starts.
 model_name = "sarvamai/sarvam-m"
+print(f"Loading model: {model_name}...")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Use float16 for less memory and faster inference on supported GPUs
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.float16,
+    device_map="auto" # Automatically use the GPU if available
+)
+print("Model loaded successfully.")
+# --- 2. Helper and Event Handler Functions ---
+def format_history_for_sarvam(history: list) -> list:
+    """
+    Converts the pro.Chatbot's history format to the list of dictionaries
+    expected by the tokenizer's apply_chat_template method.
+    """
+    messages = []
+    if not history:
+        return messages
+    for item in history:
+        role = item.get("role")
+        content = item.get("content")
+        if role == "user":
+            messages.append({"role": "user", "content": content})
+        elif role == "assistant":
+            # Assistant content is a list of dicts (for tool/text). We need the final text.
+            final_content = ""
+            if isinstance(content, list):
+                for part in content:
+                    if part.get("type") == "text":
+                        final_content = part.get("content", "")
+                        break
+            # In case of error or simple text response
+            elif isinstance(content, str):
+                final_content = content
+            if final_content:
+                messages.append({"role": "assistant", "content": final_content})
+    return messages
 @spaces.GPU
+def submit(sender_value: str, chatbot_value: list):
+    """
+    The main function to handle user submission. It streams the model's response
+    and updates the UI in real-time.
+    """
+    # Append the new user message to the chat history
+    if sender_value:
+        chatbot_value.append({
+            "role": "user",
+            "content": sender_value,
+        })
+    # Append a placeholder for the assistant's response, with a loading indicator
+    chatbot_value.append({
+        "role": "assistant",
+        "content": [],
+        "loading": True,
+        "status": "pending"
+    })
+    # Initial UI update: clear the input box and show loading state
+    yield {
+        sender: gr.update(value=None, loading=True),
+        clear_btn: gr.update(disabled=True),
+        chatbot: gr.update(value=chatbot_value)
+    }
+    try:
+        # --- Model Inference ---
+        # 1. Format the conversation history for the model
+        history_messages = format_history_for_sarvam(chatbot_value)
+        # 2. Apply the chat template, enabling the <think> tag
+        prompt_text = tokenizer.apply_chat_template(
+            history_messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=True
+        )
+        model_inputs = tokenizer([prompt_text], return_tensors="pt").to(model.device)
+        # 3. Set up the streamer
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        # 4. Start generation in a separate thread
+        generation_kwargs = dict(
+            input_ids=model_inputs.input_ids,
+            max_new_tokens=8192,
+            do_sample=True,
+            temperature=0.7,
+            streamer=streamer,
+        )
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        # --- Stream and Parse the Response ---
+        start_time = time.time()
+        # Prepare the chatbot response structure for thinking + text
+        message_content = chatbot_value[-1]["content"]
+        message_content.append({
+            "copyable": False, "editable": False, "type": "tool", "content": "",
+            "options": {"title": "Thinking...", "status": "pending"}
+        })
+        message_content.append({"type": "text", "content": ""})
+        chatbot_value[-1]["loading"] = False
+        full_response = ""
+        thinking_content = ""
+        main_content = ""
+        thinking_done = False
+        for new_text in streamer:
+            full_response += new_text
+            if not thinking_done and "</think>" in full_response:
+                thinking_done = True
+                try:
+                    parts = full_response.split("</think>", 1)
+                    thinking_content = parts[0].split("<think>", 1)[1]
+                    main_content = parts[1]
+                    # Update the "Thinking" block in the UI
+                    thought_cost_time = "{:.2f}".format(time.time() - start_time)
+                    message_content[0]["content"] = thinking_content.strip()
+                    message_content[0]["options"]["title"] = f"End of Thought ({thought_cost_time}s)"
+                    message_content[0]["options"]["status"] = "done"
+                except IndexError:
+                    # Malformed tags, treat everything as main content
+                    main_content = full_response
+            elif not thinking_done:
+                # Still in the thinking block
+                if full_response.lstrip().startswith("<think>"):
+                    thinking_content = full_response.lstrip()[len("<think>"):]
+                    message_content[0]["content"] = thinking_content.strip()
+            else:
+                # Thinking is done, all new text is main content
+                # This ensures the main_content variable is always up-to-date
+                main_content = full_response.split("</think>", 1)[1]
+            # Update the main text response in the UI
+            message_content[1]["content"] = main_content.lstrip("\n")
+            yield {chatbot: gr.update(value=chatbot_value)}
+        # Finalize the response state
+        chatbot_value[-1]["footer"] = "{:.2f}s".format(time.time() - start_time)
+        chatbot_value[-1]["status"] = "done"
+        yield {
+            clear_btn: gr.update(disabled=False),
+            sender: gr.update(loading=False),
+            chatbot: gr.update(value=chatbot_value),
+        }
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        chatbot_value[-1]["loading"] = False
+        chatbot_value[-1]["status"] = "done"
+        chatbot_value[-1]["content"] = f"Failed to respond due to an error: {e}"
+        yield {
+            clear_btn: gr.update(disabled=False),
+            sender: gr.update(loading=False),
+            chatbot: gr.update(value=chatbot_value),
+        }
+def prompt_select(e: gr.EventData):
+    return gr.update(value=e._data["payload"][0]["value"]["description"])
+def clear():
+    return gr.update(value=None)
+def retry(chatbot_value: list, e: gr.EventData):
+    index = e._data["payload"][0]["index"]
+    # Remove the last assistant response and user message to retry
+    chatbot_value = chatbot_value[:index-1]
+    yield {
+        sender: gr.update(loading=True),
+        chatbot: gr.update(value=chatbot_value),
+        clear_btn: gr.update(disabled=True)
+    }
+    # Re-run submit with the truncated history
+    for chunk in submit(None, chatbot_value):
+        yield chunk
+def cancel(chatbot_value: list):
+    # This function is called by Gradio to stop the generator
+    if chatbot_value and chatbot_value[-1].get("status") == "pending":
+        chatbot_value[-1]["loading"] = False
+        chatbot_value[-1]["status"] = "done"
+        chatbot_value[-1]["footer"] = "Chat completion paused"
+    return {
+        chatbot: gr.update(value=chatbot_value),
+        sender: gr.update(loading=False),
+        clear_btn: gr.update(disabled=False)
+    }
+# --- 3. Build the Gradio UI ---
+with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="blue")) as demo, ms.Application(), antdx.XProvider():
+    with antd.Flex(vertical=True, gap="middle"):
+        chatbot = pro.Chatbot(
+            height=650,
+            welcome_config=ChatbotWelcomeConfig(
+                variant="borderless",
+                icon="https://cdn-avatars.huggingface.co/v1/production/uploads/60270a7c32856987162c641a/umd13GCWVijwTDGZzw3q-.png",
+                title=f"Hello, I'm {model_name.split('/')[-1]}",
+                description="I can show you my thinking process. How can I help you today?",
+                prompts=ChatbotPromptsConfig(
+                    items=[
+                        {"label": "Explain a concept", "children": [{"description": "Explain what a Large Language Model is in simple terms."}]},
+                        {"label": "Help me write", "children": [{"description": "Write a short, futuristic story about AI companions."}]},
+                        {"label": "Creative Ideas", "children": [{"description": "Give me three creative names for a new coffee shop."}]},
+                        {"label": "Code generation", "children": [{"description": "Write a python function to find the factorial of a number."}]}
+                    ]
+                )
+            ),
+            user_config=ChatbotUserConfig(avatar="https://api.dicebear.com/7.x/miniavs/svg?seed=gradio"),
+            bot_config=ChatbotBotConfig(
+                header=model_name,
+                avatar="https://cdn-avatars.huggingface.co/v1/production/uploads/60270a7c32856987162c641a/umd13GCWVijwTDGZzw3q-.png",
+                actions=["copy", "retry"]
+            ),
+        )
+        with antdx.Sender() as sender:
+            with ms.Slot("prefix"):
+                with antd.Button(value=None, color="default", variant="text") as clear_btn:
+                    with ms.Slot("icon"):
+                        antd.Icon("ClearOutlined")
+    # --- Wire up the components and event handlers ---
+    clear_btn.click(fn=clear, outputs=[chatbot])
+    submit_event = sender.submit(
+        fn=submit,
+        inputs=[sender, chatbot],
+        outputs=[sender, chatbot, clear_btn]
+    )
+    sender.cancel(
+        fn=cancel,
+        inputs=[chatbot],
+        outputs=[chatbot, sender, clear_btn],
+        cancels=[submit_event],
+        queue=False # Must be False for cancel to work instantly
+    )
+    chatbot.retry(
+        fn=retry,
+        inputs=[chatbot],
+        outputs=[sender, chatbot, clear_btn]
+    )
+    chatbot.welcome_prompt_select(
+        fn=prompt_select,
+        outputs=[sender]
+    )
 if __name__ == "__main__":
+    demo.queue().launch(debug=True)