Spaces:

prithivMLmods
/

Agent-Dino

Runtime error

App Files Files Community

prithivMLmods commited on 11 days ago

Commit

1be5917

verified ·

1 Parent(s): 6565507

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -48

app.py CHANGED Viewed

@@ -234,9 +234,6 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
          response += token
          yield response
-# ------------------------------------------------------------------------------
-# New Phi-4 Multimodal Feature (Image & Audio)
-# ------------------------------------------------------------------------------
 # Define prompt structure for Phi-4
 phi4_user_prompt = '<|user|>'
 phi4_assistant_prompt = '<|assistant|>'
@@ -253,15 +250,8 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
     _attn_implementation="eager",
 )
-grpo_model_name = "prithivMLmods/SmolLM2-360M-Grpo-r999"
-grpo_device = "cuda" if torch.cuda.is_available() else "cpu"
-grpo_tokenizer = AutoTokenizer.from_pretrained(grpo_model_name)
-grpo_model = AutoModelForCausalLM.from_pretrained(grpo_model_name).to(grpo_device)
 DESCRIPTION = """
-# Agent Dino 🌠
-"""
 css = '''
 h1 {
@@ -450,7 +440,7 @@ def detect_objects(image: np.ndarray):
     return Image.fromarray(annotated_image)
-# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, @phi4, and now @grpo commands
 @spaces.GPU
 def generate(
@@ -470,8 +460,7 @@ def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
-      - "@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.
-      - **"@grpo": triggers text generation using the GRPO model with a text streamer.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -630,37 +619,6 @@ def generate(
             yield buffer
         return
-    # --- GRPO Text Generation branch ---
-    if text.strip().lower().startswith("@grpo"):
-        prompt = text[len("@grpo"):].strip()
-        yield "📝 Generating text with @grpo..."
-        messages = [
-            {"role": "system", "content": "Please respond in this specific format ONLY:\n<thinking>\n input your reasoning behind your answer in between these reasoning tags.\n</thinking>\n<answer>\nyour answer in between these answer tags.\n</answer>\n"},
-            {"role": "user", "content": prompt}
-        ]
-        # Use the GRPO tokenizer's chat template if available, otherwise simply join the messages.
-        input_text = grpo_tokenizer.apply_chat_template(messages, tokenize=False) if hasattr(grpo_tokenizer, "apply_chat_template") else "\n".join([msg["content"] for msg in messages])
-        inputs = grpo_tokenizer.encode(input_text, return_tensors="pt").to(grpo_model.device)
-        streamer = TextIteratorStreamer(grpo_tokenizer, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            "input_ids": inputs,
-            "max_new_tokens": 100,
-            "temperature": 0.2,
-            "top_p": 0.9,
-            "do_sample": True,
-            "use_cache": False,
-            "streamer": streamer,
-        }
-        thread = Thread(target=grpo_model.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        yield "🤔 Thinking..."
-        for new_text in streamer:
-            buffer += new_text
-            time.sleep(0.01)
-            yield buffer
-        return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -756,10 +714,9 @@ demo = gr.ChatInterface(
         ["@tts2 What causes rainbows to form?"],
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
-        ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
-        ["@grpo If there are 12 cookies in a dozen and you have 5 dozen, how many cookies do you have?"],
     ],
     cache_examples=False,
     type="messages",
@@ -770,7 +727,7 @@ demo = gr.ChatInterface(
         label="Query Input",
         file_types=["image", "audio"],
         file_count="multiple",
-        placeholder="@tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, @grpo, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,

          response += token
          yield response
 # Define prompt structure for Phi-4
 phi4_user_prompt = '<|user|>'
 phi4_assistant_prompt = '<|assistant|>'
     _attn_implementation="eager",
 )
 DESCRIPTION = """
+# Agent Dino 🌠"""
 css = '''
 h1 {
     return Image.fromarray(annotated_image)
+# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
 @spaces.GPU
 def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
+      - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
             yield buffer
         return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
         ["@tts2 What causes rainbows to form?"],
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
+        ["@ragent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
     ],
     cache_examples=False,
     type="messages",
         label="Query Input",
         file_types=["image", "audio"],
         file_count="multiple",
+        placeholder="@tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,