Spaces:

prithivMLmods
/

Agent-Dino

Running on Zero

App Files Files Community

prithivMLmods commited on 10 days ago

Commit

f60d610

verified ·

1 Parent(s): c8e4529

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -15

app.py CHANGED Viewed

@@ -259,7 +259,8 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
 # ------------------------------------------------------------------------------
 DESCRIPTION = """
-# Agent Dino 🌠 """
 css = '''
 h1 {
@@ -468,7 +469,7 @@ def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
-      - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -564,7 +565,7 @@ def generate(
         yield gr.Image(result_img)
         return
-    # --- Phi-4 Multimodal branch (Image/Audio) ---
     if text.strip().lower().startswith("@phi4"):
         question = text[len("@phi4"):].strip()
         if not files:
@@ -603,18 +604,17 @@ def generate(
             yield "Invalid file type for @phi4 multimodal processing."
             return
-        with torch.no_grad():
-            generate_ids = phi4_model.generate(
-                **inputs,
-                max_new_tokens=200,
-                num_logits_to_keep=0,
-            )
-        input_length = inputs['input_ids'].shape[1]
-        generate_ids = generate_ids[:, input_length:]
-        response = phi4_processor.batch_decode(
-            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0]
-        yield response
         return
     # --- Text and TTS branch ---

 # ------------------------------------------------------------------------------
 DESCRIPTION = """
+# Agent Dino 🌠
+"""
 css = '''
 h1 {
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
+      - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model with streaming output.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
         yield gr.Image(result_img)
         return
+    # --- Phi-4 Multimodal branch (Image/Audio) with streaming ---
     if text.strip().lower().startswith("@phi4"):
         question = text[len("@phi4"):].strip()
         if not files:
             yield "Invalid file type for @phi4 multimodal processing."
             return
+        # Set up a streamer for the phi4 model
+        streamer_phi4 = TextIteratorStreamer(phi4_processor, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs_phi4 = {**inputs, "streamer": streamer_phi4, "max_new_tokens": 200}
+        thread_phi4 = Thread(target=phi4_model.generate, kwargs=generation_kwargs_phi4)
+        thread_phi4.start()
+        outputs_phi4 = []
+        yield "🤔 Thinking..."
+        for new_text in streamer_phi4:
+            outputs_phi4.append(new_text)
+            yield "".join(outputs_phi4)
         return
     # --- Text and TTS branch ---