Spaces:

prithivMLmods
/

Agent-Dino

Running on Zero

App Files Files Community

prithivMLmods commited on 10 days ago

Commit

2bd3ee0

verified ·

1 Parent(s): 721c813

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -4

app.py CHANGED Viewed

@@ -35,6 +35,9 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
@@ -424,7 +427,60 @@ def detect_objects(image: np.ndarray):
     return Image.fromarray(annotated_image)
-# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, and @yolo commands
 @spaces.GPU
 def generate(
@@ -442,8 +498,9 @@ def generate(
       - "@image": triggers image generation using the SDXL pipeline.
       - "@3d": triggers 3D model generation using the ShapE pipeline.
       - "@web": triggers a web search or webpage visit.
-      - "@rAgent": initiates a reasoning chain using Llama mode OpenAI.
       - "@yolo": triggers object detection using YOLO.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -539,6 +596,24 @@ def generate(
         yield gr.Image(result_img)
         return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -627,12 +702,14 @@ demo = gr.ChatInterface(
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
         ["@tts2 What causes rainbows to form?"],
         ["@image Chocolate dripping from a donut"],
         ["@3d A birthday cupcake with cherry"],
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
-        ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
     ],
@@ -641,7 +718,7 @@ demo = gr.ChatInterface(
     description=DESCRIPTION,
     css=css,
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="@tts1-♀, @tts2-♂, @image-image gen, @3d-3d mesh gen, @rAgent-coding, @web-websearch, @yolo-object detection, default-{text gen}{image-text-text}"),
     stop_btn="Stop Generation",
     multimodal=True,
 )

 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
+# Additional import for Phi-4 multimodality (audio support)
+import soundfile as sf
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
     return Image.fromarray(annotated_image)
+# ---------------------------
+# Phi-4 Multimodal Model Setup with Text Streaming
+# ---------------------------
+phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
+phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
+phi4_model = AutoModelForCausalLM.from_pretrained(
+    phi4_model_path,
+    device_map="auto",
+    torch_dtype="auto",
+    trust_remote_code=True,
+    _attn_implementation="eager",
+)
+def process_phi4(input_type: str, file, question: str, max_new_tokens: int = 200):
+    """
+    Process an image or audio input with the Phi-4 multimodal model.
+    Uses a text streamer to yield incremental outputs.
+    Expects input_type to be either 'image' or 'audio'.
+    """
+    user_prompt = '<|user|>'
+    assistant_prompt = '<|assistant|>'
+    prompt_suffix = '<|end|>'
+    if not file or not question:
+        yield "Please upload a file and provide a question."
+        return
+    if input_type.lower() == "image":
+        prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}'
+        image = Image.open(file)
+        inputs = phi4_processor(text=prompt, images=image, return_tensors='pt').to(phi4_model.device)
+    elif input_type.lower() == "audio":
+        prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}'
+        audio, samplerate = sf.read(file)
+        inputs = phi4_processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
+    else:
+        yield "Invalid input type selected."
+        return
+    # Setup text streamer using TextIteratorStreamer for incremental generation
+    streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+    thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    yield "🤔 Thinking..."
+    for new_text in streamer:
+        buffer += new_text
+        buffer = buffer.replace("<|im_end|>", "")
+        time.sleep(0.01)
+        yield buffer
+# Chat Generation Function with support for @tts, @image, @3d, @web, @ragent, @yolo, and now @phi4 commands
 @spaces.GPU
 def generate(
       - "@image": triggers image generation using the SDXL pipeline.
       - "@3d": triggers 3D model generation using the ShapE pipeline.
       - "@web": triggers a web search or webpage visit.
+      - "@ragent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
+      - **New:** "@phi4": processes image or audio inputs with the Phi-4 multimodal model and streams text output.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
         yield gr.Image(result_img)
         return
+    # --- Phi-4 Multimodal branch with text streaming ---
+    if text.strip().lower().startswith("@phi4"):
+        # Expected format: "@phi4 [image|audio] <your question>"
+        parts = text.strip().split(maxsplit=2)
+        if len(parts) < 3:
+            yield "Error: Please provide input type and a question. Format: '@phi4 [image|audio] <your question>'"
+            return
+        input_type = parts[1]
+        question = parts[2]
+        if not files or len(files) == 0:
+            yield "Error: Please attach an image or audio file for Phi-4 processing."
+            return
+        file_input = files[0]
+        yield "🔄 Processing multimodal input with Phi-4..."
+        for partial in process_phi4(input_type, file_input, question):
+            yield partial
+        return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
         gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
     ],
     examples=[
+        [{"text": "@phi4 Solve the problem", "files": ["examples/math.webp"]}],
+        [{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
         ["@tts2 What causes rainbows to form?"],
         ["@image Chocolate dripping from a donut"],
         ["@3d A birthday cupcake with cherry"],
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
+        ["@ragent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
     ],
     description=DESCRIPTION,
     css=css,
     fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "audio"], file_count="multiple", placeholder="@tts1, @tts2, @image, @3d, @ragent, @web, @yolo, @phi4, or plain text"),
     stop_btn="Stop Generation",
     multimodal=True,
 )