Spaces:

prithivMLmods
/

Agent-Dino

Running on Zero

App Files Files Community

prithivMLmods commited on 6 days ago

Commit

f2d8164

verified ·

1 Parent(s): e6f15d9

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -124

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ import numpy as np
 from PIL import Image
 import edge_tts
 import trimesh
-import soundfile as sf  # For audio file reading
 import supervision as sv
 from ultralytics import YOLO as YOLODetector
@@ -36,13 +36,7 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
-# Additional imports for the new DeepseekR1 feature and FastAPI endpoints
-import openai
-from fastapi import FastAPI, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
 os.system('pip install backoff')
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
@@ -62,72 +56,14 @@ def glb_to_data_url(glb_path: str) -> str:
     b64_data = base64.b64encode(data).decode("utf-8")
     return f"data:model/gltf-binary;base64,{b64_data}"
-# ---------------------------
-# Sambanova DeepseekR1 Clients and Chat Function
-# ---------------------------
-sambanova_client = openai.OpenAI(
-    api_key=os.environ.get("SAMBANOVA_API_KEY"),
-    base_url="https://api.sambanova.ai/v1",
-)
-sambanova_client2 = openai.OpenAI(
-    api_key=os.environ.get("SAMBANOVA_API_KEY_2"),
-    base_url="https://api.sambanova.ai/v1",
-)
-sambanova_client3 = openai.OpenAI(
-    api_key=os.environ.get("SAMBANOVA_API_KEY_3"),
-    base_url="https://api.sambanova.ai/v1",
-)
-def chat_response(prompt: str) -> str:
-    """
-    Generate a chat response using the primary Sambanova API.
-    If it fails, fallback to the second, and then the third API.
-    """
-    messages = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": prompt},
-    ]
-    errors = {}
-    try:
-        response = sambanova_client.chat.completions.create(
-            model="DeepSeek-R1-Distill-Llama-70B",
-            messages=messages,
-            temperature=0.1,
-            top_p=0.1
-        )
-        return response.choices[0].message.content
-    except Exception as e:
-        errors['client1'] = str(e)
-        try:
-            response2 = sambanova_client2.chat.completions.create(
-                model="DeepSeek-R1-Distill-Llama-70B",
-                messages=messages,
-                temperature=0.1,
-                top_p=0.1
-            )
-            return response2.choices[0].message.content
-        except Exception as e2:
-            errors['client2'] = str(e2)
-            try:
-                response3 = sambanova_client3.chat.completions.create(
-                    model="DeepSeek-R1-Distill-Llama-70B",
-                    messages=messages,
-                    temperature=0.1,
-                    top_p=0.1
-                )
-                return response3.choices[0].message.content
-            except Exception as e3:
-                errors['client3'] = str(e3)
-                return f"Primary error: {errors['client1']}; Second error: {errors['client2']}; Third error: {errors['client3']}"
-# ---------------------------
 # Model class for Text-to-3D Generation (ShapE)
-# ---------------------------
 class Model:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
         self.pipe.to(self.device)
         if torch.cuda.is_available():
             try:
                 self.pipe.text_encoder = self.pipe.text_encoder.half()
@@ -136,6 +72,7 @@ class Model:
         self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
         self.pipe_img.to(self.device)
         if torch.cuda.is_available():
             text_encoder_img = getattr(self.pipe_img, "text_encoder", None)
             if text_encoder_img is not None:
@@ -143,6 +80,7 @@ class Model:
     def to_glb(self, ply_path: str) -> str:
         mesh = trimesh.load(ply_path)
         rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
         mesh.apply_transform(rot)
         rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
@@ -177,9 +115,8 @@ class Model:
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
-# ---------------------------
 # New Tools for Web Functionality using DuckDuckGo and smolagents
-# ---------------------------
 from typing import Any, Optional
 from smolagents.tools import Tool
 import duckduckgo_search
@@ -231,21 +168,27 @@ class VisitWebpageTool(Tool):
                 "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
             ) from e
         try:
             response = requests.get(url, timeout=20)
-            response.raise_for_status()
             markdown_content = markdownify(response.text).strip()
             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
             return truncate_content(markdown_content, 10000)
         except requests.exceptions.Timeout:
             return "The request timed out. Please try again later or check the URL."
         except RequestException as e:
             return f"Error fetching the webpage: {str(e)}"
         except Exception as e:
             return f"An unexpected error occurred: {str(e)}"
-# ---------------------------
 # rAgent Reasoning using Llama mode OpenAI
-# ---------------------------
 from openai import OpenAI
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
@@ -270,6 +213,7 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
     Uses the Llama mode OpenAI model to perform a structured reasoning chain.
     """
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
     for msg in history:
         if msg.get("role") == "user":
             messages.append({"role": "user", "content": msg["content"]})
@@ -293,10 +237,12 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
 # ------------------------------------------------------------------------------
 # New Phi-4 Multimodal Feature (Image & Audio)
 # ------------------------------------------------------------------------------
 phi4_user_prompt = '<|user|>'
 phi4_assistant_prompt = '<|assistant|>'
 phi4_prompt_suffix = '<|end|>'
 phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
 phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
 phi4_model = AutoModelForCausalLM.from_pretrained(
@@ -330,9 +276,9 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# ---------------------------
 # Load Models and Pipelines for Chat, Image, and Multimodal Processing
-# ---------------------------
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -342,11 +288,13 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
 TTS_VOICES = [
-    "en-US-JennyNeural",
-    "en-US-GuyNeural",
 ]
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -355,15 +303,20 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
     """
     cleaned = []
     for msg in chat_history:
@@ -371,14 +324,14 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
-# ---------------------------
 # Stable Diffusion XL Pipeline for Image Generation
-# ---------------------------
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
@@ -436,6 +389,7 @@ def generate_image_fn(
         options["use_resolution_binning"] = True
     images = []
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
@@ -450,9 +404,8 @@ def generate_image_fn(
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
-# ---------------------------
 # Text-to-3D Generation using the ShapE Pipeline
-# ---------------------------
 @spaces.GPU(duration=120, enable_queue=True)
 def generate_3d_fn(
     prompt: str,
@@ -470,9 +423,7 @@ def generate_3d_fn(
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
-# ---------------------------
 # YOLO Object Detection Setup
-# ---------------------------
 YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
 YOLO_CHECKPOINT_NAME = "images/demo.pt"
 yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
@@ -492,9 +443,8 @@ def detect_objects(image: np.ndarray):
     return Image.fromarray(annotated_image)
-# ---------------------------
-# Chat Generation Function with Special Commands
-# ---------------------------
 @spaces.GPU
 def generate(
     input_dict: dict,
@@ -513,8 +463,7 @@ def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
-      - "@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.
-      - **"@deepseekr1": queries the Sambanova DeepSeek-R1 model with fallback APIs.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -530,6 +479,7 @@ def generate(
             num_steps=64,
             randomize_seed=True,
         )
         static_folder = os.path.join(os.getcwd(), "static")
         if not os.path.exists(static_folder):
             os.makedirs(static_folder)
@@ -563,6 +513,7 @@ def generate(
     # --- Web Search/Visit branch ---
     if text.strip().lower().startswith("@web"):
         web_command = text[len("@web"):].strip()
         if web_command.lower().startswith("visit"):
             url = web_command[len("visit"):].strip()
             yield "🌍 Visiting webpage..."
@@ -570,6 +521,7 @@ def generate(
             content = visitor.forward(url)
             yield content
         else:
             query = web_command
             yield "🧤 Performing a web search ..."
             searcher = DuckDuckGoSearchTool()
@@ -581,24 +533,18 @@ def generate(
     if text.strip().lower().startswith("@ragent"):
         prompt = text[len("@ragent"):].strip()
         yield "📝 Initiating reasoning chain using Llama mode..."
         for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
             yield partial
         return
-    # --- DeepSeek-R1 branch ---
-    if text.strip().lower().startswith("@deepseekr1"):
-        prompt = text[len("@deepseekr1"):].strip()
-        # Directly return the response from the API
-        response = chat_response(prompt)
-        yield response
-        return
     # --- YOLO Object Detection branch ---
     if text.strip().lower().startswith("@yolo"):
         yield "🔍 Running object detection with YOLO..."
         if not files or len(files) == 0:
             yield "Error: Please attach an image for YOLO object detection."
             return
         input_file = files[0]
         try:
             if isinstance(input_file, str):
@@ -622,12 +568,15 @@ def generate(
         if not question:
             yield "Error: Please provide a question after @phi4."
             return
         input_file = files[0]
         try:
             if isinstance(input_file, Image.Image):
                 input_type = "Image"
                 file_for_phi4 = input_file
             else:
                 try:
                     file_for_phi4 = Image.open(input_file)
                     input_type = "Image"
@@ -649,8 +598,10 @@ def generate(
             yield "Invalid file type for @phi4 multimodal processing."
             return
         streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             **inputs,
             "streamer": streamer,
@@ -658,14 +609,16 @@ def generate(
             "num_logits_to_keep": 0,
         }
         thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         yield "🤔 Processing with Phi-4..."
         for new_text in streamer:
             buffer += new_text
-            time.sleep(0.01)
             yield buffer
         return
@@ -745,9 +698,8 @@ def generate(
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
-# ---------------------------
 # Gradio Chat Interface Setup and Launch
-# ---------------------------
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
@@ -779,39 +731,18 @@ demo = gr.ChatInterface(
         label="Query Input",
         file_types=["image", "audio"],
         file_count="multiple",
-        placeholder="‎ @tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, @deepseekr1, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,
 )
 if not os.path.exists("static"):
     os.makedirs("static")
 from fastapi.staticfiles import StaticFiles
 demo.app.mount("/static", StaticFiles(directory="static"), name="static")
-# ---------------------------
-# Mount FastAPI Middleware and Endpoint for DeepSeek-R1
-# ---------------------------
-demo.app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-@demo.app.post("/chat")
-async def chat_endpoint(prompt: str):
-    """
-    FastAPI endpoint for the Sambanova DeepSeek-R1 chatbot.
-    """
-    result = chat_response(prompt)
-    return {"response": result}
-# ---------------------------
-# Main Execution
-# ---------------------------
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(share=True)

 from PIL import Image
 import edge_tts
 import trimesh
+import soundfile as sf  # New import for audio file reading
 import supervision as sv
 from ultralytics import YOLO as YOLODetector
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
 os.system('pip install backoff')
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
     b64_data = base64.b64encode(data).decode("utf-8")
     return f"data:model/gltf-binary;base64,{b64_data}"
 # Model class for Text-to-3D Generation (ShapE)
 class Model:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
         self.pipe.to(self.device)
+        # Ensure the text encoder is in half precision to avoid dtype mismatches.
         if torch.cuda.is_available():
             try:
                 self.pipe.text_encoder = self.pipe.text_encoder.half()
         self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
         self.pipe_img.to(self.device)
+        # Use getattr with a default value to avoid AttributeError if text_encoder is missing.
         if torch.cuda.is_available():
             text_encoder_img = getattr(self.pipe_img, "text_encoder", None)
             if text_encoder_img is not None:
     def to_glb(self, ply_path: str) -> str:
         mesh = trimesh.load(ply_path)
+        # Rotate the mesh for proper orientation
         rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
         mesh.apply_transform(rot)
         rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
 # New Tools for Web Functionality using DuckDuckGo and smolagents
 from typing import Any, Optional
 from smolagents.tools import Tool
 import duckduckgo_search
                 "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
             ) from e
         try:
+            # Send a GET request to the URL with a 20-second timeout
             response = requests.get(url, timeout=20)
+            response.raise_for_status()  # Raise an exception for bad status codes
+            # Convert the HTML content to Markdown
             markdown_content = markdownify(response.text).strip()
+            # Remove multiple line breaks
             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
             return truncate_content(markdown_content, 10000)
         except requests.exceptions.Timeout:
             return "The request timed out. Please try again later or check the URL."
         except RequestException as e:
             return f"Error fetching the webpage: {str(e)}"
         except Exception as e:
             return f"An unexpected error occurred: {str(e)}"
 # rAgent Reasoning using Llama mode OpenAI
 from openai import OpenAI
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
     Uses the Llama mode OpenAI model to perform a structured reasoning chain.
     """
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    # Incorporate conversation history (if any)
     for msg in history:
         if msg.get("role") == "user":
             messages.append({"role": "user", "content": msg["content"]})
 # ------------------------------------------------------------------------------
 # New Phi-4 Multimodal Feature (Image & Audio)
 # ------------------------------------------------------------------------------
+# Define prompt structure for Phi-4
 phi4_user_prompt = '<|user|>'
 phi4_assistant_prompt = '<|assistant|>'
 phi4_prompt_suffix = '<|end|>'
+# Load Phi-4 multimodal model and processor using unique variable names
 phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
 phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
 phi4_model = AutoModelForCausalLM.from_pretrained(
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Models and Pipelines for Chat, Image, and Multimodal Processing
+# Load the text-only model and tokenizer (for pure text chat)
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
+# Voices for text-to-speech
 TTS_VOICES = [
+    "en-US-JennyNeural",  # @tts1
+    "en-US-GuyNeural",    # @tts2
 ]
+# Load multimodal processor and model (e.g. for OCR and image processing)
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
+# Asynchronous text-to-speech
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
+# Utility function to clean conversation history
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
+    This helps prevent errors when concatenating previous messages.
     """
     cleaned = []
     for msg in chat_history:
             cleaned.append(msg)
     return cleaned
 # Stable Diffusion XL Pipeline for Image Generation
+# Model In Use : SG161222/RealVisXL_V5.0_Lightning
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
         options["use_resolution_binning"] = True
     images = []
+    # Process in batches
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
 # Text-to-3D Generation using the ShapE Pipeline
 @spaces.GPU(duration=120, enable_queue=True)
 def generate_3d_fn(
     prompt: str,
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
 # YOLO Object Detection Setup
 YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
 YOLO_CHECKPOINT_NAME = "images/demo.pt"
 yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
     return Image.fromarray(annotated_image)
+# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
 @spaces.GPU
 def generate(
     input_dict: dict,
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
+      - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
             num_steps=64,
             randomize_seed=True,
         )
+        # Copy the GLB file to a static folder.
         static_folder = os.path.join(os.getcwd(), "static")
         if not os.path.exists(static_folder):
             os.makedirs(static_folder)
     # --- Web Search/Visit branch ---
     if text.strip().lower().startswith("@web"):
         web_command = text[len("@web"):].strip()
+        # If the command starts with "visit", then treat the rest as a URL
         if web_command.lower().startswith("visit"):
             url = web_command[len("visit"):].strip()
             yield "🌍 Visiting webpage..."
             content = visitor.forward(url)
             yield content
         else:
+            # Otherwise, treat the rest as a search query.
             query = web_command
             yield "🧤 Performing a web search ..."
             searcher = DuckDuckGoSearchTool()
     if text.strip().lower().startswith("@ragent"):
         prompt = text[len("@ragent"):].strip()
         yield "📝 Initiating reasoning chain using Llama mode..."
+        # Pass the current chat history (cleaned) to help inform the chain.
         for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
             yield partial
         return
     # --- YOLO Object Detection branch ---
     if text.strip().lower().startswith("@yolo"):
         yield "🔍 Running object detection with YOLO..."
         if not files or len(files) == 0:
             yield "Error: Please attach an image for YOLO object detection."
             return
+        # Use the first attached image
         input_file = files[0]
         try:
             if isinstance(input_file, str):
         if not question:
             yield "Error: Please provide a question after @phi4."
             return
+        # Determine input type (Image or Audio) from the first file
         input_file = files[0]
         try:
+            # If file is already a PIL Image, treat as image
             if isinstance(input_file, Image.Image):
                 input_type = "Image"
                 file_for_phi4 = input_file
             else:
+                # Try opening as image; if it fails, assume audio
                 try:
                     file_for_phi4 = Image.open(input_file)
                     input_type = "Image"
             yield "Invalid file type for @phi4 multimodal processing."
             return
+        # Initialize the streamer
         streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
+        # Prepare generation kwargs
         generation_kwargs = {
             **inputs,
             "streamer": streamer,
             "num_logits_to_keep": 0,
         }
+        # Start generation in a separate thread
         thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
         thread.start()
+        # Stream the response
         buffer = ""
         yield "🤔 Processing with Phi-4..."
         for new_text in streamer:
             buffer += new_text
+            time.sleep(0.01)  # Small delay to simulate real-time streaming
             yield buffer
         return
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
 # Gradio Chat Interface Setup and Launch
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
         label="Query Input",
         file_types=["image", "audio"],
         file_count="multiple",
+        placeholder="‎ @tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,
 )
+# Ensure the static folder exists
 if not os.path.exists("static"):
     os.makedirs("static")
 from fastapi.staticfiles import StaticFiles
 demo.app.mount("/static", StaticFiles(directory="static"), name="static")
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(share=True)