Spaces:

prithivMLmods
/

Agent-Dino

Running on Zero

App Files Files Community

prithivMLmods commited on 6 days ago

Commit

eaab4db

verified ·

1 Parent(s): 90baac7

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -52

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ import numpy as np
 from PIL import Image
 import edge_tts
 import trimesh
-import soundfile as sf  # New import for audio file reading
 import supervision as sv
 from ultralytics import YOLO as YOLODetector
@@ -36,7 +36,13 @@ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
 os.system('pip install backoff')
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
@@ -56,8 +62,67 @@ def glb_to_data_url(glb_path: str) -> str:
     b64_data = base64.b64encode(data).decode("utf-8")
     return f"data:model/gltf-binary;base64,{b64_data}"
-# Model class for Text-to-3D Generation (ShapE)
 class Model:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -115,8 +180,9 @@ class Model:
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
 # New Tools for Web Functionality using DuckDuckGo and smolagents
 from typing import Any, Optional
 from smolagents.tools import Tool
 import duckduckgo_search
@@ -168,27 +234,21 @@ class VisitWebpageTool(Tool):
                 "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
             ) from e
         try:
-            # Send a GET request to the URL with a 20-second timeout
             response = requests.get(url, timeout=20)
-            response.raise_for_status()  # Raise an exception for bad status codes
-            # Convert the HTML content to Markdown
             markdown_content = markdownify(response.text).strip()
-            # Remove multiple line breaks
             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
             return truncate_content(markdown_content, 10000)
         except requests.exceptions.Timeout:
             return "The request timed out. Please try again later or check the URL."
         except RequestException as e:
             return f"Error fetching the webpage: {str(e)}"
         except Exception as e:
             return f"An unexpected error occurred: {str(e)}"
-# rAgent Reasoning using Llama mode OpenAI
 from openai import OpenAI
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
@@ -213,7 +273,6 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
     Uses the Llama mode OpenAI model to perform a structured reasoning chain.
     """
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-    # Incorporate conversation history (if any)
     for msg in history:
         if msg.get("role") == "user":
             messages.append({"role": "user", "content": msg["content"]})
@@ -237,12 +296,10 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
 # ------------------------------------------------------------------------------
 # New Phi-4 Multimodal Feature (Image & Audio)
 # ------------------------------------------------------------------------------
-# Define prompt structure for Phi-4
 phi4_user_prompt = '<|user|>'
 phi4_assistant_prompt = '<|assistant|>'
 phi4_prompt_suffix = '<|end|>'
-# Load Phi-4 multimodal model and processor using unique variable names
 phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
 phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
 phi4_model = AutoModelForCausalLM.from_pretrained(
@@ -276,9 +333,9 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Models and Pipelines for Chat, Image, and Multimodal Processing
-# Load the text-only model and tokenizer (for pure text chat)
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -288,13 +345,11 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
-# Voices for text-to-speech
 TTS_VOICES = [
-    "en-US-JennyNeural",  # @tts1
-    "en-US-GuyNeural",    # @tts2
 ]
-# Load multimodal processor and model (e.g. for OCR and image processing)
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -303,20 +358,15 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# Asynchronous text-to-speech
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
-# Utility function to clean conversation history
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
-    This helps prevent errors when concatenating previous messages.
     """
     cleaned = []
     for msg in chat_history:
@@ -324,14 +374,14 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
 # Stable Diffusion XL Pipeline for Image Generation
-# Model In Use : SG161222/RealVisXL_V5.0_Lightning
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
@@ -389,7 +439,6 @@ def generate_image_fn(
         options["use_resolution_binning"] = True
     images = []
-    # Process in batches
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
@@ -404,8 +453,9 @@ def generate_image_fn(
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
 # Text-to-3D Generation using the ShapE Pipeline
 @spaces.GPU(duration=120, enable_queue=True)
 def generate_3d_fn(
     prompt: str,
@@ -423,7 +473,9 @@ def generate_3d_fn(
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
 # YOLO Object Detection Setup
 YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
 YOLO_CHECKPOINT_NAME = "images/demo.pt"
 yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
@@ -443,8 +495,9 @@ def detect_objects(image: np.ndarray):
     return Image.fromarray(annotated_image)
-# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
 @spaces.GPU
 def generate(
     input_dict: dict,
@@ -463,7 +516,8 @@ def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
-      - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -479,7 +533,6 @@ def generate(
             num_steps=64,
             randomize_seed=True,
         )
-        # Copy the GLB file to a static folder.
         static_folder = os.path.join(os.getcwd(), "static")
         if not os.path.exists(static_folder):
             os.makedirs(static_folder)
@@ -513,7 +566,6 @@ def generate(
     # --- Web Search/Visit branch ---
     if text.strip().lower().startswith("@web"):
         web_command = text[len("@web"):].strip()
-        # If the command starts with "visit", then treat the rest as a URL
         if web_command.lower().startswith("visit"):
             url = web_command[len("visit"):].strip()
             yield "🌍 Visiting webpage..."
@@ -521,7 +573,6 @@ def generate(
             content = visitor.forward(url)
             yield content
         else:
-            # Otherwise, treat the rest as a search query.
             query = web_command
             yield "🧤 Performing a web search ..."
             searcher = DuckDuckGoSearchTool()
@@ -533,18 +584,24 @@ def generate(
     if text.strip().lower().startswith("@ragent"):
         prompt = text[len("@ragent"):].strip()
         yield "📝 Initiating reasoning chain using Llama mode..."
-        # Pass the current chat history (cleaned) to help inform the chain.
         for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
             yield partial
         return
     # --- YOLO Object Detection branch ---
     if text.strip().lower().startswith("@yolo"):
         yield "🔍 Running object detection with YOLO..."
         if not files or len(files) == 0:
             yield "Error: Please attach an image for YOLO object detection."
             return
-        # Use the first attached image
         input_file = files[0]
         try:
             if isinstance(input_file, str):
@@ -568,15 +625,12 @@ def generate(
         if not question:
             yield "Error: Please provide a question after @phi4."
             return
-        # Determine input type (Image or Audio) from the first file
         input_file = files[0]
         try:
-            # If file is already a PIL Image, treat as image
             if isinstance(input_file, Image.Image):
                 input_type = "Image"
                 file_for_phi4 = input_file
             else:
-                # Try opening as image; if it fails, assume audio
                 try:
                     file_for_phi4 = Image.open(input_file)
                     input_type = "Image"
@@ -598,10 +652,8 @@ def generate(
             yield "Invalid file type for @phi4 multimodal processing."
             return
-        # Initialize the streamer
         streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
-        # Prepare generation kwargs
         generation_kwargs = {
             **inputs,
             "streamer": streamer,
@@ -609,16 +661,14 @@ def generate(
             "num_logits_to_keep": 0,
         }
-        # Start generation in a separate thread
         thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
         thread.start()
-        # Stream the response
         buffer = ""
         yield "🤔 Processing with Phi-4..."
         for new_text in streamer:
             buffer += new_text
-            time.sleep(0.01)  # Small delay to simulate real-time streaming
             yield buffer
         return
@@ -698,8 +748,9 @@ def generate(
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
 # Gradio Chat Interface Setup and Launch
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
@@ -731,18 +782,39 @@ demo = gr.ChatInterface(
         label="Query Input",
         file_types=["image", "audio"],
         file_count="multiple",
-        placeholder="‎ @tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,
 )
-# Ensure the static folder exists
 if not os.path.exists("static"):
     os.makedirs("static")
 from fastapi.staticfiles import StaticFiles
 demo.app.mount("/static", StaticFiles(directory="static"), name="static")
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(share=True)

 from PIL import Image
 import edge_tts
 import trimesh
+import soundfile as sf  # For audio file reading
 import supervision as sv
 from ultralytics import YOLO as YOLODetector
 from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
+# Additional imports for the new DeepseekR1 feature and FastAPI endpoints
+import openai
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
 os.system('pip install backoff')
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
     b64_data = base64.b64encode(data).decode("utf-8")
     return f"data:model/gltf-binary;base64,{b64_data}"
+# ---------------------------
+# Sambanova DeepseekR1 Clients and Chat Function
+# ---------------------------
+sambanova_client = openai.OpenAI(
+    api_key=os.environ.get("SAMBANOVA_API_KEY"),
+    base_url="https://api.sambanova.ai/v1",
+)
+sambanova_client2 = openai.OpenAI(
+    api_key=os.environ.get("SAMBANOVA_API_KEY_2"),
+    base_url="https://api.sambanova.ai/v1",
+)
+sambanova_client3 = openai.OpenAI(
+    api_key=os.environ.get("SAMBANOVA_API_KEY_3"),
+    base_url="https://api.sambanova.ai/v1",
+)
+def chat_response(prompt: str) -> str:
+    """
+    Generate a chat response using the primary Sambanova API.
+    If it fails, fallback to the second, and then the third API.
+    """
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt},
+    ]
+    errors = {}
+    try:
+        response = sambanova_client.chat.completions.create(
+            model="DeepSeek-R1-Distill-Llama-70B",
+            messages=messages,
+            temperature=0.1,
+            top_p=0.1
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        errors['client1'] = str(e)
+        try:
+            response2 = sambanova_client2.chat.completions.create(
+                model="DeepSeek-R1-Distill-Llama-70B",
+                messages=messages,
+                temperature=0.1,
+                top_p=0.1
+            )
+            return response2.choices[0].message.content
+        except Exception as e2:
+            errors['client2'] = str(e2)
+            try:
+                response3 = sambanova_client3.chat.completions.create(
+                    model="DeepSeek-R1-Distill-Llama-70B",
+                    messages=messages,
+                    temperature=0.1,
+                    top_p=0.1
+                )
+                return response3.choices[0].message.content
+            except Exception as e3:
+                errors['client3'] = str(e3)
+                return f"Primary error: {errors['client1']}; Second error: {errors['client2']}; Third error: {errors['client3']}"
+# ---------------------------
+# Model class for Text-to-3D Generation (ShapE)
+# ---------------------------
 class Model:
     def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         export_to_ply(images[0], ply_path.name)
         return self.to_glb(ply_path.name)
+# ---------------------------
 # New Tools for Web Functionality using DuckDuckGo and smolagents
+# ---------------------------
 from typing import Any, Optional
 from smolagents.tools import Tool
 import duckduckgo_search
                 "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
             ) from e
         try:
             response = requests.get(url, timeout=20)
+            response.raise_for_status()
             markdown_content = markdownify(response.text).strip()
             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
             return truncate_content(markdown_content, 10000)
         except requests.exceptions.Timeout:
             return "The request timed out. Please try again later or check the URL."
         except RequestException as e:
             return f"Error fetching the webpage: {str(e)}"
         except Exception as e:
             return f"An unexpected error occurred: {str(e)}"
+# ---------------------------
+# rAgent Reasoning using Llama mode OpenAI
+# ---------------------------
 from openai import OpenAI
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
     Uses the Llama mode OpenAI model to perform a structured reasoning chain.
     """
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
     for msg in history:
         if msg.get("role") == "user":
             messages.append({"role": "user", "content": msg["content"]})
 # ------------------------------------------------------------------------------
 # New Phi-4 Multimodal Feature (Image & Audio)
 # ------------------------------------------------------------------------------
 phi4_user_prompt = '<|user|>'
 phi4_assistant_prompt = '<|assistant|>'
 phi4_prompt_suffix = '<|end|>'
 phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
 phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
 phi4_model = AutoModelForCausalLM.from_pretrained(
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# ---------------------------
 # Load Models and Pipelines for Chat, Image, and Multimodal Processing
+# ---------------------------
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
 TTS_VOICES = [
+    "en-US-JennyNeural",
+    "en-US-GuyNeural",
 ]
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     """Convert text to speech using Edge TTS and save as MP3"""
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(output_file)
     return output_file
 def clean_chat_history(chat_history):
     """
     Filter out any chat entries whose "content" is not a string.
     """
     cleaned = []
     for msg in chat_history:
             cleaned.append(msg)
     return cleaned
+# ---------------------------
 # Stable Diffusion XL Pipeline for Image Generation
+# ---------------------------
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))
 sd_pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_ID_SD,
         options["use_resolution_binning"] = True
     images = []
     for i in range(0, num_images, BATCH_SIZE):
         batch_options = options.copy()
         batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
+# ---------------------------
 # Text-to-3D Generation using the ShapE Pipeline
+# ---------------------------
 @spaces.GPU(duration=120, enable_queue=True)
 def generate_3d_fn(
     prompt: str,
     glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
     return glb_path, seed
+# ---------------------------
 # YOLO Object Detection Setup
+# ---------------------------
 YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
 YOLO_CHECKPOINT_NAME = "images/demo.pt"
 yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
     return Image.fromarray(annotated_image)
+# ---------------------------
+# Chat Generation Function with Special Commands
+# ---------------------------
 @spaces.GPU
 def generate(
     input_dict: dict,
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
+      - "@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.
+      - **"@deepseekr1": queries the Sambanova DeepSeek-R1 model with fallback APIs.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
             num_steps=64,
             randomize_seed=True,
         )
         static_folder = os.path.join(os.getcwd(), "static")
         if not os.path.exists(static_folder):
             os.makedirs(static_folder)
     # --- Web Search/Visit branch ---
     if text.strip().lower().startswith("@web"):
         web_command = text[len("@web"):].strip()
         if web_command.lower().startswith("visit"):
             url = web_command[len("visit"):].strip()
             yield "🌍 Visiting webpage..."
             content = visitor.forward(url)
             yield content
         else:
             query = web_command
             yield "🧤 Performing a web search ..."
             searcher = DuckDuckGoSearchTool()
     if text.strip().lower().startswith("@ragent"):
         prompt = text[len("@ragent"):].strip()
         yield "📝 Initiating reasoning chain using Llama mode..."
         for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
             yield partial
         return
+    # --- DeepSeek-R1 branch ---
+    if text.strip().lower().startswith("@deepseekr1"):
+        prompt = text[len("@deepseekr1"):].strip()
+        yield "🔍 Querying DeepSeek-R1..."
+        response = chat_response(prompt)
+        yield response
+        return
     # --- YOLO Object Detection branch ---
     if text.strip().lower().startswith("@yolo"):
         yield "🔍 Running object detection with YOLO..."
         if not files or len(files) == 0:
             yield "Error: Please attach an image for YOLO object detection."
             return
         input_file = files[0]
         try:
             if isinstance(input_file, str):
         if not question:
             yield "Error: Please provide a question after @phi4."
             return
         input_file = files[0]
         try:
             if isinstance(input_file, Image.Image):
                 input_type = "Image"
                 file_for_phi4 = input_file
             else:
                 try:
                     file_for_phi4 = Image.open(input_file)
                     input_type = "Image"
             yield "Invalid file type for @phi4 multimodal processing."
             return
         streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = {
             **inputs,
             "streamer": streamer,
             "num_logits_to_keep": 0,
         }
         thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         yield "🤔 Processing with Phi-4..."
         for new_text in streamer:
             buffer += new_text
+            time.sleep(0.01)
             yield buffer
         return
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
+# ---------------------------
 # Gradio Chat Interface Setup and Launch
+# ---------------------------
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
         label="Query Input",
         file_types=["image", "audio"],
         file_count="multiple",
+        placeholder="‎ @tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, @deepseekr1, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,
 )
 if not os.path.exists("static"):
     os.makedirs("static")
 from fastapi.staticfiles import StaticFiles
 demo.app.mount("/static", StaticFiles(directory="static"), name="static")
+# ---------------------------
+# Mount FastAPI Middleware and Endpoint for DeepSeek-R1
+# ---------------------------
+demo.app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@demo.app.post("/chat")
+async def chat_endpoint(prompt: str):
+    """
+    FastAPI endpoint for the Sambanova DeepSeek-R1 chatbot.
+    """
+    result = chat_response(prompt)
+    return {"response": result}
+# ---------------------------
+# Main Execution
+# ---------------------------
 if __name__ == "__main__":
     demo.queue(max_size=20).launch(share=True)