Spaces:

prithivMLmods
/

Agent-Dino

Running on Zero

App Files Files Community

prithivMLmods commited on 9 days ago

Commit

d0a3095

verified ·

1 Parent(s): f344c9a

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -135

app.py CHANGED Viewed

@@ -186,7 +186,7 @@ class VisitWebpageTool(Tool):
             return f"Error fetching the webpage: {str(e)}"
         except Exception as e:
             return f"An unexpected error occurred: {str(e)}"
 # rAgent Reasoning using Llama mode OpenAI
 from openai import OpenAI
@@ -199,12 +199,12 @@ ragent_client = OpenAI(
 SYSTEM_PROMPT = """
-            "You are an expert assistant who solves tasks using Python code. Follow these steps:\n"
-            "1. **Thought**: Explain your reasoning and plan for solving the task.\n"
-            "2. **Code**: Write Python code to implement your solution.\n"
-            "3. **Observation**: Analyze the output of the code and summarize the results.\n"
-            "4. **Final Answer**: Provide a concise conclusion or final result.\n\n"
-            f"Task: {task}"
 """
@@ -222,18 +222,21 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
     messages.append({"role": "user", "content": prompt})
     response = ""
     stream = ragent_client.chat.completions.create(
-        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-        messages=messages,
     )
     for message in stream:
-        token = message.choices[0].delta.content
-        response += token
-        yield response
 # Define prompt structure for Phi-4
 phi4_user_prompt = '<|user|>'
 phi4_assistant_prompt = '<|assistant|>'
@@ -250,20 +253,24 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
     _attn_implementation="eager",
 )
 DESCRIPTION = """
 # Agent Dino 🌠"""
 css = '''
 h1 {
-    text-align: center;
-    display: block;
 }
 #duplicate-button {
-    margin: auto;
-    color: #fff;
-    background: #1565c0;
-    border-radius: 100vh;
 }
 '''
@@ -292,7 +299,7 @@ TTS_VOICES = [
 ]
 # Load multimodal processor and model (e.g. for OCR and image processing)
-MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
@@ -321,10 +328,10 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
-# Stable Diffusion XL Pipeline for Image Generation
 # Model In Use : SG161222/RealVisXL_V5.0_Lightning
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
@@ -430,96 +437,17 @@ def detect_objects(image: np.ndarray):
     """Runs object detection on the input image."""
     results = yolo_detector(image, verbose=False)[0]
     detections = sv.Detections.from_ultralytics(results).with_nms()
     box_annotator = sv.BoxAnnotator()
     label_annotator = sv.LabelAnnotator()
     annotated_image = image.copy()
     annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
     annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
     return Image.fromarray(annotated_image)
-# GRPO Model Loading and Functions
-grpo_model_name = "prithivMLmods/SmolLM2-360M-Grpo-r999"
-grpo_tokenizer = AutoTokenizer.from_pretrained(grpo_model_name)
-grpo_model = AutoModelForCausalLM.from_pretrained(grpo_model_name).to(device)
-def get_user_prompt(prompt: str) -> str:
-    match = re.search(r"<\|im_start\|>user\s*(.*?)\s*<\|im_end\|>", prompt, re.DOTALL)
-    return match.group(1).strip() if match else "\n".join(
-        line.strip()[4:].strip() if line.strip().lower().startswith("user") else line
-        for line in prompt.splitlines() if not line.strip().lower().startswith("system")
-    ).strip()
-def get_assistant_response(text: str) -> str:
-    match = re.search(r"<\|im_start\|>assistant\s*(.*?)\s*<\|im_end\|>", text, re.DOTALL)
-    return match.group(1).strip() if match else "\n".join(
-        line for line in text.splitlines() if not line.strip().lower().startswith("assistant")
-    ).strip()
-def generate_grpo_fn(prompt: str):
-    messages = [
-        {"role": "system", "content": "Please respond in this specific format ONLY:\n<thinking>\n input your reasoning behind your answer in between these reasoning tags.\n</thinking>\n<answer>\nyour answer in between these answer tags.\n</answer>\n"},
-        {"role": "user", "content": prompt}
-    ]
-    input_text = grpo_tokenizer.apply_chat_template(messages, tokenize=False)
-    inputs = grpo_tokenizer.encode(input_text, return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(grpo_tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        "input_ids": inputs,
-        "streamer": streamer,
-        "max_new_tokens": 200,
-        "temperature": 0.2,
-        "top_p": 0.9,
-        "do_sample": True,
-        "use_cache": False
-    }
-    thread = Thread(target=grpo_model.generate, kwargs=generation_kwargs)
-    thread.start()
-    outputs = []
-    thinking_started = False
-    answer_started = False
-    collected_thinking = ""
-    collected_answer = ""
-    for new_text in streamer:
-        outputs.append(new_text)
-        full_output = "".join(outputs)
-        if "<thinking>" in full_output and not thinking_started:
-            thinking_started = True
-            thinking_start_index = full_output.find("<thinking>") + len("<thinking>")
-            collected_thinking = full_output[thinking_start_index:]
-        elif thinking_started and "</thinking>" not in full_output:
-            collected_thinking += new_text
-        elif thinking_started and "</thinking>" in full_output and not answer_started:
-            thinking_ended_index = full_output.find("</thinking>")
-            collected_thinking = full_output[thinking_start_index:thinking_ended_index]
-            answer_started = True
-            answer_start_index = full_output.find("<answer>") + len("<answer>")
-            collected_answer = full_output[answer_start_index:]
-        elif answer_started and "</answer>" not in full_output:
-            collected_answer += new_text
-        elif answer_started and "</answer>" in full_output:
-            answer_ended_index = full_output.find("</answer>")
-            collected_answer = full_output[answer_start_index:answer_ended_index]
-        if answer_started:
-            # Yield only the answer part once answer section started
-            yield collected_answer.strip()
-        else:
-            # While in thinking phase or before, yield full output for streaming effect
-            yield "".join(outputs).replace("<thinking>", "").replace("</thinking>", "").replace("<answer>", "").replace("</answer>", "").strip()
-# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, @phi4 and @grpo commands
 @spaces.GPU
 def generate(
@@ -533,27 +461,17 @@ def generate(
 ):
     """
     Generates chatbot responses with support for multimodal input and special commands:
-         - "@tts1" or "@tts2": triggers text-to-speech.
-         - "@image": triggers image generation using the SDXL pipeline.
-         - "@3d": triggers 3D model generation using the ShapE pipeline.
-         - "@web": triggers a web search or webpage visit.
-         - "@rAgent": initiates a reasoning chain using Llama mode.
-         - "@yolo": triggers object detection using YOLO.
-         - "@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.
-         - "@grpo": triggers text generation using GRPO model with structured output.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # --- GRPO Generation branch ---
-    if text.strip().lower().startswith("@grpo"):
-        prompt = text[len("@grpo"):].strip()
-        yield "💡 Thinking using GRPO model..."
-        for partial_response in generate_grpo_fn(prompt):
-            yield partial_response
-        return
     # --- 3D Generation branch ---
     if text.strip().lower().startswith("@3d"):
         prompt = text[len("@3d"):].strip()
@@ -572,7 +490,7 @@ def generate(
         new_filename = f"mesh_{uuid.uuid4()}.glb"
         new_filepath = os.path.join(static_folder, new_filename)
         shutil.copy(glb_path, new_filepath)
         yield gr.File(new_filepath)
         return
@@ -616,8 +534,8 @@ def generate(
         return
     # --- rAgent Reasoning branch ---
-    if text.strip().lower().startswith("@ragent"):
-        prompt = text[len("@ragent"):].strip()
         yield "📝 Initiating reasoning chain using Llama mode..."
         # Pass the current chat history (cleaned) to help inform the chain.
         for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
@@ -686,7 +604,7 @@ def generate(
         # Initialize the streamer
         streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
         # Prepare generation kwargs
         generation_kwargs = {
             **inputs,
@@ -712,7 +630,7 @@ def generate(
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
@@ -803,10 +721,9 @@ demo = gr.ChatInterface(
         ["@tts2 What causes rainbows to form?"],
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
-        ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
-        ["@grpo If there are 12 cookies in a dozen and you have 5 dozen, how many cookies do you have?"],
     ],
     cache_examples=False,
     type="messages",
@@ -814,10 +731,10 @@ demo = gr.ChatInterface(
     css=css,
     fill_height=True,
     textbox=gr.MultimodalTextbox(
-        label="Query Input",
         file_types=["image", "audio"],
-        file_count="multiple",
-        placeholder="@tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, @grpo, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,

             return f"Error fetching the webpage: {str(e)}"
         except Exception as e:
             return f"An unexpected error occurred: {str(e)}"
 # rAgent Reasoning using Llama mode OpenAI
 from openai import OpenAI
 SYSTEM_PROMPT = """
+        "You are an expert assistant who solves tasks using Python code. Follow these steps:\n"
+        "1. **Thought**: Explain your reasoning and plan for solving the task.\n"
+        "2. **Code**: Write Python code to implement your solution.\n"
+        "3. **Observation**: Analyze the output of the code and summarize the results.\n"
+        "4. **Final Answer**: Provide a concise conclusion or final result.\n\n"
+        f"Task: {task}"
 """
     messages.append({"role": "user", "content": prompt})
     response = ""
     stream = ragent_client.chat.completions.create(
+         model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+         max_tokens=max_tokens,
+         stream=True,
+         temperature=temperature,
+         top_p=top_p,
+         messages=messages,
     )
     for message in stream:
+         token = message.choices[0].delta.content
+         response += token
+         yield response
+# ------------------------------------------------------------------------------
+# New Phi-4 Multimodal Feature (Image & Audio)
+# ------------------------------------------------------------------------------
 # Define prompt structure for Phi-4
 phi4_user_prompt = '<|user|>'
 phi4_assistant_prompt = '<|assistant|>'
     _attn_implementation="eager",
 )
+# ------------------------------------------------------------------------------
+# Gradio UI configuration
+# ------------------------------------------------------------------------------
 DESCRIPTION = """
 # Agent Dino 🌠"""
 css = '''
 h1 {
+  text-align: center;
+  display: block;
 }
 #duplicate-button {
+  margin: auto;
+  color: #fff;
+  background: #1565c0;
+  border-radius: 100vh;
 }
 '''
 ]
 # Load multimodal processor and model (e.g. for OCR and image processing)
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
             cleaned.append(msg)
     return cleaned
+# Stable Diffusion XL Pipeline for Image Generation
 # Model In Use : SG161222/RealVisXL_V5.0_Lightning
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
     """Runs object detection on the input image."""
     results = yolo_detector(image, verbose=False)[0]
     detections = sv.Detections.from_ultralytics(results).with_nms()
     box_annotator = sv.BoxAnnotator()
     label_annotator = sv.LabelAnnotator()
     annotated_image = image.copy()
     annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
     annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
     return Image.fromarray(annotated_image)
+# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
 @spaces.GPU
 def generate(
 ):
     """
     Generates chatbot responses with support for multimodal input and special commands:
+      - "@tts1" or "@tts2": triggers text-to-speech.
+      - "@image": triggers image generation using the SDXL pipeline.
+      - "@3d": triggers 3D model generation using the ShapE pipeline.
+      - "@web": triggers a web search or webpage visit.
+      - "@rAgent": initiates a reasoning chain using Llama mode.
+      - "@yolo": triggers object detection using YOLO.
+      - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
     # --- 3D Generation branch ---
     if text.strip().lower().startswith("@3d"):
         prompt = text[len("@3d"):].strip()
         new_filename = f"mesh_{uuid.uuid4()}.glb"
         new_filepath = os.path.join(static_folder, new_filename)
         shutil.copy(glb_path, new_filepath)
         yield gr.File(new_filepath)
         return
         return
     # --- rAgent Reasoning branch ---
+    if text.strip().lower().startswith("@rAgent"):
+        prompt = text[len("@rAgent"):].strip()
         yield "📝 Initiating reasoning chain using Llama mode..."
         # Pass the current chat history (cleaned) to help inform the chain.
         for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
         # Initialize the streamer
         streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
         # Prepare generation kwargs
         generation_kwargs = {
             **inputs,
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
         ["@tts2 What causes rainbows to form?"],
         [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
         [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
+        ["@ragent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
     ],
     cache_examples=False,
     type="messages",
     css=css,
     fill_height=True,
     textbox=gr.MultimodalTextbox(
+        label="Query Input",
         file_types=["image", "audio"],
+        file_count="multiple",
+        placeholder="@tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,