Spaces:

prithivMLmods
/

Agent-Dino

Running on Zero

App Files Files Community

prithivMLmods commited on 9 days ago

Commit

f344c9a

verified ·

1 Parent(s): 59d9c9e

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -42

app.py CHANGED Viewed

@@ -186,7 +186,7 @@ class VisitWebpageTool(Tool):
             return f"Error fetching the webpage: {str(e)}"
         except Exception as e:
             return f"An unexpected error occurred: {str(e)}"
 # rAgent Reasoning using Llama mode OpenAI
 from openai import OpenAI
@@ -199,12 +199,12 @@ ragent_client = OpenAI(
 SYSTEM_PROMPT = """
-        "You are an expert assistant who solves tasks using Python code. Follow these steps:\n"
-        "1. **Thought**: Explain your reasoning and plan for solving the task.\n"
-        "2. **Code**: Write Python code to implement your solution.\n"
-        "3. **Observation**: Analyze the output of the code and summarize the results.\n"
-        "4. **Final Answer**: Provide a concise conclusion or final result.\n\n"
-        f"Task: {task}"
 """
@@ -222,17 +222,17 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
     messages.append({"role": "user", "content": prompt})
     response = ""
     stream = ragent_client.chat.completions.create(
-         model="meta-llama/Meta-Llama-3.1-8B-Instruct",
-         max_tokens=max_tokens,
-         stream=True,
-         temperature=temperature,
-         top_p=top_p,
-         messages=messages,
     )
     for message in stream:
-         token = message.choices[0].delta.content
-         response += token
-         yield response
 # Define prompt structure for Phi-4
 phi4_user_prompt = '<|user|>'
@@ -255,15 +255,15 @@ DESCRIPTION = """
 css = '''
 h1 {
-  text-align: center;
-  display: block;
 }
 #duplicate-button {
-  margin: auto;
-  color: #fff;
-  background: #1565c0;
-  border-radius: 100vh;
 }
 '''
@@ -292,7 +292,7 @@ TTS_VOICES = [
 ]
 # Load multimodal processor and model (e.g. for OCR and image processing)
-MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
@@ -321,10 +321,10 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
-# Stable Diffusion XL Pipeline for Image Generation
 # Model In Use : SG161222/RealVisXL_V5.0_Lightning
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
@@ -430,17 +430,96 @@ def detect_objects(image: np.ndarray):
     """Runs object detection on the input image."""
     results = yolo_detector(image, verbose=False)[0]
     detections = sv.Detections.from_ultralytics(results).with_nms()
     box_annotator = sv.BoxAnnotator()
     label_annotator = sv.LabelAnnotator()
     annotated_image = image.copy()
     annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
     annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
     return Image.fromarray(annotated_image)
-# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
 @spaces.GPU
 def generate(
@@ -454,17 +533,27 @@ def generate(
 ):
     """
     Generates chatbot responses with support for multimodal input and special commands:
-      - "@tts1" or "@tts2": triggers text-to-speech.
-      - "@image": triggers image generation using the SDXL pipeline.
-      - "@3d": triggers 3D model generation using the ShapE pipeline.
-      - "@web": triggers a web search or webpage visit.
-      - "@rAgent": initiates a reasoning chain using Llama mode.
-      - "@yolo": triggers object detection using YOLO.
-      - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
     # --- 3D Generation branch ---
     if text.strip().lower().startswith("@3d"):
         prompt = text[len("@3d"):].strip()
@@ -483,7 +572,7 @@ def generate(
         new_filename = f"mesh_{uuid.uuid4()}.glb"
         new_filepath = os.path.join(static_folder, new_filename)
         shutil.copy(glb_path, new_filepath)
         yield gr.File(new_filepath)
         return
@@ -597,7 +686,7 @@ def generate(
         # Initialize the streamer
         streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
         # Prepare generation kwargs
         generation_kwargs = {
             **inputs,
@@ -623,7 +712,7 @@ def generate(
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
@@ -717,6 +806,7 @@ demo = gr.ChatInterface(
         ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
     ],
     cache_examples=False,
     type="messages",
@@ -724,10 +814,10 @@ demo = gr.ChatInterface(
     css=css,
     fill_height=True,
     textbox=gr.MultimodalTextbox(
-        label="Query Input",
         file_types=["image", "audio"],
-        file_count="multiple",
-        placeholder="@tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,

             return f"Error fetching the webpage: {str(e)}"
         except Exception as e:
             return f"An unexpected error occurred: {str(e)}"
 # rAgent Reasoning using Llama mode OpenAI
 from openai import OpenAI
 SYSTEM_PROMPT = """
+            "You are an expert assistant who solves tasks using Python code. Follow these steps:\n"
+            "1. **Thought**: Explain your reasoning and plan for solving the task.\n"
+            "2. **Code**: Write Python code to implement your solution.\n"
+            "3. **Observation**: Analyze the output of the code and summarize the results.\n"
+            "4. **Final Answer**: Provide a concise conclusion or final result.\n\n"
+            f"Task: {task}"
 """
     messages.append({"role": "user", "content": prompt})
     response = ""
     stream = ragent_client.chat.completions.create(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        max_tokens=max_tokens,
+        stream=True,
+        temperature=temperature,
+        top_p=top_p,
+        messages=messages,
     )
     for message in stream:
+        token = message.choices[0].delta.content
+        response += token
+        yield response
 # Define prompt structure for Phi-4
 phi4_user_prompt = '<|user|>'
 css = '''
 h1 {
+    text-align: center;
+    display: block;
 }
 #duplicate-button {
+    margin: auto;
+    color: #fff;
+    background: #1565c0;
+    border-radius: 100vh;
 }
 '''
 ]
 # Load multimodal processor and model (e.g. for OCR and image processing)
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
             cleaned.append(msg)
     return cleaned
+# Stable Diffusion XL Pipeline for Image Generation
 # Model In Use : SG161222/RealVisXL_V5.0_Lightning
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
     """Runs object detection on the input image."""
     results = yolo_detector(image, verbose=False)[0]
     detections = sv.Detections.from_ultralytics(results).with_nms()
     box_annotator = sv.BoxAnnotator()
     label_annotator = sv.LabelAnnotator()
     annotated_image = image.copy()
     annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
     annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
     return Image.fromarray(annotated_image)
+# GRPO Model Loading and Functions
+grpo_model_name = "prithivMLmods/SmolLM2-360M-Grpo-r999"
+grpo_tokenizer = AutoTokenizer.from_pretrained(grpo_model_name)
+grpo_model = AutoModelForCausalLM.from_pretrained(grpo_model_name).to(device)
+def get_user_prompt(prompt: str) -> str:
+    match = re.search(r"<\|im_start\|>user\s*(.*?)\s*<\|im_end\|>", prompt, re.DOTALL)
+    return match.group(1).strip() if match else "\n".join(
+        line.strip()[4:].strip() if line.strip().lower().startswith("user") else line
+        for line in prompt.splitlines() if not line.strip().lower().startswith("system")
+    ).strip()
+def get_assistant_response(text: str) -> str:
+    match = re.search(r"<\|im_start\|>assistant\s*(.*?)\s*<\|im_end\|>", text, re.DOTALL)
+    return match.group(1).strip() if match else "\n".join(
+        line for line in text.splitlines() if not line.strip().lower().startswith("assistant")
+    ).strip()
+def generate_grpo_fn(prompt: str):
+    messages = [
+        {"role": "system", "content": "Please respond in this specific format ONLY:\n<thinking>\n input your reasoning behind your answer in between these reasoning tags.\n</thinking>\n<answer>\nyour answer in between these answer tags.\n</answer>\n"},
+        {"role": "user", "content": prompt}
+    ]
+    input_text = grpo_tokenizer.apply_chat_template(messages, tokenize=False)
+    inputs = grpo_tokenizer.encode(input_text, return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(grpo_tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        "input_ids": inputs,
+        "streamer": streamer,
+        "max_new_tokens": 200,
+        "temperature": 0.2,
+        "top_p": 0.9,
+        "do_sample": True,
+        "use_cache": False
+    }
+    thread = Thread(target=grpo_model.generate, kwargs=generation_kwargs)
+    thread.start()
+    outputs = []
+    thinking_started = False
+    answer_started = False
+    collected_thinking = ""
+    collected_answer = ""
+    for new_text in streamer:
+        outputs.append(new_text)
+        full_output = "".join(outputs)
+        if "<thinking>" in full_output and not thinking_started:
+            thinking_started = True
+            thinking_start_index = full_output.find("<thinking>") + len("<thinking>")
+            collected_thinking = full_output[thinking_start_index:]
+        elif thinking_started and "</thinking>" not in full_output:
+            collected_thinking += new_text
+        elif thinking_started and "</thinking>" in full_output and not answer_started:
+            thinking_ended_index = full_output.find("</thinking>")
+            collected_thinking = full_output[thinking_start_index:thinking_ended_index]
+            answer_started = True
+            answer_start_index = full_output.find("<answer>") + len("<answer>")
+            collected_answer = full_output[answer_start_index:]
+        elif answer_started and "</answer>" not in full_output:
+            collected_answer += new_text
+        elif answer_started and "</answer>" in full_output:
+            answer_ended_index = full_output.find("</answer>")
+            collected_answer = full_output[answer_start_index:answer_ended_index]
+        if answer_started:
+            # Yield only the answer part once answer section started
+            yield collected_answer.strip()
+        else:
+            # While in thinking phase or before, yield full output for streaming effect
+            yield "".join(outputs).replace("<thinking>", "").replace("</thinking>", "").replace("<answer>", "").replace("</answer>", "").strip()
+# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, @phi4 and @grpo commands
 @spaces.GPU
 def generate(
 ):
     """
     Generates chatbot responses with support for multimodal input and special commands:
+         - "@tts1" or "@tts2": triggers text-to-speech.
+         - "@image": triggers image generation using the SDXL pipeline.
+         - "@3d": triggers 3D model generation using the ShapE pipeline.
+         - "@web": triggers a web search or webpage visit.
+         - "@rAgent": initiates a reasoning chain using Llama mode.
+         - "@yolo": triggers object detection using YOLO.
+         - "@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.
+         - "@grpo": triggers text generation using GRPO model with structured output.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    # --- GRPO Generation branch ---
+    if text.strip().lower().startswith("@grpo"):
+        prompt = text[len("@grpo"):].strip()
+        yield "💡 Thinking using GRPO model..."
+        for partial_response in generate_grpo_fn(prompt):
+            yield partial_response
+        return
     # --- 3D Generation branch ---
     if text.strip().lower().startswith("@3d"):
         prompt = text[len("@3d"):].strip()
         new_filename = f"mesh_{uuid.uuid4()}.glb"
         new_filepath = os.path.join(static_folder, new_filename)
         shutil.copy(glb_path, new_filepath)
         yield gr.File(new_filepath)
         return
         # Initialize the streamer
         streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
         # Prepare generation kwargs
         generation_kwargs = {
             **inputs,
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
         ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
+        ["@grpo If there are 12 cookies in a dozen and you have 5 dozen, how many cookies do you have?"],
     ],
     cache_examples=False,
     type="messages",
     css=css,
     fill_height=True,
     textbox=gr.MultimodalTextbox(
+        label="Query Input",
         file_types=["image", "audio"],
+        file_count="multiple",
+        placeholder="@tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, @grpo, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,