Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	merge
Browse files- app.py +196 -4
 - navigation.py +0 -192
 
    	
        app.py
    CHANGED
    
    | 
         @@ -14,8 +14,6 @@ from PIL import Image, ImageDraw 
     | 
|
| 14 | 
         
             
            from transformers import AutoModelForImageTextToText, AutoProcessor
         
     | 
| 15 | 
         
             
            from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
         
     | 
| 16 | 
         | 
| 17 | 
         
            -
            from . import navigation
         
     | 
| 18 | 
         
            -
             
     | 
| 19 | 
         
             
            # --- Configuration ---
         
     | 
| 20 | 
         
             
            MODEL_ID = "Hcompany/Holo1-7B"
         
     | 
| 21 | 
         | 
| 
         @@ -84,6 +82,200 @@ def run_inference_localization( 
     | 
|
| 84 | 
         
             
                return decoded_output[0] if decoded_output else ""
         
     | 
| 85 | 
         | 
| 86 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 87 | 
         
             
            # --- Gradio processing function ---
         
     | 
| 88 | 
         
             
            def navigate(input_pil_image: Image.Image, task: str) -> str:
         
     | 
| 89 | 
         
             
                if not model_loaded or not processor or not model:
         
     | 
| 
         @@ -116,7 +308,7 @@ def navigate(input_pil_image: Image.Image, task: str) -> str: 
     | 
|
| 116 | 
         
             
                    return f"Error resizing image: {e}", input_pil_image.copy().convert("RGB")
         
     | 
| 117 | 
         | 
| 118 | 
         
             
                # 2. Create the prompt using the resized image (for correct image tagging context) and task
         
     | 
| 119 | 
         
            -
                prompt =  
     | 
| 120 | 
         | 
| 121 | 
         
             
                # 3. Run inference
         
     | 
| 122 | 
         
             
                #    Pass `messages` (which includes the image object for template processing)
         
     | 
| 
         @@ -128,7 +320,7 @@ def navigate(input_pil_image: Image.Image, task: str) -> str: 
     | 
|
| 128 | 
         
             
                    return f"Error during model inference: {e}", resized_image.copy().convert("RGB")
         
     | 
| 129 | 
         | 
| 130 | 
         
             
                return navigation_str
         
     | 
| 131 | 
         
            -
                # return  
     | 
| 132 | 
         | 
| 133 | 
         | 
| 134 | 
         
             
            # --- Load Example Data ---
         
     | 
| 
         | 
|
| 14 | 
         
             
            from transformers import AutoModelForImageTextToText, AutoProcessor
         
     | 
| 15 | 
         
             
            from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
         
     | 
| 16 | 
         | 
| 
         | 
|
| 
         | 
|
| 17 | 
         
             
            # --- Configuration ---
         
     | 
| 18 | 
         
             
            MODEL_ID = "Hcompany/Holo1-7B"
         
     | 
| 19 | 
         | 
| 
         | 
|
| 82 | 
         
             
                return decoded_output[0] if decoded_output else ""
         
     | 
| 83 | 
         | 
| 84 | 
         | 
| 85 | 
         
            +
            from typing import Literal
         
     | 
| 86 | 
         
            +
             
     | 
| 87 | 
         
            +
            from pydantic import BaseModel, Field
         
     | 
| 88 | 
         
            +
             
     | 
| 89 | 
         
            +
            SYSTEM_PROMPT: str = """Imagine you are a robot browsing the web, just like humans. Now you need to complete a task.
         
     | 
| 90 | 
         
            +
            In each iteration, you will receive an Observation that includes the last  screenshots of a web browser and the current memory of the agent.
         
     | 
| 91 | 
         
            +
            You have also information about the step that the agent is trying to achieve to solve the task.
         
     | 
| 92 | 
         
            +
            Carefully analyze the visual information to identify what to do, then follow the guidelines to choose the following action.
         
     | 
| 93 | 
         
            +
            You should detail your thought (i.e. reasoning steps) before taking the action.
         
     | 
| 94 | 
         
            +
            Also detail in the notes field of the action the extracted information relevant to solve the task.
         
     | 
| 95 | 
         
            +
            Once you have enough information in the notes to answer the task, return an answer action with the detailed answer in the notes field.
         
     | 
| 96 | 
         
            +
            This will be evaluated by an evaluator and should match all the criteria or requirements of the task.
         
     | 
| 97 | 
         
            +
             
     | 
| 98 | 
         
            +
            Guidelines:
         
     | 
| 99 | 
         
            +
            - store in the notes all the relevant information to solve the task that fulfill the task criteria. Be precise
         
     | 
| 100 | 
         
            +
            - Use both the task and the step information to decide what to do
         
     | 
| 101 | 
         
            +
            - if you want to write in a text field and the text field already has text, designate the text field by the text it contains and its type
         
     | 
| 102 | 
         
            +
            - If there is a cookies notice, always accept all the cookies first
         
     | 
| 103 | 
         
            +
            - The observation is the screenshot of the current page and the memory of the agent.
         
     | 
| 104 | 
         
            +
            - If you see relevant information on the screenshot to answer the task, add it to the notes field of the action.
         
     | 
| 105 | 
         
            +
            - If there is no relevant information on the screenshot to answer the task, add an empty string to the notes field of the action.
         
     | 
| 106 | 
         
            +
            - If you see buttons that allow to navigate directly to relevant information, like jump to ... or go to ... , use them to navigate faster.
         
     | 
| 107 | 
         
            +
            - In the answer action, give as many details a possible relevant to answering the task.
         
     | 
| 108 | 
         
            +
            - if you want to write, don't click before. Directly use the write action
         
     | 
| 109 | 
         
            +
            - to write, identify the web element which is type and the text it already contains
         
     | 
| 110 | 
         
            +
            - If you want to use a search bar, directly write text in the search bar
         
     | 
| 111 | 
         
            +
            - Don't scroll too much. Don't scroll if the number of scrolls is greater than 3
         
     | 
| 112 | 
         
            +
            - Don't scroll if you are at the end of the webpage
         
     | 
| 113 | 
         
            +
            - Only refresh if you identify a rate limit problem
         
     | 
| 114 | 
         
            +
            - If you are looking for a single flights, click on round-trip to select 'one way'
         
     | 
| 115 | 
         
            +
            - Never try to login, enter email or password. If there is a need to login, then go back.
         
     | 
| 116 | 
         
            +
            - If you are facing a captcha on a website, try to solve it.
         
     | 
| 117 | 
         
            +
             
     | 
| 118 | 
         
            +
            - if you have enough information in the screenshot and in the notes to answer the task, return an answer action with the detailed answer in the notes field
         
     | 
| 119 | 
         
            +
            - The current date is {timestamp}.
         
     | 
| 120 | 
         
            +
             
     | 
| 121 | 
         
            +
            # <output_json_format>
         
     | 
| 122 | 
         
            +
            # ```json
         
     | 
| 123 | 
         
            +
            # {output_format}
         
     | 
| 124 | 
         
            +
            # ```
         
     | 
| 125 | 
         
            +
            # </output_json_format>
         
     | 
| 126 | 
         
            +
             
     | 
| 127 | 
         
            +
            """
         
     | 
| 128 | 
         
            +
             
     | 
| 129 | 
         
            +
             
     | 
| 130 | 
         
            +
            class ClickElementAction(BaseModel):
         
     | 
| 131 | 
         
            +
                """Click at absolute coordinates of a web element with its description"""
         
     | 
| 132 | 
         
            +
             
     | 
| 133 | 
         
            +
                action: Literal["click_element"] = Field(description="Click at absolute coordinates of a web element")
         
     | 
| 134 | 
         
            +
                element: str = Field(description="text description of the element")
         
     | 
| 135 | 
         
            +
                x: int = Field(description="The x coordinate, number of pixels from the left edge.")
         
     | 
| 136 | 
         
            +
                y: int = Field(description="The y coordinate, number of pixels from the top edge.")
         
     | 
| 137 | 
         
            +
             
     | 
| 138 | 
         
            +
                def log(self):
         
     | 
| 139 | 
         
            +
                    return f"I have clicked on the element '{self.element}' at absolute coordinates {self.x}, {self.y}"
         
     | 
| 140 | 
         
            +
             
     | 
| 141 | 
         
            +
             
     | 
| 142 | 
         
            +
            class WriteElementAction(BaseModel):
         
     | 
| 143 | 
         
            +
                """Write content at absolute coordinates of a web element identified by its description, then press Enter."""
         
     | 
| 144 | 
         
            +
             
     | 
| 145 | 
         
            +
                action: Literal["write_element_abs"] = Field(description="Write content at absolute coordinates of a web page")
         
     | 
| 146 | 
         
            +
                content: str = Field(description="Content to write")
         
     | 
| 147 | 
         
            +
                element: str = Field(description="Text description of the element")
         
     | 
| 148 | 
         
            +
                x: int = Field(description="The x coordinate, number of pixels from the left edge.")
         
     | 
| 149 | 
         
            +
                y: int = Field(description="The y coordinate, number of pixels from the top edge.")
         
     | 
| 150 | 
         
            +
             
     | 
| 151 | 
         
            +
                def log(self):
         
     | 
| 152 | 
         
            +
                    return f"I have written '{self.content}' in the element '{self.element}' at absolute coordinates {self.x}, {self.y}"
         
     | 
| 153 | 
         
            +
             
     | 
| 154 | 
         
            +
             
     | 
| 155 | 
         
            +
            class ScrollAction(BaseModel):
         
     | 
| 156 | 
         
            +
                """Scroll action with no required element"""
         
     | 
| 157 | 
         
            +
             
     | 
| 158 | 
         
            +
                action: Literal["scroll"] = Field(description="Scroll the page or a specific element")
         
     | 
| 159 | 
         
            +
                direction: Literal["down", "up", "left", "right"] = Field(description="The direction to scroll in")
         
     | 
| 160 | 
         
            +
             
     | 
| 161 | 
         
            +
                def log(self):
         
     | 
| 162 | 
         
            +
                    return f"I have scrolled {self.direction}"
         
     | 
| 163 | 
         
            +
             
     | 
| 164 | 
         
            +
             
     | 
| 165 | 
         
            +
            class GoBackAction(BaseModel):
         
     | 
| 166 | 
         
            +
                """Action to navigate back in browser history"""
         
     | 
| 167 | 
         
            +
             
     | 
| 168 | 
         
            +
                action: Literal["go_back"] = Field(description="Navigate to the previous page")
         
     | 
| 169 | 
         
            +
             
     | 
| 170 | 
         
            +
                def log(self):
         
     | 
| 171 | 
         
            +
                    return "I have gone back to the previous page"
         
     | 
| 172 | 
         
            +
             
     | 
| 173 | 
         
            +
             
     | 
| 174 | 
         
            +
            class RefreshAction(BaseModel):
         
     | 
| 175 | 
         
            +
                """Action to refresh the current page"""
         
     | 
| 176 | 
         
            +
             
     | 
| 177 | 
         
            +
                action: Literal["refresh"] = Field(description="Refresh the current page")
         
     | 
| 178 | 
         
            +
             
     | 
| 179 | 
         
            +
                def log(self):
         
     | 
| 180 | 
         
            +
                    return "I have refreshed the page"
         
     | 
| 181 | 
         
            +
             
     | 
| 182 | 
         
            +
             
     | 
| 183 | 
         
            +
            class GotoAction(BaseModel):
         
     | 
| 184 | 
         
            +
                """Action to go to a particular URL"""
         
     | 
| 185 | 
         
            +
             
     | 
| 186 | 
         
            +
                action: Literal["goto"] = Field(description="Goto a particular URL")
         
     | 
| 187 | 
         
            +
                url: str = Field(description="A url starting with http:// or https://")
         
     | 
| 188 | 
         
            +
             
     | 
| 189 | 
         
            +
                def log(self):
         
     | 
| 190 | 
         
            +
                    return f"I have navigated to the URL {self.url}"
         
     | 
| 191 | 
         
            +
             
     | 
| 192 | 
         
            +
             
     | 
| 193 | 
         
            +
            class WaitAction(BaseModel):
         
     | 
| 194 | 
         
            +
                """Action to wait for a particular amount of time"""
         
     | 
| 195 | 
         
            +
             
     | 
| 196 | 
         
            +
                action: Literal["wait"] = Field(description="Wait for a particular amount of time")
         
     | 
| 197 | 
         
            +
                seconds: int = Field(default=2, ge=0, le=10, description="The number of seconds to wait")
         
     | 
| 198 | 
         
            +
             
     | 
| 199 | 
         
            +
                def log(self):
         
     | 
| 200 | 
         
            +
                    return f"I have waited for {self.seconds} seconds"
         
     | 
| 201 | 
         
            +
             
     | 
| 202 | 
         
            +
             
     | 
| 203 | 
         
            +
            class RestartAction(BaseModel):
         
     | 
| 204 | 
         
            +
                """Restart the task from the beginning."""
         
     | 
| 205 | 
         
            +
             
     | 
| 206 | 
         
            +
                action: Literal["restart"] = "restart"
         
     | 
| 207 | 
         
            +
             
     | 
| 208 | 
         
            +
                def log(self):
         
     | 
| 209 | 
         
            +
                    return "I have restarted the task from the beginning"
         
     | 
| 210 | 
         
            +
             
     | 
| 211 | 
         
            +
             
     | 
| 212 | 
         
            +
            class AnswerAction(BaseModel):
         
     | 
| 213 | 
         
            +
                """Return a final answer to the task. This is the last action to call in an episode."""
         
     | 
| 214 | 
         
            +
             
     | 
| 215 | 
         
            +
                action: Literal["answer"] = "answer"
         
     | 
| 216 | 
         
            +
                content: str = Field(description="The answer content")
         
     | 
| 217 | 
         
            +
             
     | 
| 218 | 
         
            +
                def log(self):
         
     | 
| 219 | 
         
            +
                    return f"I have answered the task with '{self.content}'"
         
     | 
| 220 | 
         
            +
             
     | 
| 221 | 
         
            +
             
     | 
| 222 | 
         
            +
            ActionSpace = (
         
     | 
| 223 | 
         
            +
                ClickElementAction
         
     | 
| 224 | 
         
            +
                | WriteElementAction
         
     | 
| 225 | 
         
            +
                | ScrollAction
         
     | 
| 226 | 
         
            +
                | GoBackAction
         
     | 
| 227 | 
         
            +
                | RefreshAction
         
     | 
| 228 | 
         
            +
                | WaitAction
         
     | 
| 229 | 
         
            +
                | RestartAction
         
     | 
| 230 | 
         
            +
                | AnswerAction
         
     | 
| 231 | 
         
            +
                | GotoAction
         
     | 
| 232 | 
         
            +
            )
         
     | 
| 233 | 
         
            +
             
     | 
| 234 | 
         
            +
             
     | 
| 235 | 
         
            +
            class NavigationStep(BaseModel):
         
     | 
| 236 | 
         
            +
                note: str = Field(
         
     | 
| 237 | 
         
            +
                    default="",
         
     | 
| 238 | 
         
            +
                    description="Task-relevant information extracted from the previous observation. Keep empty if no new info.",
         
     | 
| 239 | 
         
            +
                )
         
     | 
| 240 | 
         
            +
                thought: str = Field(description="Reasoning about next steps (<4 lines)")
         
     | 
| 241 | 
         
            +
                action: ActionSpace = Field(description="Next action to take")
         
     | 
| 242 | 
         
            +
             
     | 
| 243 | 
         
            +
             
     | 
| 244 | 
         
            +
            def get_navigation_prompt(task, image, step=1):
         
     | 
| 245 | 
         
            +
                """
         
     | 
| 246 | 
         
            +
                Get the prompt for the navigation task.
         
     | 
| 247 | 
         
            +
                - task: The task to complete
         
     | 
| 248 | 
         
            +
                - image: The current screenshot of the web page
         
     | 
| 249 | 
         
            +
                - step: The current step of the task
         
     | 
| 250 | 
         
            +
                """
         
     | 
| 251 | 
         
            +
                system_prompt = SYSTEM_PROMPT.format(
         
     | 
| 252 | 
         
            +
                    output_format=NavigationStep.model_json_schema(),
         
     | 
| 253 | 
         
            +
                    timestamp="2025-06-04 14:16:03",
         
     | 
| 254 | 
         
            +
                )
         
     | 
| 255 | 
         
            +
                return [
         
     | 
| 256 | 
         
            +
                    {
         
     | 
| 257 | 
         
            +
                        "role": "system",
         
     | 
| 258 | 
         
            +
                        "content": [
         
     | 
| 259 | 
         
            +
                            {"type": "text", "text": system_prompt},
         
     | 
| 260 | 
         
            +
                        ],
         
     | 
| 261 | 
         
            +
                    },
         
     | 
| 262 | 
         
            +
                    {
         
     | 
| 263 | 
         
            +
                        "role": "user",
         
     | 
| 264 | 
         
            +
                        "content": [
         
     | 
| 265 | 
         
            +
                            {"type": "text", "text": f"<task>\n{task}\n</task>\n"},
         
     | 
| 266 | 
         
            +
                            {"type": "text", "text": f"<observation step={step}>\n"},
         
     | 
| 267 | 
         
            +
                            {"type": "text", "text": "<screenshot>\n"},
         
     | 
| 268 | 
         
            +
                            {
         
     | 
| 269 | 
         
            +
                                "type": "image",
         
     | 
| 270 | 
         
            +
                                "image": image,
         
     | 
| 271 | 
         
            +
                            },
         
     | 
| 272 | 
         
            +
                            {"type": "text", "text": "\n</screenshot>\n"},
         
     | 
| 273 | 
         
            +
                            {"type": "text", "text": "\n</observation>\n"},
         
     | 
| 274 | 
         
            +
                        ],
         
     | 
| 275 | 
         
            +
                    },
         
     | 
| 276 | 
         
            +
                ]
         
     | 
| 277 | 
         
            +
             
     | 
| 278 | 
         
            +
             
     | 
| 279 | 
         
             
            # --- Gradio processing function ---
         
     | 
| 280 | 
         
             
            def navigate(input_pil_image: Image.Image, task: str) -> str:
         
     | 
| 281 | 
         
             
                if not model_loaded or not processor or not model:
         
     | 
| 
         | 
|
| 308 | 
         
             
                    return f"Error resizing image: {e}", input_pil_image.copy().convert("RGB")
         
     | 
| 309 | 
         | 
| 310 | 
         
             
                # 2. Create the prompt using the resized image (for correct image tagging context) and task
         
     | 
| 311 | 
         
            +
                prompt = get_navigation_prompt(task, resized_image, step=1)
         
     | 
| 312 | 
         | 
| 313 | 
         
             
                # 3. Run inference
         
     | 
| 314 | 
         
             
                #    Pass `messages` (which includes the image object for template processing)
         
     | 
| 
         | 
|
| 320 | 
         
             
                    return f"Error during model inference: {e}", resized_image.copy().convert("RGB")
         
     | 
| 321 | 
         | 
| 322 | 
         
             
                return navigation_str
         
     | 
| 323 | 
         
            +
                # return NavigationStep(**json.loads(navigation_str))
         
     | 
| 324 | 
         | 
| 325 | 
         | 
| 326 | 
         
             
            # --- Load Example Data ---
         
     | 
    	
        navigation.py
    DELETED
    
    | 
         @@ -1,192 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            from typing import Literal
         
     | 
| 2 | 
         
            -
             
     | 
| 3 | 
         
            -
            from pydantic import BaseModel, Field
         
     | 
| 4 | 
         
            -
             
     | 
| 5 | 
         
            -
            SYSTEM_PROMPT: str = """Imagine you are a robot browsing the web, just like humans. Now you need to complete a task.
         
     | 
| 6 | 
         
            -
            In each iteration, you will receive an Observation that includes the last  screenshots of a web browser and the current memory of the agent.
         
     | 
| 7 | 
         
            -
            You have also information about the step that the agent is trying to achieve to solve the task.
         
     | 
| 8 | 
         
            -
            Carefully analyze the visual information to identify what to do, then follow the guidelines to choose the following action.
         
     | 
| 9 | 
         
            -
            You should detail your thought (i.e. reasoning steps) before taking the action.
         
     | 
| 10 | 
         
            -
            Also detail in the notes field of the action the extracted information relevant to solve the task.
         
     | 
| 11 | 
         
            -
            Once you have enough information in the notes to answer the task, return an answer action with the detailed answer in the notes field.
         
     | 
| 12 | 
         
            -
            This will be evaluated by an evaluator and should match all the criteria or requirements of the task.
         
     | 
| 13 | 
         
            -
             
     | 
| 14 | 
         
            -
            Guidelines:
         
     | 
| 15 | 
         
            -
            - store in the notes all the relevant information to solve the task that fulfill the task criteria. Be precise
         
     | 
| 16 | 
         
            -
            - Use both the task and the step information to decide what to do
         
     | 
| 17 | 
         
            -
            - if you want to write in a text field and the text field already has text, designate the text field by the text it contains and its type
         
     | 
| 18 | 
         
            -
            - If there is a cookies notice, always accept all the cookies first
         
     | 
| 19 | 
         
            -
            - The observation is the screenshot of the current page and the memory of the agent.
         
     | 
| 20 | 
         
            -
            - If you see relevant information on the screenshot to answer the task, add it to the notes field of the action.
         
     | 
| 21 | 
         
            -
            - If there is no relevant information on the screenshot to answer the task, add an empty string to the notes field of the action.
         
     | 
| 22 | 
         
            -
            - If you see buttons that allow to navigate directly to relevant information, like jump to ... or go to ... , use them to navigate faster.
         
     | 
| 23 | 
         
            -
            - In the answer action, give as many details a possible relevant to answering the task.
         
     | 
| 24 | 
         
            -
            - if you want to write, don't click before. Directly use the write action
         
     | 
| 25 | 
         
            -
            - to write, identify the web element which is type and the text it already contains
         
     | 
| 26 | 
         
            -
            - If you want to use a search bar, directly write text in the search bar
         
     | 
| 27 | 
         
            -
            - Don't scroll too much. Don't scroll if the number of scrolls is greater than 3
         
     | 
| 28 | 
         
            -
            - Don't scroll if you are at the end of the webpage
         
     | 
| 29 | 
         
            -
            - Only refresh if you identify a rate limit problem
         
     | 
| 30 | 
         
            -
            - If you are looking for a single flights, click on round-trip to select 'one way'
         
     | 
| 31 | 
         
            -
            - Never try to login, enter email or password. If there is a need to login, then go back.
         
     | 
| 32 | 
         
            -
            - If you are facing a captcha on a website, try to solve it.
         
     | 
| 33 | 
         
            -
             
     | 
| 34 | 
         
            -
            - if you have enough information in the screenshot and in the notes to answer the task, return an answer action with the detailed answer in the notes field
         
     | 
| 35 | 
         
            -
            - The current date is {timestamp}.
         
     | 
| 36 | 
         
            -
             
     | 
| 37 | 
         
            -
            # <output_json_format>
         
     | 
| 38 | 
         
            -
            # ```json
         
     | 
| 39 | 
         
            -
            # {output_format}
         
     | 
| 40 | 
         
            -
            # ```
         
     | 
| 41 | 
         
            -
            # </output_json_format>
         
     | 
| 42 | 
         
            -
             
     | 
| 43 | 
         
            -
            """
         
     | 
| 44 | 
         
            -
             
     | 
| 45 | 
         
            -
             
     | 
| 46 | 
         
            -
            class ClickElementAction(BaseModel):
         
     | 
| 47 | 
         
            -
                """Click at absolute coordinates of a web element with its description"""
         
     | 
| 48 | 
         
            -
             
     | 
| 49 | 
         
            -
                action: Literal["click_element"] = Field(description="Click at absolute coordinates of a web element")
         
     | 
| 50 | 
         
            -
                element: str = Field(description="text description of the element")
         
     | 
| 51 | 
         
            -
                x: int = Field(description="The x coordinate, number of pixels from the left edge.")
         
     | 
| 52 | 
         
            -
                y: int = Field(description="The y coordinate, number of pixels from the top edge.")
         
     | 
| 53 | 
         
            -
             
     | 
| 54 | 
         
            -
                def log(self):
         
     | 
| 55 | 
         
            -
                    return f"I have clicked on the element '{self.element}' at absolute coordinates {self.x}, {self.y}"
         
     | 
| 56 | 
         
            -
             
     | 
| 57 | 
         
            -
             
     | 
| 58 | 
         
            -
            class WriteElementAction(BaseModel):
         
     | 
| 59 | 
         
            -
                """Write content at absolute coordinates of a web element identified by its description, then press Enter."""
         
     | 
| 60 | 
         
            -
             
     | 
| 61 | 
         
            -
                action: Literal["write_element_abs"] = Field(description="Write content at absolute coordinates of a web page")
         
     | 
| 62 | 
         
            -
                content: str = Field(description="Content to write")
         
     | 
| 63 | 
         
            -
                element: str = Field(description="Text description of the element")
         
     | 
| 64 | 
         
            -
                x: int = Field(description="The x coordinate, number of pixels from the left edge.")
         
     | 
| 65 | 
         
            -
                y: int = Field(description="The y coordinate, number of pixels from the top edge.")
         
     | 
| 66 | 
         
            -
             
     | 
| 67 | 
         
            -
                def log(self):
         
     | 
| 68 | 
         
            -
                    return f"I have written '{self.content}' in the element '{self.element}' at absolute coordinates {self.x}, {self.y}"
         
     | 
| 69 | 
         
            -
             
     | 
| 70 | 
         
            -
             
     | 
| 71 | 
         
            -
            class ScrollAction(BaseModel):
         
     | 
| 72 | 
         
            -
                """Scroll action with no required element"""
         
     | 
| 73 | 
         
            -
             
     | 
| 74 | 
         
            -
                action: Literal["scroll"] = Field(description="Scroll the page or a specific element")
         
     | 
| 75 | 
         
            -
                direction: Literal["down", "up", "left", "right"] = Field(description="The direction to scroll in")
         
     | 
| 76 | 
         
            -
             
     | 
| 77 | 
         
            -
                def log(self):
         
     | 
| 78 | 
         
            -
                    return f"I have scrolled {self.direction}"
         
     | 
| 79 | 
         
            -
             
     | 
| 80 | 
         
            -
             
     | 
| 81 | 
         
            -
            class GoBackAction(BaseModel):
         
     | 
| 82 | 
         
            -
                """Action to navigate back in browser history"""
         
     | 
| 83 | 
         
            -
             
     | 
| 84 | 
         
            -
                action: Literal["go_back"] = Field(description="Navigate to the previous page")
         
     | 
| 85 | 
         
            -
             
     | 
| 86 | 
         
            -
                def log(self):
         
     | 
| 87 | 
         
            -
                    return "I have gone back to the previous page"
         
     | 
| 88 | 
         
            -
             
     | 
| 89 | 
         
            -
             
     | 
| 90 | 
         
            -
            class RefreshAction(BaseModel):
         
     | 
| 91 | 
         
            -
                """Action to refresh the current page"""
         
     | 
| 92 | 
         
            -
             
     | 
| 93 | 
         
            -
                action: Literal["refresh"] = Field(description="Refresh the current page")
         
     | 
| 94 | 
         
            -
             
     | 
| 95 | 
         
            -
                def log(self):
         
     | 
| 96 | 
         
            -
                    return "I have refreshed the page"
         
     | 
| 97 | 
         
            -
             
     | 
| 98 | 
         
            -
             
     | 
| 99 | 
         
            -
            class GotoAction(BaseModel):
         
     | 
| 100 | 
         
            -
                """Action to go to a particular URL"""
         
     | 
| 101 | 
         
            -
             
     | 
| 102 | 
         
            -
                action: Literal["goto"] = Field(description="Goto a particular URL")
         
     | 
| 103 | 
         
            -
                url: str = Field(description="A url starting with http:// or https://")
         
     | 
| 104 | 
         
            -
             
     | 
| 105 | 
         
            -
                def log(self):
         
     | 
| 106 | 
         
            -
                    return f"I have navigated to the URL {self.url}"
         
     | 
| 107 | 
         
            -
             
     | 
| 108 | 
         
            -
             
     | 
| 109 | 
         
            -
            class WaitAction(BaseModel):
         
     | 
| 110 | 
         
            -
                """Action to wait for a particular amount of time"""
         
     | 
| 111 | 
         
            -
             
     | 
| 112 | 
         
            -
                action: Literal["wait"] = Field(description="Wait for a particular amount of time")
         
     | 
| 113 | 
         
            -
                seconds: int = Field(default=2, ge=0, le=10, description="The number of seconds to wait")
         
     | 
| 114 | 
         
            -
             
     | 
| 115 | 
         
            -
                def log(self):
         
     | 
| 116 | 
         
            -
                    return f"I have waited for {self.seconds} seconds"
         
     | 
| 117 | 
         
            -
             
     | 
| 118 | 
         
            -
             
     | 
| 119 | 
         
            -
            class RestartAction(BaseModel):
         
     | 
| 120 | 
         
            -
                """Restart the task from the beginning."""
         
     | 
| 121 | 
         
            -
             
     | 
| 122 | 
         
            -
                action: Literal["restart"] = "restart"
         
     | 
| 123 | 
         
            -
             
     | 
| 124 | 
         
            -
                def log(self):
         
     | 
| 125 | 
         
            -
                    return "I have restarted the task from the beginning"
         
     | 
| 126 | 
         
            -
             
     | 
| 127 | 
         
            -
             
     | 
| 128 | 
         
            -
            class AnswerAction(BaseModel):
         
     | 
| 129 | 
         
            -
                """Return a final answer to the task. This is the last action to call in an episode."""
         
     | 
| 130 | 
         
            -
             
     | 
| 131 | 
         
            -
                action: Literal["answer"] = "answer"
         
     | 
| 132 | 
         
            -
                content: str = Field(description="The answer content")
         
     | 
| 133 | 
         
            -
             
     | 
| 134 | 
         
            -
                def log(self):
         
     | 
| 135 | 
         
            -
                    return f"I have answered the task with '{self.content}'"
         
     | 
| 136 | 
         
            -
             
     | 
| 137 | 
         
            -
             
     | 
| 138 | 
         
            -
            ActionSpace = (
         
     | 
| 139 | 
         
            -
                ClickElementAction
         
     | 
| 140 | 
         
            -
                | WriteElementAction
         
     | 
| 141 | 
         
            -
                | ScrollAction
         
     | 
| 142 | 
         
            -
                | GoBackAction
         
     | 
| 143 | 
         
            -
                | RefreshAction
         
     | 
| 144 | 
         
            -
                | WaitAction
         
     | 
| 145 | 
         
            -
                | RestartAction
         
     | 
| 146 | 
         
            -
                | AnswerAction
         
     | 
| 147 | 
         
            -
                | GotoAction
         
     | 
| 148 | 
         
            -
            )
         
     | 
| 149 | 
         
            -
             
     | 
| 150 | 
         
            -
             
     | 
| 151 | 
         
            -
            class NavigationStep(BaseModel):
         
     | 
| 152 | 
         
            -
                note: str = Field(
         
     | 
| 153 | 
         
            -
                    default="",
         
     | 
| 154 | 
         
            -
                    description="Task-relevant information extracted from the previous observation. Keep empty if no new info.",
         
     | 
| 155 | 
         
            -
                )
         
     | 
| 156 | 
         
            -
                thought: str = Field(description="Reasoning about next steps (<4 lines)")
         
     | 
| 157 | 
         
            -
                action: ActionSpace = Field(description="Next action to take")
         
     | 
| 158 | 
         
            -
             
     | 
| 159 | 
         
            -
             
     | 
| 160 | 
         
            -
            def get_navigation_prompt(task, image, step=1):
         
     | 
| 161 | 
         
            -
                """
         
     | 
| 162 | 
         
            -
                Get the prompt for the navigation task.
         
     | 
| 163 | 
         
            -
                - task: The task to complete
         
     | 
| 164 | 
         
            -
                - image: The current screenshot of the web page
         
     | 
| 165 | 
         
            -
                - step: The current step of the task
         
     | 
| 166 | 
         
            -
                """
         
     | 
| 167 | 
         
            -
                system_prompt = SYSTEM_PROMPT.format(
         
     | 
| 168 | 
         
            -
                    output_format=NavigationStep.model_json_schema(),
         
     | 
| 169 | 
         
            -
                    timestamp="2025-06-04 14:16:03",
         
     | 
| 170 | 
         
            -
                )
         
     | 
| 171 | 
         
            -
                return [
         
     | 
| 172 | 
         
            -
                    {
         
     | 
| 173 | 
         
            -
                        "role": "system",
         
     | 
| 174 | 
         
            -
                        "content": [
         
     | 
| 175 | 
         
            -
                            {"type": "text", "text": system_prompt},
         
     | 
| 176 | 
         
            -
                        ],
         
     | 
| 177 | 
         
            -
                    },
         
     | 
| 178 | 
         
            -
                    {
         
     | 
| 179 | 
         
            -
                        "role": "user",
         
     | 
| 180 | 
         
            -
                        "content": [
         
     | 
| 181 | 
         
            -
                            {"type": "text", "text": f"<task>\n{task}\n</task>\n"},
         
     | 
| 182 | 
         
            -
                            {"type": "text", "text": f"<observation step={step}>\n"},
         
     | 
| 183 | 
         
            -
                            {"type": "text", "text": "<screenshot>\n"},
         
     | 
| 184 | 
         
            -
                            {
         
     | 
| 185 | 
         
            -
                                "type": "image",
         
     | 
| 186 | 
         
            -
                                "image": image,
         
     | 
| 187 | 
         
            -
                            },
         
     | 
| 188 | 
         
            -
                            {"type": "text", "text": "\n</screenshot>\n"},
         
     | 
| 189 | 
         
            -
                            {"type": "text", "text": "\n</observation>\n"},
         
     | 
| 190 | 
         
            -
                        ],
         
     | 
| 191 | 
         
            -
                    },
         
     | 
| 192 | 
         
            -
                ]
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         |