Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	fix
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -4,13 +4,14 @@ subprocess.run( 
     | 
|
| 4 | 
         
             
                "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True
         
     | 
| 5 | 
         
             
            )
         
     | 
| 6 | 
         | 
| 7 | 
         
            -
            from typing import Any, List
         
     | 
| 8 | 
         | 
| 9 | 
         
             
            import gradio as gr
         
     | 
| 10 | 
         
             
            import requests
         
     | 
| 11 | 
         
             
            import spaces
         
     | 
| 12 | 
         
             
            import torch
         
     | 
| 13 | 
         
             
            from PIL import Image, ImageDraw
         
     | 
| 
         | 
|
| 14 | 
         
             
            from transformers import AutoModelForImageTextToText, AutoProcessor
         
     | 
| 15 | 
         
             
            from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
         
     | 
| 16 | 
         | 
| 
         @@ -42,7 +43,7 @@ except Exception as e: 
     | 
|
| 42 | 
         
             
            # --- Helper functions from the model card (or adapted) ---
         
     | 
| 43 | 
         | 
| 44 | 
         | 
| 45 | 
         
            -
             
     | 
| 46 | 
         
             
            def run_inference_localization(
         
     | 
| 47 | 
         
             
                messages_for_template: List[dict[str, Any]], pil_image_for_processing: Image.Image
         
     | 
| 48 | 
         
             
            ) -> str:
         
     | 
| 
         @@ -82,10 +83,6 @@ def run_inference_localization( 
     | 
|
| 82 | 
         
             
                return decoded_output[0] if decoded_output else ""
         
     | 
| 83 | 
         | 
| 84 | 
         | 
| 85 | 
         
            -
            from typing import Literal
         
     | 
| 86 | 
         
            -
             
     | 
| 87 | 
         
            -
            from pydantic import BaseModel, Field
         
     | 
| 88 | 
         
            -
             
     | 
| 89 | 
         
             
            SYSTEM_PROMPT: str = """Imagine you are a robot browsing the web, just like humans. Now you need to complete a task.
         
     | 
| 90 | 
         
             
            In each iteration, you will receive an Observation that includes the last  screenshots of a web browser and the current memory of the agent.
         
     | 
| 91 | 
         
             
            You have also information about the step that the agent is trying to achieve to solve the task.
         
     | 
| 
         | 
|
| 4 | 
         
             
                "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True
         
     | 
| 5 | 
         
             
            )
         
     | 
| 6 | 
         | 
| 7 | 
         
            +
            from typing import Any, List, Literal
         
     | 
| 8 | 
         | 
| 9 | 
         
             
            import gradio as gr
         
     | 
| 10 | 
         
             
            import requests
         
     | 
| 11 | 
         
             
            import spaces
         
     | 
| 12 | 
         
             
            import torch
         
     | 
| 13 | 
         
             
            from PIL import Image, ImageDraw
         
     | 
| 14 | 
         
            +
            from pydantic import BaseModel, Field
         
     | 
| 15 | 
         
             
            from transformers import AutoModelForImageTextToText, AutoProcessor
         
     | 
| 16 | 
         
             
            from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
         
     | 
| 17 | 
         | 
| 
         | 
|
| 43 | 
         
             
            # --- Helper functions from the model card (or adapted) ---
         
     | 
| 44 | 
         | 
| 45 | 
         | 
| 46 | 
         
            +
            @spaces.GPU(duration=20)
         
     | 
| 47 | 
         
             
            def run_inference_localization(
         
     | 
| 48 | 
         
             
                messages_for_template: List[dict[str, Any]], pil_image_for_processing: Image.Image
         
     | 
| 49 | 
         
             
            ) -> str:
         
     | 
| 
         | 
|
| 83 | 
         
             
                return decoded_output[0] if decoded_output else ""
         
     | 
| 84 | 
         | 
| 85 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 86 | 
         
             
            SYSTEM_PROMPT: str = """Imagine you are a robot browsing the web, just like humans. Now you need to complete a task.
         
     | 
| 87 | 
         
             
            In each iteration, you will receive an Observation that includes the last  screenshots of a web browser and the current memory of the agent.
         
     | 
| 88 | 
         
             
            You have also information about the step that the agent is trying to achieve to solve the task.
         
     |