Spaces:

Sanket17
/

OmniPar

Build error

App Files Files Community

Sanket17 commited on Dec 25, 2024

Commit

d865512

1 Parent(s): c13799b

updaded file

Browse files

Files changed (2) hide show

Dockerfile +3 -3
app.py +20 -29

Dockerfile CHANGED Viewed

@@ -32,9 +32,9 @@ RUN chown -R user:user /code
 USER user
 # Download and cache models during build
-RUN python -c "from transformers import AutoProcessor, AutoModelForCausalLM; \
-    processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True); \
-    model = AutoModelForCausalLM.from_pretrained('microsoft/OmniParser', torch_dtype='float16', trust_remote_code=True)"
 # Run the application
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

 USER user
 # Download and cache models during build
+RUN python -c "from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering; \
+    processor = AutoProcessor.from_pretrained('microsoft/OmniParser', trust_remote_code=True); \
+    model = AutoModelForVisualQuestionAnswering.from_pretrained('microsoft/OmniParser', trust_remote_code=True)"
 # Run the application
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, Field
 from typing import Optional, Dict, Any
 import torch
-from transformers import AutoProcessor, AutoModelForCausalLM
 from PIL import Image
 import io
 import base64
@@ -19,44 +19,36 @@ class OmniParser:
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         # Initialize processor and model
         self.processor = AutoProcessor.from_pretrained(
-            "microsoft/Florence-2-base",
-            trust_remote_code=True
         )
-        self.model = AutoModelForCausalLM.from_pretrained(
             "microsoft/OmniParser",
-            torch_dtype=torch.float16,
-            trust_remote_code=True
         ).to(self.device)
     @torch.inference_mode()
     def process_image(
         self,
         image: Image.Image,
-        box_threshold: float = 0.05,
-        iou_threshold: float = 0.1
     ) -> Dict[str, Any]:
         # Process image with the model
-        inputs = self.processor(images=image, return_tensors="pt").to(self.device)
-        outputs = self.model.generate(**inputs)
         # Decode the outputs
-        parsed_elements = self.processor.batch_decode(
-            outputs,
             skip_special_tokens=True
-        )[0]
-        # Get bounding boxes and process image
-        boxes = self._get_box_coordinates(outputs)
         return {
-            "parsed_elements": parsed_elements,
-            "box_coordinates": boxes
         }
-    def _get_box_coordinates(self, outputs) -> Dict[str, list]:
-        # Extract bounding box coordinates from model outputs
-        # This is a placeholder - implement actual box extraction logic
-        return {}
 # Initialize model
 model = OmniParser()
@@ -64,15 +56,16 @@ model = OmniParser()
 # Request/Response models
 class ParseRequest(BaseModel):
     image_data: str = Field(..., description="Base64 encoded image data")
-    box_threshold: float = Field(0.05, ge=0.01, le=1.0)
-    iou_threshold: float = Field(0.1, ge=0.01, le=1.0)
 class ParseResponse(BaseModel):
     parsed_elements: str
     box_coordinates: dict
     output_image: Optional[str]
-# Utility functions
 def load_and_preprocess_image(image_data: bytes) -> Optional[Image.Image]:
     """Load and preprocess image from bytes data."""
     try:
@@ -87,7 +80,6 @@ def encode_output_image(image: Image.Image) -> str:
     image.save(buffered, format="PNG")
     return base64.b64encode(buffered.getvalue()).decode()
-# API endpoints
 @app.get("/")
 async def root():
     return {
@@ -105,8 +97,7 @@ async def parse_image(request: ParseRequest):
         # Process with model
         result = model.process_image(
             image=image,
-            box_threshold=request.box_threshold,
-            iou_threshold=request.iou_threshold
         )
         # Prepare response

 from pydantic import BaseModel, Field
 from typing import Optional, Dict, Any
 import torch
+from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
 from PIL import Image
 import io
 import base64
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         # Initialize processor and model
         self.processor = AutoProcessor.from_pretrained(
+            "microsoft/OmniParser",
+            trust_remote_code=True,
+            cache_dir="/code/.cache"
         )
+        self.model = AutoModelForVisualQuestionAnswering.from_pretrained(
             "microsoft/OmniParser",
+            trust_remote_code=True,
+            cache_dir="/code/.cache"
         ).to(self.device)
     @torch.inference_mode()
     def process_image(
         self,
         image: Image.Image,
+        question: str = "What elements do you see in this GUI?",
     ) -> Dict[str, Any]:
         # Process image with the model
+        inputs = self.processor(images=image, text=question, return_tensors="pt").to(self.device)
+        outputs = self.model(**inputs)
         # Decode the outputs
+        predicted_answer = self.processor.decode(
+            outputs.logits.argmax(-1)[0],
             skip_special_tokens=True
+        )
         return {
+            "parsed_elements": predicted_answer,
+            "box_coordinates": {}  # Placeholder for future box detection implementation
         }
 # Initialize model
 model = OmniParser()
 # Request/Response models
 class ParseRequest(BaseModel):
     image_data: str = Field(..., description="Base64 encoded image data")
+    question: Optional[str] = Field(
+        default="What elements do you see in this GUI?",
+        description="Question to ask about the GUI"
+    )
 class ParseResponse(BaseModel):
     parsed_elements: str
     box_coordinates: dict
     output_image: Optional[str]
 def load_and_preprocess_image(image_data: bytes) -> Optional[Image.Image]:
     """Load and preprocess image from bytes data."""
     try:
     image.save(buffered, format="PNG")
     return base64.b64encode(buffered.getvalue()).decode()
 @app.get("/")
 async def root():
     return {
         # Process with model
         result = model.process_image(
             image=image,
+            question=request.question
         )
         # Prepare response