updaded file
Browse files- Dockerfile +3 -3
- app.py +20 -29
Dockerfile
CHANGED
@@ -32,9 +32,9 @@ RUN chown -R user:user /code
|
|
32 |
USER user
|
33 |
|
34 |
# Download and cache models during build
|
35 |
-
RUN python -c "from transformers import AutoProcessor,
|
36 |
-
processor = AutoProcessor.from_pretrained('microsoft/
|
37 |
-
model =
|
38 |
|
39 |
# Run the application
|
40 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
|
32 |
USER user
|
33 |
|
34 |
# Download and cache models during build
|
35 |
+
RUN python -c "from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering; \
|
36 |
+
processor = AutoProcessor.from_pretrained('microsoft/OmniParser', trust_remote_code=True); \
|
37 |
+
model = AutoModelForVisualQuestionAnswering.from_pretrained('microsoft/OmniParser', trust_remote_code=True)"
|
38 |
|
39 |
# Run the application
|
40 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
|
app.py
CHANGED
@@ -2,7 +2,7 @@ from fastapi import FastAPI, HTTPException
|
|
2 |
from pydantic import BaseModel, Field
|
3 |
from typing import Optional, Dict, Any
|
4 |
import torch
|
5 |
-
from transformers import AutoProcessor,
|
6 |
from PIL import Image
|
7 |
import io
|
8 |
import base64
|
@@ -19,44 +19,36 @@ class OmniParser:
|
|
19 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
20 |
# Initialize processor and model
|
21 |
self.processor = AutoProcessor.from_pretrained(
|
22 |
-
"microsoft/
|
23 |
-
trust_remote_code=True
|
|
|
24 |
)
|
25 |
-
self.model =
|
26 |
"microsoft/OmniParser",
|
27 |
-
|
28 |
-
|
29 |
).to(self.device)
|
30 |
|
31 |
@torch.inference_mode()
|
32 |
def process_image(
|
33 |
self,
|
34 |
image: Image.Image,
|
35 |
-
|
36 |
-
iou_threshold: float = 0.1
|
37 |
) -> Dict[str, Any]:
|
38 |
# Process image with the model
|
39 |
-
inputs = self.processor(images=image, return_tensors="pt").to(self.device)
|
40 |
-
outputs = self.model
|
41 |
|
42 |
# Decode the outputs
|
43 |
-
|
44 |
-
outputs,
|
45 |
skip_special_tokens=True
|
46 |
-
)
|
47 |
-
|
48 |
-
# Get bounding boxes and process image
|
49 |
-
boxes = self._get_box_coordinates(outputs)
|
50 |
|
51 |
return {
|
52 |
-
"parsed_elements":
|
53 |
-
"box_coordinates":
|
54 |
}
|
55 |
-
|
56 |
-
def _get_box_coordinates(self, outputs) -> Dict[str, list]:
|
57 |
-
# Extract bounding box coordinates from model outputs
|
58 |
-
# This is a placeholder - implement actual box extraction logic
|
59 |
-
return {}
|
60 |
|
61 |
# Initialize model
|
62 |
model = OmniParser()
|
@@ -64,15 +56,16 @@ model = OmniParser()
|
|
64 |
# Request/Response models
|
65 |
class ParseRequest(BaseModel):
|
66 |
image_data: str = Field(..., description="Base64 encoded image data")
|
67 |
-
|
68 |
-
|
|
|
|
|
69 |
|
70 |
class ParseResponse(BaseModel):
|
71 |
parsed_elements: str
|
72 |
box_coordinates: dict
|
73 |
output_image: Optional[str]
|
74 |
|
75 |
-
# Utility functions
|
76 |
def load_and_preprocess_image(image_data: bytes) -> Optional[Image.Image]:
|
77 |
"""Load and preprocess image from bytes data."""
|
78 |
try:
|
@@ -87,7 +80,6 @@ def encode_output_image(image: Image.Image) -> str:
|
|
87 |
image.save(buffered, format="PNG")
|
88 |
return base64.b64encode(buffered.getvalue()).decode()
|
89 |
|
90 |
-
# API endpoints
|
91 |
@app.get("/")
|
92 |
async def root():
|
93 |
return {
|
@@ -105,8 +97,7 @@ async def parse_image(request: ParseRequest):
|
|
105 |
# Process with model
|
106 |
result = model.process_image(
|
107 |
image=image,
|
108 |
-
|
109 |
-
iou_threshold=request.iou_threshold
|
110 |
)
|
111 |
|
112 |
# Prepare response
|
|
|
2 |
from pydantic import BaseModel, Field
|
3 |
from typing import Optional, Dict, Any
|
4 |
import torch
|
5 |
+
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
|
6 |
from PIL import Image
|
7 |
import io
|
8 |
import base64
|
|
|
19 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
20 |
# Initialize processor and model
|
21 |
self.processor = AutoProcessor.from_pretrained(
|
22 |
+
"microsoft/OmniParser",
|
23 |
+
trust_remote_code=True,
|
24 |
+
cache_dir="/code/.cache"
|
25 |
)
|
26 |
+
self.model = AutoModelForVisualQuestionAnswering.from_pretrained(
|
27 |
"microsoft/OmniParser",
|
28 |
+
trust_remote_code=True,
|
29 |
+
cache_dir="/code/.cache"
|
30 |
).to(self.device)
|
31 |
|
32 |
@torch.inference_mode()
|
33 |
def process_image(
|
34 |
self,
|
35 |
image: Image.Image,
|
36 |
+
question: str = "What elements do you see in this GUI?",
|
|
|
37 |
) -> Dict[str, Any]:
|
38 |
# Process image with the model
|
39 |
+
inputs = self.processor(images=image, text=question, return_tensors="pt").to(self.device)
|
40 |
+
outputs = self.model(**inputs)
|
41 |
|
42 |
# Decode the outputs
|
43 |
+
predicted_answer = self.processor.decode(
|
44 |
+
outputs.logits.argmax(-1)[0],
|
45 |
skip_special_tokens=True
|
46 |
+
)
|
|
|
|
|
|
|
47 |
|
48 |
return {
|
49 |
+
"parsed_elements": predicted_answer,
|
50 |
+
"box_coordinates": {} # Placeholder for future box detection implementation
|
51 |
}
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# Initialize model
|
54 |
model = OmniParser()
|
|
|
56 |
# Request/Response models
|
57 |
class ParseRequest(BaseModel):
|
58 |
image_data: str = Field(..., description="Base64 encoded image data")
|
59 |
+
question: Optional[str] = Field(
|
60 |
+
default="What elements do you see in this GUI?",
|
61 |
+
description="Question to ask about the GUI"
|
62 |
+
)
|
63 |
|
64 |
class ParseResponse(BaseModel):
|
65 |
parsed_elements: str
|
66 |
box_coordinates: dict
|
67 |
output_image: Optional[str]
|
68 |
|
|
|
69 |
def load_and_preprocess_image(image_data: bytes) -> Optional[Image.Image]:
|
70 |
"""Load and preprocess image from bytes data."""
|
71 |
try:
|
|
|
80 |
image.save(buffered, format="PNG")
|
81 |
return base64.b64encode(buffered.getvalue()).decode()
|
82 |
|
|
|
83 |
@app.get("/")
|
84 |
async def root():
|
85 |
return {
|
|
|
97 |
# Process with model
|
98 |
result = model.process_image(
|
99 |
image=image,
|
100 |
+
question=request.question
|
|
|
101 |
)
|
102 |
|
103 |
# Prepare response
|