Sanket17 commited on
Commit
d865512
·
1 Parent(s): c13799b

updaded file

Browse files
Files changed (2) hide show
  1. Dockerfile +3 -3
  2. app.py +20 -29
Dockerfile CHANGED
@@ -32,9 +32,9 @@ RUN chown -R user:user /code
32
  USER user
33
 
34
  # Download and cache models during build
35
- RUN python -c "from transformers import AutoProcessor, AutoModelForCausalLM; \
36
- processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True); \
37
- model = AutoModelForCausalLM.from_pretrained('microsoft/OmniParser', torch_dtype='float16', trust_remote_code=True)"
38
 
39
  # Run the application
40
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
 
32
  USER user
33
 
34
  # Download and cache models during build
35
+ RUN python -c "from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering; \
36
+ processor = AutoProcessor.from_pretrained('microsoft/OmniParser', trust_remote_code=True); \
37
+ model = AutoModelForVisualQuestionAnswering.from_pretrained('microsoft/OmniParser', trust_remote_code=True)"
38
 
39
  # Run the application
40
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
app.py CHANGED
@@ -2,7 +2,7 @@ from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel, Field
3
  from typing import Optional, Dict, Any
4
  import torch
5
- from transformers import AutoProcessor, AutoModelForCausalLM
6
  from PIL import Image
7
  import io
8
  import base64
@@ -19,44 +19,36 @@ class OmniParser:
19
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
20
  # Initialize processor and model
21
  self.processor = AutoProcessor.from_pretrained(
22
- "microsoft/Florence-2-base",
23
- trust_remote_code=True
 
24
  )
25
- self.model = AutoModelForCausalLM.from_pretrained(
26
  "microsoft/OmniParser",
27
- torch_dtype=torch.float16,
28
- trust_remote_code=True
29
  ).to(self.device)
30
 
31
  @torch.inference_mode()
32
  def process_image(
33
  self,
34
  image: Image.Image,
35
- box_threshold: float = 0.05,
36
- iou_threshold: float = 0.1
37
  ) -> Dict[str, Any]:
38
  # Process image with the model
39
- inputs = self.processor(images=image, return_tensors="pt").to(self.device)
40
- outputs = self.model.generate(**inputs)
41
 
42
  # Decode the outputs
43
- parsed_elements = self.processor.batch_decode(
44
- outputs,
45
  skip_special_tokens=True
46
- )[0]
47
-
48
- # Get bounding boxes and process image
49
- boxes = self._get_box_coordinates(outputs)
50
 
51
  return {
52
- "parsed_elements": parsed_elements,
53
- "box_coordinates": boxes
54
  }
55
-
56
- def _get_box_coordinates(self, outputs) -> Dict[str, list]:
57
- # Extract bounding box coordinates from model outputs
58
- # This is a placeholder - implement actual box extraction logic
59
- return {}
60
 
61
  # Initialize model
62
  model = OmniParser()
@@ -64,15 +56,16 @@ model = OmniParser()
64
  # Request/Response models
65
  class ParseRequest(BaseModel):
66
  image_data: str = Field(..., description="Base64 encoded image data")
67
- box_threshold: float = Field(0.05, ge=0.01, le=1.0)
68
- iou_threshold: float = Field(0.1, ge=0.01, le=1.0)
 
 
69
 
70
  class ParseResponse(BaseModel):
71
  parsed_elements: str
72
  box_coordinates: dict
73
  output_image: Optional[str]
74
 
75
- # Utility functions
76
  def load_and_preprocess_image(image_data: bytes) -> Optional[Image.Image]:
77
  """Load and preprocess image from bytes data."""
78
  try:
@@ -87,7 +80,6 @@ def encode_output_image(image: Image.Image) -> str:
87
  image.save(buffered, format="PNG")
88
  return base64.b64encode(buffered.getvalue()).decode()
89
 
90
- # API endpoints
91
  @app.get("/")
92
  async def root():
93
  return {
@@ -105,8 +97,7 @@ async def parse_image(request: ParseRequest):
105
  # Process with model
106
  result = model.process_image(
107
  image=image,
108
- box_threshold=request.box_threshold,
109
- iou_threshold=request.iou_threshold
110
  )
111
 
112
  # Prepare response
 
2
  from pydantic import BaseModel, Field
3
  from typing import Optional, Dict, Any
4
  import torch
5
+ from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
6
  from PIL import Image
7
  import io
8
  import base64
 
19
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
20
  # Initialize processor and model
21
  self.processor = AutoProcessor.from_pretrained(
22
+ "microsoft/OmniParser",
23
+ trust_remote_code=True,
24
+ cache_dir="/code/.cache"
25
  )
26
+ self.model = AutoModelForVisualQuestionAnswering.from_pretrained(
27
  "microsoft/OmniParser",
28
+ trust_remote_code=True,
29
+ cache_dir="/code/.cache"
30
  ).to(self.device)
31
 
32
  @torch.inference_mode()
33
  def process_image(
34
  self,
35
  image: Image.Image,
36
+ question: str = "What elements do you see in this GUI?",
 
37
  ) -> Dict[str, Any]:
38
  # Process image with the model
39
+ inputs = self.processor(images=image, text=question, return_tensors="pt").to(self.device)
40
+ outputs = self.model(**inputs)
41
 
42
  # Decode the outputs
43
+ predicted_answer = self.processor.decode(
44
+ outputs.logits.argmax(-1)[0],
45
  skip_special_tokens=True
46
+ )
 
 
 
47
 
48
  return {
49
+ "parsed_elements": predicted_answer,
50
+ "box_coordinates": {} # Placeholder for future box detection implementation
51
  }
 
 
 
 
 
52
 
53
  # Initialize model
54
  model = OmniParser()
 
56
  # Request/Response models
57
  class ParseRequest(BaseModel):
58
  image_data: str = Field(..., description="Base64 encoded image data")
59
+ question: Optional[str] = Field(
60
+ default="What elements do you see in this GUI?",
61
+ description="Question to ask about the GUI"
62
+ )
63
 
64
  class ParseResponse(BaseModel):
65
  parsed_elements: str
66
  box_coordinates: dict
67
  output_image: Optional[str]
68
 
 
69
  def load_and_preprocess_image(image_data: bytes) -> Optional[Image.Image]:
70
  """Load and preprocess image from bytes data."""
71
  try:
 
80
  image.save(buffered, format="PNG")
81
  return base64.b64encode(buffered.getvalue()).decode()
82
 
 
83
  @app.get("/")
84
  async def root():
85
  return {
 
97
  # Process with model
98
  result = model.process_image(
99
  image=image,
100
+ question=request.question
 
101
  )
102
 
103
  # Prepare response