merve HF Staff commited on
Commit
8f7ca70
·
verified ·
1 Parent(s): 1157b18

add new models

Browse files
Files changed (2) hide show
  1. app.py +103 -22
  2. requirements.txt +3 -1
app.py CHANGED
@@ -10,10 +10,43 @@ def extract_model_short_name(model_id):
10
 
11
  model_llmdet_id = "iSEE-Laboratory/llmdet_tiny"
12
  model_mm_grounding_id = "rziga/mm_grounding_dino_tiny_o365v1_goldg"
 
 
13
 
14
  model_llmdet_name = extract_model_short_name(model_llmdet_id)
15
  model_mm_grounding_name = extract_model_short_name(model_mm_grounding_id)
 
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  @spaces.GPU
18
  def detect_llmdet(image: Image.Image, prompts: list, threshold: float):
19
  t0 = time.perf_counter()
@@ -42,7 +75,6 @@ def detect_llmdet(image: Image.Image, prompts: list, threshold: float):
42
  time_taken = f"**Inference time ({model_llmdet_name}):** {elapsed_ms:.0f} ms"
43
  raw_text = "\n".join(raw_results) if raw_results else "No detections"
44
  return annotations, raw_text, time_taken
45
-
46
  @spaces.GPU
47
  def detect_mm_grounding(image: Image.Image, prompts: list, threshold: float):
48
  t0 = time.perf_counter()
@@ -72,14 +104,46 @@ def detect_mm_grounding(image: Image.Image, prompts: list, threshold: float):
72
  raw_text = "\n".join(raw_results) if raw_results else "No detections"
73
  return annotations, raw_text, time_taken
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- def run_detection(image, prompts_str, threshold):
77
  if image is None:
78
  return (None, []), "No detections", "", (None, []), "No detections", ""
79
  prompts = [p.strip() for p in prompts_str.split(",")]
80
- ann_llm, raw_llm, time_llm = detect_llmdet(image, prompts, threshold)
81
- ann_mm, raw_mm, time_mm = detect_mm_grounding(image, prompts, threshold)
82
- return (image, ann_llm), raw_llm, time_llm, (image, ann_mm), raw_mm, time_mm
 
 
83
 
84
  with gr.Blocks() as app:
85
  gr.Markdown("# Zero-Shot Object Detection Arena")
@@ -88,35 +152,52 @@ with gr.Blocks() as app:
88
  with gr.Column(scale=1):
89
  image = gr.Image(type="pil", label="Upload an image", height=400)
90
  prompts = gr.Textbox(label="Prompts (comma-separated)", value="a cat, a remote control")
91
- threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, step=0.05, value=0.30)
 
 
 
 
92
  generate_btn = gr.Button(value="Detect")
93
- with gr.Column(scale=2):
94
- output_image_llm = gr.AnnotatedImage(label=f"Annotated image for {model_llmdet_name}", height=400)
95
- output_text_llm = gr.Textbox(label=f"Model detections for {model_llmdet_name}", lines=10)
96
- output_time_llm = gr.Markdown()
97
- with gr.Column(scale=2):
98
- output_image_mm = gr.AnnotatedImage(label=f"Annotated image for {model_mm_grounding_name}", height=400)
99
- output_text_mm = gr.Textbox(label=f"Model detections for {model_mm_grounding_name}", lines=10)
100
- output_time_mm = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
101
  gr.Markdown("### Examples")
102
  example_data = [
103
- ["http://images.cocodataset.org/val2017/000000039769.jpg", "a cat, a remote control", 0.4],
104
- ["http://images.cocodataset.org/val2017/000000000139.jpg", "a person, a tv, a remote", 0.3],
105
  ]
 
106
  gr.Examples(
107
  examples=example_data,
108
- inputs=[image, prompts, threshold],
109
- label="Click an example to populate the input",
110
  )
 
 
111
  generate_btn.click(
112
  fn=run_detection,
113
- inputs=[image, prompts, threshold],
114
- outputs=[output_image_llm, output_text_llm, output_time_llm, output_image_mm, output_text_mm, output_time_mm],
115
  )
116
  image.upload(
117
  fn=run_detection,
118
- inputs=[image, prompts, threshold],
119
- outputs=[output_image_llm, output_text_llm, output_time_llm, output_image_mm, output_text_mm, output_time_mm],
120
  )
121
 
122
  app.launch()
 
10
 
11
  model_llmdet_id = "iSEE-Laboratory/llmdet_tiny"
12
  model_mm_grounding_id = "rziga/mm_grounding_dino_tiny_o365v1_goldg"
13
+ model_omdet_id = "omlab/omdet-turbo-swin-tiny-hf"
14
+ model_owlv2_id = "google/owlv2-large-patch14-ensemble"
15
 
16
  model_llmdet_name = extract_model_short_name(model_llmdet_id)
17
  model_mm_grounding_name = extract_model_short_name(model_mm_grounding_id)
18
+ model_omdet_name = extract_model_short_name(model_omdet_id)
19
+ model_owlv2_name = extract_model_short_name(model_owlv2_id)
20
 
21
+ @spaces.GPU
22
+ def detect_omdet(image: Image.Image, prompts: list, threshold: float):
23
+ t0 = time.perf_counter()
24
+ model_id = model_omdet_id
25
+ device = "cuda" if torch.cuda.is_available() else "cpu"
26
+ processor = AutoProcessor.from_pretrained(model_id)
27
+ model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device).eval()
28
+ texts = [prompts]
29
+ inputs = processor(images=image, text=texts, return_tensors="pt").to(device)
30
+ with torch.no_grad():
31
+ outputs = model(**inputs)
32
+ results = processor.post_process_grounded_object_detection(
33
+ outputs,
34
+ threshold=threshold,
35
+ target_sizes=[image.size[::-1]]
36
+ )
37
+ result = results[0]
38
+ annotations = []
39
+ raw_results = []
40
+ for box, score, label in zip(result["boxes"], result["scores"], result["labels"]):
41
+ if score >= threshold:
42
+ label_name = prompts[label]
43
+ xmin, ymin, xmax, ymax = [int(x) for x in box.tolist()]
44
+ annotations.append(((xmin, ymin, xmax, ymax), f"{label_name} {score:.2f}"))
45
+ raw_results.append(f"Detected {label_name} with confidence {score:.2f} at location [{xmin}, {ymin}, {xmax}, {ymax}]")
46
+ elapsed_ms = (time.perf_counter() - t0) * 1000
47
+ time_taken = f"**Inference time ({model_omdet_name}):** {elapsed_ms:.0f} ms"
48
+ raw_text = "\n".join(raw_results) if raw_results else "No detections"
49
+ return annotations, raw_text, time_taken
50
  @spaces.GPU
51
  def detect_llmdet(image: Image.Image, prompts: list, threshold: float):
52
  t0 = time.perf_counter()
 
75
  time_taken = f"**Inference time ({model_llmdet_name}):** {elapsed_ms:.0f} ms"
76
  raw_text = "\n".join(raw_results) if raw_results else "No detections"
77
  return annotations, raw_text, time_taken
 
78
  @spaces.GPU
79
  def detect_mm_grounding(image: Image.Image, prompts: list, threshold: float):
80
  t0 = time.perf_counter()
 
104
  raw_text = "\n".join(raw_results) if raw_results else "No detections"
105
  return annotations, raw_text, time_taken
106
 
107
+ @spaces.GPU
108
+ def detect_owlv2(image: Image.Image, prompts: list, threshold: float):
109
+ t0 = time.perf_counter()
110
+ model_id = model_owlv2_id
111
+ device = "cuda" if torch.cuda.is_available() else "cpu"
112
+ processor = AutoProcessor.from_pretrained(model_id)
113
+ model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device).eval()
114
+ texts = [prompts]
115
+ inputs = processor(images=image, text=texts, return_tensors="pt").to(device)
116
+ with torch.no_grad():
117
+ outputs = model(**inputs)
118
+ results = processor.post_process_grounded_object_detection(
119
+ outputs,
120
+ threshold=threshold,
121
+ target_sizes=[image.size[::-1]]
122
+ )
123
+ result = results[0]
124
+ annotations = []
125
+ raw_results = []
126
+ for box, score, label in zip(result["boxes"], result["scores"], result["labels"]):
127
+ if score >= threshold:
128
+ label_name = prompts[label]
129
+ xmin, ymin, xmax, ymax = [int(x) for x in box.tolist()]
130
+ annotations.append(((xmin, ymin, xmax, ymax), f"{label_name} {score:.2f}"))
131
+ raw_results.append(f"Detected {label_name} with confidence {score:.2f} at location [{xmin}, {ymin}, {xmax}, {ymax}]")
132
+ elapsed_ms = (time.perf_counter() - t0) * 1000
133
+ time_taken = f"**Inference time ({model_owlv2_name}):** {elapsed_ms:.0f} ms"
134
+ raw_text = "\n".join(raw_results) if raw_results else "No detections"
135
+ return annotations, raw_text, time_taken
136
+
137
 
138
+ def run_detection(image, prompts_str, threshold_llm, threshold_mm, threshold_owlv2, threshold_omdet):
139
  if image is None:
140
  return (None, []), "No detections", "", (None, []), "No detections", ""
141
  prompts = [p.strip() for p in prompts_str.split(",")]
142
+ ann_llm, raw_llm, time_llm = detect_llmdet(image, prompts, threshold_llm)
143
+ ann_mm, raw_mm, time_mm = detect_mm_grounding(image, prompts, threshold_mm)
144
+ ann_owlv2, raw_owlv2, time_owlv2 = detect_owlv2(image, prompts, threshold_owlv2)
145
+ ann_omdet, raw_omdet, time_omdet = detect_omdet(image, prompts, threshold_omdet)
146
+ return (image, ann_llm), raw_llm, time_llm, (image, ann_mm), raw_mm, time_mm, (image, ann_owlv2), raw_owlv2, time_owlv2, (image, ann_omdet), raw_omdet, time_omdet
147
 
148
  with gr.Blocks() as app:
149
  gr.Markdown("# Zero-Shot Object Detection Arena")
 
152
  with gr.Column(scale=1):
153
  image = gr.Image(type="pil", label="Upload an image", height=400)
154
  prompts = gr.Textbox(label="Prompts (comma-separated)", value="a cat, a remote control")
155
+ with gr.Accordion("Per-model confidence thresholds", open=True):
156
+ threshold_llm = gr.Slider(label="Threshold for LLMDet", minimum=0.0, maximum=1.0, value=0.3)
157
+ threshold_mm = gr.Slider(label="Threshold for MM GroundingDINO Tiny", minimum=0.0, maximum=1.0, value=0.3)
158
+ threshold_owlv2 = gr.Slider(label="Threshold for OwlV2 Large", minimum=0.0, maximum=1.0, value=0.1)
159
+ threshold_omdet = gr.Slider(label="Threshold for OMDet Turbo Swin Tiny", minimum=0.0, maximum=1.0, value=0.2)
160
  generate_btn = gr.Button(value="Detect")
161
+ with gr.Row():
162
+ with gr.Column(scale=2):
163
+ output_image_llm = gr.AnnotatedImage(label=f"Annotated image for {model_llmdet_name}", height=400)
164
+ output_text_llm = gr.Textbox(label=f"Model detections for {model_llmdet_name}", lines=5)
165
+ output_time_llm = gr.Markdown()
166
+ with gr.Column(scale=2):
167
+ output_image_mm = gr.AnnotatedImage(label=f"Annotated image for {model_mm_grounding_name}", height=400)
168
+ output_text_mm = gr.Textbox(label=f"Model detections for {model_mm_grounding_name}", lines=5)
169
+ output_time_mm = gr.Markdown()
170
+ with gr.Row():
171
+ with gr.Column(scale=2):
172
+ output_image_owlv2 = gr.AnnotatedImage(label=f"Annotated image for {model_owlv2_name}", height=400)
173
+ output_text_owlv2 = gr.Textbox(label=f"Model detections for {model_owlv2_name}", lines=5)
174
+ output_time_owlv2 = gr.Markdown()
175
+ with gr.Column(scale=2):
176
+ output_image_omdet = gr.AnnotatedImage(label=f"Annotated image for {model_omdet_name}", height=400)
177
+ output_text_omdet = gr.Textbox(label=f"Model detections for {model_omdet_name}", lines=5)
178
+ output_time_omdet = gr.Markdown()
179
  gr.Markdown("### Examples")
180
  example_data = [
181
+ ["http://images.cocodataset.org/val2017/000000039769.jpg", "a cat, a remote control", 0.30, 0.30, 0.10, 0.30],
182
+ ["http://images.cocodataset.org/val2017/000000000139.jpg", "a person, a tv, a remote", 0.35, 0.30, 0.12, 0.30],
183
  ]
184
+
185
  gr.Examples(
186
  examples=example_data,
187
+ inputs=[image, prompts, threshold_llm, threshold_mm, threshold_owlv2, threshold_omdet],
188
+ label="Click an example to populate the inputs",
189
  )
190
+ inputs = [image, prompts, threshold_llm, threshold_mm, threshold_owlv2, threshold_omdet]
191
+ outputs = [output_image_llm, output_text_llm, output_time_llm, output_image_mm, output_text_mm, output_time_mm, output_image_owlv2, output_text_owlv2, output_time_owlv2, output_image_omdet, output_text_omdet, output_time_omdet]
192
  generate_btn.click(
193
  fn=run_detection,
194
+ inputs=inputs,
195
+ outputs=outputs,
196
  )
197
  image.upload(
198
  fn=run_detection,
199
+ inputs=inputs,
200
+ outputs=outputs,
201
  )
202
 
203
  app.launch()
requirements.txt CHANGED
@@ -4,4 +4,6 @@ pillow
4
  spaces
5
  gradio
6
  transformers
7
- accelerate
 
 
 
4
  spaces
5
  gradio
6
  transformers
7
+ accelerate
8
+ scipy
9
+ timm