Spaces:

ariG23498
/

zero-shot-od

Running on Zero

App Files Files Community

ariG23498 HF Staff commited on Aug 11

Commit

cb90111

1 Parent(s): 9fc3606

label id vs label name

Browse files

Files changed (1) hide show

app.py +46 -19

app.py CHANGED Viewed

@@ -1,7 +1,12 @@
 import gradio as gr
 import spaces
 import torch
-from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
 from PIL import Image
 import time
@@ -12,27 +17,21 @@ def extract_model_short_name(model_id):
 model_llmdet_id = "iSEE-Laboratory/llmdet_tiny"
 processor_llmdet = AutoProcessor.from_pretrained(model_llmdet_id)
-model_llmdet = (
-    AutoModelForZeroShotObjectDetection.from_pretrained(model_llmdet_id)
-)
 model_mm_grounding_id = "rziga/mm_grounding_dino_tiny_o365v1_goldg"
 processor_mm_grounding = AutoProcessor.from_pretrained(model_mm_grounding_id)
-model_mm_grounding = (
-    AutoModelForZeroShotObjectDetection.from_pretrained(model_mm_grounding_id)
 )
 model_omdet_id = "omlab/omdet-turbo-swin-tiny-hf"
 processor_omdet = AutoProcessor.from_pretrained(model_omdet_id)
-model_omdet = (
-    AutoModelForZeroShotObjectDetection.from_pretrained(model_omdet_id)
-)
 model_owlv2_id = "google/owlv2-large-patch14-ensemble"
 processor_owlv2 = AutoProcessor.from_pretrained(model_owlv2_id)
-model_owlv2 = (
-    AutoModelForZeroShotObjectDetection.from_pretrained(model_owlv2_id)
-)
 model_llmdet_name = extract_model_short_name(model_llmdet_id)
 model_mm_grounding_name = extract_model_short_name(model_mm_grounding_id)
@@ -44,7 +43,7 @@ model_owlv2_name = extract_model_short_name(model_owlv2_id)
 def detect(model, processor, image: Image.Image, prompts: list, threshold: float):
     t0 = time.perf_counter()
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = model.to(device).eval()
     texts = [prompts]
     inputs = processor(images=image, text=texts, return_tensors="pt").to(device)
     with torch.inference_mode():
@@ -54,8 +53,23 @@ def detect(model, processor, image: Image.Image, prompts: list, threshold: float
     )
     result = results[0]
     annotations = []
-    for box, score, label_name in zip(result["boxes"], result["scores"], result["text_labels"]):
         if score >= threshold:
             xmin, ymin, xmax, ymax = [int(x) for x in box.tolist()]
             annotations.append(((xmin, ymin, xmax, ymax), f"{label_name} {score:.2f}"))
     elapsed_ms = (time.perf_counter() - t0) * 1000
@@ -64,13 +78,26 @@ def detect(model, processor, image: Image.Image, prompts: list, threshold: float
 def run_detection(
-    image: Image.Image, prompts_str: str, threshold_llm, threshold_mm, threshold_owlv2, threshold_omdet,
 ):
     prompts = [p.strip() for p in prompts_str.split(",")]
-    ann_llm, time_llm = detect(model_llmdet, processor_llmdet, image, prompts, threshold_llm)
-    ann_mm, time_mm = detect(model_mm_grounding, processor_mm_grounding, image, prompts, threshold_mm)
-    ann_owlv2, time_owlv2 = detect(model_owlv2, processor_owlv2, image, prompts, threshold_owlv2)
-    ann_omdet, time_omdet = detect(model_omdet, processor_omdet, image, prompts, threshold_omdet)
     return (
         (image, ann_llm),
         time_llm,

 import gradio as gr
 import spaces
 import torch
+from transformers import (
+    AutoProcessor,
+    AutoModelForZeroShotObjectDetection,
+    Owlv2ForObjectDetection,
+    OmDetTurboForObjectDetection,
+)
 from PIL import Image
 import time
 model_llmdet_id = "iSEE-Laboratory/llmdet_tiny"
 processor_llmdet = AutoProcessor.from_pretrained(model_llmdet_id)
+model_llmdet = AutoModelForZeroShotObjectDetection.from_pretrained(model_llmdet_id)
 model_mm_grounding_id = "rziga/mm_grounding_dino_tiny_o365v1_goldg"
 processor_mm_grounding = AutoProcessor.from_pretrained(model_mm_grounding_id)
+model_mm_grounding = AutoModelForZeroShotObjectDetection.from_pretrained(
+    model_mm_grounding_id
 )
 model_omdet_id = "omlab/omdet-turbo-swin-tiny-hf"
 processor_omdet = AutoProcessor.from_pretrained(model_omdet_id)
+model_omdet = AutoModelForZeroShotObjectDetection.from_pretrained(model_omdet_id)
 model_owlv2_id = "google/owlv2-large-patch14-ensemble"
 processor_owlv2 = AutoProcessor.from_pretrained(model_owlv2_id)
+model_owlv2 = AutoModelForZeroShotObjectDetection.from_pretrained(model_owlv2_id)
 model_llmdet_name = extract_model_short_name(model_llmdet_id)
 model_mm_grounding_name = extract_model_short_name(model_mm_grounding_id)
 def detect(model, processor, image: Image.Image, prompts: list, threshold: float):
     t0 = time.perf_counter()
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device).eval()
     texts = [prompts]
     inputs = processor(images=image, text=texts, return_tensors="pt").to(device)
     with torch.inference_mode():
     )
     result = results[0]
     annotations = []
+    if isinstance(model, Owlv2ForObjectDetection) or isinstance(
+        model, OmDetTurboForObjectDetection
+    ):
+        key = "labels"
+        check = True
+    else:
+        key = "text_labels"
+        check = False
+    for box, score, label in zip(result["boxes"], result["scores"], result[key]):
         if score >= threshold:
+            if check:
+                label_id = label
+                label_name = prompts[label_id]
+            else:
+                label_name = label
             xmin, ymin, xmax, ymax = [int(x) for x in box.tolist()]
             annotations.append(((xmin, ymin, xmax, ymax), f"{label_name} {score:.2f}"))
     elapsed_ms = (time.perf_counter() - t0) * 1000
 def run_detection(
+    image: Image.Image,
+    prompts_str: str,
+    threshold_llm,
+    threshold_mm,
+    threshold_owlv2,
+    threshold_omdet,
 ):
     prompts = [p.strip() for p in prompts_str.split(",")]
+    ann_llm, time_llm = detect(
+        model_llmdet, processor_llmdet, image, prompts, threshold_llm
+    )
+    ann_mm, time_mm = detect(
+        model_mm_grounding, processor_mm_grounding, image, prompts, threshold_mm
+    )
+    ann_owlv2, time_owlv2 = detect(
+        model_owlv2, processor_owlv2, image, prompts, threshold_owlv2
+    )
+    ann_omdet, time_omdet = detect(
+        model_omdet, processor_omdet, image, prompts, threshold_omdet
+    )
     return (
         (image, ann_llm),
         time_llm,