Spaces:

SkalskiP
/

YOLO-World

Runtime error

App Files Files Community

SkalskiP commited on Feb 16, 2024

Commit

39840e5

1 Parent(s): f590b07

EfficientSAM support added

Browse files

Files changed (4) hide show

.gitattributes +2 -0
app.py +38 -8
utils/__init__.py +0 -0
utils/efficient_sam.py +47 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+efficient_sam_s_cpu.jit filter=lfs diff=lfs merge=lfs -text
+efficient_sam_s_gpu.jit filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,33 +1,63 @@
 from typing import List
 import gradio as gr
 import numpy as np
 import supervision as sv
 from inference.models import YOLOWorld
 MARKDOWN = """
-# YOLO-World 🌎
 Powered by Roboflow [Inference](https://github.com/roboflow/inference) and [Supervision](https://github.com/roboflow/supervision).
 """
-MODEL = YOLOWorld(model_id="yolo_world/l")
 BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator()
-LABEL_ANNOTATOR = sv.LabelAnnotator(text_color=sv.Color.BLACK)
 def process_categories(categories: str) -> List[str]:
     return [category.strip() for category in categories.split(',')]
-def process_image(input_image: np.ndarray, categories: str) -> np.ndarray:
     categories = process_categories(categories)
-    MODEL.set_classes(categories)
-    results = MODEL.infer(input_image, confidence=0.003)
-    detections = sv.Detections.from_inference(results).with_nms(0.1)
     output_image = input_image.copy()
     output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
-    output_image = LABEL_ANNOTATOR.annotate(output_image, detections)
     return output_image

 from typing import List
+import torch
 import gradio as gr
 import numpy as np
 import supervision as sv
 from inference.models import YOLOWorld
+from utils.efficient_sam import load, inference_with_box
 MARKDOWN = """
+# YOLO-World 🔥 [with Efficient-SAM]
+This is a demo of zero-shot instance segmentation using [YOLO-World](https://github.com/AILab-CVC/YOLO-World) and [Efficient-SAM](https://github.com/yformer/EfficientSAM).
 Powered by Roboflow [Inference](https://github.com/roboflow/inference) and [Supervision](https://github.com/roboflow/supervision).
 """
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+EFFICIENT_SAM_MODEL = load(device=DEVICE)
+YOLO_WORLD_MODEL = YOLOWorld(model_id="yolo_world/l")
 BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator()
+MASK_ANNOTATOR = sv.MaskAnnotator()
+LABEL_ANNOTATOR = sv.LabelAnnotator()
 def process_categories(categories: str) -> List[str]:
     return [category.strip() for category in categories.split(',')]
+def process_image(
+        input_image: np.ndarray,
+        categories: str,
+        confidence_threshold: float = 0.003,
+        iou_threshold: float = 0.5,
+        with_segmentation: bool = True,
+        with_confidence: bool = True
+) -> np.ndarray:
     categories = process_categories(categories)
+    YOLO_WORLD_MODEL.set_classes(categories)
+    results = YOLO_WORLD_MODEL.infer(input_image, confidence=confidence_threshold)
+    detections = sv.Detections.from_inference(results).with_nms(iou_threshold)
+    if with_segmentation:
+        masks = []
+        for [x_min, y_min, x_max, y_max] in detections.xyxy:
+            box = np.array([[x_min, y_min], [x_max, y_max]])
+            mask = inference_with_box(input_image, box, EFFICIENT_SAM_MODEL, DEVICE)
+            masks.append(mask)
+        detections.mask = np.array(masks)
+    labels = [
+        f"{categories[class_id]}: {confidence:.2f}" if with_confidence else f"{categories[class_id]}"
+        for class_id, confidence in
+        zip(detections.class_id, detections.confidence)
+    ]
     output_image = input_image.copy()
+    output_image = MASK_ANNOTATOR.annotate(output_image, detections)
     output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
+    output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
     return output_image

utils/__init__.py ADDED Viewed

File without changes

utils/efficient_sam.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+import numpy as np
+from torchvision.transforms import ToTensor
+GPU_EFFICIENT_SAM_CHECKPOINT = "efficient_sam_s_gpu.jit"
+CPU_EFFICIENT_SAM_CHECKPOINT = "efficient_sam_s_cpu.jit"
+def load(device: torch.device) -> torch.jit.ScriptModule:
+    if device.type == "cuda":
+        model = torch.jit.load(GPU_EFFICIENT_SAM_CHECKPOINT)
+    else:
+        model = torch.jit.load(CPU_EFFICIENT_SAM_CHECKPOINT)
+    model.eval()
+    return model
+def inference_with_box(
+    image: np.ndarray,
+    box: np.ndarray,
+    model: torch.jit.ScriptModule,
+    device: torch.device
+) -> np.ndarray:
+    bbox = torch.reshape(torch.tensor(box), [1, 1, 2, 2])
+    bbox_labels = torch.reshape(torch.tensor([2, 3]), [1, 1, 2])
+    img_tensor = ToTensor()(image)
+    predicted_logits, predicted_iou = model(
+        img_tensor[None, ...].to(device),
+        bbox.to(device),
+        bbox_labels.to(device),
+    )
+    predicted_logits = predicted_logits.cpu()
+    all_masks = torch.ge(torch.sigmoid(predicted_logits[0, 0, :, :, :]), 0.5).numpy()
+    predicted_iou = predicted_iou[0, 0, ...].cpu().detach().numpy()
+    max_predicted_iou = -1
+    selected_mask_using_predicted_iou = None
+    for m in range(all_masks.shape[0]):
+        curr_predicted_iou = predicted_iou[m]
+        if (
+                curr_predicted_iou > max_predicted_iou
+                or selected_mask_using_predicted_iou is None
+        ):
+            max_predicted_iou = curr_predicted_iou
+            selected_mask_using_predicted_iou = all_masks[m]
+    return selected_mask_using_predicted_iou