Spaces:

CuriousDolphin
/

DetrDetectionSegmentation

Sleeping

App Files Files Community

CuriousDolphin commited on Dec 11, 2023

Commit

41eedc7

1 Parent(s): 1d636f3

first commit (wip)

Browse files

Files changed (9) hide show

.gitignore +2 -159
.vscode/settings.json +6 -0
Dockerfile +0 -0
data/assets/000000039769.jpg +0 -0
data/assets/dog_bike_car.jpeg +0 -0
detr/__init__.py +0 -0
detr/detr.py +316 -0
detr/main_gradio.py +66 -0
requirements.txt +6 -0

.gitignore CHANGED Viewed

@@ -1,160 +1,3 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
 .venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/

 .venv
+__pycache__
+data/cache

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "flake8.args": [
+        "--ignore=E24,E128,E201,E202,E225,E231,E252,E265,E302,E303,E401,E402,E501,E731,W504,W605",
+        "--verbose"
+      ],
+}

Dockerfile ADDED Viewed

File without changes

data/assets/000000039769.jpg ADDED Viewed

data/assets/dog_bike_car.jpeg ADDED Viewed

detr/__init__.py ADDED Viewed

File without changes

detr/detr.py ADDED Viewed

	@@ -0,0 +1,316 @@

+from functools import cache
+import torch
+import torchvision.transforms as T
+import os
+import numpy as np
+from torch import nn
+from torchvision.models import resnet50
+from supervision import Detections, BoxAnnotator
+torch.set_grad_enabled(False)
+# https://colab.research.google.com/github/facebookresearch/detr/blob/colab/notebooks/detr_demo.ipynb#scrollTo=cfCcEYjg7y46
+DETR_DEMO_WEIGHTS_URI = "https://dl.fbaipublicfiles.com/detr/detr_demo-da2a99e9.pth"
+TORCH_HOME = os.path.abspath(os.curdir) + "/data/cache"
+os.environ["TORCH_HOME"] = TORCH_HOME
+print("Torch home:", TORCH_HOME)
+# standard PyTorch mean-std input image normalization
+def normalize_img(image):
+    transform = T.Compose(
+        [
+            T.Resize(800),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    return transform(image).unsqueeze(0)
+# for output bounding box post-processing
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=1)
+def rescale_bboxes(out_bbox, size):
+    img_w, img_h = size
+    b = box_cxcywh_to_xyxy(out_bbox)
+    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
+    return b
+class DETRdemo(nn.Module):
+    """
+    Demo DETR implementation.
+    Demo implementation of DETR in minimal number of lines, with the
+    following differences wrt DETR in the paper:
+    * learned positional encoding (instead of sine)
+    * positional encoding is passed at input (instead of attention)
+    * fc bbox predictor (instead of MLP)
+    The model achieves ~40 AP on COCO val5k and runs at ~28 FPS on Tesla V100.
+    Only batch size 1 supported.
+    """
+    def __init__(
+        self,
+        num_classes,
+        hidden_dim=256,
+        nheads=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+    ):
+        super().__init__()
+        # create ResNet-50 backbone
+        self.backbone = resnet50()
+        del self.backbone.fc
+        # create conversion layer
+        self.conv = nn.Conv2d(2048, hidden_dim, 1)
+        # create a default PyTorch transformer
+        self.transformer = nn.Transformer(
+            hidden_dim, nheads, num_encoder_layers, num_decoder_layers
+        )
+        # prediction heads, one extra class for predicting non-empty slots
+        # note that in baseline DETR linear_bbox layer is 3-layer MLP
+        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)
+        self.linear_bbox = nn.Linear(hidden_dim, 4)
+        # output positional encodings (object queries)
+        self.query_pos = nn.Parameter(torch.rand(100, hidden_dim))
+        # spatial positional encodings
+        # note that in baseline DETR we use sine positional encodings
+        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
+        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
+    def forward(self, inputs):
+        # propagate inputs through ResNet-50 up to avg-pool layer
+        x = self.backbone.conv1(inputs)
+        x = self.backbone.bn1(x)
+        x = self.backbone.relu(x)
+        x = self.backbone.maxpool(x)
+        x = self.backbone.layer1(x)
+        x = self.backbone.layer2(x)
+        x = self.backbone.layer3(x)
+        x = self.backbone.layer4(x)
+        # convert from 2048 to 256 feature planes for the transformer
+        h = self.conv(x)
+        # construct positional encodings
+        H, W = h.shape[-2:]
+        pos = (
+            torch.cat(
+                [
+                    self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
+                    self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
+                ],
+                dim=-1,
+            )
+            .flatten(0, 1)
+            .unsqueeze(1)
+        )
+        # propagate through the transformer
+        h = self.transformer(
+            pos + 0.1 * h.flatten(2).permute(2, 0, 1), self.query_pos.unsqueeze(1)
+        ).transpose(0, 1)
+        # finally project transformer outputs to class labels and bounding boxes
+        return {
+            "pred_logits": self.linear_class(h),
+            "pred_boxes": self.linear_bbox(h).sigmoid(),
+        }
+class SimpleDetr:
+    @cache
+    def __init__(self):
+        self.model = DETRdemo(num_classes=91)
+        state_dict = torch.hub.load_state_dict_from_url(
+            url=DETR_DEMO_WEIGHTS_URI,
+            map_location="cpu",
+            check_hash=True,
+        )
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+        self.box_annotator: BoxAnnotator = BoxAnnotator()
+    def detect(self, image, conf):
+        # mean-std normalize the input image (batch-size: 1)
+        img = normalize_img(image)
+        # demo model only support by default images with aspect ratio between 0.5 and 2
+        # if you want to use images with an aspect ratio outside this range
+        # rescale your image so that the maximum size is at most 1333 for best results
+        assert (
+            img.shape[-2] <= 1600 and img.shape[-1] <= 1600
+        ), "demo model only supports images up to 1600 pixels on each side"
+        # propagate through the model
+        outputs = self.model(img)
+        # keep only predictions with 0.7+ confidence
+        scores = outputs["pred_logits"].softmax(-1)[0, :, :-1]
+        keep = scores.max(-1).values > conf
+        # convert boxes from [0; 1] to image scales
+        bboxes_scaled = rescale_bboxes(outputs["pred_boxes"][0, keep], image.size)
+        probas = scores[keep]
+        class_id = []
+        confidence = []
+        for prob in probas:
+            cls_id = prob.argmax()
+            c = prob[cls_id]
+            class_id.append(int(cls_id))
+            confidence.append(float(c))
+        print(class_id, confidence)
+        detections = Detections(
+            xyxy=bboxes_scaled.cpu().detach().numpy(),
+            class_id=np.array(class_id),
+            confidence=np.array(confidence),
+        )
+        annotated = self.box_annotator.annotate(
+            scene=np.array(image),
+            skip_label=False,
+            detections=detections,
+            labels=[
+                f"{CLASSES[cls_id]} {conf:.2f}"
+                for cls_id, conf in zip(detections.class_id, detections.confidence)
+            ],
+        )
+        return annotated
+class PanopticDetrResenet101:
+    @cache
+    def __init__(self):
+        model, postprocessor = torch.hub.load(
+            "facebookresearch/detr",
+            "detr_resnet101_panoptic",
+            pretrained=True,
+            return_postprocessor=True,
+            num_classes=250,
+        )
+        model.eval()
+    def detect(self, image, conf):
+        # mean-std normalize the input image (batch-size: 1)
+        img = normalize_img(image)
+        outputs = self.model(img)
+        # keep only predictions with 0.7+ confidence
+        # compute the scores, excluding the "no-object" class (the last one)
+        scores = outputs["pred_logits"].softmax(-1)[..., :-1].max(-1)[0]
+        # threshold the confidence
+        keep = scores > conf
+# COCO classes
+CLASSES = [
+    "N/A",
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "N/A",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "N/A",
+    "backpack",
+    "umbrella",
+    "N/A",
+    "N/A",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "N/A",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "N/A",
+    "dining table",
+    "N/A",
+    "N/A",
+    "toilet",
+    "N/A",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "N/A",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+]

detr/main_gradio.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import gradio as gr
+import supervision as sv
+import os
+from detr import SimpleDetr, PanopticDetrResenet101
+ASSETS_DIR = os.path.abspath(os.curdir) + "/data/assets"
+print("Assets:", ASSETS_DIR)
+def run_inference(image, confidence, model_name, progress=gr.Progress(track_tqdm=True)):
+    progress(0.1, "loading model..")
+    if model_name == "detr_demo_boxes":
+        model = SimpleDetr()
+    else:
+        model = PanopticDetrResenet101()
+    progress(0.1, "Inference..")
+    annotated_img = model.detect(image, confidence)
+    return annotated_img, None, None
+with gr.Blocks() as inference_gradio:
+    gr.Markdown("# DETR inference")
+    with gr.Row():
+        with gr.Column():
+            img_file = gr.Image(type="pil")
+            # with gr.Row():
+            model_name = gr.Dropdown(
+                label="Model",
+                scale=3,
+                choices=["detr_demo_boxes", "detr_resnet101_panoptic"],
+                value="detr_demo_boxes",
+            )
+            conf = gr.Slider(label="Confidence", minimum=0, maximum=0.99, value=0.5)
+            with gr.Row():
+                start_btn = gr.Button("Start", variant="primary")
+        with gr.Column():
+            annotated_img = gr.Image(label="Annotated Image")
+            speed = gr.JSON(label="speed")
+            json_out = gr.JSON(label="output")
+    examples = gr.Examples(
+        examples=[
+            [path]
+            for path in sv.list_files_with_extensions(
+                directory=ASSETS_DIR, extensions=["jpeg", "jpg"]
+            )
+        ],
+        inputs=[img_file],
+    )
+    start_btn.click(
+        fn=run_inference,
+        inputs=[img_file, conf, model_name],
+        outputs=[annotated_img, speed, json_out],
+    )
+if __name__ == "__main__":
+    inference_gradio.queue(2).launch(
+        debug=True,
+        server_name="0.0.0.0",
+        server_port=7000,
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio==4.8.0
+torch==2.1.1
+numpy
+matplotlib
+torchvision
+supervision==0.17.1