Spaces:
Runtime error
Runtime error
update
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .ipynb_checkpoints/README-checkpoint.md +11 -0
- .ipynb_checkpoints/README-checkpoint.txt +0 -1
- .ipynb_checkpoints/requirements-checkpoint.txt +0 -17
- .ipynb_checkpoints/test-checkpoint.ipynb +0 -113
- README.md +10 -0
- README.txt +0 -1
- app.py +0 -125
- groundingdino.egg-info/PKG-INFO +0 -213
- groundingdino.egg-info/SOURCES.txt +0 -46
- groundingdino.egg-info/requires.txt +0 -10
- groundingdino.egg-info/top_level.txt +0 -1
- groundingdino/.ipynb_checkpoints/__init__-checkpoint.py +0 -0
- groundingdino/.ipynb_checkpoints/version-checkpoint.py +0 -1
- groundingdino/__init__.py +0 -0
- groundingdino/__pycache__/__init__.cpython-310.pyc +0 -0
- groundingdino/config/.ipynb_checkpoints/GroundingDINO_SwinB_cfg-checkpoint.py +0 -43
- groundingdino/config/GroundingDINO_SwinB_cfg.py +0 -43
- groundingdino/config/GroundingDINO_SwinT_OGC.py +0 -43
- groundingdino/config/__init__.py +0 -0
- groundingdino/datasets/.ipynb_checkpoints/__init__-checkpoint.py +23 -0
- groundingdino/datasets/.ipynb_checkpoints/coco-checkpoint.py +649 -0
- groundingdino/datasets/.ipynb_checkpoints/dataset-checkpoint.py +44 -0
- groundingdino/datasets/.ipynb_checkpoints/odvg-checkpoint.py +258 -0
- groundingdino/datasets/.ipynb_checkpoints/transforms-checkpoint.py +285 -0
- groundingdino/datasets/__init__.py +23 -0
- groundingdino/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
- groundingdino/datasets/__pycache__/coco.cpython-310.pyc +0 -0
- groundingdino/datasets/__pycache__/coco_eval.cpython-310.pyc +0 -0
- groundingdino/datasets/__pycache__/cocogrounding_eval.cpython-310.pyc +0 -0
- groundingdino/datasets/__pycache__/data_util.cpython-310.pyc +0 -0
- groundingdino/datasets/__pycache__/odvg.cpython-310.pyc +0 -0
- groundingdino/datasets/__pycache__/panoptic_eval.cpython-310.pyc +0 -0
- groundingdino/datasets/__pycache__/random_crop.cpython-310.pyc +0 -0
- groundingdino/datasets/__pycache__/sltransform.cpython-310.pyc +0 -0
- groundingdino/datasets/__pycache__/transforms.cpython-310.pyc +0 -0
- groundingdino/datasets/coco.py +649 -0
- groundingdino/datasets/coco_eval.py +266 -0
- groundingdino/datasets/coco_panoptic.py +99 -0
- groundingdino/datasets/cocogrounding_eval.py +3 -1
- groundingdino/datasets/data_util.py +170 -0
- groundingdino/datasets/dataset.py +44 -0
- groundingdino/datasets/odvg.py +258 -0
- groundingdino/datasets/panoptic_eval.py +44 -0
- groundingdino/datasets/random_crop.py +135 -0
- groundingdino/datasets/sltransform.py +247 -0
- groundingdino/datasets/transforms.py +22 -48
- groundingdino/models/.ipynb_checkpoints/__init__-checkpoint.py +0 -18
- groundingdino/models/.ipynb_checkpoints/registry-checkpoint.py +0 -66
- groundingdino/models/GroundingDINO/.ipynb_checkpoints/bertwarper-checkpoint.py +273 -0
- groundingdino/models/GroundingDINO/.ipynb_checkpoints/fuse_modules-checkpoint.py +8 -8
.ipynb_checkpoints/README-checkpoint.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: My Awesome Space
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 3.9.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
.ipynb_checkpoints/README-checkpoint.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
Peft-ed Grounding DINO on RSVG dataset
|
|
|
|
|
|
.ipynb_checkpoints/requirements-checkpoint.txt
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
cython
|
| 2 |
-
submitit
|
| 3 |
-
scipy
|
| 4 |
-
termcolor
|
| 5 |
-
addict
|
| 6 |
-
yapf==0.40.1
|
| 7 |
-
timm
|
| 8 |
-
torch
|
| 9 |
-
torchvision
|
| 10 |
-
transformers
|
| 11 |
-
numpy
|
| 12 |
-
opencv-python
|
| 13 |
-
supervision==0.6.0
|
| 14 |
-
pycocotools
|
| 15 |
-
pyyaml>3.10
|
| 16 |
-
colorlog
|
| 17 |
-
loralib
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.ipynb_checkpoints/test-checkpoint.ipynb
DELETED
|
@@ -1,113 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"cells": [
|
| 3 |
-
{
|
| 4 |
-
"cell_type": "code",
|
| 5 |
-
"execution_count": 2,
|
| 6 |
-
"metadata": {},
|
| 7 |
-
"outputs": [
|
| 8 |
-
{
|
| 9 |
-
"name": "stdout",
|
| 10 |
-
"output_type": "stream",
|
| 11 |
-
"text": [
|
| 12 |
-
"final text_encoder_type: bert-base-uncased\n"
|
| 13 |
-
]
|
| 14 |
-
},
|
| 15 |
-
{
|
| 16 |
-
"data": {
|
| 17 |
-
"application/json": {
|
| 18 |
-
"ascii": false,
|
| 19 |
-
"bar_format": null,
|
| 20 |
-
"colour": null,
|
| 21 |
-
"elapsed": 0.014210224151611328,
|
| 22 |
-
"initial": 0,
|
| 23 |
-
"n": 0,
|
| 24 |
-
"ncols": null,
|
| 25 |
-
"nrows": null,
|
| 26 |
-
"postfix": null,
|
| 27 |
-
"prefix": "Downloading model.safetensors",
|
| 28 |
-
"rate": null,
|
| 29 |
-
"total": 440449768,
|
| 30 |
-
"unit": "B",
|
| 31 |
-
"unit_divisor": 1000,
|
| 32 |
-
"unit_scale": true
|
| 33 |
-
},
|
| 34 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 35 |
-
"model_id": "5922f34578364d36afa13de9f01254bd",
|
| 36 |
-
"version_major": 2,
|
| 37 |
-
"version_minor": 0
|
| 38 |
-
},
|
| 39 |
-
"text/plain": [
|
| 40 |
-
"Downloading model.safetensors: 0%| | 0.00/440M [00:00<?, ?B/s]"
|
| 41 |
-
]
|
| 42 |
-
},
|
| 43 |
-
"metadata": {},
|
| 44 |
-
"output_type": "display_data"
|
| 45 |
-
},
|
| 46 |
-
{
|
| 47 |
-
"name": "stderr",
|
| 48 |
-
"output_type": "stream",
|
| 49 |
-
"text": [
|
| 50 |
-
"/root/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:881: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
|
| 51 |
-
" warnings.warn(\n",
|
| 52 |
-
"/root/miniconda3/lib/python3.8/site-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
|
| 53 |
-
" warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n"
|
| 54 |
-
]
|
| 55 |
-
},
|
| 56 |
-
{
|
| 57 |
-
"data": {
|
| 58 |
-
"text/plain": [
|
| 59 |
-
"True"
|
| 60 |
-
]
|
| 61 |
-
},
|
| 62 |
-
"execution_count": 2,
|
| 63 |
-
"metadata": {},
|
| 64 |
-
"output_type": "execute_result"
|
| 65 |
-
}
|
| 66 |
-
],
|
| 67 |
-
"source": [
|
| 68 |
-
"from groundingdino.util.inference import load_model, load_image, predict, annotate\n",
|
| 69 |
-
"import cv2\n",
|
| 70 |
-
"\n",
|
| 71 |
-
"model = load_model(\"groundingdino/config/GroundingDINO_SwinT_OGC.py\", \"../04-06-segment-anything/weights/groundingdino_swint_ogc.pth\")\n",
|
| 72 |
-
"IMAGE_PATH = \".asset/cat_dog.jpeg\"\n",
|
| 73 |
-
"TEXT_PROMPT = \"chair . person . dog .\"\n",
|
| 74 |
-
"BOX_TRESHOLD = 0.35\n",
|
| 75 |
-
"TEXT_TRESHOLD = 0.25\n",
|
| 76 |
-
"\n",
|
| 77 |
-
"image_source, image = load_image(IMAGE_PATH)\n",
|
| 78 |
-
"\n",
|
| 79 |
-
"boxes, logits, phrases = predict(\n",
|
| 80 |
-
" model=model,\n",
|
| 81 |
-
" image=image,\n",
|
| 82 |
-
" caption=TEXT_PROMPT,\n",
|
| 83 |
-
" box_threshold=BOX_TRESHOLD,\n",
|
| 84 |
-
" text_threshold=TEXT_TRESHOLD\n",
|
| 85 |
-
")\n",
|
| 86 |
-
"\n",
|
| 87 |
-
"annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)\n",
|
| 88 |
-
"cv2.imwrite(\"annotated_image.jpg\", annotated_frame)"
|
| 89 |
-
]
|
| 90 |
-
}
|
| 91 |
-
],
|
| 92 |
-
"metadata": {
|
| 93 |
-
"kernelspec": {
|
| 94 |
-
"display_name": "base",
|
| 95 |
-
"language": "python",
|
| 96 |
-
"name": "python3"
|
| 97 |
-
},
|
| 98 |
-
"language_info": {
|
| 99 |
-
"codemirror_mode": {
|
| 100 |
-
"name": "ipython",
|
| 101 |
-
"version": 3
|
| 102 |
-
},
|
| 103 |
-
"file_extension": ".py",
|
| 104 |
-
"mimetype": "text/x-python",
|
| 105 |
-
"name": "python",
|
| 106 |
-
"nbconvert_exporter": "python",
|
| 107 |
-
"pygments_lexer": "ipython3",
|
| 108 |
-
"version": "3.8.10"
|
| 109 |
-
}
|
| 110 |
-
},
|
| 111 |
-
"nbformat": 4,
|
| 112 |
-
"nbformat_minor": 2
|
| 113 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: My Awesome Space
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 4.36.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
README.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
Peft-ed Grounding DINO on RSVG dataset
|
|
|
|
|
|
app.py
DELETED
|
@@ -1,125 +0,0 @@
|
|
| 1 |
-
import argparse
|
| 2 |
-
from functools import partial
|
| 3 |
-
import cv2
|
| 4 |
-
import requests
|
| 5 |
-
import os
|
| 6 |
-
from io import BytesIO
|
| 7 |
-
from PIL import Image
|
| 8 |
-
import numpy as np
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
import warnings
|
| 13 |
-
|
| 14 |
-
import torch
|
| 15 |
-
|
| 16 |
-
# prepare the environment
|
| 17 |
-
os.system("python setup.py build develop --user")
|
| 18 |
-
os.system("pip install packaging==21.3")
|
| 19 |
-
os.system("pip install gradio")
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
warnings.filterwarnings("ignore")
|
| 23 |
-
|
| 24 |
-
import gradio as gr
|
| 25 |
-
|
| 26 |
-
from groundingdino.models import build_model
|
| 27 |
-
from groundingdino.util.slconfig import SLConfig
|
| 28 |
-
from groundingdino.util.utils import clean_state_dict
|
| 29 |
-
from groundingdino.util.inference import annotate, load_image, predict
|
| 30 |
-
import groundingdino.datasets.transforms as T
|
| 31 |
-
|
| 32 |
-
from huggingface_hub import hf_hub_download
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
# Use this command for evaluate the Grounding DINO model
|
| 37 |
-
config_file = "groundingdino/config/GroundingDINO_SwinB_OGC.py"
|
| 38 |
-
ckpt_repo_id = "Hasanmog/Peft-GroundingDINO"
|
| 39 |
-
ckpt_filenmae = "Best.pth"
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
|
| 43 |
-
args = SLConfig.fromfile(model_config_path)
|
| 44 |
-
model = build_model(args)
|
| 45 |
-
args.device = device
|
| 46 |
-
|
| 47 |
-
cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
|
| 48 |
-
checkpoint = torch.load(cache_file, map_location='cpu')
|
| 49 |
-
log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
|
| 50 |
-
print("Model loaded from {} \n => {}".format(cache_file, log))
|
| 51 |
-
_ = model.eval()
|
| 52 |
-
return model
|
| 53 |
-
|
| 54 |
-
def image_transform_grounding(init_image):
|
| 55 |
-
transform = T.Compose([
|
| 56 |
-
T.RandomResize([800], max_size=1333),
|
| 57 |
-
T.ToTensor(),
|
| 58 |
-
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
| 59 |
-
])
|
| 60 |
-
image, _ = transform(init_image, None) # 3, h, w
|
| 61 |
-
return init_image, image
|
| 62 |
-
|
| 63 |
-
def image_transform_grounding_for_vis(init_image):
|
| 64 |
-
transform = T.Compose([
|
| 65 |
-
T.RandomResize([800], max_size=1333),
|
| 66 |
-
])
|
| 67 |
-
image, _ = transform(init_image, None) # 3, h, w
|
| 68 |
-
return image
|
| 69 |
-
|
| 70 |
-
model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
|
| 71 |
-
|
| 72 |
-
def run_grounding(input_image, grounding_caption, box_threshold, text_threshold):
|
| 73 |
-
init_image = input_image.convert("RGB")
|
| 74 |
-
original_size = init_image.size
|
| 75 |
-
|
| 76 |
-
_, image_tensor = image_transform_grounding(init_image)
|
| 77 |
-
image_pil: Image = image_transform_grounding_for_vis(init_image)
|
| 78 |
-
|
| 79 |
-
# run grounidng
|
| 80 |
-
boxes, logits, phrases = predict(model, image_tensor, grounding_caption, box_threshold, text_threshold, device='cpu')
|
| 81 |
-
annotated_frame = annotate(image_source=np.asarray(image_pil), boxes=boxes, logits=logits, phrases=phrases)
|
| 82 |
-
image_with_box = Image.fromarray(cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB))
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
return image_with_box
|
| 86 |
-
|
| 87 |
-
if __name__ == "__main__":
|
| 88 |
-
|
| 89 |
-
parser = argparse.ArgumentParser("Grounding DINO demo", add_help=True)
|
| 90 |
-
parser.add_argument("--debug", action="store_true", help="using debug mode")
|
| 91 |
-
parser.add_argument("--share", action="store_true", help="share the app")
|
| 92 |
-
args = parser.parse_args()
|
| 93 |
-
|
| 94 |
-
block = gr.Blocks().queue()
|
| 95 |
-
with block:
|
| 96 |
-
gr.Markdown("# [Grounding DINO](https://github.com/IDEA-Research/GroundingDINO)")
|
| 97 |
-
gr.Markdown("### Open-World Detection with Grounding DINO")
|
| 98 |
-
|
| 99 |
-
with gr.Row():
|
| 100 |
-
with gr.Column():
|
| 101 |
-
input_image = gr.Image(source='upload', type="pil")
|
| 102 |
-
grounding_caption = gr.Textbox(label="Detection Prompt")
|
| 103 |
-
run_button = gr.Button(label="Run")
|
| 104 |
-
with gr.Accordion("Advanced options", open=False):
|
| 105 |
-
box_threshold = gr.Slider(
|
| 106 |
-
label="Box Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
|
| 107 |
-
)
|
| 108 |
-
text_threshold = gr.Slider(
|
| 109 |
-
label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
|
| 110 |
-
)
|
| 111 |
-
|
| 112 |
-
with gr.Column():
|
| 113 |
-
gallery = gr.outputs.Image(
|
| 114 |
-
type="pil",
|
| 115 |
-
# label="grounding results"
|
| 116 |
-
).style(full_width=True, full_height=True)
|
| 117 |
-
# gallery = gr.Gallery(label="Generated images", show_label=False).style(
|
| 118 |
-
# grid=[1], height="auto", container=True, full_width=True, full_height=True)
|
| 119 |
-
|
| 120 |
-
run_button.click(fn=run_grounding, inputs=[
|
| 121 |
-
input_image, grounding_caption, box_threshold, text_threshold], outputs=[gallery])
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
block.launch(server_name='0.0.0.0', server_port=7579, debug=args.debug, share=args.share)
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
groundingdino.egg-info/PKG-INFO
DELETED
|
@@ -1,213 +0,0 @@
|
|
| 1 |
-
Metadata-Version: 2.1
|
| 2 |
-
Name: groundingdino
|
| 3 |
-
Version: 0.1.0
|
| 4 |
-
Summary: open-set object detector
|
| 5 |
-
Home-page: https://github.com/IDEA-Research/GroundingDINO
|
| 6 |
-
Author: International Digital Economy Academy, Shilong Liu
|
| 7 |
-
License: Apache License
|
| 8 |
-
Version 2.0, January 2004
|
| 9 |
-
http://www.apache.org/licenses/
|
| 10 |
-
|
| 11 |
-
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 12 |
-
|
| 13 |
-
1. Definitions.
|
| 14 |
-
|
| 15 |
-
"License" shall mean the terms and conditions for use, reproduction,
|
| 16 |
-
and distribution as defined by Sections 1 through 9 of this document.
|
| 17 |
-
|
| 18 |
-
"Licensor" shall mean the copyright owner or entity authorized by
|
| 19 |
-
the copyright owner that is granting the License.
|
| 20 |
-
|
| 21 |
-
"Legal Entity" shall mean the union of the acting entity and all
|
| 22 |
-
other entities that control, are controlled by, or are under common
|
| 23 |
-
control with that entity. For the purposes of this definition,
|
| 24 |
-
"control" means (i) the power, direct or indirect, to cause the
|
| 25 |
-
direction or management of such entity, whether by contract or
|
| 26 |
-
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 27 |
-
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 28 |
-
|
| 29 |
-
"You" (or "Your") shall mean an individual or Legal Entity
|
| 30 |
-
exercising permissions granted by this License.
|
| 31 |
-
|
| 32 |
-
"Source" form shall mean the preferred form for making modifications,
|
| 33 |
-
including but not limited to software source code, documentation
|
| 34 |
-
source, and configuration files.
|
| 35 |
-
|
| 36 |
-
"Object" form shall mean any form resulting from mechanical
|
| 37 |
-
transformation or translation of a Source form, including but
|
| 38 |
-
not limited to compiled object code, generated documentation,
|
| 39 |
-
and conversions to other media types.
|
| 40 |
-
|
| 41 |
-
"Work" shall mean the work of authorship, whether in Source or
|
| 42 |
-
Object form, made available under the License, as indicated by a
|
| 43 |
-
copyright notice that is included in or attached to the work
|
| 44 |
-
(an example is provided in the Appendix below).
|
| 45 |
-
|
| 46 |
-
"Derivative Works" shall mean any work, whether in Source or Object
|
| 47 |
-
form, that is based on (or derived from) the Work and for which the
|
| 48 |
-
editorial revisions, annotations, elaborations, or other modifications
|
| 49 |
-
represent, as a whole, an original work of authorship. For the purposes
|
| 50 |
-
of this License, Derivative Works shall not include works that remain
|
| 51 |
-
separable from, or merely link (or bind by name) to the interfaces of,
|
| 52 |
-
the Work and Derivative Works thereof.
|
| 53 |
-
|
| 54 |
-
"Contribution" shall mean any work of authorship, including
|
| 55 |
-
the original version of the Work and any modifications or additions
|
| 56 |
-
to that Work or Derivative Works thereof, that is intentionally
|
| 57 |
-
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 58 |
-
or by an individual or Legal Entity authorized to submit on behalf of
|
| 59 |
-
the copyright owner. For the purposes of this definition, "submitted"
|
| 60 |
-
means any form of electronic, verbal, or written communication sent
|
| 61 |
-
to the Licensor or its representatives, including but not limited to
|
| 62 |
-
communication on electronic mailing lists, source code control systems,
|
| 63 |
-
and issue tracking systems that are managed by, or on behalf of, the
|
| 64 |
-
Licensor for the purpose of discussing and improving the Work, but
|
| 65 |
-
excluding communication that is conspicuously marked or otherwise
|
| 66 |
-
designated in writing by the copyright owner as "Not a Contribution."
|
| 67 |
-
|
| 68 |
-
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 69 |
-
on behalf of whom a Contribution has been received by Licensor and
|
| 70 |
-
subsequently incorporated within the Work.
|
| 71 |
-
|
| 72 |
-
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 73 |
-
this License, each Contributor hereby grants to You a perpetual,
|
| 74 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 75 |
-
copyright license to reproduce, prepare Derivative Works of,
|
| 76 |
-
publicly display, publicly perform, sublicense, and distribute the
|
| 77 |
-
Work and such Derivative Works in Source or Object form.
|
| 78 |
-
|
| 79 |
-
3. Grant of Patent License. Subject to the terms and conditions of
|
| 80 |
-
this License, each Contributor hereby grants to You a perpetual,
|
| 81 |
-
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 82 |
-
(except as stated in this section) patent license to make, have made,
|
| 83 |
-
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 84 |
-
where such license applies only to those patent claims licensable
|
| 85 |
-
by such Contributor that are necessarily infringed by their
|
| 86 |
-
Contribution(s) alone or by combination of their Contribution(s)
|
| 87 |
-
with the Work to which such Contribution(s) was submitted. If You
|
| 88 |
-
institute patent litigation against any entity (including a
|
| 89 |
-
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 90 |
-
or a Contribution incorporated within the Work constitutes direct
|
| 91 |
-
or contributory patent infringement, then any patent licenses
|
| 92 |
-
granted to You under this License for that Work shall terminate
|
| 93 |
-
as of the date such litigation is filed.
|
| 94 |
-
|
| 95 |
-
4. Redistribution. You may reproduce and distribute copies of the
|
| 96 |
-
Work or Derivative Works thereof in any medium, with or without
|
| 97 |
-
modifications, and in Source or Object form, provided that You
|
| 98 |
-
meet the following conditions:
|
| 99 |
-
|
| 100 |
-
(a) You must give any other recipients of the Work or
|
| 101 |
-
Derivative Works a copy of this License; and
|
| 102 |
-
|
| 103 |
-
(b) You must cause any modified files to carry prominent notices
|
| 104 |
-
stating that You changed the files; and
|
| 105 |
-
|
| 106 |
-
(c) You must retain, in the Source form of any Derivative Works
|
| 107 |
-
that You distribute, all copyright, patent, trademark, and
|
| 108 |
-
attribution notices from the Source form of the Work,
|
| 109 |
-
excluding those notices that do not pertain to any part of
|
| 110 |
-
the Derivative Works; and
|
| 111 |
-
|
| 112 |
-
(d) If the Work includes a "NOTICE" text file as part of its
|
| 113 |
-
distribution, then any Derivative Works that You distribute must
|
| 114 |
-
include a readable copy of the attribution notices contained
|
| 115 |
-
within such NOTICE file, excluding those notices that do not
|
| 116 |
-
pertain to any part of the Derivative Works, in at least one
|
| 117 |
-
of the following places: within a NOTICE text file distributed
|
| 118 |
-
as part of the Derivative Works; within the Source form or
|
| 119 |
-
documentation, if provided along with the Derivative Works; or,
|
| 120 |
-
within a display generated by the Derivative Works, if and
|
| 121 |
-
wherever such third-party notices normally appear. The contents
|
| 122 |
-
of the NOTICE file are for informational purposes only and
|
| 123 |
-
do not modify the License. You may add Your own attribution
|
| 124 |
-
notices within Derivative Works that You distribute, alongside
|
| 125 |
-
or as an addendum to the NOTICE text from the Work, provided
|
| 126 |
-
that such additional attribution notices cannot be construed
|
| 127 |
-
as modifying the License.
|
| 128 |
-
|
| 129 |
-
You may add Your own copyright statement to Your modifications and
|
| 130 |
-
may provide additional or different license terms and conditions
|
| 131 |
-
for use, reproduction, or distribution of Your modifications, or
|
| 132 |
-
for any such Derivative Works as a whole, provided Your use,
|
| 133 |
-
reproduction, and distribution of the Work otherwise complies with
|
| 134 |
-
the conditions stated in this License.
|
| 135 |
-
|
| 136 |
-
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 137 |
-
any Contribution intentionally submitted for inclusion in the Work
|
| 138 |
-
by You to the Licensor shall be under the terms and conditions of
|
| 139 |
-
this License, without any additional terms or conditions.
|
| 140 |
-
Notwithstanding the above, nothing herein shall supersede or modify
|
| 141 |
-
the terms of any separate license agreement you may have executed
|
| 142 |
-
with Licensor regarding such Contributions.
|
| 143 |
-
|
| 144 |
-
6. Trademarks. This License does not grant permission to use the trade
|
| 145 |
-
names, trademarks, service marks, or product names of the Licensor,
|
| 146 |
-
except as required for reasonable and customary use in describing the
|
| 147 |
-
origin of the Work and reproducing the content of the NOTICE file.
|
| 148 |
-
|
| 149 |
-
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 150 |
-
agreed to in writing, Licensor provides the Work (and each
|
| 151 |
-
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 152 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 153 |
-
implied, including, without limitation, any warranties or conditions
|
| 154 |
-
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 155 |
-
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 156 |
-
appropriateness of using or redistributing the Work and assume any
|
| 157 |
-
risks associated with Your exercise of permissions under this License.
|
| 158 |
-
|
| 159 |
-
8. Limitation of Liability. In no event and under no legal theory,
|
| 160 |
-
whether in tort (including negligence), contract, or otherwise,
|
| 161 |
-
unless required by applicable law (such as deliberate and grossly
|
| 162 |
-
negligent acts) or agreed to in writing, shall any Contributor be
|
| 163 |
-
liable to You for damages, including any direct, indirect, special,
|
| 164 |
-
incidental, or consequential damages of any character arising as a
|
| 165 |
-
result of this License or out of the use or inability to use the
|
| 166 |
-
Work (including but not limited to damages for loss of goodwill,
|
| 167 |
-
work stoppage, computer failure or malfunction, or any and all
|
| 168 |
-
other commercial damages or losses), even if such Contributor
|
| 169 |
-
has been advised of the possibility of such damages.
|
| 170 |
-
|
| 171 |
-
9. Accepting Warranty or Additional Liability. While redistributing
|
| 172 |
-
the Work or Derivative Works thereof, You may choose to offer,
|
| 173 |
-
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 174 |
-
or other liability obligations and/or rights consistent with this
|
| 175 |
-
License. However, in accepting such obligations, You may act only
|
| 176 |
-
on Your own behalf and on Your sole responsibility, not on behalf
|
| 177 |
-
of any other Contributor, and only if You agree to indemnify,
|
| 178 |
-
defend, and hold each Contributor harmless for any liability
|
| 179 |
-
incurred by, or claims asserted against, such Contributor by reason
|
| 180 |
-
of your accepting any such warranty or additional liability.
|
| 181 |
-
|
| 182 |
-
END OF TERMS AND CONDITIONS
|
| 183 |
-
|
| 184 |
-
APPENDIX: How to apply the Apache License to your work.
|
| 185 |
-
|
| 186 |
-
To apply the Apache License to your work, attach the following
|
| 187 |
-
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 188 |
-
replaced with your own identifying information. (Don't include
|
| 189 |
-
the brackets!) The text should be enclosed in the appropriate
|
| 190 |
-
comment syntax for the file format. We also recommend that a
|
| 191 |
-
file or class name and description of purpose be included on the
|
| 192 |
-
same "printed page" as the copyright notice for easier
|
| 193 |
-
identification within third-party archives.
|
| 194 |
-
|
| 195 |
-
Copyright 2023 - present, IDEA Research.
|
| 196 |
-
|
| 197 |
-
Licensed under the Apache License, Version 2.0 (the "License");
|
| 198 |
-
you may not use this file except in compliance with the License.
|
| 199 |
-
You may obtain a copy of the License at
|
| 200 |
-
|
| 201 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
| 202 |
-
|
| 203 |
-
Unless required by applicable law or agreed to in writing, software
|
| 204 |
-
distributed under the License is distributed on an "AS IS" BASIS,
|
| 205 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 206 |
-
See the License for the specific language governing permissions and
|
| 207 |
-
limitations under the License.
|
| 208 |
-
|
| 209 |
-
Platform: UNKNOWN
|
| 210 |
-
License-File: LICENSE
|
| 211 |
-
|
| 212 |
-
UNKNOWN
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
groundingdino.egg-info/SOURCES.txt
DELETED
|
@@ -1,46 +0,0 @@
|
|
| 1 |
-
LICENSE
|
| 2 |
-
README.md
|
| 3 |
-
setup.py
|
| 4 |
-
/home/jamada/jupyterlab/projects/gdino-peft/gdino-official/GroundingDINO/groundingdino/models/GroundingDINO/csrc/cuda_version.cu
|
| 5 |
-
/home/jamada/jupyterlab/projects/gdino-peft/gdino-official/GroundingDINO/groundingdino/models/GroundingDINO/csrc/vision.cpp
|
| 6 |
-
/home/jamada/jupyterlab/projects/gdino-peft/gdino-official/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp
|
| 7 |
-
/home/jamada/jupyterlab/projects/gdino-peft/gdino-official/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cuda.cu
|
| 8 |
-
groundingdino/__init__.py
|
| 9 |
-
groundingdino/version.py
|
| 10 |
-
groundingdino.egg-info/PKG-INFO
|
| 11 |
-
groundingdino.egg-info/SOURCES.txt
|
| 12 |
-
groundingdino.egg-info/dependency_links.txt
|
| 13 |
-
groundingdino.egg-info/requires.txt
|
| 14 |
-
groundingdino.egg-info/top_level.txt
|
| 15 |
-
groundingdino/config/GroundingDINO_SwinB_cfg.py
|
| 16 |
-
groundingdino/config/GroundingDINO_SwinT_OGC.py
|
| 17 |
-
groundingdino/config/__init__.py
|
| 18 |
-
groundingdino/datasets/__init__.py
|
| 19 |
-
groundingdino/datasets/cocogrounding_eval.py
|
| 20 |
-
groundingdino/datasets/transforms.py
|
| 21 |
-
groundingdino/models/__init__.py
|
| 22 |
-
groundingdino/models/registry.py
|
| 23 |
-
groundingdino/models/GroundingDINO/__init__.py
|
| 24 |
-
groundingdino/models/GroundingDINO/bertwarper.py
|
| 25 |
-
groundingdino/models/GroundingDINO/fuse_modules.py
|
| 26 |
-
groundingdino/models/GroundingDINO/groundingdino.py
|
| 27 |
-
groundingdino/models/GroundingDINO/ms_deform_attn.py
|
| 28 |
-
groundingdino/models/GroundingDINO/transformer.py
|
| 29 |
-
groundingdino/models/GroundingDINO/transformer_vanilla.py
|
| 30 |
-
groundingdino/models/GroundingDINO/utils.py
|
| 31 |
-
groundingdino/models/GroundingDINO/backbone/__init__.py
|
| 32 |
-
groundingdino/models/GroundingDINO/backbone/backbone.py
|
| 33 |
-
groundingdino/models/GroundingDINO/backbone/position_encoding.py
|
| 34 |
-
groundingdino/models/GroundingDINO/backbone/swin_transformer.py
|
| 35 |
-
groundingdino/util/__init__.py
|
| 36 |
-
groundingdino/util/box_ops.py
|
| 37 |
-
groundingdino/util/get_tokenlizer.py
|
| 38 |
-
groundingdino/util/inference.py
|
| 39 |
-
groundingdino/util/logger.py
|
| 40 |
-
groundingdino/util/misc.py
|
| 41 |
-
groundingdino/util/slconfig.py
|
| 42 |
-
groundingdino/util/slio.py
|
| 43 |
-
groundingdino/util/time_counter.py
|
| 44 |
-
groundingdino/util/utils.py
|
| 45 |
-
groundingdino/util/visualizer.py
|
| 46 |
-
groundingdino/util/vl_utils.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
groundingdino.egg-info/requires.txt
DELETED
|
@@ -1,10 +0,0 @@
|
|
| 1 |
-
addict
|
| 2 |
-
numpy
|
| 3 |
-
opencv-python
|
| 4 |
-
pycocotools
|
| 5 |
-
supervision
|
| 6 |
-
timm
|
| 7 |
-
torch
|
| 8 |
-
torchvision
|
| 9 |
-
transformers
|
| 10 |
-
yapf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
groundingdino.egg-info/top_level.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
groundingdino
|
|
|
|
|
|
groundingdino/.ipynb_checkpoints/__init__-checkpoint.py
DELETED
|
File without changes
|
groundingdino/.ipynb_checkpoints/version-checkpoint.py
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
__version__ = '0.1.0'
|
|
|
|
|
|
groundingdino/__init__.py
DELETED
|
File without changes
|
groundingdino/__pycache__/__init__.cpython-310.pyc
DELETED
|
Binary file (182 Bytes)
|
|
|
groundingdino/config/.ipynb_checkpoints/GroundingDINO_SwinB_cfg-checkpoint.py
DELETED
|
@@ -1,43 +0,0 @@
|
|
| 1 |
-
batch_size = 1
|
| 2 |
-
modelname = "groundingdino"
|
| 3 |
-
backbone = "swin_B_384_22k"
|
| 4 |
-
position_embedding = "sine"
|
| 5 |
-
pe_temperatureH = 20
|
| 6 |
-
pe_temperatureW = 20
|
| 7 |
-
return_interm_indices = [1, 2, 3]
|
| 8 |
-
backbone_freeze_keywords = None
|
| 9 |
-
enc_layers = 6
|
| 10 |
-
dec_layers = 6
|
| 11 |
-
pre_norm = False
|
| 12 |
-
dim_feedforward = 2048
|
| 13 |
-
hidden_dim = 256
|
| 14 |
-
dropout = 0.0
|
| 15 |
-
nheads = 8
|
| 16 |
-
num_queries = 900
|
| 17 |
-
query_dim = 4
|
| 18 |
-
num_patterns = 0
|
| 19 |
-
num_feature_levels = 4
|
| 20 |
-
enc_n_points = 4
|
| 21 |
-
dec_n_points = 4
|
| 22 |
-
two_stage_type = "standard"
|
| 23 |
-
two_stage_bbox_embed_share = False
|
| 24 |
-
two_stage_class_embed_share = False
|
| 25 |
-
transformer_activation = "relu"
|
| 26 |
-
dec_pred_bbox_embed_share = True
|
| 27 |
-
dn_box_noise_scale = 1.0
|
| 28 |
-
dn_label_noise_ratio = 0.5
|
| 29 |
-
dn_label_coef = 1.0
|
| 30 |
-
dn_bbox_coef = 1.0
|
| 31 |
-
embed_init_tgt = True
|
| 32 |
-
dn_labelbook_size = 2000
|
| 33 |
-
max_text_len = 256
|
| 34 |
-
text_encoder_type = "bert-base-uncased"
|
| 35 |
-
use_text_enhancer = True
|
| 36 |
-
use_fusion_layer = True
|
| 37 |
-
use_checkpoint = True
|
| 38 |
-
use_transformer_ckpt = True
|
| 39 |
-
use_text_cross_attention = True
|
| 40 |
-
text_dropout = 0.0
|
| 41 |
-
fusion_dropout = 0.0
|
| 42 |
-
fusion_droppath = 0.1
|
| 43 |
-
sub_sentence_present = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
groundingdino/config/GroundingDINO_SwinB_cfg.py
DELETED
|
@@ -1,43 +0,0 @@
|
|
| 1 |
-
batch_size = 1
|
| 2 |
-
modelname = "groundingdino"
|
| 3 |
-
backbone = "swin_B_384_22k"
|
| 4 |
-
position_embedding = "sine"
|
| 5 |
-
pe_temperatureH = 20
|
| 6 |
-
pe_temperatureW = 20
|
| 7 |
-
return_interm_indices = [1, 2, 3]
|
| 8 |
-
backbone_freeze_keywords = None
|
| 9 |
-
enc_layers = 6
|
| 10 |
-
dec_layers = 6
|
| 11 |
-
pre_norm = False
|
| 12 |
-
dim_feedforward = 2048
|
| 13 |
-
hidden_dim = 256
|
| 14 |
-
dropout = 0.0
|
| 15 |
-
nheads = 8
|
| 16 |
-
num_queries = 900
|
| 17 |
-
query_dim = 4
|
| 18 |
-
num_patterns = 0
|
| 19 |
-
num_feature_levels = 4
|
| 20 |
-
enc_n_points = 4
|
| 21 |
-
dec_n_points = 4
|
| 22 |
-
two_stage_type = "standard"
|
| 23 |
-
two_stage_bbox_embed_share = False
|
| 24 |
-
two_stage_class_embed_share = False
|
| 25 |
-
transformer_activation = "relu"
|
| 26 |
-
dec_pred_bbox_embed_share = True
|
| 27 |
-
dn_box_noise_scale = 1.0
|
| 28 |
-
dn_label_noise_ratio = 0.5
|
| 29 |
-
dn_label_coef = 1.0
|
| 30 |
-
dn_bbox_coef = 1.0
|
| 31 |
-
embed_init_tgt = True
|
| 32 |
-
dn_labelbook_size = 2000
|
| 33 |
-
max_text_len = 256
|
| 34 |
-
text_encoder_type = "bert-base-uncased"
|
| 35 |
-
use_text_enhancer = True
|
| 36 |
-
use_fusion_layer = True
|
| 37 |
-
use_checkpoint = True
|
| 38 |
-
use_transformer_ckpt = True
|
| 39 |
-
use_text_cross_attention = True
|
| 40 |
-
text_dropout = 0.0
|
| 41 |
-
fusion_dropout = 0.0
|
| 42 |
-
fusion_droppath = 0.1
|
| 43 |
-
sub_sentence_present = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
groundingdino/config/GroundingDINO_SwinT_OGC.py
DELETED
|
@@ -1,43 +0,0 @@
|
|
| 1 |
-
batch_size = 1
|
| 2 |
-
modelname = "groundingdino"
|
| 3 |
-
backbone = "swin_T_224_1k"
|
| 4 |
-
position_embedding = "sine"
|
| 5 |
-
pe_temperatureH = 20
|
| 6 |
-
pe_temperatureW = 20
|
| 7 |
-
return_interm_indices = [1, 2, 3]
|
| 8 |
-
backbone_freeze_keywords = None
|
| 9 |
-
enc_layers = 6
|
| 10 |
-
dec_layers = 6
|
| 11 |
-
pre_norm = False
|
| 12 |
-
dim_feedforward = 2048
|
| 13 |
-
hidden_dim = 256
|
| 14 |
-
dropout = 0.0
|
| 15 |
-
nheads = 8
|
| 16 |
-
num_queries = 900
|
| 17 |
-
query_dim = 4
|
| 18 |
-
num_patterns = 0
|
| 19 |
-
num_feature_levels = 4
|
| 20 |
-
enc_n_points = 4
|
| 21 |
-
dec_n_points = 4
|
| 22 |
-
two_stage_type = "standard"
|
| 23 |
-
two_stage_bbox_embed_share = False
|
| 24 |
-
two_stage_class_embed_share = False
|
| 25 |
-
transformer_activation = "relu"
|
| 26 |
-
dec_pred_bbox_embed_share = True
|
| 27 |
-
dn_box_noise_scale = 1.0
|
| 28 |
-
dn_label_noise_ratio = 0.5
|
| 29 |
-
dn_label_coef = 1.0
|
| 30 |
-
dn_bbox_coef = 1.0
|
| 31 |
-
embed_init_tgt = True
|
| 32 |
-
dn_labelbook_size = 2000
|
| 33 |
-
max_text_len = 256
|
| 34 |
-
text_encoder_type = "bert-base-uncased"
|
| 35 |
-
use_text_enhancer = True
|
| 36 |
-
use_fusion_layer = True
|
| 37 |
-
use_checkpoint = True
|
| 38 |
-
use_transformer_ckpt = True
|
| 39 |
-
use_text_cross_attention = True
|
| 40 |
-
text_dropout = 0.0
|
| 41 |
-
fusion_dropout = 0.0
|
| 42 |
-
fusion_droppath = 0.1
|
| 43 |
-
sub_sentence_present = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
groundingdino/config/__init__.py
DELETED
|
File without changes
|
groundingdino/datasets/.ipynb_checkpoints/__init__-checkpoint.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
| 2 |
+
import torch.utils.data
|
| 3 |
+
import torchvision
|
| 4 |
+
from .coco import build as build_coco
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def get_coco_api_from_dataset(dataset):
|
| 8 |
+
for _ in range(10):
|
| 9 |
+
# if isinstance(dataset, torchvision.datasets.CocoDetection):
|
| 10 |
+
# break
|
| 11 |
+
if isinstance(dataset, torch.utils.data.Subset):
|
| 12 |
+
dataset = dataset.dataset
|
| 13 |
+
if isinstance(dataset, torchvision.datasets.CocoDetection):
|
| 14 |
+
return dataset.coco
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def build_dataset(image_set, args, datasetinfo):
|
| 18 |
+
if datasetinfo["dataset_mode"] == 'coco':
|
| 19 |
+
return build_coco(image_set, args, datasetinfo)
|
| 20 |
+
if datasetinfo["dataset_mode"] == 'odvg':
|
| 21 |
+
from .odvg import build_odvg
|
| 22 |
+
return build_odvg(image_set, args, datasetinfo)
|
| 23 |
+
raise ValueError(f'dataset {args.dataset_file} not supported')
|
groundingdino/datasets/.ipynb_checkpoints/coco-checkpoint.py
ADDED
|
@@ -0,0 +1,649 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
| 2 |
+
"""
|
| 3 |
+
COCO dataset which returns image_id for evaluation.
|
| 4 |
+
|
| 5 |
+
Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
|
| 6 |
+
"""
|
| 7 |
+
if __name__=="__main__":
|
| 8 |
+
# for debug only
|
| 9 |
+
import os, sys
|
| 10 |
+
sys.path.append(os.path.dirname(sys.path[0]))
|
| 11 |
+
from torchvision.datasets.vision import VisionDataset
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import random
|
| 16 |
+
import os
|
| 17 |
+
from typing import Any, Callable, List, Optional, Tuple
|
| 18 |
+
|
| 19 |
+
from PIL import Image
|
| 20 |
+
|
| 21 |
+
import torch
|
| 22 |
+
import torch.utils.data
|
| 23 |
+
import torchvision
|
| 24 |
+
from pycocotools import mask as coco_mask
|
| 25 |
+
|
| 26 |
+
from datasets.data_util import preparing_dataset
|
| 27 |
+
import datasets.transforms as T
|
| 28 |
+
from util.box_ops import box_cxcywh_to_xyxy, box_iou
|
| 29 |
+
|
| 30 |
+
__all__ = ['build']
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class label2compat():
|
| 34 |
+
def __init__(self) -> None:
|
| 35 |
+
self.category_map_str = {"1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10, "11": 11, "13": 12, "14": 13, "15": 14, "16": 15, "17": 16, "18": 17, "19": 18, "20": 19, "21": 20, "22": 21, "23": 22, "24": 23, "25": 24, "27": 25, "28": 26, "31": 27, "32": 28, "33": 29, "34": 30, "35": 31, "36": 32, "37": 33, "38": 34, "39": 35, "40": 36, "41": 37, "42": 38, "43": 39, "44": 40, "46": 41, "47": 42, "48": 43, "49": 44, "50": 45, "51": 46, "52": 47, "53": 48, "54": 49, "55": 50, "56": 51, "57": 52, "58": 53, "59": 54, "60": 55, "61": 56, "62": 57, "63": 58, "64": 59, "65": 60, "67": 61, "70": 62, "72": 63, "73": 64, "74": 65, "75": 66, "76": 67, "77": 68, "78": 69, "79": 70, "80": 71, "81": 72, "82": 73, "84": 74, "85": 75, "86": 76, "87": 77, "88": 78, "89": 79, "90": 80}
|
| 36 |
+
self.category_map = {int(k):v for k,v in self.category_map_str.items()}
|
| 37 |
+
|
| 38 |
+
def __call__(self, target, img=None):
|
| 39 |
+
labels = target['labels']
|
| 40 |
+
res = torch.zeros(labels.shape, dtype=labels.dtype)
|
| 41 |
+
for idx, item in enumerate(labels):
|
| 42 |
+
res[idx] = self.category_map[item.item()] - 1
|
| 43 |
+
target['label_compat'] = res
|
| 44 |
+
if img is not None:
|
| 45 |
+
return target, img
|
| 46 |
+
else:
|
| 47 |
+
return target
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class label_compat2onehot():
|
| 51 |
+
def __init__(self, num_class=80, num_output_objs=1):
|
| 52 |
+
self.num_class = num_class
|
| 53 |
+
self.num_output_objs = num_output_objs
|
| 54 |
+
if num_output_objs != 1:
|
| 55 |
+
raise DeprecationWarning("num_output_objs!=1, which is only used for comparison")
|
| 56 |
+
|
| 57 |
+
def __call__(self, target, img=None):
|
| 58 |
+
labels = target['label_compat']
|
| 59 |
+
place_dict = {k:0 for k in range(self.num_class)}
|
| 60 |
+
if self.num_output_objs == 1:
|
| 61 |
+
res = torch.zeros(self.num_class)
|
| 62 |
+
for i in labels:
|
| 63 |
+
itm = i.item()
|
| 64 |
+
res[itm] = 1.0
|
| 65 |
+
else:
|
| 66 |
+
# compat with baseline
|
| 67 |
+
res = torch.zeros(self.num_class, self.num_output_objs)
|
| 68 |
+
for i in labels:
|
| 69 |
+
itm = i.item()
|
| 70 |
+
res[itm][place_dict[itm]] = 1.0
|
| 71 |
+
place_dict[itm] += 1
|
| 72 |
+
target['label_compat_onehot'] = res
|
| 73 |
+
if img is not None:
|
| 74 |
+
return target, img
|
| 75 |
+
else:
|
| 76 |
+
return target
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class box_label_catter():
|
| 80 |
+
def __init__(self):
|
| 81 |
+
pass
|
| 82 |
+
|
| 83 |
+
def __call__(self, target, img=None):
|
| 84 |
+
labels = target['label_compat']
|
| 85 |
+
boxes = target['boxes']
|
| 86 |
+
box_label = torch.cat((boxes, labels.unsqueeze(-1)), 1)
|
| 87 |
+
target['box_label'] = box_label
|
| 88 |
+
if img is not None:
|
| 89 |
+
return target, img
|
| 90 |
+
else:
|
| 91 |
+
return target
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class RandomSelectBoxlabels():
|
| 95 |
+
def __init__(self, num_classes, leave_one_out=False, blank_prob=0.8,
|
| 96 |
+
prob_first_item = 0.0,
|
| 97 |
+
prob_random_item = 0.0,
|
| 98 |
+
prob_last_item = 0.8,
|
| 99 |
+
prob_stop_sign = 0.2
|
| 100 |
+
) -> None:
|
| 101 |
+
self.num_classes = num_classes
|
| 102 |
+
self.leave_one_out = leave_one_out
|
| 103 |
+
self.blank_prob = blank_prob
|
| 104 |
+
|
| 105 |
+
self.set_state(prob_first_item, prob_random_item, prob_last_item, prob_stop_sign)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def get_state(self):
|
| 109 |
+
return [self.prob_first_item, self.prob_random_item, self.prob_last_item, self.prob_stop_sign]
|
| 110 |
+
|
| 111 |
+
def set_state(self, prob_first_item, prob_random_item, prob_last_item, prob_stop_sign):
|
| 112 |
+
sum_prob = prob_first_item + prob_random_item + prob_last_item + prob_stop_sign
|
| 113 |
+
assert sum_prob - 1 < 1e-6, \
|
| 114 |
+
f"Sum up all prob = {sum_prob}. prob_first_item:{prob_first_item}" \
|
| 115 |
+
+ f"prob_random_item:{prob_random_item}, prob_last_item:{prob_last_item}" \
|
| 116 |
+
+ f"prob_stop_sign:{prob_stop_sign}"
|
| 117 |
+
|
| 118 |
+
self.prob_first_item = prob_first_item
|
| 119 |
+
self.prob_random_item = prob_random_item
|
| 120 |
+
self.prob_last_item = prob_last_item
|
| 121 |
+
self.prob_stop_sign = prob_stop_sign
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def sample_for_pred_first_item(self, box_label: torch.FloatTensor):
|
| 125 |
+
box_label_known = torch.Tensor(0,5)
|
| 126 |
+
box_label_unknown = box_label
|
| 127 |
+
return box_label_known, box_label_unknown
|
| 128 |
+
|
| 129 |
+
def sample_for_pred_random_item(self, box_label: torch.FloatTensor):
|
| 130 |
+
n_select = int(random.random() * box_label.shape[0])
|
| 131 |
+
box_label = box_label[torch.randperm(box_label.shape[0])]
|
| 132 |
+
box_label_known = box_label[:n_select]
|
| 133 |
+
box_label_unknown = box_label[n_select:]
|
| 134 |
+
return box_label_known, box_label_unknown
|
| 135 |
+
|
| 136 |
+
def sample_for_pred_last_item(self, box_label: torch.FloatTensor):
|
| 137 |
+
box_label_perm = box_label[torch.randperm(box_label.shape[0])]
|
| 138 |
+
known_label_list = []
|
| 139 |
+
box_label_known = []
|
| 140 |
+
box_label_unknown = []
|
| 141 |
+
for item in box_label_perm:
|
| 142 |
+
label_i = item[4].item()
|
| 143 |
+
if label_i in known_label_list:
|
| 144 |
+
box_label_known.append(item)
|
| 145 |
+
else:
|
| 146 |
+
# first item
|
| 147 |
+
box_label_unknown.append(item)
|
| 148 |
+
known_label_list.append(label_i)
|
| 149 |
+
box_label_known = torch.stack(box_label_known) if len(box_label_known) > 0 else torch.Tensor(0,5)
|
| 150 |
+
box_label_unknown = torch.stack(box_label_unknown) if len(box_label_unknown) > 0 else torch.Tensor(0,5)
|
| 151 |
+
return box_label_known, box_label_unknown
|
| 152 |
+
|
| 153 |
+
def sample_for_pred_stop_sign(self, box_label: torch.FloatTensor):
|
| 154 |
+
box_label_unknown = torch.Tensor(0,5)
|
| 155 |
+
box_label_known = box_label
|
| 156 |
+
return box_label_known, box_label_unknown
|
| 157 |
+
|
| 158 |
+
def __call__(self, target, img=None):
|
| 159 |
+
box_label = target['box_label'] # K, 5
|
| 160 |
+
|
| 161 |
+
dice_number = random.random()
|
| 162 |
+
|
| 163 |
+
if dice_number < self.prob_first_item:
|
| 164 |
+
box_label_known, box_label_unknown = self.sample_for_pred_first_item(box_label)
|
| 165 |
+
elif dice_number < self.prob_first_item + self.prob_random_item:
|
| 166 |
+
box_label_known, box_label_unknown = self.sample_for_pred_random_item(box_label)
|
| 167 |
+
elif dice_number < self.prob_first_item + self.prob_random_item + self.prob_last_item:
|
| 168 |
+
box_label_known, box_label_unknown = self.sample_for_pred_last_item(box_label)
|
| 169 |
+
else:
|
| 170 |
+
box_label_known, box_label_unknown = self.sample_for_pred_stop_sign(box_label)
|
| 171 |
+
|
| 172 |
+
target['label_onehot_known'] = label2onehot(box_label_known[:,-1], self.num_classes)
|
| 173 |
+
target['label_onehot_unknown'] = label2onehot(box_label_unknown[:, -1], self.num_classes)
|
| 174 |
+
target['box_label_known'] = box_label_known
|
| 175 |
+
target['box_label_unknown'] = box_label_unknown
|
| 176 |
+
|
| 177 |
+
return target, img
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
class RandomDrop():
|
| 181 |
+
def __init__(self, p=0.2) -> None:
|
| 182 |
+
self.p = p
|
| 183 |
+
|
| 184 |
+
def __call__(self, target, img=None):
|
| 185 |
+
known_box = target['box_label_known']
|
| 186 |
+
num_known_box = known_box.size(0)
|
| 187 |
+
idxs = torch.rand(num_known_box)
|
| 188 |
+
# indices = torch.randperm(num_known_box)[:int((1-self).p*num_known_box + 0.5 + random.random())]
|
| 189 |
+
target['box_label_known'] = known_box[idxs > self.p]
|
| 190 |
+
return target, img
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
class BboxPertuber():
|
| 194 |
+
def __init__(self, max_ratio = 0.02, generate_samples = 1000) -> None:
|
| 195 |
+
self.max_ratio = max_ratio
|
| 196 |
+
self.generate_samples = generate_samples
|
| 197 |
+
self.samples = self.generate_pertube_samples()
|
| 198 |
+
self.idx = 0
|
| 199 |
+
|
| 200 |
+
def generate_pertube_samples(self):
|
| 201 |
+
import torch
|
| 202 |
+
samples = (torch.rand(self.generate_samples, 5) - 0.5) * 2 * self.max_ratio
|
| 203 |
+
return samples
|
| 204 |
+
|
| 205 |
+
def __call__(self, target, img):
|
| 206 |
+
known_box = target['box_label_known'] # Tensor(K,5), K known bbox
|
| 207 |
+
K = known_box.shape[0]
|
| 208 |
+
known_box_pertube = torch.zeros(K, 6) # 4:bbox, 1:prob, 1:label
|
| 209 |
+
if K == 0:
|
| 210 |
+
pass
|
| 211 |
+
else:
|
| 212 |
+
if self.idx + K > self.generate_samples:
|
| 213 |
+
self.idx = 0
|
| 214 |
+
delta = self.samples[self.idx: self.idx + K, :]
|
| 215 |
+
known_box_pertube[:, :4] = known_box[:, :4] + delta[:, :4]
|
| 216 |
+
iou = (torch.diag(box_iou(box_cxcywh_to_xyxy(known_box[:, :4]), box_cxcywh_to_xyxy(known_box_pertube[:, :4]))[0])) * (1 + delta[:, -1])
|
| 217 |
+
known_box_pertube[:, 4].copy_(iou)
|
| 218 |
+
known_box_pertube[:, -1].copy_(known_box[:, -1])
|
| 219 |
+
|
| 220 |
+
target['box_label_known_pertube'] = known_box_pertube
|
| 221 |
+
return target, img
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
class RandomCutout():
|
| 225 |
+
def __init__(self, factor=0.5) -> None:
|
| 226 |
+
self.factor = factor
|
| 227 |
+
|
| 228 |
+
def __call__(self, target, img=None):
|
| 229 |
+
unknown_box = target['box_label_unknown'] # Ku, 5
|
| 230 |
+
known_box = target['box_label_known_pertube'] # Kk, 6
|
| 231 |
+
Ku = unknown_box.size(0)
|
| 232 |
+
|
| 233 |
+
known_box_add = torch.zeros(Ku, 6) # Ku, 6
|
| 234 |
+
known_box_add[:, :5] = unknown_box
|
| 235 |
+
known_box_add[:, 5].uniform_(0.5, 1)
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
known_box_add[:, :2] += known_box_add[:, 2:4] * (torch.rand(Ku, 2) - 0.5) / 2
|
| 239 |
+
known_box_add[:, 2:4] /= 2
|
| 240 |
+
|
| 241 |
+
target['box_label_known_pertube'] = torch.cat((known_box, known_box_add))
|
| 242 |
+
return target, img
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
class RandomSelectBoxes():
|
| 246 |
+
def __init__(self, num_class=80) -> None:
|
| 247 |
+
Warning("This is such a slow function and will be deprecated soon!!!")
|
| 248 |
+
self.num_class = num_class
|
| 249 |
+
|
| 250 |
+
def __call__(self, target, img=None):
|
| 251 |
+
boxes = target['boxes']
|
| 252 |
+
labels = target['label_compat']
|
| 253 |
+
|
| 254 |
+
# transform to list of tensors
|
| 255 |
+
boxs_list = [[] for i in range(self.num_class)]
|
| 256 |
+
for idx, item in enumerate(boxes):
|
| 257 |
+
label = labels[idx].item()
|
| 258 |
+
boxs_list[label].append(item)
|
| 259 |
+
boxs_list_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in boxs_list]
|
| 260 |
+
|
| 261 |
+
# random selection
|
| 262 |
+
box_known = []
|
| 263 |
+
box_unknown = []
|
| 264 |
+
for idx, item in enumerate(boxs_list_tensor):
|
| 265 |
+
ncnt = item.shape[0]
|
| 266 |
+
nselect = int(random.random() * ncnt) # close in both sides, much faster than random.randint
|
| 267 |
+
|
| 268 |
+
item = item[torch.randperm(ncnt)]
|
| 269 |
+
# random.shuffle(item)
|
| 270 |
+
box_known.append(item[:nselect])
|
| 271 |
+
box_unknown.append(item[nselect:])
|
| 272 |
+
|
| 273 |
+
# box_known_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_known]
|
| 274 |
+
# box_unknown_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_unknown]
|
| 275 |
+
# print('box_unknown_tensor:', box_unknown_tensor)
|
| 276 |
+
target['known_box'] = box_known
|
| 277 |
+
target['unknown_box'] = box_unknown
|
| 278 |
+
return target, img
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def label2onehot(label, num_classes):
|
| 282 |
+
"""
|
| 283 |
+
label: Tensor(K)
|
| 284 |
+
"""
|
| 285 |
+
res = torch.zeros(num_classes)
|
| 286 |
+
for i in label:
|
| 287 |
+
itm = int(i.item())
|
| 288 |
+
res[itm] = 1.0
|
| 289 |
+
return res
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
class MaskCrop():
|
| 293 |
+
def __init__(self) -> None:
|
| 294 |
+
pass
|
| 295 |
+
|
| 296 |
+
def __call__(self, target, img):
|
| 297 |
+
known_box = target['known_box']
|
| 298 |
+
h,w = img.shape[1:] # h,w
|
| 299 |
+
# imgsize = target['orig_size'] # h,w
|
| 300 |
+
|
| 301 |
+
scale = torch.Tensor([w, h, w, h])
|
| 302 |
+
|
| 303 |
+
# _cnt = 0
|
| 304 |
+
for boxes in known_box:
|
| 305 |
+
if boxes.shape[0] == 0:
|
| 306 |
+
continue
|
| 307 |
+
box_xyxy = box_cxcywh_to_xyxy(boxes) * scale
|
| 308 |
+
for box in box_xyxy:
|
| 309 |
+
x1, y1, x2, y2 = [int(i) for i in box.tolist()]
|
| 310 |
+
img[:, y1:y2, x1:x2] = 0
|
| 311 |
+
# _cnt += 1
|
| 312 |
+
# print("_cnt:", _cnt)
|
| 313 |
+
return target, img
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
dataset_hook_register = {
|
| 317 |
+
'label2compat': label2compat,
|
| 318 |
+
'label_compat2onehot': label_compat2onehot,
|
| 319 |
+
'box_label_catter': box_label_catter,
|
| 320 |
+
'RandomSelectBoxlabels': RandomSelectBoxlabels,
|
| 321 |
+
'RandomSelectBoxes': RandomSelectBoxes,
|
| 322 |
+
'MaskCrop': MaskCrop,
|
| 323 |
+
'BboxPertuber': BboxPertuber,
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
class CocoDetection(torchvision.datasets.CocoDetection):
|
| 328 |
+
def __init__(self, img_folder, ann_file, transforms, return_masks, aux_target_hacks=None):
|
| 329 |
+
super(CocoDetection, self).__init__(img_folder, ann_file)
|
| 330 |
+
self._transforms = transforms
|
| 331 |
+
self.prepare = ConvertCocoPolysToMask(return_masks)
|
| 332 |
+
self.aux_target_hacks = aux_target_hacks
|
| 333 |
+
|
| 334 |
+
def change_hack_attr(self, hackclassname, attrkv_dict):
|
| 335 |
+
target_class = dataset_hook_register[hackclassname]
|
| 336 |
+
for item in self.aux_target_hacks:
|
| 337 |
+
if isinstance(item, target_class):
|
| 338 |
+
for k,v in attrkv_dict.items():
|
| 339 |
+
setattr(item, k, v)
|
| 340 |
+
|
| 341 |
+
def get_hack(self, hackclassname):
|
| 342 |
+
target_class = dataset_hook_register[hackclassname]
|
| 343 |
+
for item in self.aux_target_hacks:
|
| 344 |
+
if isinstance(item, target_class):
|
| 345 |
+
return item
|
| 346 |
+
|
| 347 |
+
def _load_image(self, id: int) -> Image.Image:
|
| 348 |
+
path = self.coco.loadImgs(id)[0]["file_name"]
|
| 349 |
+
abs_path = os.path.join(self.root, path)
|
| 350 |
+
return Image.open(abs_path).convert("RGB")
|
| 351 |
+
|
| 352 |
+
def __getitem__(self, idx):
|
| 353 |
+
"""
|
| 354 |
+
Output:
|
| 355 |
+
- target: dict of multiple items
|
| 356 |
+
- boxes: Tensor[num_box, 4]. \
|
| 357 |
+
Init type: x0,y0,x1,y1. unnormalized data.
|
| 358 |
+
Final type: cx,cy,w,h. normalized data.
|
| 359 |
+
"""
|
| 360 |
+
try:
|
| 361 |
+
img, target = super(CocoDetection, self).__getitem__(idx)
|
| 362 |
+
except:
|
| 363 |
+
print("Error idx: {}".format(idx))
|
| 364 |
+
idx += 1
|
| 365 |
+
img, target = super(CocoDetection, self).__getitem__(idx)
|
| 366 |
+
image_id = self.ids[idx]
|
| 367 |
+
target = {'image_id': image_id, 'annotations': target}
|
| 368 |
+
img, target = self.prepare(img, target)
|
| 369 |
+
|
| 370 |
+
if self._transforms is not None:
|
| 371 |
+
img, target = self._transforms(img, target)
|
| 372 |
+
|
| 373 |
+
# convert to needed format
|
| 374 |
+
if self.aux_target_hacks is not None:
|
| 375 |
+
for hack_runner in self.aux_target_hacks:
|
| 376 |
+
target, img = hack_runner(target, img=img)
|
| 377 |
+
|
| 378 |
+
return img, target
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def convert_coco_poly_to_mask(segmentations, height, width):
|
| 382 |
+
masks = []
|
| 383 |
+
for polygons in segmentations:
|
| 384 |
+
rles = coco_mask.frPyObjects(polygons, height, width)
|
| 385 |
+
mask = coco_mask.decode(rles)
|
| 386 |
+
if len(mask.shape) < 3:
|
| 387 |
+
mask = mask[..., None]
|
| 388 |
+
mask = torch.as_tensor(mask, dtype=torch.uint8)
|
| 389 |
+
mask = mask.any(dim=2)
|
| 390 |
+
masks.append(mask)
|
| 391 |
+
if masks:
|
| 392 |
+
masks = torch.stack(masks, dim=0)
|
| 393 |
+
else:
|
| 394 |
+
masks = torch.zeros((0, height, width), dtype=torch.uint8)
|
| 395 |
+
return masks
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
class ConvertCocoPolysToMask(object):
|
| 399 |
+
def __init__(self, return_masks=False):
|
| 400 |
+
self.return_masks = return_masks
|
| 401 |
+
|
| 402 |
+
def __call__(self, image, target):
|
| 403 |
+
w, h = image.size
|
| 404 |
+
|
| 405 |
+
image_id = target["image_id"]
|
| 406 |
+
image_id = torch.tensor([image_id])
|
| 407 |
+
|
| 408 |
+
anno = target["annotations"]
|
| 409 |
+
|
| 410 |
+
anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
|
| 411 |
+
|
| 412 |
+
boxes = [obj["bbox"] for obj in anno]
|
| 413 |
+
# guard against no boxes via resizing
|
| 414 |
+
boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
|
| 415 |
+
boxes[:, 2:] += boxes[:, :2]
|
| 416 |
+
boxes[:, 0::2].clamp_(min=0, max=w)
|
| 417 |
+
boxes[:, 1::2].clamp_(min=0, max=h)
|
| 418 |
+
|
| 419 |
+
classes = [obj["category_id"] for obj in anno]
|
| 420 |
+
classes = torch.tensor(classes, dtype=torch.int64)
|
| 421 |
+
|
| 422 |
+
if self.return_masks:
|
| 423 |
+
segmentations = [obj["segmentation"] for obj in anno]
|
| 424 |
+
masks = convert_coco_poly_to_mask(segmentations, h, w)
|
| 425 |
+
|
| 426 |
+
keypoints = None
|
| 427 |
+
if anno and "keypoints" in anno[0]:
|
| 428 |
+
keypoints = [obj["keypoints"] for obj in anno]
|
| 429 |
+
keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
|
| 430 |
+
num_keypoints = keypoints.shape[0]
|
| 431 |
+
if num_keypoints:
|
| 432 |
+
keypoints = keypoints.view(num_keypoints, -1, 3)
|
| 433 |
+
|
| 434 |
+
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
|
| 435 |
+
boxes = boxes[keep]
|
| 436 |
+
classes = classes[keep]
|
| 437 |
+
if self.return_masks:
|
| 438 |
+
masks = masks[keep]
|
| 439 |
+
if keypoints is not None:
|
| 440 |
+
keypoints = keypoints[keep]
|
| 441 |
+
|
| 442 |
+
target = {}
|
| 443 |
+
target["boxes"] = boxes
|
| 444 |
+
target["labels"] = classes
|
| 445 |
+
if self.return_masks:
|
| 446 |
+
target["masks"] = masks
|
| 447 |
+
target["image_id"] = image_id
|
| 448 |
+
if keypoints is not None:
|
| 449 |
+
target["keypoints"] = keypoints
|
| 450 |
+
|
| 451 |
+
# for conversion to coco api
|
| 452 |
+
area = torch.tensor([obj["area"] for obj in anno])
|
| 453 |
+
iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
|
| 454 |
+
target["area"] = area[keep]
|
| 455 |
+
target["iscrowd"] = iscrowd[keep]
|
| 456 |
+
|
| 457 |
+
target["orig_size"] = torch.as_tensor([int(h), int(w)])
|
| 458 |
+
target["size"] = torch.as_tensor([int(h), int(w)])
|
| 459 |
+
|
| 460 |
+
return image, target
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
|
| 464 |
+
|
| 465 |
+
normalize = T.Compose([
|
| 466 |
+
T.ToTensor(),
|
| 467 |
+
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
| 468 |
+
])
|
| 469 |
+
|
| 470 |
+
# config the params for data aug
|
| 471 |
+
scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
|
| 472 |
+
max_size = 1333
|
| 473 |
+
scales2_resize = [400, 500, 600]
|
| 474 |
+
scales2_crop = [384, 600]
|
| 475 |
+
|
| 476 |
+
# update args from config files
|
| 477 |
+
scales = getattr(args, 'data_aug_scales', scales)
|
| 478 |
+
max_size = getattr(args, 'data_aug_max_size', max_size)
|
| 479 |
+
scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
|
| 480 |
+
scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
|
| 481 |
+
|
| 482 |
+
# resize them
|
| 483 |
+
data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
|
| 484 |
+
if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
|
| 485 |
+
data_aug_scale_overlap = float(data_aug_scale_overlap)
|
| 486 |
+
scales = [int(i*data_aug_scale_overlap) for i in scales]
|
| 487 |
+
max_size = int(max_size*data_aug_scale_overlap)
|
| 488 |
+
scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
|
| 489 |
+
scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
|
| 490 |
+
|
| 491 |
+
datadict_for_print = {
|
| 492 |
+
'scales': scales,
|
| 493 |
+
'max_size': max_size,
|
| 494 |
+
'scales2_resize': scales2_resize,
|
| 495 |
+
'scales2_crop': scales2_crop
|
| 496 |
+
}
|
| 497 |
+
# print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
|
| 498 |
+
|
| 499 |
+
if image_set == 'train':
|
| 500 |
+
if fix_size:
|
| 501 |
+
return T.Compose([
|
| 502 |
+
T.RandomHorizontalFlip(),
|
| 503 |
+
T.RandomResize([(max_size, max(scales))]),
|
| 504 |
+
# T.RandomResize([(512, 512)]),
|
| 505 |
+
normalize,
|
| 506 |
+
])
|
| 507 |
+
|
| 508 |
+
if strong_aug:
|
| 509 |
+
import datasets.sltransform as SLT
|
| 510 |
+
|
| 511 |
+
return T.Compose([
|
| 512 |
+
T.RandomHorizontalFlip(),
|
| 513 |
+
T.RandomSelect(
|
| 514 |
+
T.RandomResize(scales, max_size=max_size),
|
| 515 |
+
T.Compose([
|
| 516 |
+
T.RandomResize(scales2_resize),
|
| 517 |
+
T.RandomSizeCrop(*scales2_crop),
|
| 518 |
+
T.RandomResize(scales, max_size=max_size),
|
| 519 |
+
])
|
| 520 |
+
),
|
| 521 |
+
SLT.RandomSelectMulti([
|
| 522 |
+
SLT.RandomCrop(),
|
| 523 |
+
SLT.LightingNoise(),
|
| 524 |
+
SLT.AdjustBrightness(2),
|
| 525 |
+
SLT.AdjustContrast(2),
|
| 526 |
+
]),
|
| 527 |
+
normalize,
|
| 528 |
+
])
|
| 529 |
+
|
| 530 |
+
return T.Compose([
|
| 531 |
+
T.RandomHorizontalFlip(),
|
| 532 |
+
T.RandomSelect(
|
| 533 |
+
T.RandomResize(scales, max_size=max_size),
|
| 534 |
+
T.Compose([
|
| 535 |
+
T.RandomResize(scales2_resize),
|
| 536 |
+
T.RandomSizeCrop(*scales2_crop),
|
| 537 |
+
T.RandomResize(scales, max_size=max_size),
|
| 538 |
+
])
|
| 539 |
+
),
|
| 540 |
+
normalize,
|
| 541 |
+
])
|
| 542 |
+
|
| 543 |
+
if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
|
| 544 |
+
|
| 545 |
+
if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
|
| 546 |
+
print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
|
| 547 |
+
return T.Compose([
|
| 548 |
+
T.ResizeDebug((1280, 800)),
|
| 549 |
+
normalize,
|
| 550 |
+
])
|
| 551 |
+
|
| 552 |
+
return T.Compose([
|
| 553 |
+
T.RandomResize([max(scales)], max_size=max_size),
|
| 554 |
+
normalize,
|
| 555 |
+
])
|
| 556 |
+
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
raise ValueError(f'unknown {image_set}')
|
| 560 |
+
|
| 561 |
+
|
| 562 |
+
def get_aux_target_hacks_list(image_set, args):
|
| 563 |
+
if args.modelname in ['q2bs_mask', 'q2bs']:
|
| 564 |
+
aux_target_hacks_list = [
|
| 565 |
+
label2compat(),
|
| 566 |
+
label_compat2onehot(),
|
| 567 |
+
RandomSelectBoxes(num_class=args.num_classes)
|
| 568 |
+
]
|
| 569 |
+
if args.masked_data and image_set == 'train':
|
| 570 |
+
# aux_target_hacks_list.append()
|
| 571 |
+
aux_target_hacks_list.append(MaskCrop())
|
| 572 |
+
elif args.modelname in ['q2bm_v2', 'q2bs_ce', 'q2op', 'q2ofocal', 'q2opclip', 'q2ocqonly']:
|
| 573 |
+
aux_target_hacks_list = [
|
| 574 |
+
label2compat(),
|
| 575 |
+
label_compat2onehot(),
|
| 576 |
+
box_label_catter(),
|
| 577 |
+
RandomSelectBoxlabels(num_classes=args.num_classes,
|
| 578 |
+
prob_first_item=args.prob_first_item,
|
| 579 |
+
prob_random_item=args.prob_random_item,
|
| 580 |
+
prob_last_item=args.prob_last_item,
|
| 581 |
+
prob_stop_sign=args.prob_stop_sign,
|
| 582 |
+
),
|
| 583 |
+
BboxPertuber(max_ratio=0.02, generate_samples=1000),
|
| 584 |
+
]
|
| 585 |
+
elif args.modelname in ['q2omask', 'q2osa']:
|
| 586 |
+
if args.coco_aug:
|
| 587 |
+
aux_target_hacks_list = [
|
| 588 |
+
label2compat(),
|
| 589 |
+
label_compat2onehot(),
|
| 590 |
+
box_label_catter(),
|
| 591 |
+
RandomSelectBoxlabels(num_classes=args.num_classes,
|
| 592 |
+
prob_first_item=args.prob_first_item,
|
| 593 |
+
prob_random_item=args.prob_random_item,
|
| 594 |
+
prob_last_item=args.prob_last_item,
|
| 595 |
+
prob_stop_sign=args.prob_stop_sign,
|
| 596 |
+
),
|
| 597 |
+
RandomDrop(p=0.2),
|
| 598 |
+
BboxPertuber(max_ratio=0.02, generate_samples=1000),
|
| 599 |
+
RandomCutout(factor=0.5)
|
| 600 |
+
]
|
| 601 |
+
else:
|
| 602 |
+
aux_target_hacks_list = [
|
| 603 |
+
label2compat(),
|
| 604 |
+
label_compat2onehot(),
|
| 605 |
+
box_label_catter(),
|
| 606 |
+
RandomSelectBoxlabels(num_classes=args.num_classes,
|
| 607 |
+
prob_first_item=args.prob_first_item,
|
| 608 |
+
prob_random_item=args.prob_random_item,
|
| 609 |
+
prob_last_item=args.prob_last_item,
|
| 610 |
+
prob_stop_sign=args.prob_stop_sign,
|
| 611 |
+
),
|
| 612 |
+
BboxPertuber(max_ratio=0.02, generate_samples=1000),
|
| 613 |
+
]
|
| 614 |
+
else:
|
| 615 |
+
aux_target_hacks_list = None
|
| 616 |
+
|
| 617 |
+
return aux_target_hacks_list
|
| 618 |
+
|
| 619 |
+
|
| 620 |
+
def build(image_set, args, datasetinfo):
|
| 621 |
+
img_folder = datasetinfo["root"]
|
| 622 |
+
ann_file = datasetinfo["anno"]
|
| 623 |
+
|
| 624 |
+
# copy to local path
|
| 625 |
+
if os.environ.get('DATA_COPY_SHILONG') == 'INFO':
|
| 626 |
+
preparing_dataset(dict(img_folder=img_folder, ann_file=ann_file), image_set, args)
|
| 627 |
+
|
| 628 |
+
try:
|
| 629 |
+
strong_aug = args.strong_aug
|
| 630 |
+
except:
|
| 631 |
+
strong_aug = False
|
| 632 |
+
print(img_folder, ann_file)
|
| 633 |
+
dataset = CocoDetection(img_folder, ann_file,
|
| 634 |
+
transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
|
| 635 |
+
return_masks=args.masks,
|
| 636 |
+
aux_target_hacks=None,
|
| 637 |
+
)
|
| 638 |
+
return dataset
|
| 639 |
+
|
| 640 |
+
|
| 641 |
+
if __name__ == "__main__":
|
| 642 |
+
# Objects365 Val example
|
| 643 |
+
dataset_o365 = CocoDetection(
|
| 644 |
+
'/path/Objects365/train/',
|
| 645 |
+
"/path/Objects365/slannos/anno_preprocess_train_v2.json",
|
| 646 |
+
transforms=None,
|
| 647 |
+
return_masks=False,
|
| 648 |
+
)
|
| 649 |
+
print('len(dataset_o365):', len(dataset_o365))
|
groundingdino/datasets/.ipynb_checkpoints/dataset-checkpoint.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import print_function
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torchvision.datasets as datasets
|
| 5 |
+
from torch.utils.data import Dataset
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from .tsv_io import TSVFile
|
| 8 |
+
import numpy as np
|
| 9 |
+
import base64
|
| 10 |
+
import io
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class TSVDataset(Dataset):
|
| 14 |
+
""" TSV dataset for ImageNet 1K training
|
| 15 |
+
"""
|
| 16 |
+
def __init__(self, tsv_file, transform=None, target_transform=None):
|
| 17 |
+
self.tsv = TSVFile(tsv_file)
|
| 18 |
+
self.transform = transform
|
| 19 |
+
self.target_transform = target_transform
|
| 20 |
+
|
| 21 |
+
def __getitem__(self, index):
|
| 22 |
+
"""
|
| 23 |
+
Args:
|
| 24 |
+
index (int): Index
|
| 25 |
+
Returns:
|
| 26 |
+
tuple: (image, target) where target is class_index of the target class.
|
| 27 |
+
"""
|
| 28 |
+
row = self.tsv.seek(index)
|
| 29 |
+
image_data = base64.b64decode(row[-1])
|
| 30 |
+
image = Image.open(io.BytesIO(image_data))
|
| 31 |
+
image = image.convert('RGB')
|
| 32 |
+
target = int(row[1])
|
| 33 |
+
|
| 34 |
+
if self.transform is not None:
|
| 35 |
+
img = self.transform(image)
|
| 36 |
+
else:
|
| 37 |
+
img = image
|
| 38 |
+
if self.target_transform is not None:
|
| 39 |
+
target = self.target_transform(target)
|
| 40 |
+
|
| 41 |
+
return img, target
|
| 42 |
+
|
| 43 |
+
def __len__(self):
|
| 44 |
+
return self.tsv.num_rows()
|
groundingdino/datasets/.ipynb_checkpoints/odvg-checkpoint.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torchvision.datasets.vision import VisionDataset
|
| 2 |
+
import os.path
|
| 3 |
+
from typing import Callable, Optional
|
| 4 |
+
import json
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import torch
|
| 7 |
+
import random
|
| 8 |
+
import os, sys
|
| 9 |
+
sys.path.append(os.path.dirname(sys.path[0]))
|
| 10 |
+
|
| 11 |
+
import datasets.transforms as T
|
| 12 |
+
|
| 13 |
+
class ODVGDataset(VisionDataset):
|
| 14 |
+
"""
|
| 15 |
+
Args:
|
| 16 |
+
root (string): Root directory where images are downloaded to.
|
| 17 |
+
anno (string): Path to json annotation file.
|
| 18 |
+
label_map_anno (string): Path to json label mapping file. Only for Object Detection
|
| 19 |
+
transform (callable, optional): A function/transform that takes in an PIL image
|
| 20 |
+
and returns a transformed version. E.g, ``transforms.PILToTensor``
|
| 21 |
+
target_transform (callable, optional): A function/transform that takes in the
|
| 22 |
+
target and transforms it.
|
| 23 |
+
transforms (callable, optional): A function/transform that takes input sample and its target as entry
|
| 24 |
+
and returns a transformed version.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(
|
| 28 |
+
self,
|
| 29 |
+
root: str,
|
| 30 |
+
anno: str,
|
| 31 |
+
label_map_anno: str = None,
|
| 32 |
+
max_labels: int = 80,
|
| 33 |
+
transform: Optional[Callable] = None,
|
| 34 |
+
target_transform: Optional[Callable] = None,
|
| 35 |
+
transforms: Optional[Callable] = None,
|
| 36 |
+
) -> None:
|
| 37 |
+
super().__init__(root, transforms, transform, target_transform)
|
| 38 |
+
self.root = root
|
| 39 |
+
self.dataset_mode = "OD" if label_map_anno else "VG"
|
| 40 |
+
self.max_labels = max_labels
|
| 41 |
+
if self.dataset_mode == "OD":
|
| 42 |
+
self.load_label_map(label_map_anno)
|
| 43 |
+
self._load_metas(anno)
|
| 44 |
+
self.get_dataset_info()
|
| 45 |
+
|
| 46 |
+
def load_label_map(self, label_map_anno):
|
| 47 |
+
with open(label_map_anno, 'r') as file:
|
| 48 |
+
self.label_map = json.load(file)
|
| 49 |
+
self.label_index = set(self.label_map.keys())
|
| 50 |
+
|
| 51 |
+
def _load_metas(self, anno):
|
| 52 |
+
with open(anno, 'r') as f:
|
| 53 |
+
self.metas = json.load(f)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def get_dataset_info(self):
|
| 57 |
+
print(f" == total images: {len(self)}")
|
| 58 |
+
if self.dataset_mode == "OD":
|
| 59 |
+
print(f" == total labels: {len(self.label_map)}")
|
| 60 |
+
|
| 61 |
+
def __getitem__(self, index: int):
|
| 62 |
+
meta = self.metas[index]
|
| 63 |
+
rel_path = meta["filename"]
|
| 64 |
+
abs_path = os.path.join(self.root, rel_path)
|
| 65 |
+
if not os.path.exists(abs_path):
|
| 66 |
+
raise FileNotFoundError(f"{abs_path} not found.")
|
| 67 |
+
image = Image.open(abs_path).convert('RGB')
|
| 68 |
+
w, h = image.size
|
| 69 |
+
if self.dataset_mode == "OD":
|
| 70 |
+
anno = meta["detection"]
|
| 71 |
+
instances = [obj for obj in anno["instances"]]
|
| 72 |
+
boxes = [obj["bbox"] for obj in instances]
|
| 73 |
+
# generate vg_labels
|
| 74 |
+
# pos bbox labels
|
| 75 |
+
ori_classes = [str(obj["label"]) for obj in instances]
|
| 76 |
+
pos_labels = set(ori_classes)
|
| 77 |
+
# neg bbox labels
|
| 78 |
+
neg_labels = self.label_index.difference(pos_labels)
|
| 79 |
+
|
| 80 |
+
vg_labels = list(pos_labels)
|
| 81 |
+
num_to_add = min(len(neg_labels), self.max_labels-len(pos_labels))
|
| 82 |
+
if num_to_add > 0:
|
| 83 |
+
vg_labels.extend(random.sample(neg_labels, num_to_add))
|
| 84 |
+
|
| 85 |
+
# shuffle
|
| 86 |
+
for i in range(len(vg_labels)-1, 0, -1):
|
| 87 |
+
j = random.randint(0, i)
|
| 88 |
+
vg_labels[i], vg_labels[j] = vg_labels[j], vg_labels[i]
|
| 89 |
+
|
| 90 |
+
caption_list = [self.label_map[lb] for lb in vg_labels]
|
| 91 |
+
caption_dict = {item:index for index, item in enumerate(caption_list)}
|
| 92 |
+
|
| 93 |
+
caption = ' . '.join(caption_list) + ' .'
|
| 94 |
+
classes = [caption_dict[self.label_map[str(obj["label"])]] for obj in instances]
|
| 95 |
+
boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
|
| 96 |
+
classes = torch.tensor(classes, dtype=torch.int64)
|
| 97 |
+
elif self.dataset_mode == "VG":
|
| 98 |
+
anno = meta["Grounding"]
|
| 99 |
+
instances = [obj for obj in anno["regions"]]
|
| 100 |
+
boxes = [obj["bbox"] for obj in instances]
|
| 101 |
+
caption_list = [obj["phrase"] for obj in instances]
|
| 102 |
+
c = list(zip(boxes, caption_list))
|
| 103 |
+
random.shuffle(c)
|
| 104 |
+
boxes[:], caption_list[:] = zip(*c)
|
| 105 |
+
uni_caption_list = list(set(caption_list))
|
| 106 |
+
label_map = {}
|
| 107 |
+
for idx in range(len(uni_caption_list)):
|
| 108 |
+
label_map[uni_caption_list[idx]] = idx
|
| 109 |
+
classes = [label_map[cap] for cap in caption_list]
|
| 110 |
+
caption = ' . '.join(uni_caption_list) + ' .'
|
| 111 |
+
boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
|
| 112 |
+
classes = torch.tensor(classes, dtype=torch.int64)
|
| 113 |
+
caption_list = uni_caption_list
|
| 114 |
+
# print("caption_list" , caption_list)
|
| 115 |
+
# print("caption" , caption)
|
| 116 |
+
# print("boxes" , boxes)
|
| 117 |
+
target = {}
|
| 118 |
+
target["image_id"] = rel_path.strip(".jpg")
|
| 119 |
+
target["size"] = torch.as_tensor([int(h), int(w)])
|
| 120 |
+
target["cap_list"] = caption_list
|
| 121 |
+
target["caption"] = caption
|
| 122 |
+
target["boxes"] = boxes
|
| 123 |
+
target["labels"] = classes
|
| 124 |
+
# print(" image_id " , target["image_id"])
|
| 125 |
+
# size, cap_list, caption, bboxes, labels
|
| 126 |
+
|
| 127 |
+
if self.transforms is not None:
|
| 128 |
+
image, target = self.transforms(image, target)
|
| 129 |
+
|
| 130 |
+
return image, target
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def __len__(self) -> int:
|
| 134 |
+
return len(self.metas)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
|
| 138 |
+
|
| 139 |
+
normalize = T.Compose([
|
| 140 |
+
T.ToTensor(),
|
| 141 |
+
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
| 142 |
+
])
|
| 143 |
+
|
| 144 |
+
# config the params for data aug
|
| 145 |
+
scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
|
| 146 |
+
max_size = 1333
|
| 147 |
+
scales2_resize = [400, 500, 600]
|
| 148 |
+
scales2_crop = [384, 600]
|
| 149 |
+
|
| 150 |
+
# update args from config files
|
| 151 |
+
scales = getattr(args, 'data_aug_scales', scales)
|
| 152 |
+
max_size = getattr(args, 'data_aug_max_size', max_size)
|
| 153 |
+
scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
|
| 154 |
+
scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
|
| 155 |
+
|
| 156 |
+
# resize them
|
| 157 |
+
data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
|
| 158 |
+
if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
|
| 159 |
+
data_aug_scale_overlap = float(data_aug_scale_overlap)
|
| 160 |
+
scales = [int(i*data_aug_scale_overlap) for i in scales]
|
| 161 |
+
max_size = int(max_size*data_aug_scale_overlap)
|
| 162 |
+
scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
|
| 163 |
+
scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
|
| 164 |
+
|
| 165 |
+
# datadict_for_print = {
|
| 166 |
+
# 'scales': scales,
|
| 167 |
+
# 'max_size': max_size,
|
| 168 |
+
# 'scales2_resize': scales2_resize,
|
| 169 |
+
# 'scales2_crop': scales2_crop
|
| 170 |
+
# }
|
| 171 |
+
# print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
|
| 172 |
+
|
| 173 |
+
if image_set == 'train':
|
| 174 |
+
if fix_size:
|
| 175 |
+
return T.Compose([
|
| 176 |
+
T.RandomHorizontalFlip(),
|
| 177 |
+
T.RandomResize([(max_size, max(scales))]),
|
| 178 |
+
normalize,
|
| 179 |
+
])
|
| 180 |
+
|
| 181 |
+
if strong_aug:
|
| 182 |
+
import datasets.sltransform as SLT
|
| 183 |
+
|
| 184 |
+
return T.Compose([
|
| 185 |
+
T.RandomHorizontalFlip(),
|
| 186 |
+
T.RandomSelect(
|
| 187 |
+
T.RandomResize(scales, max_size=max_size),
|
| 188 |
+
T.Compose([
|
| 189 |
+
T.RandomResize(scales2_resize),
|
| 190 |
+
T.RandomSizeCrop(*scales2_crop),
|
| 191 |
+
T.RandomResize(scales, max_size=max_size),
|
| 192 |
+
])
|
| 193 |
+
),
|
| 194 |
+
SLT.RandomSelectMulti([
|
| 195 |
+
SLT.RandomCrop(),
|
| 196 |
+
SLT.LightingNoise(),
|
| 197 |
+
SLT.AdjustBrightness(2),
|
| 198 |
+
SLT.AdjustContrast(2),
|
| 199 |
+
]),
|
| 200 |
+
normalize,
|
| 201 |
+
])
|
| 202 |
+
|
| 203 |
+
return T.Compose([
|
| 204 |
+
T.RandomHorizontalFlip(),
|
| 205 |
+
T.RandomSelect(
|
| 206 |
+
T.RandomResize(scales, max_size=max_size),
|
| 207 |
+
T.Compose([
|
| 208 |
+
T.RandomResize(scales2_resize),
|
| 209 |
+
T.RandomSizeCrop(*scales2_crop),
|
| 210 |
+
T.RandomResize(scales, max_size=max_size),
|
| 211 |
+
])
|
| 212 |
+
),
|
| 213 |
+
normalize,
|
| 214 |
+
])
|
| 215 |
+
|
| 216 |
+
if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
|
| 217 |
+
|
| 218 |
+
if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
|
| 219 |
+
print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
|
| 220 |
+
return T.Compose([
|
| 221 |
+
T.ResizeDebug((1280, 800)),
|
| 222 |
+
normalize,
|
| 223 |
+
])
|
| 224 |
+
|
| 225 |
+
return T.Compose([
|
| 226 |
+
T.RandomResize([max(scales)], max_size=max_size),
|
| 227 |
+
normalize,
|
| 228 |
+
])
|
| 229 |
+
|
| 230 |
+
raise ValueError(f'unknown {image_set}')
|
| 231 |
+
|
| 232 |
+
def build_odvg(image_set, args, datasetinfo):
|
| 233 |
+
img_folder = datasetinfo["root"]
|
| 234 |
+
ann_file = datasetinfo["anno"]
|
| 235 |
+
label_map = datasetinfo["label_map"] if "label_map" in datasetinfo else None
|
| 236 |
+
try:
|
| 237 |
+
strong_aug = args.strong_aug
|
| 238 |
+
except:
|
| 239 |
+
strong_aug = False # False originally
|
| 240 |
+
print(img_folder, ann_file, label_map)
|
| 241 |
+
dataset = ODVGDataset(img_folder, ann_file, label_map, max_labels=args.max_labels,
|
| 242 |
+
transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
|
| 243 |
+
)
|
| 244 |
+
return dataset
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
if __name__=="__main__":
|
| 248 |
+
dataset_vg = ODVGDataset("path/GRIT-20M/data/","path/GRIT-20M/anno/grit_odvg_10k.jsonl",)
|
| 249 |
+
print(len(dataset_vg))
|
| 250 |
+
data = dataset_vg[random.randint(0, 100)]
|
| 251 |
+
print(data)
|
| 252 |
+
dataset_od = ODVGDataset("pathl/V3Det/",
|
| 253 |
+
"path/V3Det/annotations/v3det_2023_v1_all_odvg.jsonl",
|
| 254 |
+
"path/V3Det/annotations/v3det_label_map.json",
|
| 255 |
+
)
|
| 256 |
+
print(len(dataset_od))
|
| 257 |
+
data = dataset_od[random.randint(0, 100)]
|
| 258 |
+
print(data)
|
groundingdino/datasets/.ipynb_checkpoints/transforms-checkpoint.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
| 2 |
+
"""
|
| 3 |
+
Transforms and data augmentation for both image + bbox.
|
| 4 |
+
"""
|
| 5 |
+
import random
|
| 6 |
+
|
| 7 |
+
import PIL
|
| 8 |
+
import torch
|
| 9 |
+
import torchvision.transforms as T
|
| 10 |
+
import torchvision.transforms.functional as F
|
| 11 |
+
|
| 12 |
+
from util.box_ops import box_xyxy_to_cxcywh
|
| 13 |
+
from util.misc import interpolate
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def crop(image, target, region):
|
| 17 |
+
cropped_image = F.crop(image, *region)
|
| 18 |
+
|
| 19 |
+
target = target.copy()
|
| 20 |
+
i, j, h, w = region
|
| 21 |
+
|
| 22 |
+
# should we do something wrt the original size?
|
| 23 |
+
target["size"] = torch.tensor([h, w])
|
| 24 |
+
|
| 25 |
+
fields = ["labels", "area"]
|
| 26 |
+
|
| 27 |
+
if "boxes" in target:
|
| 28 |
+
boxes = target["boxes"]
|
| 29 |
+
max_size = torch.as_tensor([w, h], dtype=torch.float32)
|
| 30 |
+
cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
|
| 31 |
+
cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
|
| 32 |
+
cropped_boxes = cropped_boxes.clamp(min=0)
|
| 33 |
+
area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
|
| 34 |
+
target["boxes"] = cropped_boxes.reshape(-1, 4)
|
| 35 |
+
target["area"] = area
|
| 36 |
+
fields.append("boxes")
|
| 37 |
+
|
| 38 |
+
if "masks" in target:
|
| 39 |
+
# FIXME should we update the area here if there are no boxes?
|
| 40 |
+
target['masks'] = target['masks'][:, i:i + h, j:j + w]
|
| 41 |
+
fields.append("masks")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# remove elements for which the boxes or masks that have zero area
|
| 45 |
+
if "boxes" in target or "masks" in target:
|
| 46 |
+
# favor boxes selection when defining which elements to keep
|
| 47 |
+
# this is compatible with previous implementation
|
| 48 |
+
if "boxes" in target:
|
| 49 |
+
cropped_boxes = target['boxes'].reshape(-1, 2, 2)
|
| 50 |
+
keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
|
| 51 |
+
else:
|
| 52 |
+
keep = target['masks'].flatten(1).any(1)
|
| 53 |
+
|
| 54 |
+
for field in fields:
|
| 55 |
+
target[field] = target[field][keep]
|
| 56 |
+
|
| 57 |
+
return cropped_image, target
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def hflip(image, target):
|
| 61 |
+
flipped_image = F.hflip(image)
|
| 62 |
+
|
| 63 |
+
w, h = image.size
|
| 64 |
+
|
| 65 |
+
target = target.copy()
|
| 66 |
+
if "boxes" in target:
|
| 67 |
+
boxes = target["boxes"]
|
| 68 |
+
boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
|
| 69 |
+
target["boxes"] = boxes
|
| 70 |
+
|
| 71 |
+
if "masks" in target:
|
| 72 |
+
target['masks'] = target['masks'].flip(-1)
|
| 73 |
+
|
| 74 |
+
return flipped_image, target
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def resize(image, target, size, max_size=None):
|
| 78 |
+
# size can be min_size (scalar) or (w, h) tuple
|
| 79 |
+
|
| 80 |
+
def get_size_with_aspect_ratio(image_size, size, max_size=None):
|
| 81 |
+
w, h = image_size
|
| 82 |
+
if max_size is not None:
|
| 83 |
+
min_original_size = float(min((w, h)))
|
| 84 |
+
max_original_size = float(max((w, h)))
|
| 85 |
+
if max_original_size / min_original_size * size > max_size:
|
| 86 |
+
size = int(round(max_size * min_original_size / max_original_size))
|
| 87 |
+
|
| 88 |
+
if (w <= h and w == size) or (h <= w and h == size):
|
| 89 |
+
return (h, w)
|
| 90 |
+
|
| 91 |
+
if w < h:
|
| 92 |
+
ow = size
|
| 93 |
+
oh = int(size * h / w)
|
| 94 |
+
else:
|
| 95 |
+
oh = size
|
| 96 |
+
ow = int(size * w / h)
|
| 97 |
+
|
| 98 |
+
return (oh, ow)
|
| 99 |
+
|
| 100 |
+
def get_size(image_size, size, max_size=None):
|
| 101 |
+
if isinstance(size, (list, tuple)):
|
| 102 |
+
return size[::-1]
|
| 103 |
+
else:
|
| 104 |
+
return get_size_with_aspect_ratio(image_size, size, max_size)
|
| 105 |
+
|
| 106 |
+
size = get_size(image.size, size, max_size)
|
| 107 |
+
rescaled_image = F.resize(image, size)
|
| 108 |
+
|
| 109 |
+
if target is None:
|
| 110 |
+
return rescaled_image, None
|
| 111 |
+
|
| 112 |
+
ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
|
| 113 |
+
ratio_width, ratio_height = ratios
|
| 114 |
+
|
| 115 |
+
target = target.copy()
|
| 116 |
+
if "boxes" in target:
|
| 117 |
+
boxes = target["boxes"]
|
| 118 |
+
scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
|
| 119 |
+
target["boxes"] = scaled_boxes
|
| 120 |
+
|
| 121 |
+
if "area" in target:
|
| 122 |
+
area = target["area"]
|
| 123 |
+
scaled_area = area * (ratio_width * ratio_height)
|
| 124 |
+
target["area"] = scaled_area
|
| 125 |
+
|
| 126 |
+
h, w = size
|
| 127 |
+
target["size"] = torch.tensor([h, w])
|
| 128 |
+
|
| 129 |
+
if "masks" in target:
|
| 130 |
+
target['masks'] = interpolate(
|
| 131 |
+
target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
|
| 132 |
+
|
| 133 |
+
return rescaled_image, target
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def pad(image, target, padding):
|
| 137 |
+
# assumes that we only pad on the bottom right corners
|
| 138 |
+
padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
|
| 139 |
+
if target is None:
|
| 140 |
+
return padded_image, None
|
| 141 |
+
target = target.copy()
|
| 142 |
+
# should we do something wrt the original size?
|
| 143 |
+
target["size"] = torch.tensor(padded_image.size[::-1])
|
| 144 |
+
if "masks" in target:
|
| 145 |
+
target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
|
| 146 |
+
return padded_image, target
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class ResizeDebug(object):
|
| 150 |
+
def __init__(self, size):
|
| 151 |
+
self.size = size
|
| 152 |
+
|
| 153 |
+
def __call__(self, img, target):
|
| 154 |
+
return resize(img, target, self.size)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
class RandomCrop(object):
|
| 158 |
+
def __init__(self, size):
|
| 159 |
+
self.size = size
|
| 160 |
+
|
| 161 |
+
def __call__(self, img, target):
|
| 162 |
+
region = T.RandomCrop.get_params(img, self.size)
|
| 163 |
+
return crop(img, target, region)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
class RandomSizeCrop(object):
|
| 167 |
+
def __init__(self, min_size: int, max_size: int):
|
| 168 |
+
self.min_size = min_size
|
| 169 |
+
self.max_size = max_size
|
| 170 |
+
|
| 171 |
+
def __call__(self, img: PIL.Image.Image, target: dict):
|
| 172 |
+
w = random.randint(self.min_size, min(img.width, self.max_size))
|
| 173 |
+
h = random.randint(self.min_size, min(img.height, self.max_size))
|
| 174 |
+
region = T.RandomCrop.get_params(img, [h, w])
|
| 175 |
+
return crop(img, target, region)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
class CenterCrop(object):
|
| 179 |
+
def __init__(self, size):
|
| 180 |
+
self.size = size
|
| 181 |
+
|
| 182 |
+
def __call__(self, img, target):
|
| 183 |
+
image_width, image_height = img.size
|
| 184 |
+
crop_height, crop_width = self.size
|
| 185 |
+
crop_top = int(round((image_height - crop_height) / 2.))
|
| 186 |
+
crop_left = int(round((image_width - crop_width) / 2.))
|
| 187 |
+
return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
class RandomHorizontalFlip(object):
|
| 191 |
+
def __init__(self, p=0.5):
|
| 192 |
+
self.p = p
|
| 193 |
+
|
| 194 |
+
def __call__(self, img, target):
|
| 195 |
+
if random.random() < self.p:
|
| 196 |
+
return hflip(img, target)
|
| 197 |
+
return img, target
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
class RandomResize(object):
|
| 201 |
+
def __init__(self, sizes, max_size=None):
|
| 202 |
+
assert isinstance(sizes, (list, tuple))
|
| 203 |
+
self.sizes = sizes
|
| 204 |
+
self.max_size = max_size
|
| 205 |
+
|
| 206 |
+
def __call__(self, img, target=None):
|
| 207 |
+
size = random.choice(self.sizes)
|
| 208 |
+
return resize(img, target, size, self.max_size)
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
class RandomPad(object):
|
| 212 |
+
def __init__(self, max_pad):
|
| 213 |
+
self.max_pad = max_pad
|
| 214 |
+
|
| 215 |
+
def __call__(self, img, target):
|
| 216 |
+
pad_x = random.randint(0, self.max_pad)
|
| 217 |
+
pad_y = random.randint(0, self.max_pad)
|
| 218 |
+
return pad(img, target, (pad_x, pad_y))
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
class RandomSelect(object):
|
| 222 |
+
"""
|
| 223 |
+
Randomly selects between transforms1 and transforms2,
|
| 224 |
+
with probability p for transforms1 and (1 - p) for transforms2
|
| 225 |
+
"""
|
| 226 |
+
def __init__(self, transforms1, transforms2, p=0.5):
|
| 227 |
+
self.transforms1 = transforms1
|
| 228 |
+
self.transforms2 = transforms2
|
| 229 |
+
self.p = p
|
| 230 |
+
|
| 231 |
+
def __call__(self, img, target):
|
| 232 |
+
if random.random() < self.p:
|
| 233 |
+
return self.transforms1(img, target)
|
| 234 |
+
return self.transforms2(img, target)
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
class ToTensor(object):
|
| 238 |
+
def __call__(self, img, target):
|
| 239 |
+
return F.to_tensor(img), target
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
class RandomErasing(object):
|
| 243 |
+
|
| 244 |
+
def __init__(self, *args, **kwargs):
|
| 245 |
+
self.eraser = T.RandomErasing(*args, **kwargs)
|
| 246 |
+
|
| 247 |
+
def __call__(self, img, target):
|
| 248 |
+
return self.eraser(img), target
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
class Normalize(object):
|
| 252 |
+
def __init__(self, mean, std):
|
| 253 |
+
self.mean = mean
|
| 254 |
+
self.std = std
|
| 255 |
+
|
| 256 |
+
def __call__(self, image, target=None):
|
| 257 |
+
image = F.normalize(image, mean=self.mean, std=self.std)
|
| 258 |
+
if target is None:
|
| 259 |
+
return image, None
|
| 260 |
+
target = target.copy()
|
| 261 |
+
h, w = image.shape[-2:]
|
| 262 |
+
if "boxes" in target:
|
| 263 |
+
boxes = target["boxes"]
|
| 264 |
+
boxes = box_xyxy_to_cxcywh(boxes)
|
| 265 |
+
boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
|
| 266 |
+
target["boxes"] = boxes
|
| 267 |
+
return image, target
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
class Compose(object):
|
| 271 |
+
def __init__(self, transforms):
|
| 272 |
+
self.transforms = transforms
|
| 273 |
+
|
| 274 |
+
def __call__(self, image, target):
|
| 275 |
+
for t in self.transforms:
|
| 276 |
+
image, target = t(image, target)
|
| 277 |
+
return image, target
|
| 278 |
+
|
| 279 |
+
def __repr__(self):
|
| 280 |
+
format_string = self.__class__.__name__ + "("
|
| 281 |
+
for t in self.transforms:
|
| 282 |
+
format_string += "\n"
|
| 283 |
+
format_string += " {0}".format(t)
|
| 284 |
+
format_string += "\n)"
|
| 285 |
+
return format_string
|
groundingdino/datasets/__init__.py
CHANGED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
| 2 |
+
import torch.utils.data
|
| 3 |
+
import torchvision
|
| 4 |
+
from .coco import build as build_coco
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def get_coco_api_from_dataset(dataset):
|
| 8 |
+
for _ in range(10):
|
| 9 |
+
# if isinstance(dataset, torchvision.datasets.CocoDetection):
|
| 10 |
+
# break
|
| 11 |
+
if isinstance(dataset, torch.utils.data.Subset):
|
| 12 |
+
dataset = dataset.dataset
|
| 13 |
+
if isinstance(dataset, torchvision.datasets.CocoDetection):
|
| 14 |
+
return dataset.coco
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def build_dataset(image_set, args, datasetinfo):
|
| 18 |
+
if datasetinfo["dataset_mode"] == 'coco':
|
| 19 |
+
return build_coco(image_set, args, datasetinfo)
|
| 20 |
+
if datasetinfo["dataset_mode"] == 'odvg':
|
| 21 |
+
from .odvg import build_odvg
|
| 22 |
+
return build_odvg(image_set, args, datasetinfo)
|
| 23 |
+
raise ValueError(f'dataset {args.dataset_file} not supported')
|
groundingdino/datasets/__pycache__/__init__.cpython-310.pyc
CHANGED
|
Binary files a/groundingdino/datasets/__pycache__/__init__.cpython-310.pyc and b/groundingdino/datasets/__pycache__/__init__.cpython-310.pyc differ
|
|
|
groundingdino/datasets/__pycache__/coco.cpython-310.pyc
ADDED
|
Binary file (20.2 kB). View file
|
|
|
groundingdino/datasets/__pycache__/coco_eval.cpython-310.pyc
ADDED
|
Binary file (7.42 kB). View file
|
|
|
groundingdino/datasets/__pycache__/cocogrounding_eval.cpython-310.pyc
ADDED
|
Binary file (7.44 kB). View file
|
|
|
groundingdino/datasets/__pycache__/data_util.cpython-310.pyc
ADDED
|
Binary file (4.55 kB). View file
|
|
|
groundingdino/datasets/__pycache__/odvg.cpython-310.pyc
ADDED
|
Binary file (8.21 kB). View file
|
|
|
groundingdino/datasets/__pycache__/panoptic_eval.cpython-310.pyc
ADDED
|
Binary file (1.87 kB). View file
|
|
|
groundingdino/datasets/__pycache__/random_crop.cpython-310.pyc
ADDED
|
Binary file (3.69 kB). View file
|
|
|
groundingdino/datasets/__pycache__/sltransform.cpython-310.pyc
ADDED
|
Binary file (7.68 kB). View file
|
|
|
groundingdino/datasets/__pycache__/transforms.cpython-310.pyc
CHANGED
|
Binary files a/groundingdino/datasets/__pycache__/transforms.cpython-310.pyc and b/groundingdino/datasets/__pycache__/transforms.cpython-310.pyc differ
|
|
|
groundingdino/datasets/coco.py
ADDED
|
@@ -0,0 +1,649 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
| 2 |
+
"""
|
| 3 |
+
COCO dataset which returns image_id for evaluation.
|
| 4 |
+
|
| 5 |
+
Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
|
| 6 |
+
"""
|
| 7 |
+
if __name__=="__main__":
|
| 8 |
+
# for debug only
|
| 9 |
+
import os, sys
|
| 10 |
+
sys.path.append(os.path.dirname(sys.path[0]))
|
| 11 |
+
from torchvision.datasets.vision import VisionDataset
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import random
|
| 16 |
+
import os
|
| 17 |
+
from typing import Any, Callable, List, Optional, Tuple
|
| 18 |
+
|
| 19 |
+
from PIL import Image
|
| 20 |
+
|
| 21 |
+
import torch
|
| 22 |
+
import torch.utils.data
|
| 23 |
+
import torchvision
|
| 24 |
+
from pycocotools import mask as coco_mask
|
| 25 |
+
|
| 26 |
+
from datasets.data_util import preparing_dataset
|
| 27 |
+
import datasets.transforms as T
|
| 28 |
+
from util.box_ops import box_cxcywh_to_xyxy, box_iou
|
| 29 |
+
|
| 30 |
+
__all__ = ['build']
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class label2compat():
|
| 34 |
+
def __init__(self) -> None:
|
| 35 |
+
self.category_map_str = {"1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10, "11": 11, "13": 12, "14": 13, "15": 14, "16": 15, "17": 16, "18": 17, "19": 18, "20": 19, "21": 20, "22": 21, "23": 22, "24": 23, "25": 24, "27": 25, "28": 26, "31": 27, "32": 28, "33": 29, "34": 30, "35": 31, "36": 32, "37": 33, "38": 34, "39": 35, "40": 36, "41": 37, "42": 38, "43": 39, "44": 40, "46": 41, "47": 42, "48": 43, "49": 44, "50": 45, "51": 46, "52": 47, "53": 48, "54": 49, "55": 50, "56": 51, "57": 52, "58": 53, "59": 54, "60": 55, "61": 56, "62": 57, "63": 58, "64": 59, "65": 60, "67": 61, "70": 62, "72": 63, "73": 64, "74": 65, "75": 66, "76": 67, "77": 68, "78": 69, "79": 70, "80": 71, "81": 72, "82": 73, "84": 74, "85": 75, "86": 76, "87": 77, "88": 78, "89": 79, "90": 80}
|
| 36 |
+
self.category_map = {int(k):v for k,v in self.category_map_str.items()}
|
| 37 |
+
|
| 38 |
+
def __call__(self, target, img=None):
|
| 39 |
+
labels = target['labels']
|
| 40 |
+
res = torch.zeros(labels.shape, dtype=labels.dtype)
|
| 41 |
+
for idx, item in enumerate(labels):
|
| 42 |
+
res[idx] = self.category_map[item.item()] - 1
|
| 43 |
+
target['label_compat'] = res
|
| 44 |
+
if img is not None:
|
| 45 |
+
return target, img
|
| 46 |
+
else:
|
| 47 |
+
return target
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class label_compat2onehot():
|
| 51 |
+
def __init__(self, num_class=80, num_output_objs=1):
|
| 52 |
+
self.num_class = num_class
|
| 53 |
+
self.num_output_objs = num_output_objs
|
| 54 |
+
if num_output_objs != 1:
|
| 55 |
+
raise DeprecationWarning("num_output_objs!=1, which is only used for comparison")
|
| 56 |
+
|
| 57 |
+
def __call__(self, target, img=None):
|
| 58 |
+
labels = target['label_compat']
|
| 59 |
+
place_dict = {k:0 for k in range(self.num_class)}
|
| 60 |
+
if self.num_output_objs == 1:
|
| 61 |
+
res = torch.zeros(self.num_class)
|
| 62 |
+
for i in labels:
|
| 63 |
+
itm = i.item()
|
| 64 |
+
res[itm] = 1.0
|
| 65 |
+
else:
|
| 66 |
+
# compat with baseline
|
| 67 |
+
res = torch.zeros(self.num_class, self.num_output_objs)
|
| 68 |
+
for i in labels:
|
| 69 |
+
itm = i.item()
|
| 70 |
+
res[itm][place_dict[itm]] = 1.0
|
| 71 |
+
place_dict[itm] += 1
|
| 72 |
+
target['label_compat_onehot'] = res
|
| 73 |
+
if img is not None:
|
| 74 |
+
return target, img
|
| 75 |
+
else:
|
| 76 |
+
return target
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class box_label_catter():
|
| 80 |
+
def __init__(self):
|
| 81 |
+
pass
|
| 82 |
+
|
| 83 |
+
def __call__(self, target, img=None):
|
| 84 |
+
labels = target['label_compat']
|
| 85 |
+
boxes = target['boxes']
|
| 86 |
+
box_label = torch.cat((boxes, labels.unsqueeze(-1)), 1)
|
| 87 |
+
target['box_label'] = box_label
|
| 88 |
+
if img is not None:
|
| 89 |
+
return target, img
|
| 90 |
+
else:
|
| 91 |
+
return target
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class RandomSelectBoxlabels():
|
| 95 |
+
def __init__(self, num_classes, leave_one_out=False, blank_prob=0.8,
|
| 96 |
+
prob_first_item = 0.0,
|
| 97 |
+
prob_random_item = 0.0,
|
| 98 |
+
prob_last_item = 0.8,
|
| 99 |
+
prob_stop_sign = 0.2
|
| 100 |
+
) -> None:
|
| 101 |
+
self.num_classes = num_classes
|
| 102 |
+
self.leave_one_out = leave_one_out
|
| 103 |
+
self.blank_prob = blank_prob
|
| 104 |
+
|
| 105 |
+
self.set_state(prob_first_item, prob_random_item, prob_last_item, prob_stop_sign)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def get_state(self):
|
| 109 |
+
return [self.prob_first_item, self.prob_random_item, self.prob_last_item, self.prob_stop_sign]
|
| 110 |
+
|
| 111 |
+
def set_state(self, prob_first_item, prob_random_item, prob_last_item, prob_stop_sign):
|
| 112 |
+
sum_prob = prob_first_item + prob_random_item + prob_last_item + prob_stop_sign
|
| 113 |
+
assert sum_prob - 1 < 1e-6, \
|
| 114 |
+
f"Sum up all prob = {sum_prob}. prob_first_item:{prob_first_item}" \
|
| 115 |
+
+ f"prob_random_item:{prob_random_item}, prob_last_item:{prob_last_item}" \
|
| 116 |
+
+ f"prob_stop_sign:{prob_stop_sign}"
|
| 117 |
+
|
| 118 |
+
self.prob_first_item = prob_first_item
|
| 119 |
+
self.prob_random_item = prob_random_item
|
| 120 |
+
self.prob_last_item = prob_last_item
|
| 121 |
+
self.prob_stop_sign = prob_stop_sign
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def sample_for_pred_first_item(self, box_label: torch.FloatTensor):
|
| 125 |
+
box_label_known = torch.Tensor(0,5)
|
| 126 |
+
box_label_unknown = box_label
|
| 127 |
+
return box_label_known, box_label_unknown
|
| 128 |
+
|
| 129 |
+
def sample_for_pred_random_item(self, box_label: torch.FloatTensor):
|
| 130 |
+
n_select = int(random.random() * box_label.shape[0])
|
| 131 |
+
box_label = box_label[torch.randperm(box_label.shape[0])]
|
| 132 |
+
box_label_known = box_label[:n_select]
|
| 133 |
+
box_label_unknown = box_label[n_select:]
|
| 134 |
+
return box_label_known, box_label_unknown
|
| 135 |
+
|
| 136 |
+
def sample_for_pred_last_item(self, box_label: torch.FloatTensor):
|
| 137 |
+
box_label_perm = box_label[torch.randperm(box_label.shape[0])]
|
| 138 |
+
known_label_list = []
|
| 139 |
+
box_label_known = []
|
| 140 |
+
box_label_unknown = []
|
| 141 |
+
for item in box_label_perm:
|
| 142 |
+
label_i = item[4].item()
|
| 143 |
+
if label_i in known_label_list:
|
| 144 |
+
box_label_known.append(item)
|
| 145 |
+
else:
|
| 146 |
+
# first item
|
| 147 |
+
box_label_unknown.append(item)
|
| 148 |
+
known_label_list.append(label_i)
|
| 149 |
+
box_label_known = torch.stack(box_label_known) if len(box_label_known) > 0 else torch.Tensor(0,5)
|
| 150 |
+
box_label_unknown = torch.stack(box_label_unknown) if len(box_label_unknown) > 0 else torch.Tensor(0,5)
|
| 151 |
+
return box_label_known, box_label_unknown
|
| 152 |
+
|
| 153 |
+
def sample_for_pred_stop_sign(self, box_label: torch.FloatTensor):
|
| 154 |
+
box_label_unknown = torch.Tensor(0,5)
|
| 155 |
+
box_label_known = box_label
|
| 156 |
+
return box_label_known, box_label_unknown
|
| 157 |
+
|
| 158 |
+
def __call__(self, target, img=None):
|
| 159 |
+
box_label = target['box_label'] # K, 5
|
| 160 |
+
|
| 161 |
+
dice_number = random.random()
|
| 162 |
+
|
| 163 |
+
if dice_number < self.prob_first_item:
|
| 164 |
+
box_label_known, box_label_unknown = self.sample_for_pred_first_item(box_label)
|
| 165 |
+
elif dice_number < self.prob_first_item + self.prob_random_item:
|
| 166 |
+
box_label_known, box_label_unknown = self.sample_for_pred_random_item(box_label)
|
| 167 |
+
elif dice_number < self.prob_first_item + self.prob_random_item + self.prob_last_item:
|
| 168 |
+
box_label_known, box_label_unknown = self.sample_for_pred_last_item(box_label)
|
| 169 |
+
else:
|
| 170 |
+
box_label_known, box_label_unknown = self.sample_for_pred_stop_sign(box_label)
|
| 171 |
+
|
| 172 |
+
target['label_onehot_known'] = label2onehot(box_label_known[:,-1], self.num_classes)
|
| 173 |
+
target['label_onehot_unknown'] = label2onehot(box_label_unknown[:, -1], self.num_classes)
|
| 174 |
+
target['box_label_known'] = box_label_known
|
| 175 |
+
target['box_label_unknown'] = box_label_unknown
|
| 176 |
+
|
| 177 |
+
return target, img
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
class RandomDrop():
|
| 181 |
+
def __init__(self, p=0.2) -> None:
|
| 182 |
+
self.p = p
|
| 183 |
+
|
| 184 |
+
def __call__(self, target, img=None):
|
| 185 |
+
known_box = target['box_label_known']
|
| 186 |
+
num_known_box = known_box.size(0)
|
| 187 |
+
idxs = torch.rand(num_known_box)
|
| 188 |
+
# indices = torch.randperm(num_known_box)[:int((1-self).p*num_known_box + 0.5 + random.random())]
|
| 189 |
+
target['box_label_known'] = known_box[idxs > self.p]
|
| 190 |
+
return target, img
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
class BboxPertuber():
|
| 194 |
+
def __init__(self, max_ratio = 0.02, generate_samples = 1000) -> None:
|
| 195 |
+
self.max_ratio = max_ratio
|
| 196 |
+
self.generate_samples = generate_samples
|
| 197 |
+
self.samples = self.generate_pertube_samples()
|
| 198 |
+
self.idx = 0
|
| 199 |
+
|
| 200 |
+
def generate_pertube_samples(self):
|
| 201 |
+
import torch
|
| 202 |
+
samples = (torch.rand(self.generate_samples, 5) - 0.5) * 2 * self.max_ratio
|
| 203 |
+
return samples
|
| 204 |
+
|
| 205 |
+
def __call__(self, target, img):
|
| 206 |
+
known_box = target['box_label_known'] # Tensor(K,5), K known bbox
|
| 207 |
+
K = known_box.shape[0]
|
| 208 |
+
known_box_pertube = torch.zeros(K, 6) # 4:bbox, 1:prob, 1:label
|
| 209 |
+
if K == 0:
|
| 210 |
+
pass
|
| 211 |
+
else:
|
| 212 |
+
if self.idx + K > self.generate_samples:
|
| 213 |
+
self.idx = 0
|
| 214 |
+
delta = self.samples[self.idx: self.idx + K, :]
|
| 215 |
+
known_box_pertube[:, :4] = known_box[:, :4] + delta[:, :4]
|
| 216 |
+
iou = (torch.diag(box_iou(box_cxcywh_to_xyxy(known_box[:, :4]), box_cxcywh_to_xyxy(known_box_pertube[:, :4]))[0])) * (1 + delta[:, -1])
|
| 217 |
+
known_box_pertube[:, 4].copy_(iou)
|
| 218 |
+
known_box_pertube[:, -1].copy_(known_box[:, -1])
|
| 219 |
+
|
| 220 |
+
target['box_label_known_pertube'] = known_box_pertube
|
| 221 |
+
return target, img
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
class RandomCutout():
|
| 225 |
+
def __init__(self, factor=0.5) -> None:
|
| 226 |
+
self.factor = factor
|
| 227 |
+
|
| 228 |
+
def __call__(self, target, img=None):
|
| 229 |
+
unknown_box = target['box_label_unknown'] # Ku, 5
|
| 230 |
+
known_box = target['box_label_known_pertube'] # Kk, 6
|
| 231 |
+
Ku = unknown_box.size(0)
|
| 232 |
+
|
| 233 |
+
known_box_add = torch.zeros(Ku, 6) # Ku, 6
|
| 234 |
+
known_box_add[:, :5] = unknown_box
|
| 235 |
+
known_box_add[:, 5].uniform_(0.5, 1)
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
known_box_add[:, :2] += known_box_add[:, 2:4] * (torch.rand(Ku, 2) - 0.5) / 2
|
| 239 |
+
known_box_add[:, 2:4] /= 2
|
| 240 |
+
|
| 241 |
+
target['box_label_known_pertube'] = torch.cat((known_box, known_box_add))
|
| 242 |
+
return target, img
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
class RandomSelectBoxes():
|
| 246 |
+
def __init__(self, num_class=80) -> None:
|
| 247 |
+
Warning("This is such a slow function and will be deprecated soon!!!")
|
| 248 |
+
self.num_class = num_class
|
| 249 |
+
|
| 250 |
+
def __call__(self, target, img=None):
|
| 251 |
+
boxes = target['boxes']
|
| 252 |
+
labels = target['label_compat']
|
| 253 |
+
|
| 254 |
+
# transform to list of tensors
|
| 255 |
+
boxs_list = [[] for i in range(self.num_class)]
|
| 256 |
+
for idx, item in enumerate(boxes):
|
| 257 |
+
label = labels[idx].item()
|
| 258 |
+
boxs_list[label].append(item)
|
| 259 |
+
boxs_list_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in boxs_list]
|
| 260 |
+
|
| 261 |
+
# random selection
|
| 262 |
+
box_known = []
|
| 263 |
+
box_unknown = []
|
| 264 |
+
for idx, item in enumerate(boxs_list_tensor):
|
| 265 |
+
ncnt = item.shape[0]
|
| 266 |
+
nselect = int(random.random() * ncnt) # close in both sides, much faster than random.randint
|
| 267 |
+
|
| 268 |
+
item = item[torch.randperm(ncnt)]
|
| 269 |
+
# random.shuffle(item)
|
| 270 |
+
box_known.append(item[:nselect])
|
| 271 |
+
box_unknown.append(item[nselect:])
|
| 272 |
+
|
| 273 |
+
# box_known_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_known]
|
| 274 |
+
# box_unknown_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_unknown]
|
| 275 |
+
# print('box_unknown_tensor:', box_unknown_tensor)
|
| 276 |
+
target['known_box'] = box_known
|
| 277 |
+
target['unknown_box'] = box_unknown
|
| 278 |
+
return target, img
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def label2onehot(label, num_classes):
|
| 282 |
+
"""
|
| 283 |
+
label: Tensor(K)
|
| 284 |
+
"""
|
| 285 |
+
res = torch.zeros(num_classes)
|
| 286 |
+
for i in label:
|
| 287 |
+
itm = int(i.item())
|
| 288 |
+
res[itm] = 1.0
|
| 289 |
+
return res
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
class MaskCrop():
|
| 293 |
+
def __init__(self) -> None:
|
| 294 |
+
pass
|
| 295 |
+
|
| 296 |
+
def __call__(self, target, img):
|
| 297 |
+
known_box = target['known_box']
|
| 298 |
+
h,w = img.shape[1:] # h,w
|
| 299 |
+
# imgsize = target['orig_size'] # h,w
|
| 300 |
+
|
| 301 |
+
scale = torch.Tensor([w, h, w, h])
|
| 302 |
+
|
| 303 |
+
# _cnt = 0
|
| 304 |
+
for boxes in known_box:
|
| 305 |
+
if boxes.shape[0] == 0:
|
| 306 |
+
continue
|
| 307 |
+
box_xyxy = box_cxcywh_to_xyxy(boxes) * scale
|
| 308 |
+
for box in box_xyxy:
|
| 309 |
+
x1, y1, x2, y2 = [int(i) for i in box.tolist()]
|
| 310 |
+
img[:, y1:y2, x1:x2] = 0
|
| 311 |
+
# _cnt += 1
|
| 312 |
+
# print("_cnt:", _cnt)
|
| 313 |
+
return target, img
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
dataset_hook_register = {
|
| 317 |
+
'label2compat': label2compat,
|
| 318 |
+
'label_compat2onehot': label_compat2onehot,
|
| 319 |
+
'box_label_catter': box_label_catter,
|
| 320 |
+
'RandomSelectBoxlabels': RandomSelectBoxlabels,
|
| 321 |
+
'RandomSelectBoxes': RandomSelectBoxes,
|
| 322 |
+
'MaskCrop': MaskCrop,
|
| 323 |
+
'BboxPertuber': BboxPertuber,
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
class CocoDetection(torchvision.datasets.CocoDetection):
|
| 328 |
+
def __init__(self, img_folder, ann_file, transforms, return_masks, aux_target_hacks=None):
|
| 329 |
+
super(CocoDetection, self).__init__(img_folder, ann_file)
|
| 330 |
+
self._transforms = transforms
|
| 331 |
+
self.prepare = ConvertCocoPolysToMask(return_masks)
|
| 332 |
+
self.aux_target_hacks = aux_target_hacks
|
| 333 |
+
|
| 334 |
+
def change_hack_attr(self, hackclassname, attrkv_dict):
|
| 335 |
+
target_class = dataset_hook_register[hackclassname]
|
| 336 |
+
for item in self.aux_target_hacks:
|
| 337 |
+
if isinstance(item, target_class):
|
| 338 |
+
for k,v in attrkv_dict.items():
|
| 339 |
+
setattr(item, k, v)
|
| 340 |
+
|
| 341 |
+
def get_hack(self, hackclassname):
|
| 342 |
+
target_class = dataset_hook_register[hackclassname]
|
| 343 |
+
for item in self.aux_target_hacks:
|
| 344 |
+
if isinstance(item, target_class):
|
| 345 |
+
return item
|
| 346 |
+
|
| 347 |
+
def _load_image(self, id: int) -> Image.Image:
|
| 348 |
+
path = self.coco.loadImgs(id)[0]["file_name"]
|
| 349 |
+
abs_path = os.path.join(self.root, path)
|
| 350 |
+
return Image.open(abs_path).convert("RGB")
|
| 351 |
+
|
| 352 |
+
def __getitem__(self, idx):
|
| 353 |
+
"""
|
| 354 |
+
Output:
|
| 355 |
+
- target: dict of multiple items
|
| 356 |
+
- boxes: Tensor[num_box, 4]. \
|
| 357 |
+
Init type: x0,y0,x1,y1. unnormalized data.
|
| 358 |
+
Final type: cx,cy,w,h. normalized data.
|
| 359 |
+
"""
|
| 360 |
+
try:
|
| 361 |
+
img, target = super(CocoDetection, self).__getitem__(idx)
|
| 362 |
+
except:
|
| 363 |
+
print("Error idx: {}".format(idx))
|
| 364 |
+
idx += 1
|
| 365 |
+
img, target = super(CocoDetection, self).__getitem__(idx)
|
| 366 |
+
image_id = self.ids[idx]
|
| 367 |
+
target = {'image_id': image_id, 'annotations': target}
|
| 368 |
+
img, target = self.prepare(img, target)
|
| 369 |
+
|
| 370 |
+
if self._transforms is not None:
|
| 371 |
+
img, target = self._transforms(img, target)
|
| 372 |
+
|
| 373 |
+
# convert to needed format
|
| 374 |
+
if self.aux_target_hacks is not None:
|
| 375 |
+
for hack_runner in self.aux_target_hacks:
|
| 376 |
+
target, img = hack_runner(target, img=img)
|
| 377 |
+
|
| 378 |
+
return img, target
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def convert_coco_poly_to_mask(segmentations, height, width):
|
| 382 |
+
masks = []
|
| 383 |
+
for polygons in segmentations:
|
| 384 |
+
rles = coco_mask.frPyObjects(polygons, height, width)
|
| 385 |
+
mask = coco_mask.decode(rles)
|
| 386 |
+
if len(mask.shape) < 3:
|
| 387 |
+
mask = mask[..., None]
|
| 388 |
+
mask = torch.as_tensor(mask, dtype=torch.uint8)
|
| 389 |
+
mask = mask.any(dim=2)
|
| 390 |
+
masks.append(mask)
|
| 391 |
+
if masks:
|
| 392 |
+
masks = torch.stack(masks, dim=0)
|
| 393 |
+
else:
|
| 394 |
+
masks = torch.zeros((0, height, width), dtype=torch.uint8)
|
| 395 |
+
return masks
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
class ConvertCocoPolysToMask(object):
|
| 399 |
+
def __init__(self, return_masks=False):
|
| 400 |
+
self.return_masks = return_masks
|
| 401 |
+
|
| 402 |
+
def __call__(self, image, target):
|
| 403 |
+
w, h = image.size
|
| 404 |
+
|
| 405 |
+
image_id = target["image_id"]
|
| 406 |
+
image_id = torch.tensor([image_id])
|
| 407 |
+
|
| 408 |
+
anno = target["annotations"]
|
| 409 |
+
|
| 410 |
+
anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
|
| 411 |
+
|
| 412 |
+
boxes = [obj["bbox"] for obj in anno]
|
| 413 |
+
# guard against no boxes via resizing
|
| 414 |
+
boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
|
| 415 |
+
boxes[:, 2:] += boxes[:, :2]
|
| 416 |
+
boxes[:, 0::2].clamp_(min=0, max=w)
|
| 417 |
+
boxes[:, 1::2].clamp_(min=0, max=h)
|
| 418 |
+
|
| 419 |
+
classes = [obj["category_id"] for obj in anno]
|
| 420 |
+
classes = torch.tensor(classes, dtype=torch.int64)
|
| 421 |
+
|
| 422 |
+
if self.return_masks:
|
| 423 |
+
segmentations = [obj["segmentation"] for obj in anno]
|
| 424 |
+
masks = convert_coco_poly_to_mask(segmentations, h, w)
|
| 425 |
+
|
| 426 |
+
keypoints = None
|
| 427 |
+
if anno and "keypoints" in anno[0]:
|
| 428 |
+
keypoints = [obj["keypoints"] for obj in anno]
|
| 429 |
+
keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
|
| 430 |
+
num_keypoints = keypoints.shape[0]
|
| 431 |
+
if num_keypoints:
|
| 432 |
+
keypoints = keypoints.view(num_keypoints, -1, 3)
|
| 433 |
+
|
| 434 |
+
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
|
| 435 |
+
boxes = boxes[keep]
|
| 436 |
+
classes = classes[keep]
|
| 437 |
+
if self.return_masks:
|
| 438 |
+
masks = masks[keep]
|
| 439 |
+
if keypoints is not None:
|
| 440 |
+
keypoints = keypoints[keep]
|
| 441 |
+
|
| 442 |
+
target = {}
|
| 443 |
+
target["boxes"] = boxes
|
| 444 |
+
target["labels"] = classes
|
| 445 |
+
if self.return_masks:
|
| 446 |
+
target["masks"] = masks
|
| 447 |
+
target["image_id"] = image_id
|
| 448 |
+
if keypoints is not None:
|
| 449 |
+
target["keypoints"] = keypoints
|
| 450 |
+
|
| 451 |
+
# for conversion to coco api
|
| 452 |
+
area = torch.tensor([obj["area"] for obj in anno])
|
| 453 |
+
iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
|
| 454 |
+
target["area"] = area[keep]
|
| 455 |
+
target["iscrowd"] = iscrowd[keep]
|
| 456 |
+
|
| 457 |
+
target["orig_size"] = torch.as_tensor([int(h), int(w)])
|
| 458 |
+
target["size"] = torch.as_tensor([int(h), int(w)])
|
| 459 |
+
|
| 460 |
+
return image, target
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
|
| 464 |
+
|
| 465 |
+
normalize = T.Compose([
|
| 466 |
+
T.ToTensor(),
|
| 467 |
+
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
| 468 |
+
])
|
| 469 |
+
|
| 470 |
+
# config the params for data aug
|
| 471 |
+
scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
|
| 472 |
+
max_size = 1333
|
| 473 |
+
scales2_resize = [400, 500, 600]
|
| 474 |
+
scales2_crop = [384, 600]
|
| 475 |
+
|
| 476 |
+
# update args from config files
|
| 477 |
+
scales = getattr(args, 'data_aug_scales', scales)
|
| 478 |
+
max_size = getattr(args, 'data_aug_max_size', max_size)
|
| 479 |
+
scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
|
| 480 |
+
scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
|
| 481 |
+
|
| 482 |
+
# resize them
|
| 483 |
+
data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
|
| 484 |
+
if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
|
| 485 |
+
data_aug_scale_overlap = float(data_aug_scale_overlap)
|
| 486 |
+
scales = [int(i*data_aug_scale_overlap) for i in scales]
|
| 487 |
+
max_size = int(max_size*data_aug_scale_overlap)
|
| 488 |
+
scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
|
| 489 |
+
scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
|
| 490 |
+
|
| 491 |
+
datadict_for_print = {
|
| 492 |
+
'scales': scales,
|
| 493 |
+
'max_size': max_size,
|
| 494 |
+
'scales2_resize': scales2_resize,
|
| 495 |
+
'scales2_crop': scales2_crop
|
| 496 |
+
}
|
| 497 |
+
# print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
|
| 498 |
+
|
| 499 |
+
if image_set == 'train':
|
| 500 |
+
if fix_size:
|
| 501 |
+
return T.Compose([
|
| 502 |
+
T.RandomHorizontalFlip(),
|
| 503 |
+
T.RandomResize([(max_size, max(scales))]),
|
| 504 |
+
# T.RandomResize([(512, 512)]),
|
| 505 |
+
normalize,
|
| 506 |
+
])
|
| 507 |
+
|
| 508 |
+
if strong_aug:
|
| 509 |
+
import datasets.sltransform as SLT
|
| 510 |
+
|
| 511 |
+
return T.Compose([
|
| 512 |
+
T.RandomHorizontalFlip(),
|
| 513 |
+
T.RandomSelect(
|
| 514 |
+
T.RandomResize(scales, max_size=max_size),
|
| 515 |
+
T.Compose([
|
| 516 |
+
T.RandomResize(scales2_resize),
|
| 517 |
+
T.RandomSizeCrop(*scales2_crop),
|
| 518 |
+
T.RandomResize(scales, max_size=max_size),
|
| 519 |
+
])
|
| 520 |
+
),
|
| 521 |
+
SLT.RandomSelectMulti([
|
| 522 |
+
SLT.RandomCrop(),
|
| 523 |
+
SLT.LightingNoise(),
|
| 524 |
+
SLT.AdjustBrightness(2),
|
| 525 |
+
SLT.AdjustContrast(2),
|
| 526 |
+
]),
|
| 527 |
+
normalize,
|
| 528 |
+
])
|
| 529 |
+
|
| 530 |
+
return T.Compose([
|
| 531 |
+
T.RandomHorizontalFlip(),
|
| 532 |
+
T.RandomSelect(
|
| 533 |
+
T.RandomResize(scales, max_size=max_size),
|
| 534 |
+
T.Compose([
|
| 535 |
+
T.RandomResize(scales2_resize),
|
| 536 |
+
T.RandomSizeCrop(*scales2_crop),
|
| 537 |
+
T.RandomResize(scales, max_size=max_size),
|
| 538 |
+
])
|
| 539 |
+
),
|
| 540 |
+
normalize,
|
| 541 |
+
])
|
| 542 |
+
|
| 543 |
+
if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
|
| 544 |
+
|
| 545 |
+
if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
|
| 546 |
+
print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
|
| 547 |
+
return T.Compose([
|
| 548 |
+
T.ResizeDebug((1280, 800)),
|
| 549 |
+
normalize,
|
| 550 |
+
])
|
| 551 |
+
|
| 552 |
+
return T.Compose([
|
| 553 |
+
T.RandomResize([max(scales)], max_size=max_size),
|
| 554 |
+
normalize,
|
| 555 |
+
])
|
| 556 |
+
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
raise ValueError(f'unknown {image_set}')
|
| 560 |
+
|
| 561 |
+
|
| 562 |
+
def get_aux_target_hacks_list(image_set, args):
|
| 563 |
+
if args.modelname in ['q2bs_mask', 'q2bs']:
|
| 564 |
+
aux_target_hacks_list = [
|
| 565 |
+
label2compat(),
|
| 566 |
+
label_compat2onehot(),
|
| 567 |
+
RandomSelectBoxes(num_class=args.num_classes)
|
| 568 |
+
]
|
| 569 |
+
if args.masked_data and image_set == 'train':
|
| 570 |
+
# aux_target_hacks_list.append()
|
| 571 |
+
aux_target_hacks_list.append(MaskCrop())
|
| 572 |
+
elif args.modelname in ['q2bm_v2', 'q2bs_ce', 'q2op', 'q2ofocal', 'q2opclip', 'q2ocqonly']:
|
| 573 |
+
aux_target_hacks_list = [
|
| 574 |
+
label2compat(),
|
| 575 |
+
label_compat2onehot(),
|
| 576 |
+
box_label_catter(),
|
| 577 |
+
RandomSelectBoxlabels(num_classes=args.num_classes,
|
| 578 |
+
prob_first_item=args.prob_first_item,
|
| 579 |
+
prob_random_item=args.prob_random_item,
|
| 580 |
+
prob_last_item=args.prob_last_item,
|
| 581 |
+
prob_stop_sign=args.prob_stop_sign,
|
| 582 |
+
),
|
| 583 |
+
BboxPertuber(max_ratio=0.02, generate_samples=1000),
|
| 584 |
+
]
|
| 585 |
+
elif args.modelname in ['q2omask', 'q2osa']:
|
| 586 |
+
if args.coco_aug:
|
| 587 |
+
aux_target_hacks_list = [
|
| 588 |
+
label2compat(),
|
| 589 |
+
label_compat2onehot(),
|
| 590 |
+
box_label_catter(),
|
| 591 |
+
RandomSelectBoxlabels(num_classes=args.num_classes,
|
| 592 |
+
prob_first_item=args.prob_first_item,
|
| 593 |
+
prob_random_item=args.prob_random_item,
|
| 594 |
+
prob_last_item=args.prob_last_item,
|
| 595 |
+
prob_stop_sign=args.prob_stop_sign,
|
| 596 |
+
),
|
| 597 |
+
RandomDrop(p=0.2),
|
| 598 |
+
BboxPertuber(max_ratio=0.02, generate_samples=1000),
|
| 599 |
+
RandomCutout(factor=0.5)
|
| 600 |
+
]
|
| 601 |
+
else:
|
| 602 |
+
aux_target_hacks_list = [
|
| 603 |
+
label2compat(),
|
| 604 |
+
label_compat2onehot(),
|
| 605 |
+
box_label_catter(),
|
| 606 |
+
RandomSelectBoxlabels(num_classes=args.num_classes,
|
| 607 |
+
prob_first_item=args.prob_first_item,
|
| 608 |
+
prob_random_item=args.prob_random_item,
|
| 609 |
+
prob_last_item=args.prob_last_item,
|
| 610 |
+
prob_stop_sign=args.prob_stop_sign,
|
| 611 |
+
),
|
| 612 |
+
BboxPertuber(max_ratio=0.02, generate_samples=1000),
|
| 613 |
+
]
|
| 614 |
+
else:
|
| 615 |
+
aux_target_hacks_list = None
|
| 616 |
+
|
| 617 |
+
return aux_target_hacks_list
|
| 618 |
+
|
| 619 |
+
|
| 620 |
+
def build(image_set, args, datasetinfo):
|
| 621 |
+
img_folder = datasetinfo["root"]
|
| 622 |
+
ann_file = datasetinfo["anno"]
|
| 623 |
+
|
| 624 |
+
# copy to local path
|
| 625 |
+
if os.environ.get('DATA_COPY_SHILONG') == 'INFO':
|
| 626 |
+
preparing_dataset(dict(img_folder=img_folder, ann_file=ann_file), image_set, args)
|
| 627 |
+
|
| 628 |
+
try:
|
| 629 |
+
strong_aug = args.strong_aug
|
| 630 |
+
except:
|
| 631 |
+
strong_aug = False
|
| 632 |
+
print(img_folder, ann_file)
|
| 633 |
+
dataset = CocoDetection(img_folder, ann_file,
|
| 634 |
+
transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
|
| 635 |
+
return_masks=args.masks,
|
| 636 |
+
aux_target_hacks=None,
|
| 637 |
+
)
|
| 638 |
+
return dataset
|
| 639 |
+
|
| 640 |
+
|
| 641 |
+
if __name__ == "__main__":
|
| 642 |
+
# Objects365 Val example
|
| 643 |
+
dataset_o365 = CocoDetection(
|
| 644 |
+
'/path/Objects365/train/',
|
| 645 |
+
"/path/Objects365/slannos/anno_preprocess_train_v2.json",
|
| 646 |
+
transforms=None,
|
| 647 |
+
return_masks=False,
|
| 648 |
+
)
|
| 649 |
+
print('len(dataset_o365):', len(dataset_o365))
|
groundingdino/datasets/coco_eval.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
| 2 |
+
"""
|
| 3 |
+
COCO evaluator that works in distributed mode.
|
| 4 |
+
|
| 5 |
+
Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
|
| 6 |
+
The difference is that there is less copy-pasting from pycocotools
|
| 7 |
+
in the end of the file, as python3 can suppress prints with contextlib
|
| 8 |
+
"""
|
| 9 |
+
import os
|
| 10 |
+
import contextlib
|
| 11 |
+
import copy
|
| 12 |
+
import numpy as np
|
| 13 |
+
import torch
|
| 14 |
+
|
| 15 |
+
from pycocotools.cocoeval import COCOeval
|
| 16 |
+
from pycocotools.coco import COCO
|
| 17 |
+
import pycocotools.mask as mask_util
|
| 18 |
+
|
| 19 |
+
from util.misc import all_gather
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class CocoEvaluator(object):
|
| 23 |
+
def __init__(self, coco_gt, iou_types, useCats=True):
|
| 24 |
+
assert isinstance(iou_types, (list, tuple))
|
| 25 |
+
coco_gt = copy.deepcopy(coco_gt)
|
| 26 |
+
self.coco_gt = coco_gt
|
| 27 |
+
|
| 28 |
+
self.iou_types = iou_types
|
| 29 |
+
self.coco_eval = {}
|
| 30 |
+
for iou_type in iou_types:
|
| 31 |
+
self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
|
| 32 |
+
self.coco_eval[iou_type].useCats = useCats
|
| 33 |
+
|
| 34 |
+
self.img_ids = []
|
| 35 |
+
self.eval_imgs = {k: [] for k in iou_types}
|
| 36 |
+
self.useCats = useCats
|
| 37 |
+
|
| 38 |
+
def update(self, predictions):
|
| 39 |
+
img_ids = list(np.unique(list(predictions.keys())))
|
| 40 |
+
self.img_ids.extend(img_ids)
|
| 41 |
+
|
| 42 |
+
for iou_type in self.iou_types:
|
| 43 |
+
results = self.prepare(predictions, iou_type)
|
| 44 |
+
|
| 45 |
+
# suppress pycocotools prints
|
| 46 |
+
with open(os.devnull, 'w') as devnull:
|
| 47 |
+
with contextlib.redirect_stdout(devnull):
|
| 48 |
+
coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
|
| 49 |
+
coco_eval = self.coco_eval[iou_type]
|
| 50 |
+
|
| 51 |
+
coco_eval.cocoDt = coco_dt
|
| 52 |
+
coco_eval.params.imgIds = list(img_ids)
|
| 53 |
+
coco_eval.params.useCats = self.useCats
|
| 54 |
+
img_ids, eval_imgs = evaluate(coco_eval)
|
| 55 |
+
|
| 56 |
+
self.eval_imgs[iou_type].append(eval_imgs)
|
| 57 |
+
|
| 58 |
+
def synchronize_between_processes(self):
|
| 59 |
+
for iou_type in self.iou_types:
|
| 60 |
+
self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
|
| 61 |
+
create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
|
| 62 |
+
|
| 63 |
+
def accumulate(self):
|
| 64 |
+
for coco_eval in self.coco_eval.values():
|
| 65 |
+
coco_eval.accumulate()
|
| 66 |
+
|
| 67 |
+
def summarize(self):
|
| 68 |
+
for iou_type, coco_eval in self.coco_eval.items():
|
| 69 |
+
print("IoU metric: {}".format(iou_type))
|
| 70 |
+
coco_eval.summarize()
|
| 71 |
+
|
| 72 |
+
def prepare(self, predictions, iou_type):
|
| 73 |
+
if iou_type == "bbox":
|
| 74 |
+
return self.prepare_for_coco_detection(predictions)
|
| 75 |
+
elif iou_type == "segm":
|
| 76 |
+
return self.prepare_for_coco_segmentation(predictions)
|
| 77 |
+
elif iou_type == "keypoints":
|
| 78 |
+
return self.prepare_for_coco_keypoint(predictions)
|
| 79 |
+
else:
|
| 80 |
+
raise ValueError("Unknown iou type {}".format(iou_type))
|
| 81 |
+
|
| 82 |
+
def prepare_for_coco_detection(self, predictions):
|
| 83 |
+
coco_results = []
|
| 84 |
+
for original_id, prediction in predictions.items():
|
| 85 |
+
if len(prediction) == 0:
|
| 86 |
+
continue
|
| 87 |
+
|
| 88 |
+
boxes = prediction["boxes"]
|
| 89 |
+
boxes = convert_to_xywh(boxes).tolist()
|
| 90 |
+
if not isinstance(prediction["scores"], list):
|
| 91 |
+
scores = prediction["scores"].tolist()
|
| 92 |
+
else:
|
| 93 |
+
scores = prediction["scores"]
|
| 94 |
+
if not isinstance(prediction["labels"], list):
|
| 95 |
+
labels = prediction["labels"].tolist()
|
| 96 |
+
else:
|
| 97 |
+
labels = prediction["labels"]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
try:
|
| 101 |
+
coco_results.extend(
|
| 102 |
+
[
|
| 103 |
+
{
|
| 104 |
+
"image_id": original_id,
|
| 105 |
+
"category_id": labels[k],
|
| 106 |
+
"bbox": box,
|
| 107 |
+
"score": scores[k],
|
| 108 |
+
}
|
| 109 |
+
for k, box in enumerate(boxes)
|
| 110 |
+
]
|
| 111 |
+
)
|
| 112 |
+
except:
|
| 113 |
+
import ipdb; ipdb.set_trace()
|
| 114 |
+
return coco_results
|
| 115 |
+
|
| 116 |
+
def prepare_for_coco_segmentation(self, predictions):
|
| 117 |
+
coco_results = []
|
| 118 |
+
for original_id, prediction in predictions.items():
|
| 119 |
+
if len(prediction) == 0:
|
| 120 |
+
continue
|
| 121 |
+
|
| 122 |
+
scores = prediction["scores"]
|
| 123 |
+
labels = prediction["labels"]
|
| 124 |
+
masks = prediction["masks"]
|
| 125 |
+
|
| 126 |
+
masks = masks > 0.5
|
| 127 |
+
|
| 128 |
+
scores = prediction["scores"].tolist()
|
| 129 |
+
labels = prediction["labels"].tolist()
|
| 130 |
+
|
| 131 |
+
rles = [
|
| 132 |
+
mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
|
| 133 |
+
for mask in masks
|
| 134 |
+
]
|
| 135 |
+
for rle in rles:
|
| 136 |
+
rle["counts"] = rle["counts"].decode("utf-8")
|
| 137 |
+
|
| 138 |
+
coco_results.extend(
|
| 139 |
+
[
|
| 140 |
+
{
|
| 141 |
+
"image_id": original_id,
|
| 142 |
+
"category_id": labels[k],
|
| 143 |
+
"segmentation": rle,
|
| 144 |
+
"score": scores[k],
|
| 145 |
+
}
|
| 146 |
+
for k, rle in enumerate(rles)
|
| 147 |
+
]
|
| 148 |
+
)
|
| 149 |
+
return coco_results
|
| 150 |
+
|
| 151 |
+
def prepare_for_coco_keypoint(self, predictions):
|
| 152 |
+
coco_results = []
|
| 153 |
+
for original_id, prediction in predictions.items():
|
| 154 |
+
if len(prediction) == 0:
|
| 155 |
+
continue
|
| 156 |
+
|
| 157 |
+
boxes = prediction["boxes"]
|
| 158 |
+
boxes = convert_to_xywh(boxes).tolist()
|
| 159 |
+
scores = prediction["scores"].tolist()
|
| 160 |
+
labels = prediction["labels"].tolist()
|
| 161 |
+
keypoints = prediction["keypoints"]
|
| 162 |
+
keypoints = keypoints.flatten(start_dim=1).tolist()
|
| 163 |
+
|
| 164 |
+
coco_results.extend(
|
| 165 |
+
[
|
| 166 |
+
{
|
| 167 |
+
"image_id": original_id,
|
| 168 |
+
"category_id": labels[k],
|
| 169 |
+
'keypoints': keypoint,
|
| 170 |
+
"score": scores[k],
|
| 171 |
+
}
|
| 172 |
+
for k, keypoint in enumerate(keypoints)
|
| 173 |
+
]
|
| 174 |
+
)
|
| 175 |
+
return coco_results
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def convert_to_xywh(boxes):
|
| 179 |
+
xmin, ymin, xmax, ymax = boxes.unbind(1)
|
| 180 |
+
return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def merge(img_ids, eval_imgs):
|
| 184 |
+
all_img_ids = all_gather(img_ids)
|
| 185 |
+
all_eval_imgs = all_gather(eval_imgs)
|
| 186 |
+
|
| 187 |
+
merged_img_ids = []
|
| 188 |
+
for p in all_img_ids:
|
| 189 |
+
merged_img_ids.extend(p)
|
| 190 |
+
|
| 191 |
+
merged_eval_imgs = []
|
| 192 |
+
for p in all_eval_imgs:
|
| 193 |
+
merged_eval_imgs.append(p)
|
| 194 |
+
|
| 195 |
+
merged_img_ids = np.array(merged_img_ids)
|
| 196 |
+
merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
|
| 197 |
+
|
| 198 |
+
# keep only unique (and in sorted order) images
|
| 199 |
+
merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
|
| 200 |
+
merged_eval_imgs = merged_eval_imgs[..., idx]
|
| 201 |
+
|
| 202 |
+
return merged_img_ids, merged_eval_imgs
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
|
| 206 |
+
img_ids, eval_imgs = merge(img_ids, eval_imgs)
|
| 207 |
+
img_ids = list(img_ids)
|
| 208 |
+
eval_imgs = list(eval_imgs.flatten())
|
| 209 |
+
|
| 210 |
+
coco_eval.evalImgs = eval_imgs
|
| 211 |
+
coco_eval.params.imgIds = img_ids
|
| 212 |
+
coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
#################################################################
|
| 216 |
+
# From pycocotools, just removed the prints and fixed
|
| 217 |
+
# a Python3 bug about unicode not defined
|
| 218 |
+
#################################################################
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def evaluate(self):
|
| 222 |
+
'''
|
| 223 |
+
Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
|
| 224 |
+
:return: None
|
| 225 |
+
'''
|
| 226 |
+
p = self.params
|
| 227 |
+
# add backward compatibility if useSegm is specified in params
|
| 228 |
+
if p.useSegm is not None:
|
| 229 |
+
p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
|
| 230 |
+
print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
|
| 231 |
+
p.imgIds = list(np.unique(p.imgIds))
|
| 232 |
+
if p.useCats:
|
| 233 |
+
p.catIds = list(np.unique(p.catIds))
|
| 234 |
+
p.maxDets = sorted(p.maxDets)
|
| 235 |
+
self.params = p
|
| 236 |
+
|
| 237 |
+
self._prepare()
|
| 238 |
+
# loop through images, area range, max detection number
|
| 239 |
+
catIds = p.catIds if p.useCats else [-1]
|
| 240 |
+
|
| 241 |
+
if p.iouType == 'segm' or p.iouType == 'bbox':
|
| 242 |
+
computeIoU = self.computeIoU
|
| 243 |
+
elif p.iouType == 'keypoints':
|
| 244 |
+
computeIoU = self.computeOks
|
| 245 |
+
self.ious = {
|
| 246 |
+
(imgId, catId): computeIoU(imgId, catId)
|
| 247 |
+
for imgId in p.imgIds
|
| 248 |
+
for catId in catIds}
|
| 249 |
+
|
| 250 |
+
evaluateImg = self.evaluateImg
|
| 251 |
+
maxDet = p.maxDets[-1]
|
| 252 |
+
evalImgs = [
|
| 253 |
+
evaluateImg(imgId, catId, areaRng, maxDet)
|
| 254 |
+
for catId in catIds
|
| 255 |
+
for areaRng in p.areaRng
|
| 256 |
+
for imgId in p.imgIds
|
| 257 |
+
]
|
| 258 |
+
# this is NOT in the pycocotools code, but could be done outside
|
| 259 |
+
evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
|
| 260 |
+
self._paramsEval = copy.deepcopy(self.params)
|
| 261 |
+
|
| 262 |
+
return p.imgIds, evalImgs
|
| 263 |
+
|
| 264 |
+
#################################################################
|
| 265 |
+
# end of straight copy from pycocotools, just removing the prints
|
| 266 |
+
#################################################################
|
groundingdino/datasets/coco_panoptic.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
| 2 |
+
import json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import torch
|
| 7 |
+
from PIL import Image
|
| 8 |
+
|
| 9 |
+
from panopticapi.utils import rgb2id
|
| 10 |
+
from util.box_ops import masks_to_boxes
|
| 11 |
+
|
| 12 |
+
from .coco import make_coco_transforms
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class CocoPanoptic:
|
| 16 |
+
def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True):
|
| 17 |
+
with open(ann_file, 'r') as f:
|
| 18 |
+
self.coco = json.load(f)
|
| 19 |
+
|
| 20 |
+
# sort 'images' field so that they are aligned with 'annotations'
|
| 21 |
+
# i.e., in alphabetical order
|
| 22 |
+
self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id'])
|
| 23 |
+
# sanity check
|
| 24 |
+
if "annotations" in self.coco:
|
| 25 |
+
for img, ann in zip(self.coco['images'], self.coco['annotations']):
|
| 26 |
+
assert img['file_name'][:-4] == ann['file_name'][:-4]
|
| 27 |
+
|
| 28 |
+
self.img_folder = img_folder
|
| 29 |
+
self.ann_folder = ann_folder
|
| 30 |
+
self.ann_file = ann_file
|
| 31 |
+
self.transforms = transforms
|
| 32 |
+
self.return_masks = return_masks
|
| 33 |
+
|
| 34 |
+
def __getitem__(self, idx):
|
| 35 |
+
ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx]
|
| 36 |
+
img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg')
|
| 37 |
+
ann_path = Path(self.ann_folder) / ann_info['file_name']
|
| 38 |
+
|
| 39 |
+
img = Image.open(img_path).convert('RGB')
|
| 40 |
+
w, h = img.size
|
| 41 |
+
if "segments_info" in ann_info:
|
| 42 |
+
masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
|
| 43 |
+
masks = rgb2id(masks)
|
| 44 |
+
|
| 45 |
+
ids = np.array([ann['id'] for ann in ann_info['segments_info']])
|
| 46 |
+
masks = masks == ids[:, None, None]
|
| 47 |
+
|
| 48 |
+
masks = torch.as_tensor(masks, dtype=torch.uint8)
|
| 49 |
+
labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64)
|
| 50 |
+
|
| 51 |
+
target = {}
|
| 52 |
+
target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]])
|
| 53 |
+
if self.return_masks:
|
| 54 |
+
target['masks'] = masks
|
| 55 |
+
target['labels'] = labels
|
| 56 |
+
|
| 57 |
+
target["boxes"] = masks_to_boxes(masks)
|
| 58 |
+
|
| 59 |
+
target['size'] = torch.as_tensor([int(h), int(w)])
|
| 60 |
+
target['orig_size'] = torch.as_tensor([int(h), int(w)])
|
| 61 |
+
if "segments_info" in ann_info:
|
| 62 |
+
for name in ['iscrowd', 'area']:
|
| 63 |
+
target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']])
|
| 64 |
+
|
| 65 |
+
if self.transforms is not None:
|
| 66 |
+
img, target = self.transforms(img, target)
|
| 67 |
+
|
| 68 |
+
return img, target
|
| 69 |
+
|
| 70 |
+
def __len__(self):
|
| 71 |
+
return len(self.coco['images'])
|
| 72 |
+
|
| 73 |
+
def get_height_and_width(self, idx):
|
| 74 |
+
img_info = self.coco['images'][idx]
|
| 75 |
+
height = img_info['height']
|
| 76 |
+
width = img_info['width']
|
| 77 |
+
return height, width
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def build(image_set, args):
|
| 81 |
+
img_folder_root = Path(args.coco_path)
|
| 82 |
+
ann_folder_root = Path(args.coco_panoptic_path)
|
| 83 |
+
assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist'
|
| 84 |
+
assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist'
|
| 85 |
+
mode = 'panoptic'
|
| 86 |
+
PATHS = {
|
| 87 |
+
"train": ("train2017", Path("annotations") / f'{mode}_train2017.json'),
|
| 88 |
+
"val": ("val2017", Path("annotations") / f'{mode}_val2017.json'),
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
img_folder, ann_file = PATHS[image_set]
|
| 92 |
+
img_folder_path = img_folder_root / img_folder
|
| 93 |
+
ann_folder = ann_folder_root / f'{mode}_{img_folder}'
|
| 94 |
+
ann_file = ann_folder_root / ann_file
|
| 95 |
+
|
| 96 |
+
dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file,
|
| 97 |
+
transforms=make_coco_transforms(image_set), return_masks=args.masks)
|
| 98 |
+
|
| 99 |
+
return dataset
|
groundingdino/datasets/cocogrounding_eval.py
CHANGED
|
@@ -45,7 +45,7 @@ class CocoGroundingEvaluator(object):
|
|
| 45 |
def update(self, predictions):
|
| 46 |
img_ids = list(np.unique(list(predictions.keys())))
|
| 47 |
self.img_ids.extend(img_ids)
|
| 48 |
-
|
| 49 |
for iou_type in self.iou_types:
|
| 50 |
results = self.prepare(predictions, iou_type)
|
| 51 |
|
|
@@ -223,6 +223,8 @@ def evaluate(self):
|
|
| 223 |
"""
|
| 224 |
# tic = time.time()
|
| 225 |
# print('Running per image evaluation...')
|
|
|
|
|
|
|
| 226 |
p = self.params
|
| 227 |
# add backward compatibility if useSegm is specified in params
|
| 228 |
if p.useSegm is not None:
|
|
|
|
| 45 |
def update(self, predictions):
|
| 46 |
img_ids = list(np.unique(list(predictions.keys())))
|
| 47 |
self.img_ids.extend(img_ids)
|
| 48 |
+
# import pdb;pdb.set_trace()
|
| 49 |
for iou_type in self.iou_types:
|
| 50 |
results = self.prepare(predictions, iou_type)
|
| 51 |
|
|
|
|
| 223 |
"""
|
| 224 |
# tic = time.time()
|
| 225 |
# print('Running per image evaluation...')
|
| 226 |
+
|
| 227 |
+
# import pdb;pdb.set_trace()
|
| 228 |
p = self.params
|
| 229 |
# add backward compatibility if useSegm is specified in params
|
| 230 |
if p.useSegm is not None:
|
groundingdino/datasets/data_util.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import os.path as osp
|
| 3 |
+
import shutil
|
| 4 |
+
import time
|
| 5 |
+
import datetime
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
from util.slconfig import SLConfig
|
| 10 |
+
|
| 11 |
+
class Error(OSError):
|
| 12 |
+
pass
|
| 13 |
+
|
| 14 |
+
def slcopytree(src, dst, symlinks=False, ignore=None, copy_function=shutil.copyfile,
|
| 15 |
+
ignore_dangling_symlinks=False):
|
| 16 |
+
"""
|
| 17 |
+
modified from shutil.copytree without copystat.
|
| 18 |
+
|
| 19 |
+
Recursively copy a directory tree.
|
| 20 |
+
|
| 21 |
+
The destination directory must not already exist.
|
| 22 |
+
If exception(s) occur, an Error is raised with a list of reasons.
|
| 23 |
+
|
| 24 |
+
If the optional symlinks flag is true, symbolic links in the
|
| 25 |
+
source tree result in symbolic links in the destination tree; if
|
| 26 |
+
it is false, the contents of the files pointed to by symbolic
|
| 27 |
+
links are copied. If the file pointed by the symlink doesn't
|
| 28 |
+
exist, an exception will be added in the list of errors raised in
|
| 29 |
+
an Error exception at the end of the copy process.
|
| 30 |
+
|
| 31 |
+
You can set the optional ignore_dangling_symlinks flag to true if you
|
| 32 |
+
want to silence this exception. Notice that this has no effect on
|
| 33 |
+
platforms that don't support os.symlink.
|
| 34 |
+
|
| 35 |
+
The optional ignore argument is a callable. If given, it
|
| 36 |
+
is called with the `src` parameter, which is the directory
|
| 37 |
+
being visited by copytree(), and `names` which is the list of
|
| 38 |
+
`src` contents, as returned by os.listdir():
|
| 39 |
+
|
| 40 |
+
callable(src, names) -> ignored_names
|
| 41 |
+
|
| 42 |
+
Since copytree() is called recursively, the callable will be
|
| 43 |
+
called once for each directory that is copied. It returns a
|
| 44 |
+
list of names relative to the `src` directory that should
|
| 45 |
+
not be copied.
|
| 46 |
+
|
| 47 |
+
The optional copy_function argument is a callable that will be used
|
| 48 |
+
to copy each file. It will be called with the source path and the
|
| 49 |
+
destination path as arguments. By default, copy2() is used, but any
|
| 50 |
+
function that supports the same signature (like copy()) can be used.
|
| 51 |
+
|
| 52 |
+
"""
|
| 53 |
+
errors = []
|
| 54 |
+
if os.path.isdir(src):
|
| 55 |
+
names = os.listdir(src)
|
| 56 |
+
if ignore is not None:
|
| 57 |
+
ignored_names = ignore(src, names)
|
| 58 |
+
else:
|
| 59 |
+
ignored_names = set()
|
| 60 |
+
|
| 61 |
+
os.makedirs(dst)
|
| 62 |
+
for name in names:
|
| 63 |
+
if name in ignored_names:
|
| 64 |
+
continue
|
| 65 |
+
srcname = os.path.join(src, name)
|
| 66 |
+
dstname = os.path.join(dst, name)
|
| 67 |
+
try:
|
| 68 |
+
if os.path.islink(srcname):
|
| 69 |
+
linkto = os.readlink(srcname)
|
| 70 |
+
if symlinks:
|
| 71 |
+
# We can't just leave it to `copy_function` because legacy
|
| 72 |
+
# code with a custom `copy_function` may rely on copytree
|
| 73 |
+
# doing the right thing.
|
| 74 |
+
os.symlink(linkto, dstname)
|
| 75 |
+
else:
|
| 76 |
+
# ignore dangling symlink if the flag is on
|
| 77 |
+
if not os.path.exists(linkto) and ignore_dangling_symlinks:
|
| 78 |
+
continue
|
| 79 |
+
# otherwise let the copy occurs. copy2 will raise an error
|
| 80 |
+
if os.path.isdir(srcname):
|
| 81 |
+
slcopytree(srcname, dstname, symlinks, ignore,
|
| 82 |
+
copy_function)
|
| 83 |
+
else:
|
| 84 |
+
copy_function(srcname, dstname)
|
| 85 |
+
elif os.path.isdir(srcname):
|
| 86 |
+
slcopytree(srcname, dstname, symlinks, ignore, copy_function)
|
| 87 |
+
else:
|
| 88 |
+
# Will raise a SpecialFileError for unsupported file types
|
| 89 |
+
copy_function(srcname, dstname)
|
| 90 |
+
# catch the Error from the recursive copytree so that we can
|
| 91 |
+
# continue with other files
|
| 92 |
+
except Error as err:
|
| 93 |
+
errors.extend(err.args[0])
|
| 94 |
+
except OSError as why:
|
| 95 |
+
errors.append((srcname, dstname, str(why)))
|
| 96 |
+
else:
|
| 97 |
+
copy_function(src, dst)
|
| 98 |
+
|
| 99 |
+
if errors:
|
| 100 |
+
raise Error(errors)
|
| 101 |
+
return dst
|
| 102 |
+
|
| 103 |
+
def check_and_copy(src_path, tgt_path):
|
| 104 |
+
if os.path.exists(tgt_path):
|
| 105 |
+
return None
|
| 106 |
+
|
| 107 |
+
return slcopytree(src_path, tgt_path)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def remove(srcpath):
|
| 111 |
+
if os.path.isdir(srcpath):
|
| 112 |
+
return shutil.rmtree(srcpath)
|
| 113 |
+
else:
|
| 114 |
+
return os.remove(srcpath)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def preparing_dataset(pathdict, image_set, args):
|
| 118 |
+
start_time = time.time()
|
| 119 |
+
dataset_file = args.dataset_file
|
| 120 |
+
data_static_info = SLConfig.fromfile('util/static_data_path.py')
|
| 121 |
+
static_dict = data_static_info[dataset_file][image_set]
|
| 122 |
+
|
| 123 |
+
copyfilelist = []
|
| 124 |
+
for k,tgt_v in pathdict.items():
|
| 125 |
+
if os.path.exists(tgt_v):
|
| 126 |
+
if args.local_rank == 0:
|
| 127 |
+
print("path <{}> exist. remove it!".format(tgt_v))
|
| 128 |
+
remove(tgt_v)
|
| 129 |
+
# continue
|
| 130 |
+
|
| 131 |
+
if args.local_rank == 0:
|
| 132 |
+
src_v = static_dict[k]
|
| 133 |
+
assert isinstance(src_v, str)
|
| 134 |
+
if src_v.endswith('.zip'):
|
| 135 |
+
# copy
|
| 136 |
+
cp_tgt_dir = os.path.dirname(tgt_v)
|
| 137 |
+
filename = os.path.basename(src_v)
|
| 138 |
+
cp_tgt_path = os.path.join(cp_tgt_dir, filename)
|
| 139 |
+
print('Copy from <{}> to <{}>.'.format(src_v, cp_tgt_path))
|
| 140 |
+
os.makedirs(cp_tgt_dir, exist_ok=True)
|
| 141 |
+
check_and_copy(src_v, cp_tgt_path)
|
| 142 |
+
|
| 143 |
+
# unzip
|
| 144 |
+
import zipfile
|
| 145 |
+
print("Starting unzip <{}>".format(cp_tgt_path))
|
| 146 |
+
with zipfile.ZipFile(cp_tgt_path, 'r') as zip_ref:
|
| 147 |
+
zip_ref.extractall(os.path.dirname(cp_tgt_path))
|
| 148 |
+
|
| 149 |
+
copyfilelist.append(cp_tgt_path)
|
| 150 |
+
copyfilelist.append(tgt_v)
|
| 151 |
+
else:
|
| 152 |
+
print('Copy from <{}> to <{}>.'.format(src_v, tgt_v))
|
| 153 |
+
os.makedirs(os.path.dirname(tgt_v), exist_ok=True)
|
| 154 |
+
check_and_copy(src_v, tgt_v)
|
| 155 |
+
copyfilelist.append(tgt_v)
|
| 156 |
+
|
| 157 |
+
if len(copyfilelist) == 0:
|
| 158 |
+
copyfilelist = None
|
| 159 |
+
args.copyfilelist = copyfilelist
|
| 160 |
+
|
| 161 |
+
if args.distributed:
|
| 162 |
+
torch.distributed.barrier()
|
| 163 |
+
total_time = time.time() - start_time
|
| 164 |
+
if copyfilelist:
|
| 165 |
+
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
|
| 166 |
+
print('Data copy time {}'.format(total_time_str))
|
| 167 |
+
return copyfilelist
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
|
groundingdino/datasets/dataset.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import print_function
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torchvision.datasets as datasets
|
| 5 |
+
from torch.utils.data import Dataset
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from .tsv_io import TSVFile
|
| 8 |
+
import numpy as np
|
| 9 |
+
import base64
|
| 10 |
+
import io
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class TSVDataset(Dataset):
|
| 14 |
+
""" TSV dataset for ImageNet 1K training
|
| 15 |
+
"""
|
| 16 |
+
def __init__(self, tsv_file, transform=None, target_transform=None):
|
| 17 |
+
self.tsv = TSVFile(tsv_file)
|
| 18 |
+
self.transform = transform
|
| 19 |
+
self.target_transform = target_transform
|
| 20 |
+
|
| 21 |
+
def __getitem__(self, index):
|
| 22 |
+
"""
|
| 23 |
+
Args:
|
| 24 |
+
index (int): Index
|
| 25 |
+
Returns:
|
| 26 |
+
tuple: (image, target) where target is class_index of the target class.
|
| 27 |
+
"""
|
| 28 |
+
row = self.tsv.seek(index)
|
| 29 |
+
image_data = base64.b64decode(row[-1])
|
| 30 |
+
image = Image.open(io.BytesIO(image_data))
|
| 31 |
+
image = image.convert('RGB')
|
| 32 |
+
target = int(row[1])
|
| 33 |
+
|
| 34 |
+
if self.transform is not None:
|
| 35 |
+
img = self.transform(image)
|
| 36 |
+
else:
|
| 37 |
+
img = image
|
| 38 |
+
if self.target_transform is not None:
|
| 39 |
+
target = self.target_transform(target)
|
| 40 |
+
|
| 41 |
+
return img, target
|
| 42 |
+
|
| 43 |
+
def __len__(self):
|
| 44 |
+
return self.tsv.num_rows()
|
groundingdino/datasets/odvg.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torchvision.datasets.vision import VisionDataset
|
| 2 |
+
import os.path
|
| 3 |
+
from typing import Callable, Optional
|
| 4 |
+
import json
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import torch
|
| 7 |
+
import random
|
| 8 |
+
import os, sys
|
| 9 |
+
sys.path.append(os.path.dirname(sys.path[0]))
|
| 10 |
+
|
| 11 |
+
import datasets.transforms as T
|
| 12 |
+
|
| 13 |
+
class ODVGDataset(VisionDataset):
|
| 14 |
+
"""
|
| 15 |
+
Args:
|
| 16 |
+
root (string): Root directory where images are downloaded to.
|
| 17 |
+
anno (string): Path to json annotation file.
|
| 18 |
+
label_map_anno (string): Path to json label mapping file. Only for Object Detection
|
| 19 |
+
transform (callable, optional): A function/transform that takes in an PIL image
|
| 20 |
+
and returns a transformed version. E.g, ``transforms.PILToTensor``
|
| 21 |
+
target_transform (callable, optional): A function/transform that takes in the
|
| 22 |
+
target and transforms it.
|
| 23 |
+
transforms (callable, optional): A function/transform that takes input sample and its target as entry
|
| 24 |
+
and returns a transformed version.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(
|
| 28 |
+
self,
|
| 29 |
+
root: str,
|
| 30 |
+
anno: str,
|
| 31 |
+
label_map_anno: str = None,
|
| 32 |
+
max_labels: int = 80,
|
| 33 |
+
transform: Optional[Callable] = None,
|
| 34 |
+
target_transform: Optional[Callable] = None,
|
| 35 |
+
transforms: Optional[Callable] = None,
|
| 36 |
+
) -> None:
|
| 37 |
+
super().__init__(root, transforms, transform, target_transform)
|
| 38 |
+
self.root = root
|
| 39 |
+
self.dataset_mode = "OD" if label_map_anno else "VG"
|
| 40 |
+
self.max_labels = max_labels
|
| 41 |
+
if self.dataset_mode == "OD":
|
| 42 |
+
self.load_label_map(label_map_anno)
|
| 43 |
+
self._load_metas(anno)
|
| 44 |
+
self.get_dataset_info()
|
| 45 |
+
|
| 46 |
+
def load_label_map(self, label_map_anno):
|
| 47 |
+
with open(label_map_anno, 'r') as file:
|
| 48 |
+
self.label_map = json.load(file)
|
| 49 |
+
self.label_index = set(self.label_map.keys())
|
| 50 |
+
|
| 51 |
+
def _load_metas(self, anno):
|
| 52 |
+
with open(anno, 'r') as f:
|
| 53 |
+
self.metas = json.load(f)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def get_dataset_info(self):
|
| 57 |
+
print(f" == total images: {len(self)}")
|
| 58 |
+
if self.dataset_mode == "OD":
|
| 59 |
+
print(f" == total labels: {len(self.label_map)}")
|
| 60 |
+
|
| 61 |
+
def __getitem__(self, index: int):
|
| 62 |
+
meta = self.metas[index]
|
| 63 |
+
rel_path = meta["filename"]
|
| 64 |
+
abs_path = os.path.join(self.root, rel_path)
|
| 65 |
+
if not os.path.exists(abs_path):
|
| 66 |
+
raise FileNotFoundError(f"{abs_path} not found.")
|
| 67 |
+
image = Image.open(abs_path).convert('RGB')
|
| 68 |
+
w, h = image.size
|
| 69 |
+
if self.dataset_mode == "OD":
|
| 70 |
+
anno = meta["detection"]
|
| 71 |
+
instances = [obj for obj in anno["instances"]]
|
| 72 |
+
boxes = [obj["bbox"] for obj in instances]
|
| 73 |
+
# generate vg_labels
|
| 74 |
+
# pos bbox labels
|
| 75 |
+
ori_classes = [str(obj["label"]) for obj in instances]
|
| 76 |
+
pos_labels = set(ori_classes)
|
| 77 |
+
# neg bbox labels
|
| 78 |
+
neg_labels = self.label_index.difference(pos_labels)
|
| 79 |
+
|
| 80 |
+
vg_labels = list(pos_labels)
|
| 81 |
+
num_to_add = min(len(neg_labels), self.max_labels-len(pos_labels))
|
| 82 |
+
if num_to_add > 0:
|
| 83 |
+
vg_labels.extend(random.sample(neg_labels, num_to_add))
|
| 84 |
+
|
| 85 |
+
# shuffle
|
| 86 |
+
for i in range(len(vg_labels)-1, 0, -1):
|
| 87 |
+
j = random.randint(0, i)
|
| 88 |
+
vg_labels[i], vg_labels[j] = vg_labels[j], vg_labels[i]
|
| 89 |
+
|
| 90 |
+
caption_list = [self.label_map[lb] for lb in vg_labels]
|
| 91 |
+
caption_dict = {item:index for index, item in enumerate(caption_list)}
|
| 92 |
+
|
| 93 |
+
caption = ' . '.join(caption_list) + ' .'
|
| 94 |
+
classes = [caption_dict[self.label_map[str(obj["label"])]] for obj in instances]
|
| 95 |
+
boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
|
| 96 |
+
classes = torch.tensor(classes, dtype=torch.int64)
|
| 97 |
+
elif self.dataset_mode == "VG":
|
| 98 |
+
anno = meta["Grounding"]
|
| 99 |
+
instances = [obj for obj in anno["regions"]]
|
| 100 |
+
boxes = [obj["bbox"] for obj in instances]
|
| 101 |
+
caption_list = [obj["phrase"] for obj in instances]
|
| 102 |
+
c = list(zip(boxes, caption_list))
|
| 103 |
+
random.shuffle(c)
|
| 104 |
+
boxes[:], caption_list[:] = zip(*c)
|
| 105 |
+
uni_caption_list = list(set(caption_list))
|
| 106 |
+
label_map = {}
|
| 107 |
+
for idx in range(len(uni_caption_list)):
|
| 108 |
+
label_map[uni_caption_list[idx]] = idx
|
| 109 |
+
classes = [label_map[cap] for cap in caption_list]
|
| 110 |
+
caption = ' . '.join(uni_caption_list) + ' .'
|
| 111 |
+
boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
|
| 112 |
+
classes = torch.tensor(classes, dtype=torch.int64)
|
| 113 |
+
caption_list = uni_caption_list
|
| 114 |
+
# print("caption_list" , caption_list)
|
| 115 |
+
# print("caption" , caption)
|
| 116 |
+
# print("boxes" , boxes)
|
| 117 |
+
target = {}
|
| 118 |
+
target["image_id"] = rel_path.strip(".jpg")
|
| 119 |
+
target["size"] = torch.as_tensor([int(h), int(w)])
|
| 120 |
+
target["cap_list"] = caption_list
|
| 121 |
+
target["caption"] = caption
|
| 122 |
+
target["boxes"] = boxes
|
| 123 |
+
target["labels"] = classes
|
| 124 |
+
# print(" image_id " , target["image_id"])
|
| 125 |
+
# size, cap_list, caption, bboxes, labels
|
| 126 |
+
|
| 127 |
+
if self.transforms is not None:
|
| 128 |
+
image, target = self.transforms(image, target)
|
| 129 |
+
|
| 130 |
+
return image, target
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def __len__(self) -> int:
|
| 134 |
+
return len(self.metas)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
|
| 138 |
+
|
| 139 |
+
normalize = T.Compose([
|
| 140 |
+
T.ToTensor(),
|
| 141 |
+
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
| 142 |
+
])
|
| 143 |
+
|
| 144 |
+
# config the params for data aug
|
| 145 |
+
scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
|
| 146 |
+
max_size = 1333
|
| 147 |
+
scales2_resize = [400, 500, 600]
|
| 148 |
+
scales2_crop = [384, 600]
|
| 149 |
+
|
| 150 |
+
# update args from config files
|
| 151 |
+
scales = getattr(args, 'data_aug_scales', scales)
|
| 152 |
+
max_size = getattr(args, 'data_aug_max_size', max_size)
|
| 153 |
+
scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
|
| 154 |
+
scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
|
| 155 |
+
|
| 156 |
+
# resize them
|
| 157 |
+
data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
|
| 158 |
+
if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
|
| 159 |
+
data_aug_scale_overlap = float(data_aug_scale_overlap)
|
| 160 |
+
scales = [int(i*data_aug_scale_overlap) for i in scales]
|
| 161 |
+
max_size = int(max_size*data_aug_scale_overlap)
|
| 162 |
+
scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
|
| 163 |
+
scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
|
| 164 |
+
|
| 165 |
+
# datadict_for_print = {
|
| 166 |
+
# 'scales': scales,
|
| 167 |
+
# 'max_size': max_size,
|
| 168 |
+
# 'scales2_resize': scales2_resize,
|
| 169 |
+
# 'scales2_crop': scales2_crop
|
| 170 |
+
# }
|
| 171 |
+
# print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
|
| 172 |
+
|
| 173 |
+
if image_set == 'train':
|
| 174 |
+
if fix_size:
|
| 175 |
+
return T.Compose([
|
| 176 |
+
T.RandomHorizontalFlip(),
|
| 177 |
+
T.RandomResize([(max_size, max(scales))]),
|
| 178 |
+
normalize,
|
| 179 |
+
])
|
| 180 |
+
|
| 181 |
+
if strong_aug:
|
| 182 |
+
import datasets.sltransform as SLT
|
| 183 |
+
|
| 184 |
+
return T.Compose([
|
| 185 |
+
T.RandomHorizontalFlip(),
|
| 186 |
+
T.RandomSelect(
|
| 187 |
+
T.RandomResize(scales, max_size=max_size),
|
| 188 |
+
T.Compose([
|
| 189 |
+
T.RandomResize(scales2_resize),
|
| 190 |
+
T.RandomSizeCrop(*scales2_crop),
|
| 191 |
+
T.RandomResize(scales, max_size=max_size),
|
| 192 |
+
])
|
| 193 |
+
),
|
| 194 |
+
SLT.RandomSelectMulti([
|
| 195 |
+
SLT.RandomCrop(),
|
| 196 |
+
SLT.LightingNoise(),
|
| 197 |
+
SLT.AdjustBrightness(2),
|
| 198 |
+
SLT.AdjustContrast(2),
|
| 199 |
+
]),
|
| 200 |
+
normalize,
|
| 201 |
+
])
|
| 202 |
+
|
| 203 |
+
return T.Compose([
|
| 204 |
+
T.RandomHorizontalFlip(),
|
| 205 |
+
T.RandomSelect(
|
| 206 |
+
T.RandomResize(scales, max_size=max_size),
|
| 207 |
+
T.Compose([
|
| 208 |
+
T.RandomResize(scales2_resize),
|
| 209 |
+
T.RandomSizeCrop(*scales2_crop),
|
| 210 |
+
T.RandomResize(scales, max_size=max_size),
|
| 211 |
+
])
|
| 212 |
+
),
|
| 213 |
+
normalize,
|
| 214 |
+
])
|
| 215 |
+
|
| 216 |
+
if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
|
| 217 |
+
|
| 218 |
+
if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
|
| 219 |
+
print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
|
| 220 |
+
return T.Compose([
|
| 221 |
+
T.ResizeDebug((1280, 800)),
|
| 222 |
+
normalize,
|
| 223 |
+
])
|
| 224 |
+
|
| 225 |
+
return T.Compose([
|
| 226 |
+
T.RandomResize([max(scales)], max_size=max_size),
|
| 227 |
+
normalize,
|
| 228 |
+
])
|
| 229 |
+
|
| 230 |
+
raise ValueError(f'unknown {image_set}')
|
| 231 |
+
|
| 232 |
+
def build_odvg(image_set, args, datasetinfo):
|
| 233 |
+
img_folder = datasetinfo["root"]
|
| 234 |
+
ann_file = datasetinfo["anno"]
|
| 235 |
+
label_map = datasetinfo["label_map"] if "label_map" in datasetinfo else None
|
| 236 |
+
try:
|
| 237 |
+
strong_aug = args.strong_aug
|
| 238 |
+
except:
|
| 239 |
+
strong_aug = False # False originally
|
| 240 |
+
print(img_folder, ann_file, label_map)
|
| 241 |
+
dataset = ODVGDataset(img_folder, ann_file, label_map, max_labels=args.max_labels,
|
| 242 |
+
transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
|
| 243 |
+
)
|
| 244 |
+
return dataset
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
if __name__=="__main__":
|
| 248 |
+
dataset_vg = ODVGDataset("path/GRIT-20M/data/","path/GRIT-20M/anno/grit_odvg_10k.jsonl",)
|
| 249 |
+
print(len(dataset_vg))
|
| 250 |
+
data = dataset_vg[random.randint(0, 100)]
|
| 251 |
+
print(data)
|
| 252 |
+
dataset_od = ODVGDataset("pathl/V3Det/",
|
| 253 |
+
"path/V3Det/annotations/v3det_2023_v1_all_odvg.jsonl",
|
| 254 |
+
"path/V3Det/annotations/v3det_label_map.json",
|
| 255 |
+
)
|
| 256 |
+
print(len(dataset_od))
|
| 257 |
+
data = dataset_od[random.randint(0, 100)]
|
| 258 |
+
print(data)
|
groundingdino/datasets/panoptic_eval.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
import util.misc as utils
|
| 6 |
+
|
| 7 |
+
try:
|
| 8 |
+
from panopticapi.evaluation import pq_compute
|
| 9 |
+
except ImportError:
|
| 10 |
+
pass
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class PanopticEvaluator(object):
|
| 14 |
+
def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
|
| 15 |
+
self.gt_json = ann_file
|
| 16 |
+
self.gt_folder = ann_folder
|
| 17 |
+
if utils.is_main_process():
|
| 18 |
+
if not os.path.exists(output_dir):
|
| 19 |
+
os.mkdir(output_dir)
|
| 20 |
+
self.output_dir = output_dir
|
| 21 |
+
self.predictions = []
|
| 22 |
+
|
| 23 |
+
def update(self, predictions):
|
| 24 |
+
for p in predictions:
|
| 25 |
+
with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
|
| 26 |
+
f.write(p.pop("png_string"))
|
| 27 |
+
|
| 28 |
+
self.predictions += predictions
|
| 29 |
+
|
| 30 |
+
def synchronize_between_processes(self):
|
| 31 |
+
all_predictions = utils.all_gather(self.predictions)
|
| 32 |
+
merged_predictions = []
|
| 33 |
+
for p in all_predictions:
|
| 34 |
+
merged_predictions += p
|
| 35 |
+
self.predictions = merged_predictions
|
| 36 |
+
|
| 37 |
+
def summarize(self):
|
| 38 |
+
if utils.is_main_process():
|
| 39 |
+
json_data = {"annotations": self.predictions}
|
| 40 |
+
predictions_json = os.path.join(self.output_dir, "predictions.json")
|
| 41 |
+
with open(predictions_json, "w") as f:
|
| 42 |
+
f.write(json.dumps(json_data))
|
| 43 |
+
return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
|
| 44 |
+
return None
|
groundingdino/datasets/random_crop.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import PIL #version 1.2.0
|
| 2 |
+
import torch
|
| 3 |
+
import os
|
| 4 |
+
import torchvision.transforms.functional as F
|
| 5 |
+
import numpy as np
|
| 6 |
+
import random
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def intersect(boxes1, boxes2):
|
| 10 |
+
'''
|
| 11 |
+
Find intersection of every box combination between two sets of box
|
| 12 |
+
boxes1: bounding boxes 1, a tensor of dimensions (n1, 4)
|
| 13 |
+
boxes2: bounding boxes 2, a tensor of dimensions (n2, 4)
|
| 14 |
+
|
| 15 |
+
Out: Intersection each of boxes1 with respect to each of boxes2,
|
| 16 |
+
a tensor of dimensions (n1, n2)
|
| 17 |
+
'''
|
| 18 |
+
n1 = boxes1.size(0)
|
| 19 |
+
n2 = boxes2.size(0)
|
| 20 |
+
max_xy = torch.min(boxes1[:, 2:].unsqueeze(1).expand(n1, n2, 2),
|
| 21 |
+
boxes2[:, 2:].unsqueeze(0).expand(n1, n2, 2))
|
| 22 |
+
|
| 23 |
+
min_xy = torch.max(boxes1[:, :2].unsqueeze(1).expand(n1, n2, 2),
|
| 24 |
+
boxes2[:, :2].unsqueeze(0).expand(n1, n2, 2))
|
| 25 |
+
inter = torch.clamp(max_xy - min_xy , min=0) # (n1, n2, 2)
|
| 26 |
+
return inter[:, :, 0] * inter[:, :, 1] #(n1, n2)
|
| 27 |
+
def find_IoU(boxes1, boxes2):
|
| 28 |
+
'''
|
| 29 |
+
Find IoU between every boxes set of boxes
|
| 30 |
+
boxes1: a tensor of dimensions (n1, 4) (left, top, right , bottom)
|
| 31 |
+
boxes2: a tensor of dimensions (n2, 4)
|
| 32 |
+
|
| 33 |
+
Out: IoU each of boxes1 with respect to each of boxes2, a tensor of
|
| 34 |
+
dimensions (n1, n2)
|
| 35 |
+
|
| 36 |
+
Formula:
|
| 37 |
+
(box1 ∩ box2) / (box1 u box2) = (box1 ∩ box2) / (area(box1) + area(box2) - (box1 ∩ box2 ))
|
| 38 |
+
'''
|
| 39 |
+
inter = intersect(boxes1, boxes2)
|
| 40 |
+
area_boxes1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
|
| 41 |
+
area_boxes2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
|
| 42 |
+
|
| 43 |
+
area_boxes1 = area_boxes1.unsqueeze(1).expand_as(inter) #(n1, n2)
|
| 44 |
+
area_boxes2 = area_boxes2.unsqueeze(0).expand_as(inter) #(n1, n2)
|
| 45 |
+
union = (area_boxes1 + area_boxes2 - inter)
|
| 46 |
+
return inter / union
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def random_crop(image, boxes, labels, difficulties=None):
|
| 50 |
+
'''
|
| 51 |
+
image: A PIL image
|
| 52 |
+
boxes: Bounding boxes, a tensor of dimensions (#objects, 4)
|
| 53 |
+
labels: labels of object, a tensor of dimensions (#objects)
|
| 54 |
+
difficulties: difficulties of detect object, a tensor of dimensions (#objects)
|
| 55 |
+
|
| 56 |
+
Out: cropped image , new boxes, new labels, new difficulties
|
| 57 |
+
'''
|
| 58 |
+
if type(image) == PIL.Image.Image:
|
| 59 |
+
image = F.to_tensor(image)
|
| 60 |
+
original_h = image.size(1)
|
| 61 |
+
original_w = image.size(2)
|
| 62 |
+
|
| 63 |
+
while True:
|
| 64 |
+
mode = random.choice([0.1, 0.3, 0.5, 0.9, None])
|
| 65 |
+
|
| 66 |
+
if mode is None:
|
| 67 |
+
return F.to_pil_image(image), boxes, labels, difficulties
|
| 68 |
+
|
| 69 |
+
new_image = image
|
| 70 |
+
new_boxes = boxes
|
| 71 |
+
new_difficulties = difficulties
|
| 72 |
+
new_labels = labels
|
| 73 |
+
for _ in range(50):
|
| 74 |
+
# Crop dimensions: [0.3, 1] of original dimensions
|
| 75 |
+
new_h = random.uniform(0.3*original_h, original_h)
|
| 76 |
+
new_w = random.uniform(0.3*original_w, original_w)
|
| 77 |
+
|
| 78 |
+
# Aspect ratio constraint b/t .5 & 2
|
| 79 |
+
if new_h/new_w < 0.5 or new_h/new_w > 2:
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
#Crop coordinate
|
| 83 |
+
left = random.uniform(0, original_w - new_w)
|
| 84 |
+
right = left + new_w
|
| 85 |
+
top = random.uniform(0, original_h - new_h)
|
| 86 |
+
bottom = top + new_h
|
| 87 |
+
crop = torch.FloatTensor([int(left), int(top), int(right), int(bottom)])
|
| 88 |
+
|
| 89 |
+
# Calculate IoU between the crop and the bounding boxes
|
| 90 |
+
overlap = find_IoU(crop.unsqueeze(0), boxes) #(1, #objects)
|
| 91 |
+
overlap = overlap.squeeze(0)
|
| 92 |
+
|
| 93 |
+
# If not a single bounding box has a IoU of greater than the minimum, try again
|
| 94 |
+
if overlap.shape[0] == 0:
|
| 95 |
+
continue
|
| 96 |
+
if overlap.max().item() < mode:
|
| 97 |
+
continue
|
| 98 |
+
|
| 99 |
+
#Crop
|
| 100 |
+
new_image = image[:, int(top):int(bottom), int(left):int(right)] #(3, new_h, new_w)
|
| 101 |
+
|
| 102 |
+
#Center of bounding boxes
|
| 103 |
+
center_bb = (boxes[:, :2] + boxes[:, 2:])/2.0
|
| 104 |
+
|
| 105 |
+
#Find bounding box has been had center in crop
|
| 106 |
+
center_in_crop = (center_bb[:, 0] >left) * (center_bb[:, 0] < right
|
| 107 |
+
) *(center_bb[:, 1] > top) * (center_bb[:, 1] < bottom) #( #objects)
|
| 108 |
+
|
| 109 |
+
if not center_in_crop.any():
|
| 110 |
+
continue
|
| 111 |
+
|
| 112 |
+
#take matching bounding box
|
| 113 |
+
new_boxes = boxes[center_in_crop, :]
|
| 114 |
+
|
| 115 |
+
#take matching labels
|
| 116 |
+
new_labels = labels[center_in_crop]
|
| 117 |
+
|
| 118 |
+
#take matching difficulities
|
| 119 |
+
if difficulties is not None:
|
| 120 |
+
new_difficulties = difficulties[center_in_crop]
|
| 121 |
+
else:
|
| 122 |
+
new_difficulties = None
|
| 123 |
+
|
| 124 |
+
#Use the box left and top corner or the crop's
|
| 125 |
+
new_boxes[:, :2] = torch.max(new_boxes[:, :2], crop[:2])
|
| 126 |
+
|
| 127 |
+
#adjust to crop
|
| 128 |
+
new_boxes[:, :2] -= crop[:2]
|
| 129 |
+
|
| 130 |
+
new_boxes[:, 2:] = torch.min(new_boxes[:, 2:],crop[2:])
|
| 131 |
+
|
| 132 |
+
#adjust to crop
|
| 133 |
+
new_boxes[:, 2:] -= crop[:2]
|
| 134 |
+
|
| 135 |
+
return F.to_pil_image(new_image), new_boxes, new_labels, new_difficulties
|
groundingdino/datasets/sltransform.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# modified from https://github.com/anhtuan85/Data-Augmentation-for-Object-Detection/blob/master/augmentation.ipynb
|
| 2 |
+
|
| 3 |
+
import PIL #version 1.2.0
|
| 4 |
+
from PIL import Image #version 6.1.0
|
| 5 |
+
import torch
|
| 6 |
+
import os
|
| 7 |
+
import torchvision.transforms.functional as F
|
| 8 |
+
import numpy as np
|
| 9 |
+
import random
|
| 10 |
+
|
| 11 |
+
from .random_crop import random_crop
|
| 12 |
+
from util.box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
|
| 13 |
+
|
| 14 |
+
class AdjustContrast:
|
| 15 |
+
def __init__(self, contrast_factor):
|
| 16 |
+
self.contrast_factor = contrast_factor
|
| 17 |
+
|
| 18 |
+
def __call__(self, img, target):
|
| 19 |
+
"""
|
| 20 |
+
img (PIL Image or Tensor): Image to be adjusted.
|
| 21 |
+
"""
|
| 22 |
+
_contrast_factor = ((random.random() + 1.0) / 2.0) * self.contrast_factor
|
| 23 |
+
img = F.adjust_contrast(img, _contrast_factor)
|
| 24 |
+
return img, target
|
| 25 |
+
|
| 26 |
+
class AdjustBrightness:
|
| 27 |
+
def __init__(self, brightness_factor):
|
| 28 |
+
self.brightness_factor = brightness_factor
|
| 29 |
+
|
| 30 |
+
def __call__(self, img, target):
|
| 31 |
+
"""
|
| 32 |
+
img (PIL Image or Tensor): Image to be adjusted.
|
| 33 |
+
"""
|
| 34 |
+
_brightness_factor = ((random.random() + 1.0) / 2.0) * self.brightness_factor
|
| 35 |
+
img = F.adjust_brightness(img, _brightness_factor)
|
| 36 |
+
return img, target
|
| 37 |
+
|
| 38 |
+
def lighting_noise(image):
|
| 39 |
+
'''
|
| 40 |
+
color channel swap in image
|
| 41 |
+
image: A PIL image
|
| 42 |
+
'''
|
| 43 |
+
new_image = image
|
| 44 |
+
perms = ((0, 1, 2), (0, 2, 1), (1, 0, 2),
|
| 45 |
+
(1, 2, 0), (2, 0, 1), (2, 1, 0))
|
| 46 |
+
swap = perms[random.randint(0, len(perms)- 1)]
|
| 47 |
+
new_image = F.to_tensor(new_image)
|
| 48 |
+
new_image = new_image[swap, :, :]
|
| 49 |
+
new_image = F.to_pil_image(new_image)
|
| 50 |
+
return new_image
|
| 51 |
+
|
| 52 |
+
class LightingNoise:
|
| 53 |
+
def __init__(self) -> None:
|
| 54 |
+
pass
|
| 55 |
+
|
| 56 |
+
def __call__(self, img, target):
|
| 57 |
+
return lighting_noise(img), target
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def rotate(image, boxes, angle):
|
| 61 |
+
'''
|
| 62 |
+
Rotate image and bounding box
|
| 63 |
+
image: A Pil image (w, h)
|
| 64 |
+
boxes: A tensors of dimensions (#objects, 4)
|
| 65 |
+
|
| 66 |
+
Out: rotated image (w, h), rotated boxes
|
| 67 |
+
'''
|
| 68 |
+
new_image = image.copy()
|
| 69 |
+
new_boxes = boxes.clone()
|
| 70 |
+
|
| 71 |
+
#Rotate image, expand = True
|
| 72 |
+
w = image.width
|
| 73 |
+
h = image.height
|
| 74 |
+
cx = w/2
|
| 75 |
+
cy = h/2
|
| 76 |
+
new_image = new_image.rotate(angle, expand=True)
|
| 77 |
+
angle = np.radians(angle)
|
| 78 |
+
alpha = np.cos(angle)
|
| 79 |
+
beta = np.sin(angle)
|
| 80 |
+
#Get affine matrix
|
| 81 |
+
AffineMatrix = torch.tensor([[alpha, beta, (1-alpha)*cx - beta*cy],
|
| 82 |
+
[-beta, alpha, beta*cx + (1-alpha)*cy]])
|
| 83 |
+
|
| 84 |
+
#Rotation boxes
|
| 85 |
+
box_width = (boxes[:,2] - boxes[:,0]).reshape(-1,1)
|
| 86 |
+
box_height = (boxes[:,3] - boxes[:,1]).reshape(-1,1)
|
| 87 |
+
|
| 88 |
+
#Get corners for boxes
|
| 89 |
+
x1 = boxes[:,0].reshape(-1,1)
|
| 90 |
+
y1 = boxes[:,1].reshape(-1,1)
|
| 91 |
+
|
| 92 |
+
x2 = x1 + box_width
|
| 93 |
+
y2 = y1
|
| 94 |
+
|
| 95 |
+
x3 = x1
|
| 96 |
+
y3 = y1 + box_height
|
| 97 |
+
|
| 98 |
+
x4 = boxes[:,2].reshape(-1,1)
|
| 99 |
+
y4 = boxes[:,3].reshape(-1,1)
|
| 100 |
+
|
| 101 |
+
corners = torch.stack((x1,y1,x2,y2,x3,y3,x4,y4), dim= 1)
|
| 102 |
+
# corners.reshape(-1, 8) #Tensors of dimensions (#objects, 8)
|
| 103 |
+
corners = corners.reshape(-1,2) #Tensors of dimension (4* #objects, 2)
|
| 104 |
+
corners = torch.cat((corners, torch.ones(corners.shape[0], 1)), dim= 1) #(Tensors of dimension (4* #objects, 3))
|
| 105 |
+
|
| 106 |
+
cos = np.abs(AffineMatrix[0, 0])
|
| 107 |
+
sin = np.abs(AffineMatrix[0, 1])
|
| 108 |
+
|
| 109 |
+
nW = int((h * sin) + (w * cos))
|
| 110 |
+
nH = int((h * cos) + (w * sin))
|
| 111 |
+
AffineMatrix[0, 2] += (nW / 2) - cx
|
| 112 |
+
AffineMatrix[1, 2] += (nH / 2) - cy
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
#Apply affine transform
|
| 116 |
+
rotate_corners = torch.mm(AffineMatrix, corners.t().to(torch.float64)).t()
|
| 117 |
+
rotate_corners = rotate_corners.reshape(-1,8)
|
| 118 |
+
|
| 119 |
+
x_corners = rotate_corners[:,[0,2,4,6]]
|
| 120 |
+
y_corners = rotate_corners[:,[1,3,5,7]]
|
| 121 |
+
|
| 122 |
+
#Get (x_min, y_min, x_max, y_max)
|
| 123 |
+
x_min, _ = torch.min(x_corners, dim= 1)
|
| 124 |
+
x_min = x_min.reshape(-1, 1)
|
| 125 |
+
y_min, _ = torch.min(y_corners, dim= 1)
|
| 126 |
+
y_min = y_min.reshape(-1, 1)
|
| 127 |
+
x_max, _ = torch.max(x_corners, dim= 1)
|
| 128 |
+
x_max = x_max.reshape(-1, 1)
|
| 129 |
+
y_max, _ = torch.max(y_corners, dim= 1)
|
| 130 |
+
y_max = y_max.reshape(-1, 1)
|
| 131 |
+
|
| 132 |
+
new_boxes = torch.cat((x_min, y_min, x_max, y_max), dim= 1)
|
| 133 |
+
|
| 134 |
+
scale_x = new_image.width / w
|
| 135 |
+
scale_y = new_image.height / h
|
| 136 |
+
|
| 137 |
+
#Resize new image to (w, h)
|
| 138 |
+
|
| 139 |
+
new_image = new_image.resize((w, h))
|
| 140 |
+
|
| 141 |
+
#Resize boxes
|
| 142 |
+
new_boxes /= torch.Tensor([scale_x, scale_y, scale_x, scale_y])
|
| 143 |
+
new_boxes[:, 0] = torch.clamp(new_boxes[:, 0], 0, w)
|
| 144 |
+
new_boxes[:, 1] = torch.clamp(new_boxes[:, 1], 0, h)
|
| 145 |
+
new_boxes[:, 2] = torch.clamp(new_boxes[:, 2], 0, w)
|
| 146 |
+
new_boxes[:, 3] = torch.clamp(new_boxes[:, 3], 0, h)
|
| 147 |
+
return new_image, new_boxes
|
| 148 |
+
|
| 149 |
+
# def convert_xywh_to_xyxy(boxes: torch.Tensor):
|
| 150 |
+
# _boxes = boxes.clone()
|
| 151 |
+
# box_xy = _boxes[:, :2]
|
| 152 |
+
# box_wh = _boxes[:, 2:]
|
| 153 |
+
# box_x1y1 = box_xy - box_wh/2
|
| 154 |
+
# box_x2y2 = box_xy + box_wh/2
|
| 155 |
+
# box_xyxy = torch.cat((box_x1y1, box_x2y2), dim=-1)
|
| 156 |
+
# return box_xyxy
|
| 157 |
+
|
| 158 |
+
class Rotate:
|
| 159 |
+
def __init__(self, angle=10) -> None:
|
| 160 |
+
self.angle = angle
|
| 161 |
+
|
| 162 |
+
def __call__(self, img, target):
|
| 163 |
+
w,h = img.size
|
| 164 |
+
whwh = torch.Tensor([w, h, w, h])
|
| 165 |
+
boxes_xyxy = box_cxcywh_to_xyxy(target['boxes']) * whwh
|
| 166 |
+
img, boxes_new = rotate(img, boxes_xyxy, self.angle)
|
| 167 |
+
target['boxes'] = box_xyxy_to_cxcywh(boxes_new).to(boxes_xyxy.dtype) / (whwh + 1e-3)
|
| 168 |
+
return img, target
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
class RandomCrop:
|
| 172 |
+
def __init__(self) -> None:
|
| 173 |
+
pass
|
| 174 |
+
|
| 175 |
+
def __call__(self, img, target):
|
| 176 |
+
w,h = img.size
|
| 177 |
+
try:
|
| 178 |
+
boxes_xyxy = target['boxes']
|
| 179 |
+
labels = target['labels']
|
| 180 |
+
img, new_boxes, new_labels, _ = random_crop(img, boxes_xyxy, labels)
|
| 181 |
+
target['boxes'] = new_boxes
|
| 182 |
+
target['labels'] = new_labels
|
| 183 |
+
except Exception as e:
|
| 184 |
+
pass
|
| 185 |
+
return img, target
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
class RandomCropDebug:
|
| 189 |
+
def __init__(self) -> None:
|
| 190 |
+
pass
|
| 191 |
+
|
| 192 |
+
def __call__(self, img, target):
|
| 193 |
+
boxes_xyxy = target['boxes'].clone()
|
| 194 |
+
labels = target['labels'].clone()
|
| 195 |
+
img, new_boxes, new_labels, _ = random_crop(img, boxes_xyxy, labels)
|
| 196 |
+
target['boxes'] = new_boxes
|
| 197 |
+
target['labels'] = new_labels
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
return img, target
|
| 201 |
+
|
| 202 |
+
class RandomSelectMulti(object):
|
| 203 |
+
"""
|
| 204 |
+
Randomly selects between transforms1 and transforms2,
|
| 205 |
+
"""
|
| 206 |
+
def __init__(self, transformslist, p=-1):
|
| 207 |
+
self.transformslist = transformslist
|
| 208 |
+
self.p = p
|
| 209 |
+
assert p == -1
|
| 210 |
+
|
| 211 |
+
def __call__(self, img, target):
|
| 212 |
+
if self.p == -1:
|
| 213 |
+
return random.choice(self.transformslist)(img, target)
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
class Albumentations:
|
| 217 |
+
def __init__(self):
|
| 218 |
+
import albumentations as A
|
| 219 |
+
self.transform = A.Compose([
|
| 220 |
+
A.Blur(p=0.01),
|
| 221 |
+
A.MedianBlur(p=0.01),
|
| 222 |
+
A.ToGray(p=0.01),
|
| 223 |
+
A.CLAHE(p=0.01),
|
| 224 |
+
A.RandomBrightnessContrast(p=0.005),
|
| 225 |
+
A.RandomGamma(p=0.005),
|
| 226 |
+
A.ImageCompression(quality_lower=75, p=0.005)],
|
| 227 |
+
bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_labels']))
|
| 228 |
+
|
| 229 |
+
def __call__(self, img, target, p=1.0):
|
| 230 |
+
"""
|
| 231 |
+
Input:
|
| 232 |
+
target['boxes']: xyxy, unnormalized data.
|
| 233 |
+
|
| 234 |
+
"""
|
| 235 |
+
boxes_raw = target['boxes']
|
| 236 |
+
labels_raw = target['labels']
|
| 237 |
+
img_np = np.array(img)
|
| 238 |
+
if self.transform and random.random() < p:
|
| 239 |
+
new_res = self.transform(image=img_np, bboxes=boxes_raw, class_labels=labels_raw) # transformed
|
| 240 |
+
boxes_new = torch.Tensor(new_res['bboxes']).to(boxes_raw.dtype).reshape_as(boxes_raw)
|
| 241 |
+
img_np = new_res['image']
|
| 242 |
+
labels_new = torch.Tensor(new_res['class_labels']).to(labels_raw.dtype)
|
| 243 |
+
img_new = Image.fromarray(img_np)
|
| 244 |
+
target['boxes'] = boxes_new
|
| 245 |
+
target['labels'] = labels_new
|
| 246 |
+
|
| 247 |
+
return img_new, target
|
groundingdino/datasets/transforms.py
CHANGED
|
@@ -2,7 +2,6 @@
|
|
| 2 |
"""
|
| 3 |
Transforms and data augmentation for both image + bbox.
|
| 4 |
"""
|
| 5 |
-
import os
|
| 6 |
import random
|
| 7 |
|
| 8 |
import PIL
|
|
@@ -10,8 +9,8 @@ import torch
|
|
| 10 |
import torchvision.transforms as T
|
| 11 |
import torchvision.transforms.functional as F
|
| 12 |
|
| 13 |
-
from
|
| 14 |
-
from
|
| 15 |
|
| 16 |
|
| 17 |
def crop(image, target, region):
|
|
@@ -23,7 +22,7 @@ def crop(image, target, region):
|
|
| 23 |
# should we do something wrt the original size?
|
| 24 |
target["size"] = torch.tensor([h, w])
|
| 25 |
|
| 26 |
-
fields = ["labels", "area"
|
| 27 |
|
| 28 |
if "boxes" in target:
|
| 29 |
boxes = target["boxes"]
|
|
@@ -38,29 +37,22 @@ def crop(image, target, region):
|
|
| 38 |
|
| 39 |
if "masks" in target:
|
| 40 |
# FIXME should we update the area here if there are no boxes?
|
| 41 |
-
target[
|
| 42 |
fields.append("masks")
|
| 43 |
|
|
|
|
| 44 |
# remove elements for which the boxes or masks that have zero area
|
| 45 |
if "boxes" in target or "masks" in target:
|
| 46 |
# favor boxes selection when defining which elements to keep
|
| 47 |
# this is compatible with previous implementation
|
| 48 |
if "boxes" in target:
|
| 49 |
-
cropped_boxes = target[
|
| 50 |
keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
|
| 51 |
else:
|
| 52 |
-
keep = target[
|
| 53 |
|
| 54 |
for field in fields:
|
| 55 |
-
|
| 56 |
-
target[field] = target[field][keep]
|
| 57 |
-
|
| 58 |
-
if os.environ.get("IPDB_SHILONG_DEBUG", None) == "INFO":
|
| 59 |
-
# for debug and visualization only.
|
| 60 |
-
if "strings_positive" in target:
|
| 61 |
-
target["strings_positive"] = [
|
| 62 |
-
_i for _i, _j in zip(target["strings_positive"], keep) if _j
|
| 63 |
-
]
|
| 64 |
|
| 65 |
return cropped_image, target
|
| 66 |
|
|
@@ -73,13 +65,11 @@ def hflip(image, target):
|
|
| 73 |
target = target.copy()
|
| 74 |
if "boxes" in target:
|
| 75 |
boxes = target["boxes"]
|
| 76 |
-
boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor(
|
| 77 |
-
[w, 0, w, 0]
|
| 78 |
-
)
|
| 79 |
target["boxes"] = boxes
|
| 80 |
|
| 81 |
if "masks" in target:
|
| 82 |
-
target[
|
| 83 |
|
| 84 |
return flipped_image, target
|
| 85 |
|
|
@@ -125,9 +115,7 @@ def resize(image, target, size, max_size=None):
|
|
| 125 |
target = target.copy()
|
| 126 |
if "boxes" in target:
|
| 127 |
boxes = target["boxes"]
|
| 128 |
-
scaled_boxes = boxes * torch.as_tensor(
|
| 129 |
-
[ratio_width, ratio_height, ratio_width, ratio_height]
|
| 130 |
-
)
|
| 131 |
target["boxes"] = scaled_boxes
|
| 132 |
|
| 133 |
if "area" in target:
|
|
@@ -139,9 +127,8 @@ def resize(image, target, size, max_size=None):
|
|
| 139 |
target["size"] = torch.tensor([h, w])
|
| 140 |
|
| 141 |
if "masks" in target:
|
| 142 |
-
target[
|
| 143 |
-
|
| 144 |
-
)
|
| 145 |
|
| 146 |
return rescaled_image, target
|
| 147 |
|
|
@@ -155,7 +142,7 @@ def pad(image, target, padding):
|
|
| 155 |
# should we do something wrt the original size?
|
| 156 |
target["size"] = torch.tensor(padded_image.size[::-1])
|
| 157 |
if "masks" in target:
|
| 158 |
-
target[
|
| 159 |
return padded_image, target
|
| 160 |
|
| 161 |
|
|
@@ -177,28 +164,15 @@ class RandomCrop(object):
|
|
| 177 |
|
| 178 |
|
| 179 |
class RandomSizeCrop(object):
|
| 180 |
-
def __init__(self, min_size: int, max_size: int
|
| 181 |
-
# respect_boxes: True to keep all boxes
|
| 182 |
-
# False to tolerence box filter
|
| 183 |
self.min_size = min_size
|
| 184 |
self.max_size = max_size
|
| 185 |
-
self.respect_boxes = respect_boxes
|
| 186 |
|
| 187 |
def __call__(self, img: PIL.Image.Image, target: dict):
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
h = random.randint(self.min_size, min(img.height, self.max_size))
|
| 193 |
-
region = T.RandomCrop.get_params(img, [h, w])
|
| 194 |
-
result_img, result_target = crop(img, target, region)
|
| 195 |
-
if (
|
| 196 |
-
not self.respect_boxes
|
| 197 |
-
or len(result_target["boxes"]) == init_boxes
|
| 198 |
-
or i == max_patience - 1
|
| 199 |
-
):
|
| 200 |
-
return result_img, result_target
|
| 201 |
-
return result_img, result_target
|
| 202 |
|
| 203 |
|
| 204 |
class CenterCrop(object):
|
|
@@ -208,8 +182,8 @@ class CenterCrop(object):
|
|
| 208 |
def __call__(self, img, target):
|
| 209 |
image_width, image_height = img.size
|
| 210 |
crop_height, crop_width = self.size
|
| 211 |
-
crop_top = int(round((image_height - crop_height) / 2.
|
| 212 |
-
crop_left = int(round((image_width - crop_width) / 2.
|
| 213 |
return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
|
| 214 |
|
| 215 |
|
|
@@ -249,7 +223,6 @@ class RandomSelect(object):
|
|
| 249 |
Randomly selects between transforms1 and transforms2,
|
| 250 |
with probability p for transforms1 and (1 - p) for transforms2
|
| 251 |
"""
|
| 252 |
-
|
| 253 |
def __init__(self, transforms1, transforms2, p=0.5):
|
| 254 |
self.transforms1 = transforms1
|
| 255 |
self.transforms2 = transforms2
|
|
@@ -267,6 +240,7 @@ class ToTensor(object):
|
|
| 267 |
|
| 268 |
|
| 269 |
class RandomErasing(object):
|
|
|
|
| 270 |
def __init__(self, *args, **kwargs):
|
| 271 |
self.eraser = T.RandomErasing(*args, **kwargs)
|
| 272 |
|
|
|
|
| 2 |
"""
|
| 3 |
Transforms and data augmentation for both image + bbox.
|
| 4 |
"""
|
|
|
|
| 5 |
import random
|
| 6 |
|
| 7 |
import PIL
|
|
|
|
| 9 |
import torchvision.transforms as T
|
| 10 |
import torchvision.transforms.functional as F
|
| 11 |
|
| 12 |
+
from util.box_ops import box_xyxy_to_cxcywh
|
| 13 |
+
from util.misc import interpolate
|
| 14 |
|
| 15 |
|
| 16 |
def crop(image, target, region):
|
|
|
|
| 22 |
# should we do something wrt the original size?
|
| 23 |
target["size"] = torch.tensor([h, w])
|
| 24 |
|
| 25 |
+
fields = ["labels", "area"]
|
| 26 |
|
| 27 |
if "boxes" in target:
|
| 28 |
boxes = target["boxes"]
|
|
|
|
| 37 |
|
| 38 |
if "masks" in target:
|
| 39 |
# FIXME should we update the area here if there are no boxes?
|
| 40 |
+
target['masks'] = target['masks'][:, i:i + h, j:j + w]
|
| 41 |
fields.append("masks")
|
| 42 |
|
| 43 |
+
|
| 44 |
# remove elements for which the boxes or masks that have zero area
|
| 45 |
if "boxes" in target or "masks" in target:
|
| 46 |
# favor boxes selection when defining which elements to keep
|
| 47 |
# this is compatible with previous implementation
|
| 48 |
if "boxes" in target:
|
| 49 |
+
cropped_boxes = target['boxes'].reshape(-1, 2, 2)
|
| 50 |
keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
|
| 51 |
else:
|
| 52 |
+
keep = target['masks'].flatten(1).any(1)
|
| 53 |
|
| 54 |
for field in fields:
|
| 55 |
+
target[field] = target[field][keep]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
return cropped_image, target
|
| 58 |
|
|
|
|
| 65 |
target = target.copy()
|
| 66 |
if "boxes" in target:
|
| 67 |
boxes = target["boxes"]
|
| 68 |
+
boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
|
|
|
|
|
|
|
| 69 |
target["boxes"] = boxes
|
| 70 |
|
| 71 |
if "masks" in target:
|
| 72 |
+
target['masks'] = target['masks'].flip(-1)
|
| 73 |
|
| 74 |
return flipped_image, target
|
| 75 |
|
|
|
|
| 115 |
target = target.copy()
|
| 116 |
if "boxes" in target:
|
| 117 |
boxes = target["boxes"]
|
| 118 |
+
scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
|
|
|
|
|
|
|
| 119 |
target["boxes"] = scaled_boxes
|
| 120 |
|
| 121 |
if "area" in target:
|
|
|
|
| 127 |
target["size"] = torch.tensor([h, w])
|
| 128 |
|
| 129 |
if "masks" in target:
|
| 130 |
+
target['masks'] = interpolate(
|
| 131 |
+
target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
|
|
|
|
| 132 |
|
| 133 |
return rescaled_image, target
|
| 134 |
|
|
|
|
| 142 |
# should we do something wrt the original size?
|
| 143 |
target["size"] = torch.tensor(padded_image.size[::-1])
|
| 144 |
if "masks" in target:
|
| 145 |
+
target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
|
| 146 |
return padded_image, target
|
| 147 |
|
| 148 |
|
|
|
|
| 164 |
|
| 165 |
|
| 166 |
class RandomSizeCrop(object):
|
| 167 |
+
def __init__(self, min_size: int, max_size: int):
|
|
|
|
|
|
|
| 168 |
self.min_size = min_size
|
| 169 |
self.max_size = max_size
|
|
|
|
| 170 |
|
| 171 |
def __call__(self, img: PIL.Image.Image, target: dict):
|
| 172 |
+
w = random.randint(self.min_size, min(img.width, self.max_size))
|
| 173 |
+
h = random.randint(self.min_size, min(img.height, self.max_size))
|
| 174 |
+
region = T.RandomCrop.get_params(img, [h, w])
|
| 175 |
+
return crop(img, target, region)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
|
| 178 |
class CenterCrop(object):
|
|
|
|
| 182 |
def __call__(self, img, target):
|
| 183 |
image_width, image_height = img.size
|
| 184 |
crop_height, crop_width = self.size
|
| 185 |
+
crop_top = int(round((image_height - crop_height) / 2.))
|
| 186 |
+
crop_left = int(round((image_width - crop_width) / 2.))
|
| 187 |
return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
|
| 188 |
|
| 189 |
|
|
|
|
| 223 |
Randomly selects between transforms1 and transforms2,
|
| 224 |
with probability p for transforms1 and (1 - p) for transforms2
|
| 225 |
"""
|
|
|
|
| 226 |
def __init__(self, transforms1, transforms2, p=0.5):
|
| 227 |
self.transforms1 = transforms1
|
| 228 |
self.transforms2 = transforms2
|
|
|
|
| 240 |
|
| 241 |
|
| 242 |
class RandomErasing(object):
|
| 243 |
+
|
| 244 |
def __init__(self, *args, **kwargs):
|
| 245 |
self.eraser = T.RandomErasing(*args, **kwargs)
|
| 246 |
|
groundingdino/models/.ipynb_checkpoints/__init__-checkpoint.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
| 1 |
-
# ------------------------------------------------------------------------
|
| 2 |
-
# Grounding DINO
|
| 3 |
-
# url: https://github.com/IDEA-Research/GroundingDINO
|
| 4 |
-
# Copyright (c) 2023 IDEA. All Rights Reserved.
|
| 5 |
-
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
|
| 6 |
-
# ------------------------------------------------------------------------
|
| 7 |
-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
| 8 |
-
from .GroundingDINO import build_groundingdino
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def build_model(args):
|
| 12 |
-
# we use register to maintain models from catdet6 on.
|
| 13 |
-
from .registry import MODULE_BUILD_FUNCS
|
| 14 |
-
|
| 15 |
-
assert args.modelname in MODULE_BUILD_FUNCS._module_dict
|
| 16 |
-
build_func = MODULE_BUILD_FUNCS.get(args.modelname)
|
| 17 |
-
model = build_func(args)
|
| 18 |
-
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
groundingdino/models/.ipynb_checkpoints/registry-checkpoint.py
DELETED
|
@@ -1,66 +0,0 @@
|
|
| 1 |
-
# ------------------------------------------------------------------------
|
| 2 |
-
# Grounding DINO
|
| 3 |
-
# url: https://github.com/IDEA-Research/GroundingDINO
|
| 4 |
-
# Copyright (c) 2023 IDEA. All Rights Reserved.
|
| 5 |
-
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
|
| 6 |
-
# ------------------------------------------------------------------------
|
| 7 |
-
# -*- coding: utf-8 -*-
|
| 8 |
-
# @Author: Yihao Chen
|
| 9 |
-
# @Date: 2021-08-16 16:03:17
|
| 10 |
-
# @Last Modified by: Shilong Liu
|
| 11 |
-
# @Last Modified time: 2022-01-23 15:26
|
| 12 |
-
# modified from mmcv
|
| 13 |
-
|
| 14 |
-
import inspect
|
| 15 |
-
from functools import partial
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
class Registry(object):
|
| 19 |
-
def __init__(self, name):
|
| 20 |
-
self._name = name
|
| 21 |
-
self._module_dict = dict()
|
| 22 |
-
|
| 23 |
-
def __repr__(self):
|
| 24 |
-
format_str = self.__class__.__name__ + "(name={}, items={})".format(
|
| 25 |
-
self._name, list(self._module_dict.keys())
|
| 26 |
-
)
|
| 27 |
-
return format_str
|
| 28 |
-
|
| 29 |
-
def __len__(self):
|
| 30 |
-
return len(self._module_dict)
|
| 31 |
-
|
| 32 |
-
@property
|
| 33 |
-
def name(self):
|
| 34 |
-
return self._name
|
| 35 |
-
|
| 36 |
-
@property
|
| 37 |
-
def module_dict(self):
|
| 38 |
-
return self._module_dict
|
| 39 |
-
|
| 40 |
-
def get(self, key):
|
| 41 |
-
return self._module_dict.get(key, None)
|
| 42 |
-
|
| 43 |
-
def registe_with_name(self, module_name=None, force=False):
|
| 44 |
-
return partial(self.register, module_name=module_name, force=force)
|
| 45 |
-
|
| 46 |
-
def register(self, module_build_function, module_name=None, force=False):
|
| 47 |
-
"""Register a module build function.
|
| 48 |
-
Args:
|
| 49 |
-
module (:obj:`nn.Module`): Module to be registered.
|
| 50 |
-
"""
|
| 51 |
-
if not inspect.isfunction(module_build_function):
|
| 52 |
-
raise TypeError(
|
| 53 |
-
"module_build_function must be a function, but got {}".format(
|
| 54 |
-
type(module_build_function)
|
| 55 |
-
)
|
| 56 |
-
)
|
| 57 |
-
if module_name is None:
|
| 58 |
-
module_name = module_build_function.__name__
|
| 59 |
-
if not force and module_name in self._module_dict:
|
| 60 |
-
raise KeyError("{} is already registered in {}".format(module_name, self.name))
|
| 61 |
-
self._module_dict[module_name] = module_build_function
|
| 62 |
-
|
| 63 |
-
return module_build_function
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
MODULE_BUILD_FUNCS = Registry("model build functions")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
groundingdino/models/GroundingDINO/.ipynb_checkpoints/bertwarper-checkpoint.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ------------------------------------------------------------------------
|
| 2 |
+
# Grounding DINO
|
| 3 |
+
# url: https://github.com/IDEA-Research/GroundingDINO
|
| 4 |
+
# Copyright (c) 2023 IDEA. All Rights Reserved.
|
| 5 |
+
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
|
| 6 |
+
# ------------------------------------------------------------------------
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn.functional as F
|
| 10 |
+
import torch.utils.checkpoint as checkpoint
|
| 11 |
+
from torch import Tensor, nn
|
| 12 |
+
from torchvision.ops.boxes import nms
|
| 13 |
+
from transformers import BertConfig, BertModel, BertPreTrainedModel
|
| 14 |
+
from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class BertModelWarper(nn.Module):
|
| 18 |
+
def __init__(self, bert_model):
|
| 19 |
+
super().__init__()
|
| 20 |
+
# self.bert = bert_modelc
|
| 21 |
+
|
| 22 |
+
self.config = bert_model.config
|
| 23 |
+
self.embeddings = bert_model.embeddings
|
| 24 |
+
self.encoder = bert_model.encoder
|
| 25 |
+
self.pooler = bert_model.pooler
|
| 26 |
+
|
| 27 |
+
self.get_extended_attention_mask = bert_model.get_extended_attention_mask
|
| 28 |
+
self.invert_attention_mask = bert_model.invert_attention_mask
|
| 29 |
+
self.get_head_mask = bert_model.get_head_mask
|
| 30 |
+
|
| 31 |
+
def forward(
|
| 32 |
+
self,
|
| 33 |
+
input_ids=None,
|
| 34 |
+
attention_mask=None,
|
| 35 |
+
token_type_ids=None,
|
| 36 |
+
position_ids=None,
|
| 37 |
+
head_mask=None,
|
| 38 |
+
inputs_embeds=None,
|
| 39 |
+
encoder_hidden_states=None,
|
| 40 |
+
encoder_attention_mask=None,
|
| 41 |
+
past_key_values=None,
|
| 42 |
+
use_cache=None,
|
| 43 |
+
output_attentions=None,
|
| 44 |
+
output_hidden_states=None,
|
| 45 |
+
return_dict=None,
|
| 46 |
+
):
|
| 47 |
+
r"""
|
| 48 |
+
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
|
| 49 |
+
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
|
| 50 |
+
the model is configured as a decoder.
|
| 51 |
+
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
| 52 |
+
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
|
| 53 |
+
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
|
| 54 |
+
|
| 55 |
+
- 1 for tokens that are **not masked**,
|
| 56 |
+
- 0 for tokens that are **masked**.
|
| 57 |
+
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
|
| 58 |
+
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
|
| 59 |
+
|
| 60 |
+
If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
|
| 61 |
+
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
|
| 62 |
+
instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
|
| 63 |
+
use_cache (:obj:`bool`, `optional`):
|
| 64 |
+
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
|
| 65 |
+
decoding (see :obj:`past_key_values`).
|
| 66 |
+
"""
|
| 67 |
+
output_attentions = (
|
| 68 |
+
output_attentions if output_attentions is not None else self.config.output_attentions
|
| 69 |
+
)
|
| 70 |
+
output_hidden_states = (
|
| 71 |
+
output_hidden_states
|
| 72 |
+
if output_hidden_states is not None
|
| 73 |
+
else self.config.output_hidden_states
|
| 74 |
+
)
|
| 75 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 76 |
+
|
| 77 |
+
if self.config.is_decoder:
|
| 78 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
| 79 |
+
else:
|
| 80 |
+
use_cache = False
|
| 81 |
+
|
| 82 |
+
if input_ids is not None and inputs_embeds is not None:
|
| 83 |
+
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
| 84 |
+
elif input_ids is not None:
|
| 85 |
+
input_shape = input_ids.size()
|
| 86 |
+
batch_size, seq_length = input_shape
|
| 87 |
+
elif inputs_embeds is not None:
|
| 88 |
+
input_shape = inputs_embeds.size()[:-1]
|
| 89 |
+
batch_size, seq_length = input_shape
|
| 90 |
+
else:
|
| 91 |
+
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
| 92 |
+
|
| 93 |
+
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
| 94 |
+
|
| 95 |
+
# past_key_values_length
|
| 96 |
+
past_key_values_length = (
|
| 97 |
+
past_key_values[0][0].shape[2] if past_key_values is not None else 0
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
if attention_mask is None:
|
| 101 |
+
attention_mask = torch.ones(
|
| 102 |
+
((batch_size, seq_length + past_key_values_length)), device=device
|
| 103 |
+
)
|
| 104 |
+
if token_type_ids is None:
|
| 105 |
+
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
|
| 106 |
+
|
| 107 |
+
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
|
| 108 |
+
# ourselves in which case we just need to make it broadcastable to all heads.
|
| 109 |
+
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
|
| 110 |
+
attention_mask, input_shape, device
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# If a 2D or 3D attention mask is provided for the cross-attention
|
| 114 |
+
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
| 115 |
+
if self.config.is_decoder and encoder_hidden_states is not None:
|
| 116 |
+
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
| 117 |
+
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
| 118 |
+
if encoder_attention_mask is None:
|
| 119 |
+
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
|
| 120 |
+
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
|
| 121 |
+
else:
|
| 122 |
+
encoder_extended_attention_mask = None
|
| 123 |
+
# if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
|
| 124 |
+
# import ipdb; ipdb.set_trace()
|
| 125 |
+
|
| 126 |
+
# Prepare head mask if needed
|
| 127 |
+
# 1.0 in head_mask indicate we keep the head
|
| 128 |
+
# attention_probs has shape bsz x n_heads x N x N
|
| 129 |
+
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
| 130 |
+
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
| 131 |
+
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
|
| 132 |
+
|
| 133 |
+
embedding_output = self.embeddings(
|
| 134 |
+
input_ids=input_ids,
|
| 135 |
+
position_ids=position_ids,
|
| 136 |
+
token_type_ids=token_type_ids,
|
| 137 |
+
inputs_embeds=inputs_embeds,
|
| 138 |
+
past_key_values_length=past_key_values_length,
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
encoder_outputs = self.encoder(
|
| 142 |
+
embedding_output,
|
| 143 |
+
attention_mask=extended_attention_mask,
|
| 144 |
+
head_mask=head_mask,
|
| 145 |
+
encoder_hidden_states=encoder_hidden_states,
|
| 146 |
+
encoder_attention_mask=encoder_extended_attention_mask,
|
| 147 |
+
past_key_values=past_key_values,
|
| 148 |
+
use_cache=use_cache,
|
| 149 |
+
output_attentions=output_attentions,
|
| 150 |
+
output_hidden_states=output_hidden_states,
|
| 151 |
+
return_dict=return_dict,
|
| 152 |
+
)
|
| 153 |
+
sequence_output = encoder_outputs[0]
|
| 154 |
+
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
|
| 155 |
+
|
| 156 |
+
if not return_dict:
|
| 157 |
+
return (sequence_output, pooled_output) + encoder_outputs[1:]
|
| 158 |
+
|
| 159 |
+
return BaseModelOutputWithPoolingAndCrossAttentions(
|
| 160 |
+
last_hidden_state=sequence_output,
|
| 161 |
+
pooler_output=pooled_output,
|
| 162 |
+
past_key_values=encoder_outputs.past_key_values,
|
| 163 |
+
hidden_states=encoder_outputs.hidden_states,
|
| 164 |
+
attentions=encoder_outputs.attentions,
|
| 165 |
+
cross_attentions=encoder_outputs.cross_attentions,
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
class TextEncoderShell(nn.Module):
|
| 170 |
+
def __init__(self, text_encoder):
|
| 171 |
+
super().__init__()
|
| 172 |
+
self.text_encoder = text_encoder
|
| 173 |
+
self.config = self.text_encoder.config
|
| 174 |
+
|
| 175 |
+
def forward(self, **kw):
|
| 176 |
+
# feed into text encoder
|
| 177 |
+
return self.text_encoder(**kw)
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer):
|
| 181 |
+
"""Generate attention mask between each pair of special tokens
|
| 182 |
+
Args:
|
| 183 |
+
input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
|
| 184 |
+
special_tokens_mask (list): special tokens mask.
|
| 185 |
+
Returns:
|
| 186 |
+
torch.Tensor: attention mask between each special tokens.
|
| 187 |
+
"""
|
| 188 |
+
input_ids = tokenized["input_ids"]
|
| 189 |
+
bs, num_token = input_ids.shape
|
| 190 |
+
# special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
|
| 191 |
+
special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
|
| 192 |
+
for special_token in special_tokens_list:
|
| 193 |
+
special_tokens_mask |= input_ids == special_token
|
| 194 |
+
|
| 195 |
+
# idxs: each row is a list of indices of special tokens
|
| 196 |
+
idxs = torch.nonzero(special_tokens_mask)
|
| 197 |
+
|
| 198 |
+
# generate attention mask and positional ids
|
| 199 |
+
attention_mask = (
|
| 200 |
+
torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
|
| 201 |
+
)
|
| 202 |
+
position_ids = torch.zeros((bs, num_token), device=input_ids.device)
|
| 203 |
+
previous_col = 0
|
| 204 |
+
for i in range(idxs.shape[0]):
|
| 205 |
+
row, col = idxs[i]
|
| 206 |
+
if (col == 0) or (col == num_token - 1):
|
| 207 |
+
attention_mask[row, col, col] = True
|
| 208 |
+
position_ids[row, col] = 0
|
| 209 |
+
else:
|
| 210 |
+
attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
|
| 211 |
+
position_ids[row, previous_col + 1 : col + 1] = torch.arange(
|
| 212 |
+
0, col - previous_col, device=input_ids.device
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
previous_col = col
|
| 216 |
+
|
| 217 |
+
# # padding mask
|
| 218 |
+
# padding_mask = tokenized['attention_mask']
|
| 219 |
+
# attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
|
| 220 |
+
|
| 221 |
+
return attention_mask, position_ids.to(torch.long)
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list, tokenizer):
|
| 225 |
+
"""Generate attention mask between each pair of special tokens
|
| 226 |
+
Args:
|
| 227 |
+
input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
|
| 228 |
+
special_tokens_mask (list): special tokens mask.
|
| 229 |
+
Returns:
|
| 230 |
+
torch.Tensor: attention mask between each special tokens.
|
| 231 |
+
"""
|
| 232 |
+
input_ids = tokenized["input_ids"]
|
| 233 |
+
bs, num_token = input_ids.shape
|
| 234 |
+
# special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
|
| 235 |
+
special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
|
| 236 |
+
for special_token in special_tokens_list:
|
| 237 |
+
special_tokens_mask |= input_ids == special_token
|
| 238 |
+
|
| 239 |
+
# idxs: each row is a list of indices of special tokens
|
| 240 |
+
idxs = torch.nonzero(special_tokens_mask)
|
| 241 |
+
|
| 242 |
+
# generate attention mask and positional ids
|
| 243 |
+
attention_mask = (
|
| 244 |
+
torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
|
| 245 |
+
)
|
| 246 |
+
position_ids = torch.zeros((bs, num_token), device=input_ids.device)
|
| 247 |
+
cate_to_token_mask_list = [[] for _ in range(bs)]
|
| 248 |
+
previous_col = 0
|
| 249 |
+
for i in range(idxs.shape[0]):
|
| 250 |
+
row, col = idxs[i]
|
| 251 |
+
if (col == 0) or (col == num_token - 1):
|
| 252 |
+
attention_mask[row, col, col] = True
|
| 253 |
+
position_ids[row, col] = 0
|
| 254 |
+
else:
|
| 255 |
+
attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
|
| 256 |
+
position_ids[row, previous_col + 1 : col + 1] = torch.arange(
|
| 257 |
+
0, col - previous_col, device=input_ids.device
|
| 258 |
+
)
|
| 259 |
+
c2t_maski = torch.zeros((num_token), device=input_ids.device).bool()
|
| 260 |
+
c2t_maski[previous_col + 1 : col] = True
|
| 261 |
+
cate_to_token_mask_list[row].append(c2t_maski)
|
| 262 |
+
previous_col = col
|
| 263 |
+
|
| 264 |
+
cate_to_token_mask_list = [
|
| 265 |
+
torch.stack(cate_to_token_mask_listi, dim=0)
|
| 266 |
+
for cate_to_token_mask_listi in cate_to_token_mask_list
|
| 267 |
+
]
|
| 268 |
+
|
| 269 |
+
# # padding mask
|
| 270 |
+
# padding_mask = tokenized['attention_mask']
|
| 271 |
+
# attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
|
| 272 |
+
|
| 273 |
+
return attention_mask, position_ids.to(torch.long), cate_to_token_mask_list
|
groundingdino/models/GroundingDINO/.ipynb_checkpoints/fuse_modules-checkpoint.py
CHANGED
|
@@ -20,9 +20,9 @@ class FeatureResizer(nn.Module):
|
|
| 20 |
def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
|
| 21 |
super().__init__()
|
| 22 |
self.do_ln = do_ln
|
|
|
|
| 23 |
# Object feature encoding
|
| 24 |
-
r =
|
| 25 |
-
self.fc = lora.Linear(input_feat_size, output_feat_size,r=r , bias=True)
|
| 26 |
self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
|
| 27 |
self.dropout = nn.Dropout(dropout)
|
| 28 |
|
|
@@ -112,14 +112,14 @@ class BiMultiHeadAttention(nn.Module):
|
|
| 112 |
), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
|
| 113 |
self.scale = self.head_dim ** (-0.5)
|
| 114 |
self.dropout = dropout
|
| 115 |
-
r =
|
| 116 |
self.v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r)
|
| 117 |
-
self.l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r)
|
| 118 |
-
self.values_v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r)
|
| 119 |
-
self.values_l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r)
|
| 120 |
|
| 121 |
-
self.out_v_proj = lora.Linear(self.embed_dim, self.v_dim , r=r)
|
| 122 |
-
self.out_l_proj = lora.Linear(self.embed_dim, self.l_dim , r=r)
|
| 123 |
|
| 124 |
self.stable_softmax_2d = True
|
| 125 |
self.clamp_min_for_underflow = True
|
|
|
|
| 20 |
def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
|
| 21 |
super().__init__()
|
| 22 |
self.do_ln = do_ln
|
| 23 |
+
r = 12
|
| 24 |
# Object feature encoding
|
| 25 |
+
self.fc = lora.Linear(input_feat_size, output_feat_size,r=r, bias=True)
|
|
|
|
| 26 |
self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
|
| 27 |
self.dropout = nn.Dropout(dropout)
|
| 28 |
|
|
|
|
| 112 |
), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
|
| 113 |
self.scale = self.head_dim ** (-0.5)
|
| 114 |
self.dropout = dropout
|
| 115 |
+
r = 12
|
| 116 |
self.v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r)
|
| 117 |
+
self.l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r )
|
| 118 |
+
self.values_v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r )
|
| 119 |
+
self.values_l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r )
|
| 120 |
|
| 121 |
+
self.out_v_proj = lora.Linear(self.embed_dim, self.v_dim , r=r )
|
| 122 |
+
self.out_l_proj = lora.Linear(self.embed_dim, self.l_dim , r=r )
|
| 123 |
|
| 124 |
self.stable_softmax_2d = True
|
| 125 |
self.clamp_min_for_underflow = True
|