Hasanmog commited on
Commit
e33160d
·
1 Parent(s): 2ebac4f
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .ipynb_checkpoints/README-checkpoint.md +11 -0
  2. .ipynb_checkpoints/README-checkpoint.txt +0 -1
  3. .ipynb_checkpoints/requirements-checkpoint.txt +0 -17
  4. .ipynb_checkpoints/test-checkpoint.ipynb +0 -113
  5. README.md +10 -0
  6. README.txt +0 -1
  7. app.py +0 -125
  8. groundingdino.egg-info/PKG-INFO +0 -213
  9. groundingdino.egg-info/SOURCES.txt +0 -46
  10. groundingdino.egg-info/requires.txt +0 -10
  11. groundingdino.egg-info/top_level.txt +0 -1
  12. groundingdino/.ipynb_checkpoints/__init__-checkpoint.py +0 -0
  13. groundingdino/.ipynb_checkpoints/version-checkpoint.py +0 -1
  14. groundingdino/__init__.py +0 -0
  15. groundingdino/__pycache__/__init__.cpython-310.pyc +0 -0
  16. groundingdino/config/.ipynb_checkpoints/GroundingDINO_SwinB_cfg-checkpoint.py +0 -43
  17. groundingdino/config/GroundingDINO_SwinB_cfg.py +0 -43
  18. groundingdino/config/GroundingDINO_SwinT_OGC.py +0 -43
  19. groundingdino/config/__init__.py +0 -0
  20. groundingdino/datasets/.ipynb_checkpoints/__init__-checkpoint.py +23 -0
  21. groundingdino/datasets/.ipynb_checkpoints/coco-checkpoint.py +649 -0
  22. groundingdino/datasets/.ipynb_checkpoints/dataset-checkpoint.py +44 -0
  23. groundingdino/datasets/.ipynb_checkpoints/odvg-checkpoint.py +258 -0
  24. groundingdino/datasets/.ipynb_checkpoints/transforms-checkpoint.py +285 -0
  25. groundingdino/datasets/__init__.py +23 -0
  26. groundingdino/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
  27. groundingdino/datasets/__pycache__/coco.cpython-310.pyc +0 -0
  28. groundingdino/datasets/__pycache__/coco_eval.cpython-310.pyc +0 -0
  29. groundingdino/datasets/__pycache__/cocogrounding_eval.cpython-310.pyc +0 -0
  30. groundingdino/datasets/__pycache__/data_util.cpython-310.pyc +0 -0
  31. groundingdino/datasets/__pycache__/odvg.cpython-310.pyc +0 -0
  32. groundingdino/datasets/__pycache__/panoptic_eval.cpython-310.pyc +0 -0
  33. groundingdino/datasets/__pycache__/random_crop.cpython-310.pyc +0 -0
  34. groundingdino/datasets/__pycache__/sltransform.cpython-310.pyc +0 -0
  35. groundingdino/datasets/__pycache__/transforms.cpython-310.pyc +0 -0
  36. groundingdino/datasets/coco.py +649 -0
  37. groundingdino/datasets/coco_eval.py +266 -0
  38. groundingdino/datasets/coco_panoptic.py +99 -0
  39. groundingdino/datasets/cocogrounding_eval.py +3 -1
  40. groundingdino/datasets/data_util.py +170 -0
  41. groundingdino/datasets/dataset.py +44 -0
  42. groundingdino/datasets/odvg.py +258 -0
  43. groundingdino/datasets/panoptic_eval.py +44 -0
  44. groundingdino/datasets/random_crop.py +135 -0
  45. groundingdino/datasets/sltransform.py +247 -0
  46. groundingdino/datasets/transforms.py +22 -48
  47. groundingdino/models/.ipynb_checkpoints/__init__-checkpoint.py +0 -18
  48. groundingdino/models/.ipynb_checkpoints/registry-checkpoint.py +0 -66
  49. groundingdino/models/GroundingDINO/.ipynb_checkpoints/bertwarper-checkpoint.py +273 -0
  50. groundingdino/models/GroundingDINO/.ipynb_checkpoints/fuse_modules-checkpoint.py +8 -8
.ipynb_checkpoints/README-checkpoint.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: My Awesome Space
3
+ emoji: 🚀
4
+ colorFrom: red
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.9.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
.ipynb_checkpoints/README-checkpoint.txt DELETED
@@ -1 +0,0 @@
1
- Peft-ed Grounding DINO on RSVG dataset
 
 
.ipynb_checkpoints/requirements-checkpoint.txt DELETED
@@ -1,17 +0,0 @@
1
- cython
2
- submitit
3
- scipy
4
- termcolor
5
- addict
6
- yapf==0.40.1
7
- timm
8
- torch
9
- torchvision
10
- transformers
11
- numpy
12
- opencv-python
13
- supervision==0.6.0
14
- pycocotools
15
- pyyaml>3.10
16
- colorlog
17
- loralib
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/test-checkpoint.ipynb DELETED
@@ -1,113 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 2,
6
- "metadata": {},
7
- "outputs": [
8
- {
9
- "name": "stdout",
10
- "output_type": "stream",
11
- "text": [
12
- "final text_encoder_type: bert-base-uncased\n"
13
- ]
14
- },
15
- {
16
- "data": {
17
- "application/json": {
18
- "ascii": false,
19
- "bar_format": null,
20
- "colour": null,
21
- "elapsed": 0.014210224151611328,
22
- "initial": 0,
23
- "n": 0,
24
- "ncols": null,
25
- "nrows": null,
26
- "postfix": null,
27
- "prefix": "Downloading model.safetensors",
28
- "rate": null,
29
- "total": 440449768,
30
- "unit": "B",
31
- "unit_divisor": 1000,
32
- "unit_scale": true
33
- },
34
- "application/vnd.jupyter.widget-view+json": {
35
- "model_id": "5922f34578364d36afa13de9f01254bd",
36
- "version_major": 2,
37
- "version_minor": 0
38
- },
39
- "text/plain": [
40
- "Downloading model.safetensors: 0%| | 0.00/440M [00:00<?, ?B/s]"
41
- ]
42
- },
43
- "metadata": {},
44
- "output_type": "display_data"
45
- },
46
- {
47
- "name": "stderr",
48
- "output_type": "stream",
49
- "text": [
50
- "/root/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:881: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
51
- " warnings.warn(\n",
52
- "/root/miniconda3/lib/python3.8/site-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
53
- " warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n"
54
- ]
55
- },
56
- {
57
- "data": {
58
- "text/plain": [
59
- "True"
60
- ]
61
- },
62
- "execution_count": 2,
63
- "metadata": {},
64
- "output_type": "execute_result"
65
- }
66
- ],
67
- "source": [
68
- "from groundingdino.util.inference import load_model, load_image, predict, annotate\n",
69
- "import cv2\n",
70
- "\n",
71
- "model = load_model(\"groundingdino/config/GroundingDINO_SwinT_OGC.py\", \"../04-06-segment-anything/weights/groundingdino_swint_ogc.pth\")\n",
72
- "IMAGE_PATH = \".asset/cat_dog.jpeg\"\n",
73
- "TEXT_PROMPT = \"chair . person . dog .\"\n",
74
- "BOX_TRESHOLD = 0.35\n",
75
- "TEXT_TRESHOLD = 0.25\n",
76
- "\n",
77
- "image_source, image = load_image(IMAGE_PATH)\n",
78
- "\n",
79
- "boxes, logits, phrases = predict(\n",
80
- " model=model,\n",
81
- " image=image,\n",
82
- " caption=TEXT_PROMPT,\n",
83
- " box_threshold=BOX_TRESHOLD,\n",
84
- " text_threshold=TEXT_TRESHOLD\n",
85
- ")\n",
86
- "\n",
87
- "annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)\n",
88
- "cv2.imwrite(\"annotated_image.jpg\", annotated_frame)"
89
- ]
90
- }
91
- ],
92
- "metadata": {
93
- "kernelspec": {
94
- "display_name": "base",
95
- "language": "python",
96
- "name": "python3"
97
- },
98
- "language_info": {
99
- "codemirror_mode": {
100
- "name": "ipython",
101
- "version": 3
102
- },
103
- "file_extension": ".py",
104
- "mimetype": "text/x-python",
105
- "name": "python",
106
- "nbconvert_exporter": "python",
107
- "pygments_lexer": "ipython3",
108
- "version": "3.8.10"
109
- }
110
- },
111
- "nbformat": 4,
112
- "nbformat_minor": 2
113
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: My Awesome Space
3
+ emoji: 🚀
4
+ colorFrom: red
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 4.36.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
README.txt DELETED
@@ -1 +0,0 @@
1
- Peft-ed Grounding DINO on RSVG dataset
 
 
app.py DELETED
@@ -1,125 +0,0 @@
1
- import argparse
2
- from functools import partial
3
- import cv2
4
- import requests
5
- import os
6
- from io import BytesIO
7
- from PIL import Image
8
- import numpy as np
9
- from pathlib import Path
10
-
11
-
12
- import warnings
13
-
14
- import torch
15
-
16
- # prepare the environment
17
- os.system("python setup.py build develop --user")
18
- os.system("pip install packaging==21.3")
19
- os.system("pip install gradio")
20
-
21
-
22
- warnings.filterwarnings("ignore")
23
-
24
- import gradio as gr
25
-
26
- from groundingdino.models import build_model
27
- from groundingdino.util.slconfig import SLConfig
28
- from groundingdino.util.utils import clean_state_dict
29
- from groundingdino.util.inference import annotate, load_image, predict
30
- import groundingdino.datasets.transforms as T
31
-
32
- from huggingface_hub import hf_hub_download
33
-
34
-
35
-
36
- # Use this command for evaluate the Grounding DINO model
37
- config_file = "groundingdino/config/GroundingDINO_SwinB_OGC.py"
38
- ckpt_repo_id = "Hasanmog/Peft-GroundingDINO"
39
- ckpt_filenmae = "Best.pth"
40
-
41
-
42
- def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
43
- args = SLConfig.fromfile(model_config_path)
44
- model = build_model(args)
45
- args.device = device
46
-
47
- cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
48
- checkpoint = torch.load(cache_file, map_location='cpu')
49
- log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
50
- print("Model loaded from {} \n => {}".format(cache_file, log))
51
- _ = model.eval()
52
- return model
53
-
54
- def image_transform_grounding(init_image):
55
- transform = T.Compose([
56
- T.RandomResize([800], max_size=1333),
57
- T.ToTensor(),
58
- T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
59
- ])
60
- image, _ = transform(init_image, None) # 3, h, w
61
- return init_image, image
62
-
63
- def image_transform_grounding_for_vis(init_image):
64
- transform = T.Compose([
65
- T.RandomResize([800], max_size=1333),
66
- ])
67
- image, _ = transform(init_image, None) # 3, h, w
68
- return image
69
-
70
- model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
71
-
72
- def run_grounding(input_image, grounding_caption, box_threshold, text_threshold):
73
- init_image = input_image.convert("RGB")
74
- original_size = init_image.size
75
-
76
- _, image_tensor = image_transform_grounding(init_image)
77
- image_pil: Image = image_transform_grounding_for_vis(init_image)
78
-
79
- # run grounidng
80
- boxes, logits, phrases = predict(model, image_tensor, grounding_caption, box_threshold, text_threshold, device='cpu')
81
- annotated_frame = annotate(image_source=np.asarray(image_pil), boxes=boxes, logits=logits, phrases=phrases)
82
- image_with_box = Image.fromarray(cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB))
83
-
84
-
85
- return image_with_box
86
-
87
- if __name__ == "__main__":
88
-
89
- parser = argparse.ArgumentParser("Grounding DINO demo", add_help=True)
90
- parser.add_argument("--debug", action="store_true", help="using debug mode")
91
- parser.add_argument("--share", action="store_true", help="share the app")
92
- args = parser.parse_args()
93
-
94
- block = gr.Blocks().queue()
95
- with block:
96
- gr.Markdown("# [Grounding DINO](https://github.com/IDEA-Research/GroundingDINO)")
97
- gr.Markdown("### Open-World Detection with Grounding DINO")
98
-
99
- with gr.Row():
100
- with gr.Column():
101
- input_image = gr.Image(source='upload', type="pil")
102
- grounding_caption = gr.Textbox(label="Detection Prompt")
103
- run_button = gr.Button(label="Run")
104
- with gr.Accordion("Advanced options", open=False):
105
- box_threshold = gr.Slider(
106
- label="Box Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
107
- )
108
- text_threshold = gr.Slider(
109
- label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
110
- )
111
-
112
- with gr.Column():
113
- gallery = gr.outputs.Image(
114
- type="pil",
115
- # label="grounding results"
116
- ).style(full_width=True, full_height=True)
117
- # gallery = gr.Gallery(label="Generated images", show_label=False).style(
118
- # grid=[1], height="auto", container=True, full_width=True, full_height=True)
119
-
120
- run_button.click(fn=run_grounding, inputs=[
121
- input_image, grounding_caption, box_threshold, text_threshold], outputs=[gallery])
122
-
123
-
124
- block.launch(server_name='0.0.0.0', server_port=7579, debug=args.debug, share=args.share)
125
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino.egg-info/PKG-INFO DELETED
@@ -1,213 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: groundingdino
3
- Version: 0.1.0
4
- Summary: open-set object detector
5
- Home-page: https://github.com/IDEA-Research/GroundingDINO
6
- Author: International Digital Economy Academy, Shilong Liu
7
- License: Apache License
8
- Version 2.0, January 2004
9
- http://www.apache.org/licenses/
10
-
11
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
12
-
13
- 1. Definitions.
14
-
15
- "License" shall mean the terms and conditions for use, reproduction,
16
- and distribution as defined by Sections 1 through 9 of this document.
17
-
18
- "Licensor" shall mean the copyright owner or entity authorized by
19
- the copyright owner that is granting the License.
20
-
21
- "Legal Entity" shall mean the union of the acting entity and all
22
- other entities that control, are controlled by, or are under common
23
- control with that entity. For the purposes of this definition,
24
- "control" means (i) the power, direct or indirect, to cause the
25
- direction or management of such entity, whether by contract or
26
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
27
- outstanding shares, or (iii) beneficial ownership of such entity.
28
-
29
- "You" (or "Your") shall mean an individual or Legal Entity
30
- exercising permissions granted by this License.
31
-
32
- "Source" form shall mean the preferred form for making modifications,
33
- including but not limited to software source code, documentation
34
- source, and configuration files.
35
-
36
- "Object" form shall mean any form resulting from mechanical
37
- transformation or translation of a Source form, including but
38
- not limited to compiled object code, generated documentation,
39
- and conversions to other media types.
40
-
41
- "Work" shall mean the work of authorship, whether in Source or
42
- Object form, made available under the License, as indicated by a
43
- copyright notice that is included in or attached to the work
44
- (an example is provided in the Appendix below).
45
-
46
- "Derivative Works" shall mean any work, whether in Source or Object
47
- form, that is based on (or derived from) the Work and for which the
48
- editorial revisions, annotations, elaborations, or other modifications
49
- represent, as a whole, an original work of authorship. For the purposes
50
- of this License, Derivative Works shall not include works that remain
51
- separable from, or merely link (or bind by name) to the interfaces of,
52
- the Work and Derivative Works thereof.
53
-
54
- "Contribution" shall mean any work of authorship, including
55
- the original version of the Work and any modifications or additions
56
- to that Work or Derivative Works thereof, that is intentionally
57
- submitted to Licensor for inclusion in the Work by the copyright owner
58
- or by an individual or Legal Entity authorized to submit on behalf of
59
- the copyright owner. For the purposes of this definition, "submitted"
60
- means any form of electronic, verbal, or written communication sent
61
- to the Licensor or its representatives, including but not limited to
62
- communication on electronic mailing lists, source code control systems,
63
- and issue tracking systems that are managed by, or on behalf of, the
64
- Licensor for the purpose of discussing and improving the Work, but
65
- excluding communication that is conspicuously marked or otherwise
66
- designated in writing by the copyright owner as "Not a Contribution."
67
-
68
- "Contributor" shall mean Licensor and any individual or Legal Entity
69
- on behalf of whom a Contribution has been received by Licensor and
70
- subsequently incorporated within the Work.
71
-
72
- 2. Grant of Copyright License. Subject to the terms and conditions of
73
- this License, each Contributor hereby grants to You a perpetual,
74
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
75
- copyright license to reproduce, prepare Derivative Works of,
76
- publicly display, publicly perform, sublicense, and distribute the
77
- Work and such Derivative Works in Source or Object form.
78
-
79
- 3. Grant of Patent License. Subject to the terms and conditions of
80
- this License, each Contributor hereby grants to You a perpetual,
81
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
82
- (except as stated in this section) patent license to make, have made,
83
- use, offer to sell, sell, import, and otherwise transfer the Work,
84
- where such license applies only to those patent claims licensable
85
- by such Contributor that are necessarily infringed by their
86
- Contribution(s) alone or by combination of their Contribution(s)
87
- with the Work to which such Contribution(s) was submitted. If You
88
- institute patent litigation against any entity (including a
89
- cross-claim or counterclaim in a lawsuit) alleging that the Work
90
- or a Contribution incorporated within the Work constitutes direct
91
- or contributory patent infringement, then any patent licenses
92
- granted to You under this License for that Work shall terminate
93
- as of the date such litigation is filed.
94
-
95
- 4. Redistribution. You may reproduce and distribute copies of the
96
- Work or Derivative Works thereof in any medium, with or without
97
- modifications, and in Source or Object form, provided that You
98
- meet the following conditions:
99
-
100
- (a) You must give any other recipients of the Work or
101
- Derivative Works a copy of this License; and
102
-
103
- (b) You must cause any modified files to carry prominent notices
104
- stating that You changed the files; and
105
-
106
- (c) You must retain, in the Source form of any Derivative Works
107
- that You distribute, all copyright, patent, trademark, and
108
- attribution notices from the Source form of the Work,
109
- excluding those notices that do not pertain to any part of
110
- the Derivative Works; and
111
-
112
- (d) If the Work includes a "NOTICE" text file as part of its
113
- distribution, then any Derivative Works that You distribute must
114
- include a readable copy of the attribution notices contained
115
- within such NOTICE file, excluding those notices that do not
116
- pertain to any part of the Derivative Works, in at least one
117
- of the following places: within a NOTICE text file distributed
118
- as part of the Derivative Works; within the Source form or
119
- documentation, if provided along with the Derivative Works; or,
120
- within a display generated by the Derivative Works, if and
121
- wherever such third-party notices normally appear. The contents
122
- of the NOTICE file are for informational purposes only and
123
- do not modify the License. You may add Your own attribution
124
- notices within Derivative Works that You distribute, alongside
125
- or as an addendum to the NOTICE text from the Work, provided
126
- that such additional attribution notices cannot be construed
127
- as modifying the License.
128
-
129
- You may add Your own copyright statement to Your modifications and
130
- may provide additional or different license terms and conditions
131
- for use, reproduction, or distribution of Your modifications, or
132
- for any such Derivative Works as a whole, provided Your use,
133
- reproduction, and distribution of the Work otherwise complies with
134
- the conditions stated in this License.
135
-
136
- 5. Submission of Contributions. Unless You explicitly state otherwise,
137
- any Contribution intentionally submitted for inclusion in the Work
138
- by You to the Licensor shall be under the terms and conditions of
139
- this License, without any additional terms or conditions.
140
- Notwithstanding the above, nothing herein shall supersede or modify
141
- the terms of any separate license agreement you may have executed
142
- with Licensor regarding such Contributions.
143
-
144
- 6. Trademarks. This License does not grant permission to use the trade
145
- names, trademarks, service marks, or product names of the Licensor,
146
- except as required for reasonable and customary use in describing the
147
- origin of the Work and reproducing the content of the NOTICE file.
148
-
149
- 7. Disclaimer of Warranty. Unless required by applicable law or
150
- agreed to in writing, Licensor provides the Work (and each
151
- Contributor provides its Contributions) on an "AS IS" BASIS,
152
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
153
- implied, including, without limitation, any warranties or conditions
154
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
155
- PARTICULAR PURPOSE. You are solely responsible for determining the
156
- appropriateness of using or redistributing the Work and assume any
157
- risks associated with Your exercise of permissions under this License.
158
-
159
- 8. Limitation of Liability. In no event and under no legal theory,
160
- whether in tort (including negligence), contract, or otherwise,
161
- unless required by applicable law (such as deliberate and grossly
162
- negligent acts) or agreed to in writing, shall any Contributor be
163
- liable to You for damages, including any direct, indirect, special,
164
- incidental, or consequential damages of any character arising as a
165
- result of this License or out of the use or inability to use the
166
- Work (including but not limited to damages for loss of goodwill,
167
- work stoppage, computer failure or malfunction, or any and all
168
- other commercial damages or losses), even if such Contributor
169
- has been advised of the possibility of such damages.
170
-
171
- 9. Accepting Warranty or Additional Liability. While redistributing
172
- the Work or Derivative Works thereof, You may choose to offer,
173
- and charge a fee for, acceptance of support, warranty, indemnity,
174
- or other liability obligations and/or rights consistent with this
175
- License. However, in accepting such obligations, You may act only
176
- on Your own behalf and on Your sole responsibility, not on behalf
177
- of any other Contributor, and only if You agree to indemnify,
178
- defend, and hold each Contributor harmless for any liability
179
- incurred by, or claims asserted against, such Contributor by reason
180
- of your accepting any such warranty or additional liability.
181
-
182
- END OF TERMS AND CONDITIONS
183
-
184
- APPENDIX: How to apply the Apache License to your work.
185
-
186
- To apply the Apache License to your work, attach the following
187
- boilerplate notice, with the fields enclosed by brackets "[]"
188
- replaced with your own identifying information. (Don't include
189
- the brackets!) The text should be enclosed in the appropriate
190
- comment syntax for the file format. We also recommend that a
191
- file or class name and description of purpose be included on the
192
- same "printed page" as the copyright notice for easier
193
- identification within third-party archives.
194
-
195
- Copyright 2023 - present, IDEA Research.
196
-
197
- Licensed under the Apache License, Version 2.0 (the "License");
198
- you may not use this file except in compliance with the License.
199
- You may obtain a copy of the License at
200
-
201
- http://www.apache.org/licenses/LICENSE-2.0
202
-
203
- Unless required by applicable law or agreed to in writing, software
204
- distributed under the License is distributed on an "AS IS" BASIS,
205
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
206
- See the License for the specific language governing permissions and
207
- limitations under the License.
208
-
209
- Platform: UNKNOWN
210
- License-File: LICENSE
211
-
212
- UNKNOWN
213
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino.egg-info/SOURCES.txt DELETED
@@ -1,46 +0,0 @@
1
- LICENSE
2
- README.md
3
- setup.py
4
- /home/jamada/jupyterlab/projects/gdino-peft/gdino-official/GroundingDINO/groundingdino/models/GroundingDINO/csrc/cuda_version.cu
5
- /home/jamada/jupyterlab/projects/gdino-peft/gdino-official/GroundingDINO/groundingdino/models/GroundingDINO/csrc/vision.cpp
6
- /home/jamada/jupyterlab/projects/gdino-peft/gdino-official/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp
7
- /home/jamada/jupyterlab/projects/gdino-peft/gdino-official/GroundingDINO/groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cuda.cu
8
- groundingdino/__init__.py
9
- groundingdino/version.py
10
- groundingdino.egg-info/PKG-INFO
11
- groundingdino.egg-info/SOURCES.txt
12
- groundingdino.egg-info/dependency_links.txt
13
- groundingdino.egg-info/requires.txt
14
- groundingdino.egg-info/top_level.txt
15
- groundingdino/config/GroundingDINO_SwinB_cfg.py
16
- groundingdino/config/GroundingDINO_SwinT_OGC.py
17
- groundingdino/config/__init__.py
18
- groundingdino/datasets/__init__.py
19
- groundingdino/datasets/cocogrounding_eval.py
20
- groundingdino/datasets/transforms.py
21
- groundingdino/models/__init__.py
22
- groundingdino/models/registry.py
23
- groundingdino/models/GroundingDINO/__init__.py
24
- groundingdino/models/GroundingDINO/bertwarper.py
25
- groundingdino/models/GroundingDINO/fuse_modules.py
26
- groundingdino/models/GroundingDINO/groundingdino.py
27
- groundingdino/models/GroundingDINO/ms_deform_attn.py
28
- groundingdino/models/GroundingDINO/transformer.py
29
- groundingdino/models/GroundingDINO/transformer_vanilla.py
30
- groundingdino/models/GroundingDINO/utils.py
31
- groundingdino/models/GroundingDINO/backbone/__init__.py
32
- groundingdino/models/GroundingDINO/backbone/backbone.py
33
- groundingdino/models/GroundingDINO/backbone/position_encoding.py
34
- groundingdino/models/GroundingDINO/backbone/swin_transformer.py
35
- groundingdino/util/__init__.py
36
- groundingdino/util/box_ops.py
37
- groundingdino/util/get_tokenlizer.py
38
- groundingdino/util/inference.py
39
- groundingdino/util/logger.py
40
- groundingdino/util/misc.py
41
- groundingdino/util/slconfig.py
42
- groundingdino/util/slio.py
43
- groundingdino/util/time_counter.py
44
- groundingdino/util/utils.py
45
- groundingdino/util/visualizer.py
46
- groundingdino/util/vl_utils.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino.egg-info/requires.txt DELETED
@@ -1,10 +0,0 @@
1
- addict
2
- numpy
3
- opencv-python
4
- pycocotools
5
- supervision
6
- timm
7
- torch
8
- torchvision
9
- transformers
10
- yapf
 
 
 
 
 
 
 
 
 
 
 
groundingdino.egg-info/top_level.txt DELETED
@@ -1 +0,0 @@
1
- groundingdino
 
 
groundingdino/.ipynb_checkpoints/__init__-checkpoint.py DELETED
File without changes
groundingdino/.ipynb_checkpoints/version-checkpoint.py DELETED
@@ -1 +0,0 @@
1
- __version__ = '0.1.0'
 
 
groundingdino/__init__.py DELETED
File without changes
groundingdino/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (182 Bytes)
 
groundingdino/config/.ipynb_checkpoints/GroundingDINO_SwinB_cfg-checkpoint.py DELETED
@@ -1,43 +0,0 @@
1
- batch_size = 1
2
- modelname = "groundingdino"
3
- backbone = "swin_B_384_22k"
4
- position_embedding = "sine"
5
- pe_temperatureH = 20
6
- pe_temperatureW = 20
7
- return_interm_indices = [1, 2, 3]
8
- backbone_freeze_keywords = None
9
- enc_layers = 6
10
- dec_layers = 6
11
- pre_norm = False
12
- dim_feedforward = 2048
13
- hidden_dim = 256
14
- dropout = 0.0
15
- nheads = 8
16
- num_queries = 900
17
- query_dim = 4
18
- num_patterns = 0
19
- num_feature_levels = 4
20
- enc_n_points = 4
21
- dec_n_points = 4
22
- two_stage_type = "standard"
23
- two_stage_bbox_embed_share = False
24
- two_stage_class_embed_share = False
25
- transformer_activation = "relu"
26
- dec_pred_bbox_embed_share = True
27
- dn_box_noise_scale = 1.0
28
- dn_label_noise_ratio = 0.5
29
- dn_label_coef = 1.0
30
- dn_bbox_coef = 1.0
31
- embed_init_tgt = True
32
- dn_labelbook_size = 2000
33
- max_text_len = 256
34
- text_encoder_type = "bert-base-uncased"
35
- use_text_enhancer = True
36
- use_fusion_layer = True
37
- use_checkpoint = True
38
- use_transformer_ckpt = True
39
- use_text_cross_attention = True
40
- text_dropout = 0.0
41
- fusion_dropout = 0.0
42
- fusion_droppath = 0.1
43
- sub_sentence_present = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/config/GroundingDINO_SwinB_cfg.py DELETED
@@ -1,43 +0,0 @@
1
- batch_size = 1
2
- modelname = "groundingdino"
3
- backbone = "swin_B_384_22k"
4
- position_embedding = "sine"
5
- pe_temperatureH = 20
6
- pe_temperatureW = 20
7
- return_interm_indices = [1, 2, 3]
8
- backbone_freeze_keywords = None
9
- enc_layers = 6
10
- dec_layers = 6
11
- pre_norm = False
12
- dim_feedforward = 2048
13
- hidden_dim = 256
14
- dropout = 0.0
15
- nheads = 8
16
- num_queries = 900
17
- query_dim = 4
18
- num_patterns = 0
19
- num_feature_levels = 4
20
- enc_n_points = 4
21
- dec_n_points = 4
22
- two_stage_type = "standard"
23
- two_stage_bbox_embed_share = False
24
- two_stage_class_embed_share = False
25
- transformer_activation = "relu"
26
- dec_pred_bbox_embed_share = True
27
- dn_box_noise_scale = 1.0
28
- dn_label_noise_ratio = 0.5
29
- dn_label_coef = 1.0
30
- dn_bbox_coef = 1.0
31
- embed_init_tgt = True
32
- dn_labelbook_size = 2000
33
- max_text_len = 256
34
- text_encoder_type = "bert-base-uncased"
35
- use_text_enhancer = True
36
- use_fusion_layer = True
37
- use_checkpoint = True
38
- use_transformer_ckpt = True
39
- use_text_cross_attention = True
40
- text_dropout = 0.0
41
- fusion_dropout = 0.0
42
- fusion_droppath = 0.1
43
- sub_sentence_present = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/config/GroundingDINO_SwinT_OGC.py DELETED
@@ -1,43 +0,0 @@
1
- batch_size = 1
2
- modelname = "groundingdino"
3
- backbone = "swin_T_224_1k"
4
- position_embedding = "sine"
5
- pe_temperatureH = 20
6
- pe_temperatureW = 20
7
- return_interm_indices = [1, 2, 3]
8
- backbone_freeze_keywords = None
9
- enc_layers = 6
10
- dec_layers = 6
11
- pre_norm = False
12
- dim_feedforward = 2048
13
- hidden_dim = 256
14
- dropout = 0.0
15
- nheads = 8
16
- num_queries = 900
17
- query_dim = 4
18
- num_patterns = 0
19
- num_feature_levels = 4
20
- enc_n_points = 4
21
- dec_n_points = 4
22
- two_stage_type = "standard"
23
- two_stage_bbox_embed_share = False
24
- two_stage_class_embed_share = False
25
- transformer_activation = "relu"
26
- dec_pred_bbox_embed_share = True
27
- dn_box_noise_scale = 1.0
28
- dn_label_noise_ratio = 0.5
29
- dn_label_coef = 1.0
30
- dn_bbox_coef = 1.0
31
- embed_init_tgt = True
32
- dn_labelbook_size = 2000
33
- max_text_len = 256
34
- text_encoder_type = "bert-base-uncased"
35
- use_text_enhancer = True
36
- use_fusion_layer = True
37
- use_checkpoint = True
38
- use_transformer_ckpt = True
39
- use_text_cross_attention = True
40
- text_dropout = 0.0
41
- fusion_dropout = 0.0
42
- fusion_droppath = 0.1
43
- sub_sentence_present = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/config/__init__.py DELETED
File without changes
groundingdino/datasets/.ipynb_checkpoints/__init__-checkpoint.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ import torch.utils.data
3
+ import torchvision
4
+ from .coco import build as build_coco
5
+
6
+
7
+ def get_coco_api_from_dataset(dataset):
8
+ for _ in range(10):
9
+ # if isinstance(dataset, torchvision.datasets.CocoDetection):
10
+ # break
11
+ if isinstance(dataset, torch.utils.data.Subset):
12
+ dataset = dataset.dataset
13
+ if isinstance(dataset, torchvision.datasets.CocoDetection):
14
+ return dataset.coco
15
+
16
+
17
+ def build_dataset(image_set, args, datasetinfo):
18
+ if datasetinfo["dataset_mode"] == 'coco':
19
+ return build_coco(image_set, args, datasetinfo)
20
+ if datasetinfo["dataset_mode"] == 'odvg':
21
+ from .odvg import build_odvg
22
+ return build_odvg(image_set, args, datasetinfo)
23
+ raise ValueError(f'dataset {args.dataset_file} not supported')
groundingdino/datasets/.ipynb_checkpoints/coco-checkpoint.py ADDED
@@ -0,0 +1,649 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ """
3
+ COCO dataset which returns image_id for evaluation.
4
+
5
+ Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
6
+ """
7
+ if __name__=="__main__":
8
+ # for debug only
9
+ import os, sys
10
+ sys.path.append(os.path.dirname(sys.path[0]))
11
+ from torchvision.datasets.vision import VisionDataset
12
+
13
+ import json
14
+ from pathlib import Path
15
+ import random
16
+ import os
17
+ from typing import Any, Callable, List, Optional, Tuple
18
+
19
+ from PIL import Image
20
+
21
+ import torch
22
+ import torch.utils.data
23
+ import torchvision
24
+ from pycocotools import mask as coco_mask
25
+
26
+ from datasets.data_util import preparing_dataset
27
+ import datasets.transforms as T
28
+ from util.box_ops import box_cxcywh_to_xyxy, box_iou
29
+
30
+ __all__ = ['build']
31
+
32
+
33
+ class label2compat():
34
+ def __init__(self) -> None:
35
+ self.category_map_str = {"1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10, "11": 11, "13": 12, "14": 13, "15": 14, "16": 15, "17": 16, "18": 17, "19": 18, "20": 19, "21": 20, "22": 21, "23": 22, "24": 23, "25": 24, "27": 25, "28": 26, "31": 27, "32": 28, "33": 29, "34": 30, "35": 31, "36": 32, "37": 33, "38": 34, "39": 35, "40": 36, "41": 37, "42": 38, "43": 39, "44": 40, "46": 41, "47": 42, "48": 43, "49": 44, "50": 45, "51": 46, "52": 47, "53": 48, "54": 49, "55": 50, "56": 51, "57": 52, "58": 53, "59": 54, "60": 55, "61": 56, "62": 57, "63": 58, "64": 59, "65": 60, "67": 61, "70": 62, "72": 63, "73": 64, "74": 65, "75": 66, "76": 67, "77": 68, "78": 69, "79": 70, "80": 71, "81": 72, "82": 73, "84": 74, "85": 75, "86": 76, "87": 77, "88": 78, "89": 79, "90": 80}
36
+ self.category_map = {int(k):v for k,v in self.category_map_str.items()}
37
+
38
+ def __call__(self, target, img=None):
39
+ labels = target['labels']
40
+ res = torch.zeros(labels.shape, dtype=labels.dtype)
41
+ for idx, item in enumerate(labels):
42
+ res[idx] = self.category_map[item.item()] - 1
43
+ target['label_compat'] = res
44
+ if img is not None:
45
+ return target, img
46
+ else:
47
+ return target
48
+
49
+
50
+ class label_compat2onehot():
51
+ def __init__(self, num_class=80, num_output_objs=1):
52
+ self.num_class = num_class
53
+ self.num_output_objs = num_output_objs
54
+ if num_output_objs != 1:
55
+ raise DeprecationWarning("num_output_objs!=1, which is only used for comparison")
56
+
57
+ def __call__(self, target, img=None):
58
+ labels = target['label_compat']
59
+ place_dict = {k:0 for k in range(self.num_class)}
60
+ if self.num_output_objs == 1:
61
+ res = torch.zeros(self.num_class)
62
+ for i in labels:
63
+ itm = i.item()
64
+ res[itm] = 1.0
65
+ else:
66
+ # compat with baseline
67
+ res = torch.zeros(self.num_class, self.num_output_objs)
68
+ for i in labels:
69
+ itm = i.item()
70
+ res[itm][place_dict[itm]] = 1.0
71
+ place_dict[itm] += 1
72
+ target['label_compat_onehot'] = res
73
+ if img is not None:
74
+ return target, img
75
+ else:
76
+ return target
77
+
78
+
79
+ class box_label_catter():
80
+ def __init__(self):
81
+ pass
82
+
83
+ def __call__(self, target, img=None):
84
+ labels = target['label_compat']
85
+ boxes = target['boxes']
86
+ box_label = torch.cat((boxes, labels.unsqueeze(-1)), 1)
87
+ target['box_label'] = box_label
88
+ if img is not None:
89
+ return target, img
90
+ else:
91
+ return target
92
+
93
+
94
+ class RandomSelectBoxlabels():
95
+ def __init__(self, num_classes, leave_one_out=False, blank_prob=0.8,
96
+ prob_first_item = 0.0,
97
+ prob_random_item = 0.0,
98
+ prob_last_item = 0.8,
99
+ prob_stop_sign = 0.2
100
+ ) -> None:
101
+ self.num_classes = num_classes
102
+ self.leave_one_out = leave_one_out
103
+ self.blank_prob = blank_prob
104
+
105
+ self.set_state(prob_first_item, prob_random_item, prob_last_item, prob_stop_sign)
106
+
107
+
108
+ def get_state(self):
109
+ return [self.prob_first_item, self.prob_random_item, self.prob_last_item, self.prob_stop_sign]
110
+
111
+ def set_state(self, prob_first_item, prob_random_item, prob_last_item, prob_stop_sign):
112
+ sum_prob = prob_first_item + prob_random_item + prob_last_item + prob_stop_sign
113
+ assert sum_prob - 1 < 1e-6, \
114
+ f"Sum up all prob = {sum_prob}. prob_first_item:{prob_first_item}" \
115
+ + f"prob_random_item:{prob_random_item}, prob_last_item:{prob_last_item}" \
116
+ + f"prob_stop_sign:{prob_stop_sign}"
117
+
118
+ self.prob_first_item = prob_first_item
119
+ self.prob_random_item = prob_random_item
120
+ self.prob_last_item = prob_last_item
121
+ self.prob_stop_sign = prob_stop_sign
122
+
123
+
124
+ def sample_for_pred_first_item(self, box_label: torch.FloatTensor):
125
+ box_label_known = torch.Tensor(0,5)
126
+ box_label_unknown = box_label
127
+ return box_label_known, box_label_unknown
128
+
129
+ def sample_for_pred_random_item(self, box_label: torch.FloatTensor):
130
+ n_select = int(random.random() * box_label.shape[0])
131
+ box_label = box_label[torch.randperm(box_label.shape[0])]
132
+ box_label_known = box_label[:n_select]
133
+ box_label_unknown = box_label[n_select:]
134
+ return box_label_known, box_label_unknown
135
+
136
+ def sample_for_pred_last_item(self, box_label: torch.FloatTensor):
137
+ box_label_perm = box_label[torch.randperm(box_label.shape[0])]
138
+ known_label_list = []
139
+ box_label_known = []
140
+ box_label_unknown = []
141
+ for item in box_label_perm:
142
+ label_i = item[4].item()
143
+ if label_i in known_label_list:
144
+ box_label_known.append(item)
145
+ else:
146
+ # first item
147
+ box_label_unknown.append(item)
148
+ known_label_list.append(label_i)
149
+ box_label_known = torch.stack(box_label_known) if len(box_label_known) > 0 else torch.Tensor(0,5)
150
+ box_label_unknown = torch.stack(box_label_unknown) if len(box_label_unknown) > 0 else torch.Tensor(0,5)
151
+ return box_label_known, box_label_unknown
152
+
153
+ def sample_for_pred_stop_sign(self, box_label: torch.FloatTensor):
154
+ box_label_unknown = torch.Tensor(0,5)
155
+ box_label_known = box_label
156
+ return box_label_known, box_label_unknown
157
+
158
+ def __call__(self, target, img=None):
159
+ box_label = target['box_label'] # K, 5
160
+
161
+ dice_number = random.random()
162
+
163
+ if dice_number < self.prob_first_item:
164
+ box_label_known, box_label_unknown = self.sample_for_pred_first_item(box_label)
165
+ elif dice_number < self.prob_first_item + self.prob_random_item:
166
+ box_label_known, box_label_unknown = self.sample_for_pred_random_item(box_label)
167
+ elif dice_number < self.prob_first_item + self.prob_random_item + self.prob_last_item:
168
+ box_label_known, box_label_unknown = self.sample_for_pred_last_item(box_label)
169
+ else:
170
+ box_label_known, box_label_unknown = self.sample_for_pred_stop_sign(box_label)
171
+
172
+ target['label_onehot_known'] = label2onehot(box_label_known[:,-1], self.num_classes)
173
+ target['label_onehot_unknown'] = label2onehot(box_label_unknown[:, -1], self.num_classes)
174
+ target['box_label_known'] = box_label_known
175
+ target['box_label_unknown'] = box_label_unknown
176
+
177
+ return target, img
178
+
179
+
180
+ class RandomDrop():
181
+ def __init__(self, p=0.2) -> None:
182
+ self.p = p
183
+
184
+ def __call__(self, target, img=None):
185
+ known_box = target['box_label_known']
186
+ num_known_box = known_box.size(0)
187
+ idxs = torch.rand(num_known_box)
188
+ # indices = torch.randperm(num_known_box)[:int((1-self).p*num_known_box + 0.5 + random.random())]
189
+ target['box_label_known'] = known_box[idxs > self.p]
190
+ return target, img
191
+
192
+
193
+ class BboxPertuber():
194
+ def __init__(self, max_ratio = 0.02, generate_samples = 1000) -> None:
195
+ self.max_ratio = max_ratio
196
+ self.generate_samples = generate_samples
197
+ self.samples = self.generate_pertube_samples()
198
+ self.idx = 0
199
+
200
+ def generate_pertube_samples(self):
201
+ import torch
202
+ samples = (torch.rand(self.generate_samples, 5) - 0.5) * 2 * self.max_ratio
203
+ return samples
204
+
205
+ def __call__(self, target, img):
206
+ known_box = target['box_label_known'] # Tensor(K,5), K known bbox
207
+ K = known_box.shape[0]
208
+ known_box_pertube = torch.zeros(K, 6) # 4:bbox, 1:prob, 1:label
209
+ if K == 0:
210
+ pass
211
+ else:
212
+ if self.idx + K > self.generate_samples:
213
+ self.idx = 0
214
+ delta = self.samples[self.idx: self.idx + K, :]
215
+ known_box_pertube[:, :4] = known_box[:, :4] + delta[:, :4]
216
+ iou = (torch.diag(box_iou(box_cxcywh_to_xyxy(known_box[:, :4]), box_cxcywh_to_xyxy(known_box_pertube[:, :4]))[0])) * (1 + delta[:, -1])
217
+ known_box_pertube[:, 4].copy_(iou)
218
+ known_box_pertube[:, -1].copy_(known_box[:, -1])
219
+
220
+ target['box_label_known_pertube'] = known_box_pertube
221
+ return target, img
222
+
223
+
224
+ class RandomCutout():
225
+ def __init__(self, factor=0.5) -> None:
226
+ self.factor = factor
227
+
228
+ def __call__(self, target, img=None):
229
+ unknown_box = target['box_label_unknown'] # Ku, 5
230
+ known_box = target['box_label_known_pertube'] # Kk, 6
231
+ Ku = unknown_box.size(0)
232
+
233
+ known_box_add = torch.zeros(Ku, 6) # Ku, 6
234
+ known_box_add[:, :5] = unknown_box
235
+ known_box_add[:, 5].uniform_(0.5, 1)
236
+
237
+
238
+ known_box_add[:, :2] += known_box_add[:, 2:4] * (torch.rand(Ku, 2) - 0.5) / 2
239
+ known_box_add[:, 2:4] /= 2
240
+
241
+ target['box_label_known_pertube'] = torch.cat((known_box, known_box_add))
242
+ return target, img
243
+
244
+
245
+ class RandomSelectBoxes():
246
+ def __init__(self, num_class=80) -> None:
247
+ Warning("This is such a slow function and will be deprecated soon!!!")
248
+ self.num_class = num_class
249
+
250
+ def __call__(self, target, img=None):
251
+ boxes = target['boxes']
252
+ labels = target['label_compat']
253
+
254
+ # transform to list of tensors
255
+ boxs_list = [[] for i in range(self.num_class)]
256
+ for idx, item in enumerate(boxes):
257
+ label = labels[idx].item()
258
+ boxs_list[label].append(item)
259
+ boxs_list_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in boxs_list]
260
+
261
+ # random selection
262
+ box_known = []
263
+ box_unknown = []
264
+ for idx, item in enumerate(boxs_list_tensor):
265
+ ncnt = item.shape[0]
266
+ nselect = int(random.random() * ncnt) # close in both sides, much faster than random.randint
267
+
268
+ item = item[torch.randperm(ncnt)]
269
+ # random.shuffle(item)
270
+ box_known.append(item[:nselect])
271
+ box_unknown.append(item[nselect:])
272
+
273
+ # box_known_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_known]
274
+ # box_unknown_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_unknown]
275
+ # print('box_unknown_tensor:', box_unknown_tensor)
276
+ target['known_box'] = box_known
277
+ target['unknown_box'] = box_unknown
278
+ return target, img
279
+
280
+
281
+ def label2onehot(label, num_classes):
282
+ """
283
+ label: Tensor(K)
284
+ """
285
+ res = torch.zeros(num_classes)
286
+ for i in label:
287
+ itm = int(i.item())
288
+ res[itm] = 1.0
289
+ return res
290
+
291
+
292
+ class MaskCrop():
293
+ def __init__(self) -> None:
294
+ pass
295
+
296
+ def __call__(self, target, img):
297
+ known_box = target['known_box']
298
+ h,w = img.shape[1:] # h,w
299
+ # imgsize = target['orig_size'] # h,w
300
+
301
+ scale = torch.Tensor([w, h, w, h])
302
+
303
+ # _cnt = 0
304
+ for boxes in known_box:
305
+ if boxes.shape[0] == 0:
306
+ continue
307
+ box_xyxy = box_cxcywh_to_xyxy(boxes) * scale
308
+ for box in box_xyxy:
309
+ x1, y1, x2, y2 = [int(i) for i in box.tolist()]
310
+ img[:, y1:y2, x1:x2] = 0
311
+ # _cnt += 1
312
+ # print("_cnt:", _cnt)
313
+ return target, img
314
+
315
+
316
+ dataset_hook_register = {
317
+ 'label2compat': label2compat,
318
+ 'label_compat2onehot': label_compat2onehot,
319
+ 'box_label_catter': box_label_catter,
320
+ 'RandomSelectBoxlabels': RandomSelectBoxlabels,
321
+ 'RandomSelectBoxes': RandomSelectBoxes,
322
+ 'MaskCrop': MaskCrop,
323
+ 'BboxPertuber': BboxPertuber,
324
+ }
325
+
326
+
327
+ class CocoDetection(torchvision.datasets.CocoDetection):
328
+ def __init__(self, img_folder, ann_file, transforms, return_masks, aux_target_hacks=None):
329
+ super(CocoDetection, self).__init__(img_folder, ann_file)
330
+ self._transforms = transforms
331
+ self.prepare = ConvertCocoPolysToMask(return_masks)
332
+ self.aux_target_hacks = aux_target_hacks
333
+
334
+ def change_hack_attr(self, hackclassname, attrkv_dict):
335
+ target_class = dataset_hook_register[hackclassname]
336
+ for item in self.aux_target_hacks:
337
+ if isinstance(item, target_class):
338
+ for k,v in attrkv_dict.items():
339
+ setattr(item, k, v)
340
+
341
+ def get_hack(self, hackclassname):
342
+ target_class = dataset_hook_register[hackclassname]
343
+ for item in self.aux_target_hacks:
344
+ if isinstance(item, target_class):
345
+ return item
346
+
347
+ def _load_image(self, id: int) -> Image.Image:
348
+ path = self.coco.loadImgs(id)[0]["file_name"]
349
+ abs_path = os.path.join(self.root, path)
350
+ return Image.open(abs_path).convert("RGB")
351
+
352
+ def __getitem__(self, idx):
353
+ """
354
+ Output:
355
+ - target: dict of multiple items
356
+ - boxes: Tensor[num_box, 4]. \
357
+ Init type: x0,y0,x1,y1. unnormalized data.
358
+ Final type: cx,cy,w,h. normalized data.
359
+ """
360
+ try:
361
+ img, target = super(CocoDetection, self).__getitem__(idx)
362
+ except:
363
+ print("Error idx: {}".format(idx))
364
+ idx += 1
365
+ img, target = super(CocoDetection, self).__getitem__(idx)
366
+ image_id = self.ids[idx]
367
+ target = {'image_id': image_id, 'annotations': target}
368
+ img, target = self.prepare(img, target)
369
+
370
+ if self._transforms is not None:
371
+ img, target = self._transforms(img, target)
372
+
373
+ # convert to needed format
374
+ if self.aux_target_hacks is not None:
375
+ for hack_runner in self.aux_target_hacks:
376
+ target, img = hack_runner(target, img=img)
377
+
378
+ return img, target
379
+
380
+
381
+ def convert_coco_poly_to_mask(segmentations, height, width):
382
+ masks = []
383
+ for polygons in segmentations:
384
+ rles = coco_mask.frPyObjects(polygons, height, width)
385
+ mask = coco_mask.decode(rles)
386
+ if len(mask.shape) < 3:
387
+ mask = mask[..., None]
388
+ mask = torch.as_tensor(mask, dtype=torch.uint8)
389
+ mask = mask.any(dim=2)
390
+ masks.append(mask)
391
+ if masks:
392
+ masks = torch.stack(masks, dim=0)
393
+ else:
394
+ masks = torch.zeros((0, height, width), dtype=torch.uint8)
395
+ return masks
396
+
397
+
398
+ class ConvertCocoPolysToMask(object):
399
+ def __init__(self, return_masks=False):
400
+ self.return_masks = return_masks
401
+
402
+ def __call__(self, image, target):
403
+ w, h = image.size
404
+
405
+ image_id = target["image_id"]
406
+ image_id = torch.tensor([image_id])
407
+
408
+ anno = target["annotations"]
409
+
410
+ anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
411
+
412
+ boxes = [obj["bbox"] for obj in anno]
413
+ # guard against no boxes via resizing
414
+ boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
415
+ boxes[:, 2:] += boxes[:, :2]
416
+ boxes[:, 0::2].clamp_(min=0, max=w)
417
+ boxes[:, 1::2].clamp_(min=0, max=h)
418
+
419
+ classes = [obj["category_id"] for obj in anno]
420
+ classes = torch.tensor(classes, dtype=torch.int64)
421
+
422
+ if self.return_masks:
423
+ segmentations = [obj["segmentation"] for obj in anno]
424
+ masks = convert_coco_poly_to_mask(segmentations, h, w)
425
+
426
+ keypoints = None
427
+ if anno and "keypoints" in anno[0]:
428
+ keypoints = [obj["keypoints"] for obj in anno]
429
+ keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
430
+ num_keypoints = keypoints.shape[0]
431
+ if num_keypoints:
432
+ keypoints = keypoints.view(num_keypoints, -1, 3)
433
+
434
+ keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
435
+ boxes = boxes[keep]
436
+ classes = classes[keep]
437
+ if self.return_masks:
438
+ masks = masks[keep]
439
+ if keypoints is not None:
440
+ keypoints = keypoints[keep]
441
+
442
+ target = {}
443
+ target["boxes"] = boxes
444
+ target["labels"] = classes
445
+ if self.return_masks:
446
+ target["masks"] = masks
447
+ target["image_id"] = image_id
448
+ if keypoints is not None:
449
+ target["keypoints"] = keypoints
450
+
451
+ # for conversion to coco api
452
+ area = torch.tensor([obj["area"] for obj in anno])
453
+ iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
454
+ target["area"] = area[keep]
455
+ target["iscrowd"] = iscrowd[keep]
456
+
457
+ target["orig_size"] = torch.as_tensor([int(h), int(w)])
458
+ target["size"] = torch.as_tensor([int(h), int(w)])
459
+
460
+ return image, target
461
+
462
+
463
+ def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
464
+
465
+ normalize = T.Compose([
466
+ T.ToTensor(),
467
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
468
+ ])
469
+
470
+ # config the params for data aug
471
+ scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
472
+ max_size = 1333
473
+ scales2_resize = [400, 500, 600]
474
+ scales2_crop = [384, 600]
475
+
476
+ # update args from config files
477
+ scales = getattr(args, 'data_aug_scales', scales)
478
+ max_size = getattr(args, 'data_aug_max_size', max_size)
479
+ scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
480
+ scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
481
+
482
+ # resize them
483
+ data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
484
+ if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
485
+ data_aug_scale_overlap = float(data_aug_scale_overlap)
486
+ scales = [int(i*data_aug_scale_overlap) for i in scales]
487
+ max_size = int(max_size*data_aug_scale_overlap)
488
+ scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
489
+ scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
490
+
491
+ datadict_for_print = {
492
+ 'scales': scales,
493
+ 'max_size': max_size,
494
+ 'scales2_resize': scales2_resize,
495
+ 'scales2_crop': scales2_crop
496
+ }
497
+ # print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
498
+
499
+ if image_set == 'train':
500
+ if fix_size:
501
+ return T.Compose([
502
+ T.RandomHorizontalFlip(),
503
+ T.RandomResize([(max_size, max(scales))]),
504
+ # T.RandomResize([(512, 512)]),
505
+ normalize,
506
+ ])
507
+
508
+ if strong_aug:
509
+ import datasets.sltransform as SLT
510
+
511
+ return T.Compose([
512
+ T.RandomHorizontalFlip(),
513
+ T.RandomSelect(
514
+ T.RandomResize(scales, max_size=max_size),
515
+ T.Compose([
516
+ T.RandomResize(scales2_resize),
517
+ T.RandomSizeCrop(*scales2_crop),
518
+ T.RandomResize(scales, max_size=max_size),
519
+ ])
520
+ ),
521
+ SLT.RandomSelectMulti([
522
+ SLT.RandomCrop(),
523
+ SLT.LightingNoise(),
524
+ SLT.AdjustBrightness(2),
525
+ SLT.AdjustContrast(2),
526
+ ]),
527
+ normalize,
528
+ ])
529
+
530
+ return T.Compose([
531
+ T.RandomHorizontalFlip(),
532
+ T.RandomSelect(
533
+ T.RandomResize(scales, max_size=max_size),
534
+ T.Compose([
535
+ T.RandomResize(scales2_resize),
536
+ T.RandomSizeCrop(*scales2_crop),
537
+ T.RandomResize(scales, max_size=max_size),
538
+ ])
539
+ ),
540
+ normalize,
541
+ ])
542
+
543
+ if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
544
+
545
+ if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
546
+ print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
547
+ return T.Compose([
548
+ T.ResizeDebug((1280, 800)),
549
+ normalize,
550
+ ])
551
+
552
+ return T.Compose([
553
+ T.RandomResize([max(scales)], max_size=max_size),
554
+ normalize,
555
+ ])
556
+
557
+
558
+
559
+ raise ValueError(f'unknown {image_set}')
560
+
561
+
562
+ def get_aux_target_hacks_list(image_set, args):
563
+ if args.modelname in ['q2bs_mask', 'q2bs']:
564
+ aux_target_hacks_list = [
565
+ label2compat(),
566
+ label_compat2onehot(),
567
+ RandomSelectBoxes(num_class=args.num_classes)
568
+ ]
569
+ if args.masked_data and image_set == 'train':
570
+ # aux_target_hacks_list.append()
571
+ aux_target_hacks_list.append(MaskCrop())
572
+ elif args.modelname in ['q2bm_v2', 'q2bs_ce', 'q2op', 'q2ofocal', 'q2opclip', 'q2ocqonly']:
573
+ aux_target_hacks_list = [
574
+ label2compat(),
575
+ label_compat2onehot(),
576
+ box_label_catter(),
577
+ RandomSelectBoxlabels(num_classes=args.num_classes,
578
+ prob_first_item=args.prob_first_item,
579
+ prob_random_item=args.prob_random_item,
580
+ prob_last_item=args.prob_last_item,
581
+ prob_stop_sign=args.prob_stop_sign,
582
+ ),
583
+ BboxPertuber(max_ratio=0.02, generate_samples=1000),
584
+ ]
585
+ elif args.modelname in ['q2omask', 'q2osa']:
586
+ if args.coco_aug:
587
+ aux_target_hacks_list = [
588
+ label2compat(),
589
+ label_compat2onehot(),
590
+ box_label_catter(),
591
+ RandomSelectBoxlabels(num_classes=args.num_classes,
592
+ prob_first_item=args.prob_first_item,
593
+ prob_random_item=args.prob_random_item,
594
+ prob_last_item=args.prob_last_item,
595
+ prob_stop_sign=args.prob_stop_sign,
596
+ ),
597
+ RandomDrop(p=0.2),
598
+ BboxPertuber(max_ratio=0.02, generate_samples=1000),
599
+ RandomCutout(factor=0.5)
600
+ ]
601
+ else:
602
+ aux_target_hacks_list = [
603
+ label2compat(),
604
+ label_compat2onehot(),
605
+ box_label_catter(),
606
+ RandomSelectBoxlabels(num_classes=args.num_classes,
607
+ prob_first_item=args.prob_first_item,
608
+ prob_random_item=args.prob_random_item,
609
+ prob_last_item=args.prob_last_item,
610
+ prob_stop_sign=args.prob_stop_sign,
611
+ ),
612
+ BboxPertuber(max_ratio=0.02, generate_samples=1000),
613
+ ]
614
+ else:
615
+ aux_target_hacks_list = None
616
+
617
+ return aux_target_hacks_list
618
+
619
+
620
+ def build(image_set, args, datasetinfo):
621
+ img_folder = datasetinfo["root"]
622
+ ann_file = datasetinfo["anno"]
623
+
624
+ # copy to local path
625
+ if os.environ.get('DATA_COPY_SHILONG') == 'INFO':
626
+ preparing_dataset(dict(img_folder=img_folder, ann_file=ann_file), image_set, args)
627
+
628
+ try:
629
+ strong_aug = args.strong_aug
630
+ except:
631
+ strong_aug = False
632
+ print(img_folder, ann_file)
633
+ dataset = CocoDetection(img_folder, ann_file,
634
+ transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
635
+ return_masks=args.masks,
636
+ aux_target_hacks=None,
637
+ )
638
+ return dataset
639
+
640
+
641
+ if __name__ == "__main__":
642
+ # Objects365 Val example
643
+ dataset_o365 = CocoDetection(
644
+ '/path/Objects365/train/',
645
+ "/path/Objects365/slannos/anno_preprocess_train_v2.json",
646
+ transforms=None,
647
+ return_masks=False,
648
+ )
649
+ print('len(dataset_o365):', len(dataset_o365))
groundingdino/datasets/.ipynb_checkpoints/dataset-checkpoint.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+
3
+ import torch
4
+ import torchvision.datasets as datasets
5
+ from torch.utils.data import Dataset
6
+ from PIL import Image
7
+ from .tsv_io import TSVFile
8
+ import numpy as np
9
+ import base64
10
+ import io
11
+
12
+
13
+ class TSVDataset(Dataset):
14
+ """ TSV dataset for ImageNet 1K training
15
+ """
16
+ def __init__(self, tsv_file, transform=None, target_transform=None):
17
+ self.tsv = TSVFile(tsv_file)
18
+ self.transform = transform
19
+ self.target_transform = target_transform
20
+
21
+ def __getitem__(self, index):
22
+ """
23
+ Args:
24
+ index (int): Index
25
+ Returns:
26
+ tuple: (image, target) where target is class_index of the target class.
27
+ """
28
+ row = self.tsv.seek(index)
29
+ image_data = base64.b64decode(row[-1])
30
+ image = Image.open(io.BytesIO(image_data))
31
+ image = image.convert('RGB')
32
+ target = int(row[1])
33
+
34
+ if self.transform is not None:
35
+ img = self.transform(image)
36
+ else:
37
+ img = image
38
+ if self.target_transform is not None:
39
+ target = self.target_transform(target)
40
+
41
+ return img, target
42
+
43
+ def __len__(self):
44
+ return self.tsv.num_rows()
groundingdino/datasets/.ipynb_checkpoints/odvg-checkpoint.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torchvision.datasets.vision import VisionDataset
2
+ import os.path
3
+ from typing import Callable, Optional
4
+ import json
5
+ from PIL import Image
6
+ import torch
7
+ import random
8
+ import os, sys
9
+ sys.path.append(os.path.dirname(sys.path[0]))
10
+
11
+ import datasets.transforms as T
12
+
13
+ class ODVGDataset(VisionDataset):
14
+ """
15
+ Args:
16
+ root (string): Root directory where images are downloaded to.
17
+ anno (string): Path to json annotation file.
18
+ label_map_anno (string): Path to json label mapping file. Only for Object Detection
19
+ transform (callable, optional): A function/transform that takes in an PIL image
20
+ and returns a transformed version. E.g, ``transforms.PILToTensor``
21
+ target_transform (callable, optional): A function/transform that takes in the
22
+ target and transforms it.
23
+ transforms (callable, optional): A function/transform that takes input sample and its target as entry
24
+ and returns a transformed version.
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ root: str,
30
+ anno: str,
31
+ label_map_anno: str = None,
32
+ max_labels: int = 80,
33
+ transform: Optional[Callable] = None,
34
+ target_transform: Optional[Callable] = None,
35
+ transforms: Optional[Callable] = None,
36
+ ) -> None:
37
+ super().__init__(root, transforms, transform, target_transform)
38
+ self.root = root
39
+ self.dataset_mode = "OD" if label_map_anno else "VG"
40
+ self.max_labels = max_labels
41
+ if self.dataset_mode == "OD":
42
+ self.load_label_map(label_map_anno)
43
+ self._load_metas(anno)
44
+ self.get_dataset_info()
45
+
46
+ def load_label_map(self, label_map_anno):
47
+ with open(label_map_anno, 'r') as file:
48
+ self.label_map = json.load(file)
49
+ self.label_index = set(self.label_map.keys())
50
+
51
+ def _load_metas(self, anno):
52
+ with open(anno, 'r') as f:
53
+ self.metas = json.load(f)
54
+
55
+
56
+ def get_dataset_info(self):
57
+ print(f" == total images: {len(self)}")
58
+ if self.dataset_mode == "OD":
59
+ print(f" == total labels: {len(self.label_map)}")
60
+
61
+ def __getitem__(self, index: int):
62
+ meta = self.metas[index]
63
+ rel_path = meta["filename"]
64
+ abs_path = os.path.join(self.root, rel_path)
65
+ if not os.path.exists(abs_path):
66
+ raise FileNotFoundError(f"{abs_path} not found.")
67
+ image = Image.open(abs_path).convert('RGB')
68
+ w, h = image.size
69
+ if self.dataset_mode == "OD":
70
+ anno = meta["detection"]
71
+ instances = [obj for obj in anno["instances"]]
72
+ boxes = [obj["bbox"] for obj in instances]
73
+ # generate vg_labels
74
+ # pos bbox labels
75
+ ori_classes = [str(obj["label"]) for obj in instances]
76
+ pos_labels = set(ori_classes)
77
+ # neg bbox labels
78
+ neg_labels = self.label_index.difference(pos_labels)
79
+
80
+ vg_labels = list(pos_labels)
81
+ num_to_add = min(len(neg_labels), self.max_labels-len(pos_labels))
82
+ if num_to_add > 0:
83
+ vg_labels.extend(random.sample(neg_labels, num_to_add))
84
+
85
+ # shuffle
86
+ for i in range(len(vg_labels)-1, 0, -1):
87
+ j = random.randint(0, i)
88
+ vg_labels[i], vg_labels[j] = vg_labels[j], vg_labels[i]
89
+
90
+ caption_list = [self.label_map[lb] for lb in vg_labels]
91
+ caption_dict = {item:index for index, item in enumerate(caption_list)}
92
+
93
+ caption = ' . '.join(caption_list) + ' .'
94
+ classes = [caption_dict[self.label_map[str(obj["label"])]] for obj in instances]
95
+ boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
96
+ classes = torch.tensor(classes, dtype=torch.int64)
97
+ elif self.dataset_mode == "VG":
98
+ anno = meta["Grounding"]
99
+ instances = [obj for obj in anno["regions"]]
100
+ boxes = [obj["bbox"] for obj in instances]
101
+ caption_list = [obj["phrase"] for obj in instances]
102
+ c = list(zip(boxes, caption_list))
103
+ random.shuffle(c)
104
+ boxes[:], caption_list[:] = zip(*c)
105
+ uni_caption_list = list(set(caption_list))
106
+ label_map = {}
107
+ for idx in range(len(uni_caption_list)):
108
+ label_map[uni_caption_list[idx]] = idx
109
+ classes = [label_map[cap] for cap in caption_list]
110
+ caption = ' . '.join(uni_caption_list) + ' .'
111
+ boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
112
+ classes = torch.tensor(classes, dtype=torch.int64)
113
+ caption_list = uni_caption_list
114
+ # print("caption_list" , caption_list)
115
+ # print("caption" , caption)
116
+ # print("boxes" , boxes)
117
+ target = {}
118
+ target["image_id"] = rel_path.strip(".jpg")
119
+ target["size"] = torch.as_tensor([int(h), int(w)])
120
+ target["cap_list"] = caption_list
121
+ target["caption"] = caption
122
+ target["boxes"] = boxes
123
+ target["labels"] = classes
124
+ # print(" image_id " , target["image_id"])
125
+ # size, cap_list, caption, bboxes, labels
126
+
127
+ if self.transforms is not None:
128
+ image, target = self.transforms(image, target)
129
+
130
+ return image, target
131
+
132
+
133
+ def __len__(self) -> int:
134
+ return len(self.metas)
135
+
136
+
137
+ def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
138
+
139
+ normalize = T.Compose([
140
+ T.ToTensor(),
141
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
142
+ ])
143
+
144
+ # config the params for data aug
145
+ scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
146
+ max_size = 1333
147
+ scales2_resize = [400, 500, 600]
148
+ scales2_crop = [384, 600]
149
+
150
+ # update args from config files
151
+ scales = getattr(args, 'data_aug_scales', scales)
152
+ max_size = getattr(args, 'data_aug_max_size', max_size)
153
+ scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
154
+ scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
155
+
156
+ # resize them
157
+ data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
158
+ if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
159
+ data_aug_scale_overlap = float(data_aug_scale_overlap)
160
+ scales = [int(i*data_aug_scale_overlap) for i in scales]
161
+ max_size = int(max_size*data_aug_scale_overlap)
162
+ scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
163
+ scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
164
+
165
+ # datadict_for_print = {
166
+ # 'scales': scales,
167
+ # 'max_size': max_size,
168
+ # 'scales2_resize': scales2_resize,
169
+ # 'scales2_crop': scales2_crop
170
+ # }
171
+ # print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
172
+
173
+ if image_set == 'train':
174
+ if fix_size:
175
+ return T.Compose([
176
+ T.RandomHorizontalFlip(),
177
+ T.RandomResize([(max_size, max(scales))]),
178
+ normalize,
179
+ ])
180
+
181
+ if strong_aug:
182
+ import datasets.sltransform as SLT
183
+
184
+ return T.Compose([
185
+ T.RandomHorizontalFlip(),
186
+ T.RandomSelect(
187
+ T.RandomResize(scales, max_size=max_size),
188
+ T.Compose([
189
+ T.RandomResize(scales2_resize),
190
+ T.RandomSizeCrop(*scales2_crop),
191
+ T.RandomResize(scales, max_size=max_size),
192
+ ])
193
+ ),
194
+ SLT.RandomSelectMulti([
195
+ SLT.RandomCrop(),
196
+ SLT.LightingNoise(),
197
+ SLT.AdjustBrightness(2),
198
+ SLT.AdjustContrast(2),
199
+ ]),
200
+ normalize,
201
+ ])
202
+
203
+ return T.Compose([
204
+ T.RandomHorizontalFlip(),
205
+ T.RandomSelect(
206
+ T.RandomResize(scales, max_size=max_size),
207
+ T.Compose([
208
+ T.RandomResize(scales2_resize),
209
+ T.RandomSizeCrop(*scales2_crop),
210
+ T.RandomResize(scales, max_size=max_size),
211
+ ])
212
+ ),
213
+ normalize,
214
+ ])
215
+
216
+ if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
217
+
218
+ if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
219
+ print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
220
+ return T.Compose([
221
+ T.ResizeDebug((1280, 800)),
222
+ normalize,
223
+ ])
224
+
225
+ return T.Compose([
226
+ T.RandomResize([max(scales)], max_size=max_size),
227
+ normalize,
228
+ ])
229
+
230
+ raise ValueError(f'unknown {image_set}')
231
+
232
+ def build_odvg(image_set, args, datasetinfo):
233
+ img_folder = datasetinfo["root"]
234
+ ann_file = datasetinfo["anno"]
235
+ label_map = datasetinfo["label_map"] if "label_map" in datasetinfo else None
236
+ try:
237
+ strong_aug = args.strong_aug
238
+ except:
239
+ strong_aug = False # False originally
240
+ print(img_folder, ann_file, label_map)
241
+ dataset = ODVGDataset(img_folder, ann_file, label_map, max_labels=args.max_labels,
242
+ transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
243
+ )
244
+ return dataset
245
+
246
+
247
+ if __name__=="__main__":
248
+ dataset_vg = ODVGDataset("path/GRIT-20M/data/","path/GRIT-20M/anno/grit_odvg_10k.jsonl",)
249
+ print(len(dataset_vg))
250
+ data = dataset_vg[random.randint(0, 100)]
251
+ print(data)
252
+ dataset_od = ODVGDataset("pathl/V3Det/",
253
+ "path/V3Det/annotations/v3det_2023_v1_all_odvg.jsonl",
254
+ "path/V3Det/annotations/v3det_label_map.json",
255
+ )
256
+ print(len(dataset_od))
257
+ data = dataset_od[random.randint(0, 100)]
258
+ print(data)
groundingdino/datasets/.ipynb_checkpoints/transforms-checkpoint.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ """
3
+ Transforms and data augmentation for both image + bbox.
4
+ """
5
+ import random
6
+
7
+ import PIL
8
+ import torch
9
+ import torchvision.transforms as T
10
+ import torchvision.transforms.functional as F
11
+
12
+ from util.box_ops import box_xyxy_to_cxcywh
13
+ from util.misc import interpolate
14
+
15
+
16
+ def crop(image, target, region):
17
+ cropped_image = F.crop(image, *region)
18
+
19
+ target = target.copy()
20
+ i, j, h, w = region
21
+
22
+ # should we do something wrt the original size?
23
+ target["size"] = torch.tensor([h, w])
24
+
25
+ fields = ["labels", "area"]
26
+
27
+ if "boxes" in target:
28
+ boxes = target["boxes"]
29
+ max_size = torch.as_tensor([w, h], dtype=torch.float32)
30
+ cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
31
+ cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
32
+ cropped_boxes = cropped_boxes.clamp(min=0)
33
+ area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
34
+ target["boxes"] = cropped_boxes.reshape(-1, 4)
35
+ target["area"] = area
36
+ fields.append("boxes")
37
+
38
+ if "masks" in target:
39
+ # FIXME should we update the area here if there are no boxes?
40
+ target['masks'] = target['masks'][:, i:i + h, j:j + w]
41
+ fields.append("masks")
42
+
43
+
44
+ # remove elements for which the boxes or masks that have zero area
45
+ if "boxes" in target or "masks" in target:
46
+ # favor boxes selection when defining which elements to keep
47
+ # this is compatible with previous implementation
48
+ if "boxes" in target:
49
+ cropped_boxes = target['boxes'].reshape(-1, 2, 2)
50
+ keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
51
+ else:
52
+ keep = target['masks'].flatten(1).any(1)
53
+
54
+ for field in fields:
55
+ target[field] = target[field][keep]
56
+
57
+ return cropped_image, target
58
+
59
+
60
+ def hflip(image, target):
61
+ flipped_image = F.hflip(image)
62
+
63
+ w, h = image.size
64
+
65
+ target = target.copy()
66
+ if "boxes" in target:
67
+ boxes = target["boxes"]
68
+ boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
69
+ target["boxes"] = boxes
70
+
71
+ if "masks" in target:
72
+ target['masks'] = target['masks'].flip(-1)
73
+
74
+ return flipped_image, target
75
+
76
+
77
+ def resize(image, target, size, max_size=None):
78
+ # size can be min_size (scalar) or (w, h) tuple
79
+
80
+ def get_size_with_aspect_ratio(image_size, size, max_size=None):
81
+ w, h = image_size
82
+ if max_size is not None:
83
+ min_original_size = float(min((w, h)))
84
+ max_original_size = float(max((w, h)))
85
+ if max_original_size / min_original_size * size > max_size:
86
+ size = int(round(max_size * min_original_size / max_original_size))
87
+
88
+ if (w <= h and w == size) or (h <= w and h == size):
89
+ return (h, w)
90
+
91
+ if w < h:
92
+ ow = size
93
+ oh = int(size * h / w)
94
+ else:
95
+ oh = size
96
+ ow = int(size * w / h)
97
+
98
+ return (oh, ow)
99
+
100
+ def get_size(image_size, size, max_size=None):
101
+ if isinstance(size, (list, tuple)):
102
+ return size[::-1]
103
+ else:
104
+ return get_size_with_aspect_ratio(image_size, size, max_size)
105
+
106
+ size = get_size(image.size, size, max_size)
107
+ rescaled_image = F.resize(image, size)
108
+
109
+ if target is None:
110
+ return rescaled_image, None
111
+
112
+ ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
113
+ ratio_width, ratio_height = ratios
114
+
115
+ target = target.copy()
116
+ if "boxes" in target:
117
+ boxes = target["boxes"]
118
+ scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
119
+ target["boxes"] = scaled_boxes
120
+
121
+ if "area" in target:
122
+ area = target["area"]
123
+ scaled_area = area * (ratio_width * ratio_height)
124
+ target["area"] = scaled_area
125
+
126
+ h, w = size
127
+ target["size"] = torch.tensor([h, w])
128
+
129
+ if "masks" in target:
130
+ target['masks'] = interpolate(
131
+ target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
132
+
133
+ return rescaled_image, target
134
+
135
+
136
+ def pad(image, target, padding):
137
+ # assumes that we only pad on the bottom right corners
138
+ padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
139
+ if target is None:
140
+ return padded_image, None
141
+ target = target.copy()
142
+ # should we do something wrt the original size?
143
+ target["size"] = torch.tensor(padded_image.size[::-1])
144
+ if "masks" in target:
145
+ target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
146
+ return padded_image, target
147
+
148
+
149
+ class ResizeDebug(object):
150
+ def __init__(self, size):
151
+ self.size = size
152
+
153
+ def __call__(self, img, target):
154
+ return resize(img, target, self.size)
155
+
156
+
157
+ class RandomCrop(object):
158
+ def __init__(self, size):
159
+ self.size = size
160
+
161
+ def __call__(self, img, target):
162
+ region = T.RandomCrop.get_params(img, self.size)
163
+ return crop(img, target, region)
164
+
165
+
166
+ class RandomSizeCrop(object):
167
+ def __init__(self, min_size: int, max_size: int):
168
+ self.min_size = min_size
169
+ self.max_size = max_size
170
+
171
+ def __call__(self, img: PIL.Image.Image, target: dict):
172
+ w = random.randint(self.min_size, min(img.width, self.max_size))
173
+ h = random.randint(self.min_size, min(img.height, self.max_size))
174
+ region = T.RandomCrop.get_params(img, [h, w])
175
+ return crop(img, target, region)
176
+
177
+
178
+ class CenterCrop(object):
179
+ def __init__(self, size):
180
+ self.size = size
181
+
182
+ def __call__(self, img, target):
183
+ image_width, image_height = img.size
184
+ crop_height, crop_width = self.size
185
+ crop_top = int(round((image_height - crop_height) / 2.))
186
+ crop_left = int(round((image_width - crop_width) / 2.))
187
+ return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
188
+
189
+
190
+ class RandomHorizontalFlip(object):
191
+ def __init__(self, p=0.5):
192
+ self.p = p
193
+
194
+ def __call__(self, img, target):
195
+ if random.random() < self.p:
196
+ return hflip(img, target)
197
+ return img, target
198
+
199
+
200
+ class RandomResize(object):
201
+ def __init__(self, sizes, max_size=None):
202
+ assert isinstance(sizes, (list, tuple))
203
+ self.sizes = sizes
204
+ self.max_size = max_size
205
+
206
+ def __call__(self, img, target=None):
207
+ size = random.choice(self.sizes)
208
+ return resize(img, target, size, self.max_size)
209
+
210
+
211
+ class RandomPad(object):
212
+ def __init__(self, max_pad):
213
+ self.max_pad = max_pad
214
+
215
+ def __call__(self, img, target):
216
+ pad_x = random.randint(0, self.max_pad)
217
+ pad_y = random.randint(0, self.max_pad)
218
+ return pad(img, target, (pad_x, pad_y))
219
+
220
+
221
+ class RandomSelect(object):
222
+ """
223
+ Randomly selects between transforms1 and transforms2,
224
+ with probability p for transforms1 and (1 - p) for transforms2
225
+ """
226
+ def __init__(self, transforms1, transforms2, p=0.5):
227
+ self.transforms1 = transforms1
228
+ self.transforms2 = transforms2
229
+ self.p = p
230
+
231
+ def __call__(self, img, target):
232
+ if random.random() < self.p:
233
+ return self.transforms1(img, target)
234
+ return self.transforms2(img, target)
235
+
236
+
237
+ class ToTensor(object):
238
+ def __call__(self, img, target):
239
+ return F.to_tensor(img), target
240
+
241
+
242
+ class RandomErasing(object):
243
+
244
+ def __init__(self, *args, **kwargs):
245
+ self.eraser = T.RandomErasing(*args, **kwargs)
246
+
247
+ def __call__(self, img, target):
248
+ return self.eraser(img), target
249
+
250
+
251
+ class Normalize(object):
252
+ def __init__(self, mean, std):
253
+ self.mean = mean
254
+ self.std = std
255
+
256
+ def __call__(self, image, target=None):
257
+ image = F.normalize(image, mean=self.mean, std=self.std)
258
+ if target is None:
259
+ return image, None
260
+ target = target.copy()
261
+ h, w = image.shape[-2:]
262
+ if "boxes" in target:
263
+ boxes = target["boxes"]
264
+ boxes = box_xyxy_to_cxcywh(boxes)
265
+ boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
266
+ target["boxes"] = boxes
267
+ return image, target
268
+
269
+
270
+ class Compose(object):
271
+ def __init__(self, transforms):
272
+ self.transforms = transforms
273
+
274
+ def __call__(self, image, target):
275
+ for t in self.transforms:
276
+ image, target = t(image, target)
277
+ return image, target
278
+
279
+ def __repr__(self):
280
+ format_string = self.__class__.__name__ + "("
281
+ for t in self.transforms:
282
+ format_string += "\n"
283
+ format_string += " {0}".format(t)
284
+ format_string += "\n)"
285
+ return format_string
groundingdino/datasets/__init__.py CHANGED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ import torch.utils.data
3
+ import torchvision
4
+ from .coco import build as build_coco
5
+
6
+
7
+ def get_coco_api_from_dataset(dataset):
8
+ for _ in range(10):
9
+ # if isinstance(dataset, torchvision.datasets.CocoDetection):
10
+ # break
11
+ if isinstance(dataset, torch.utils.data.Subset):
12
+ dataset = dataset.dataset
13
+ if isinstance(dataset, torchvision.datasets.CocoDetection):
14
+ return dataset.coco
15
+
16
+
17
+ def build_dataset(image_set, args, datasetinfo):
18
+ if datasetinfo["dataset_mode"] == 'coco':
19
+ return build_coco(image_set, args, datasetinfo)
20
+ if datasetinfo["dataset_mode"] == 'odvg':
21
+ from .odvg import build_odvg
22
+ return build_odvg(image_set, args, datasetinfo)
23
+ raise ValueError(f'dataset {args.dataset_file} not supported')
groundingdino/datasets/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/groundingdino/datasets/__pycache__/__init__.cpython-310.pyc and b/groundingdino/datasets/__pycache__/__init__.cpython-310.pyc differ
 
groundingdino/datasets/__pycache__/coco.cpython-310.pyc ADDED
Binary file (20.2 kB). View file
 
groundingdino/datasets/__pycache__/coco_eval.cpython-310.pyc ADDED
Binary file (7.42 kB). View file
 
groundingdino/datasets/__pycache__/cocogrounding_eval.cpython-310.pyc ADDED
Binary file (7.44 kB). View file
 
groundingdino/datasets/__pycache__/data_util.cpython-310.pyc ADDED
Binary file (4.55 kB). View file
 
groundingdino/datasets/__pycache__/odvg.cpython-310.pyc ADDED
Binary file (8.21 kB). View file
 
groundingdino/datasets/__pycache__/panoptic_eval.cpython-310.pyc ADDED
Binary file (1.87 kB). View file
 
groundingdino/datasets/__pycache__/random_crop.cpython-310.pyc ADDED
Binary file (3.69 kB). View file
 
groundingdino/datasets/__pycache__/sltransform.cpython-310.pyc ADDED
Binary file (7.68 kB). View file
 
groundingdino/datasets/__pycache__/transforms.cpython-310.pyc CHANGED
Binary files a/groundingdino/datasets/__pycache__/transforms.cpython-310.pyc and b/groundingdino/datasets/__pycache__/transforms.cpython-310.pyc differ
 
groundingdino/datasets/coco.py ADDED
@@ -0,0 +1,649 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ """
3
+ COCO dataset which returns image_id for evaluation.
4
+
5
+ Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
6
+ """
7
+ if __name__=="__main__":
8
+ # for debug only
9
+ import os, sys
10
+ sys.path.append(os.path.dirname(sys.path[0]))
11
+ from torchvision.datasets.vision import VisionDataset
12
+
13
+ import json
14
+ from pathlib import Path
15
+ import random
16
+ import os
17
+ from typing import Any, Callable, List, Optional, Tuple
18
+
19
+ from PIL import Image
20
+
21
+ import torch
22
+ import torch.utils.data
23
+ import torchvision
24
+ from pycocotools import mask as coco_mask
25
+
26
+ from datasets.data_util import preparing_dataset
27
+ import datasets.transforms as T
28
+ from util.box_ops import box_cxcywh_to_xyxy, box_iou
29
+
30
+ __all__ = ['build']
31
+
32
+
33
+ class label2compat():
34
+ def __init__(self) -> None:
35
+ self.category_map_str = {"1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10, "11": 11, "13": 12, "14": 13, "15": 14, "16": 15, "17": 16, "18": 17, "19": 18, "20": 19, "21": 20, "22": 21, "23": 22, "24": 23, "25": 24, "27": 25, "28": 26, "31": 27, "32": 28, "33": 29, "34": 30, "35": 31, "36": 32, "37": 33, "38": 34, "39": 35, "40": 36, "41": 37, "42": 38, "43": 39, "44": 40, "46": 41, "47": 42, "48": 43, "49": 44, "50": 45, "51": 46, "52": 47, "53": 48, "54": 49, "55": 50, "56": 51, "57": 52, "58": 53, "59": 54, "60": 55, "61": 56, "62": 57, "63": 58, "64": 59, "65": 60, "67": 61, "70": 62, "72": 63, "73": 64, "74": 65, "75": 66, "76": 67, "77": 68, "78": 69, "79": 70, "80": 71, "81": 72, "82": 73, "84": 74, "85": 75, "86": 76, "87": 77, "88": 78, "89": 79, "90": 80}
36
+ self.category_map = {int(k):v for k,v in self.category_map_str.items()}
37
+
38
+ def __call__(self, target, img=None):
39
+ labels = target['labels']
40
+ res = torch.zeros(labels.shape, dtype=labels.dtype)
41
+ for idx, item in enumerate(labels):
42
+ res[idx] = self.category_map[item.item()] - 1
43
+ target['label_compat'] = res
44
+ if img is not None:
45
+ return target, img
46
+ else:
47
+ return target
48
+
49
+
50
+ class label_compat2onehot():
51
+ def __init__(self, num_class=80, num_output_objs=1):
52
+ self.num_class = num_class
53
+ self.num_output_objs = num_output_objs
54
+ if num_output_objs != 1:
55
+ raise DeprecationWarning("num_output_objs!=1, which is only used for comparison")
56
+
57
+ def __call__(self, target, img=None):
58
+ labels = target['label_compat']
59
+ place_dict = {k:0 for k in range(self.num_class)}
60
+ if self.num_output_objs == 1:
61
+ res = torch.zeros(self.num_class)
62
+ for i in labels:
63
+ itm = i.item()
64
+ res[itm] = 1.0
65
+ else:
66
+ # compat with baseline
67
+ res = torch.zeros(self.num_class, self.num_output_objs)
68
+ for i in labels:
69
+ itm = i.item()
70
+ res[itm][place_dict[itm]] = 1.0
71
+ place_dict[itm] += 1
72
+ target['label_compat_onehot'] = res
73
+ if img is not None:
74
+ return target, img
75
+ else:
76
+ return target
77
+
78
+
79
+ class box_label_catter():
80
+ def __init__(self):
81
+ pass
82
+
83
+ def __call__(self, target, img=None):
84
+ labels = target['label_compat']
85
+ boxes = target['boxes']
86
+ box_label = torch.cat((boxes, labels.unsqueeze(-1)), 1)
87
+ target['box_label'] = box_label
88
+ if img is not None:
89
+ return target, img
90
+ else:
91
+ return target
92
+
93
+
94
+ class RandomSelectBoxlabels():
95
+ def __init__(self, num_classes, leave_one_out=False, blank_prob=0.8,
96
+ prob_first_item = 0.0,
97
+ prob_random_item = 0.0,
98
+ prob_last_item = 0.8,
99
+ prob_stop_sign = 0.2
100
+ ) -> None:
101
+ self.num_classes = num_classes
102
+ self.leave_one_out = leave_one_out
103
+ self.blank_prob = blank_prob
104
+
105
+ self.set_state(prob_first_item, prob_random_item, prob_last_item, prob_stop_sign)
106
+
107
+
108
+ def get_state(self):
109
+ return [self.prob_first_item, self.prob_random_item, self.prob_last_item, self.prob_stop_sign]
110
+
111
+ def set_state(self, prob_first_item, prob_random_item, prob_last_item, prob_stop_sign):
112
+ sum_prob = prob_first_item + prob_random_item + prob_last_item + prob_stop_sign
113
+ assert sum_prob - 1 < 1e-6, \
114
+ f"Sum up all prob = {sum_prob}. prob_first_item:{prob_first_item}" \
115
+ + f"prob_random_item:{prob_random_item}, prob_last_item:{prob_last_item}" \
116
+ + f"prob_stop_sign:{prob_stop_sign}"
117
+
118
+ self.prob_first_item = prob_first_item
119
+ self.prob_random_item = prob_random_item
120
+ self.prob_last_item = prob_last_item
121
+ self.prob_stop_sign = prob_stop_sign
122
+
123
+
124
+ def sample_for_pred_first_item(self, box_label: torch.FloatTensor):
125
+ box_label_known = torch.Tensor(0,5)
126
+ box_label_unknown = box_label
127
+ return box_label_known, box_label_unknown
128
+
129
+ def sample_for_pred_random_item(self, box_label: torch.FloatTensor):
130
+ n_select = int(random.random() * box_label.shape[0])
131
+ box_label = box_label[torch.randperm(box_label.shape[0])]
132
+ box_label_known = box_label[:n_select]
133
+ box_label_unknown = box_label[n_select:]
134
+ return box_label_known, box_label_unknown
135
+
136
+ def sample_for_pred_last_item(self, box_label: torch.FloatTensor):
137
+ box_label_perm = box_label[torch.randperm(box_label.shape[0])]
138
+ known_label_list = []
139
+ box_label_known = []
140
+ box_label_unknown = []
141
+ for item in box_label_perm:
142
+ label_i = item[4].item()
143
+ if label_i in known_label_list:
144
+ box_label_known.append(item)
145
+ else:
146
+ # first item
147
+ box_label_unknown.append(item)
148
+ known_label_list.append(label_i)
149
+ box_label_known = torch.stack(box_label_known) if len(box_label_known) > 0 else torch.Tensor(0,5)
150
+ box_label_unknown = torch.stack(box_label_unknown) if len(box_label_unknown) > 0 else torch.Tensor(0,5)
151
+ return box_label_known, box_label_unknown
152
+
153
+ def sample_for_pred_stop_sign(self, box_label: torch.FloatTensor):
154
+ box_label_unknown = torch.Tensor(0,5)
155
+ box_label_known = box_label
156
+ return box_label_known, box_label_unknown
157
+
158
+ def __call__(self, target, img=None):
159
+ box_label = target['box_label'] # K, 5
160
+
161
+ dice_number = random.random()
162
+
163
+ if dice_number < self.prob_first_item:
164
+ box_label_known, box_label_unknown = self.sample_for_pred_first_item(box_label)
165
+ elif dice_number < self.prob_first_item + self.prob_random_item:
166
+ box_label_known, box_label_unknown = self.sample_for_pred_random_item(box_label)
167
+ elif dice_number < self.prob_first_item + self.prob_random_item + self.prob_last_item:
168
+ box_label_known, box_label_unknown = self.sample_for_pred_last_item(box_label)
169
+ else:
170
+ box_label_known, box_label_unknown = self.sample_for_pred_stop_sign(box_label)
171
+
172
+ target['label_onehot_known'] = label2onehot(box_label_known[:,-1], self.num_classes)
173
+ target['label_onehot_unknown'] = label2onehot(box_label_unknown[:, -1], self.num_classes)
174
+ target['box_label_known'] = box_label_known
175
+ target['box_label_unknown'] = box_label_unknown
176
+
177
+ return target, img
178
+
179
+
180
+ class RandomDrop():
181
+ def __init__(self, p=0.2) -> None:
182
+ self.p = p
183
+
184
+ def __call__(self, target, img=None):
185
+ known_box = target['box_label_known']
186
+ num_known_box = known_box.size(0)
187
+ idxs = torch.rand(num_known_box)
188
+ # indices = torch.randperm(num_known_box)[:int((1-self).p*num_known_box + 0.5 + random.random())]
189
+ target['box_label_known'] = known_box[idxs > self.p]
190
+ return target, img
191
+
192
+
193
+ class BboxPertuber():
194
+ def __init__(self, max_ratio = 0.02, generate_samples = 1000) -> None:
195
+ self.max_ratio = max_ratio
196
+ self.generate_samples = generate_samples
197
+ self.samples = self.generate_pertube_samples()
198
+ self.idx = 0
199
+
200
+ def generate_pertube_samples(self):
201
+ import torch
202
+ samples = (torch.rand(self.generate_samples, 5) - 0.5) * 2 * self.max_ratio
203
+ return samples
204
+
205
+ def __call__(self, target, img):
206
+ known_box = target['box_label_known'] # Tensor(K,5), K known bbox
207
+ K = known_box.shape[0]
208
+ known_box_pertube = torch.zeros(K, 6) # 4:bbox, 1:prob, 1:label
209
+ if K == 0:
210
+ pass
211
+ else:
212
+ if self.idx + K > self.generate_samples:
213
+ self.idx = 0
214
+ delta = self.samples[self.idx: self.idx + K, :]
215
+ known_box_pertube[:, :4] = known_box[:, :4] + delta[:, :4]
216
+ iou = (torch.diag(box_iou(box_cxcywh_to_xyxy(known_box[:, :4]), box_cxcywh_to_xyxy(known_box_pertube[:, :4]))[0])) * (1 + delta[:, -1])
217
+ known_box_pertube[:, 4].copy_(iou)
218
+ known_box_pertube[:, -1].copy_(known_box[:, -1])
219
+
220
+ target['box_label_known_pertube'] = known_box_pertube
221
+ return target, img
222
+
223
+
224
+ class RandomCutout():
225
+ def __init__(self, factor=0.5) -> None:
226
+ self.factor = factor
227
+
228
+ def __call__(self, target, img=None):
229
+ unknown_box = target['box_label_unknown'] # Ku, 5
230
+ known_box = target['box_label_known_pertube'] # Kk, 6
231
+ Ku = unknown_box.size(0)
232
+
233
+ known_box_add = torch.zeros(Ku, 6) # Ku, 6
234
+ known_box_add[:, :5] = unknown_box
235
+ known_box_add[:, 5].uniform_(0.5, 1)
236
+
237
+
238
+ known_box_add[:, :2] += known_box_add[:, 2:4] * (torch.rand(Ku, 2) - 0.5) / 2
239
+ known_box_add[:, 2:4] /= 2
240
+
241
+ target['box_label_known_pertube'] = torch.cat((known_box, known_box_add))
242
+ return target, img
243
+
244
+
245
+ class RandomSelectBoxes():
246
+ def __init__(self, num_class=80) -> None:
247
+ Warning("This is such a slow function and will be deprecated soon!!!")
248
+ self.num_class = num_class
249
+
250
+ def __call__(self, target, img=None):
251
+ boxes = target['boxes']
252
+ labels = target['label_compat']
253
+
254
+ # transform to list of tensors
255
+ boxs_list = [[] for i in range(self.num_class)]
256
+ for idx, item in enumerate(boxes):
257
+ label = labels[idx].item()
258
+ boxs_list[label].append(item)
259
+ boxs_list_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in boxs_list]
260
+
261
+ # random selection
262
+ box_known = []
263
+ box_unknown = []
264
+ for idx, item in enumerate(boxs_list_tensor):
265
+ ncnt = item.shape[0]
266
+ nselect = int(random.random() * ncnt) # close in both sides, much faster than random.randint
267
+
268
+ item = item[torch.randperm(ncnt)]
269
+ # random.shuffle(item)
270
+ box_known.append(item[:nselect])
271
+ box_unknown.append(item[nselect:])
272
+
273
+ # box_known_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_known]
274
+ # box_unknown_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_unknown]
275
+ # print('box_unknown_tensor:', box_unknown_tensor)
276
+ target['known_box'] = box_known
277
+ target['unknown_box'] = box_unknown
278
+ return target, img
279
+
280
+
281
+ def label2onehot(label, num_classes):
282
+ """
283
+ label: Tensor(K)
284
+ """
285
+ res = torch.zeros(num_classes)
286
+ for i in label:
287
+ itm = int(i.item())
288
+ res[itm] = 1.0
289
+ return res
290
+
291
+
292
+ class MaskCrop():
293
+ def __init__(self) -> None:
294
+ pass
295
+
296
+ def __call__(self, target, img):
297
+ known_box = target['known_box']
298
+ h,w = img.shape[1:] # h,w
299
+ # imgsize = target['orig_size'] # h,w
300
+
301
+ scale = torch.Tensor([w, h, w, h])
302
+
303
+ # _cnt = 0
304
+ for boxes in known_box:
305
+ if boxes.shape[0] == 0:
306
+ continue
307
+ box_xyxy = box_cxcywh_to_xyxy(boxes) * scale
308
+ for box in box_xyxy:
309
+ x1, y1, x2, y2 = [int(i) for i in box.tolist()]
310
+ img[:, y1:y2, x1:x2] = 0
311
+ # _cnt += 1
312
+ # print("_cnt:", _cnt)
313
+ return target, img
314
+
315
+
316
+ dataset_hook_register = {
317
+ 'label2compat': label2compat,
318
+ 'label_compat2onehot': label_compat2onehot,
319
+ 'box_label_catter': box_label_catter,
320
+ 'RandomSelectBoxlabels': RandomSelectBoxlabels,
321
+ 'RandomSelectBoxes': RandomSelectBoxes,
322
+ 'MaskCrop': MaskCrop,
323
+ 'BboxPertuber': BboxPertuber,
324
+ }
325
+
326
+
327
+ class CocoDetection(torchvision.datasets.CocoDetection):
328
+ def __init__(self, img_folder, ann_file, transforms, return_masks, aux_target_hacks=None):
329
+ super(CocoDetection, self).__init__(img_folder, ann_file)
330
+ self._transforms = transforms
331
+ self.prepare = ConvertCocoPolysToMask(return_masks)
332
+ self.aux_target_hacks = aux_target_hacks
333
+
334
+ def change_hack_attr(self, hackclassname, attrkv_dict):
335
+ target_class = dataset_hook_register[hackclassname]
336
+ for item in self.aux_target_hacks:
337
+ if isinstance(item, target_class):
338
+ for k,v in attrkv_dict.items():
339
+ setattr(item, k, v)
340
+
341
+ def get_hack(self, hackclassname):
342
+ target_class = dataset_hook_register[hackclassname]
343
+ for item in self.aux_target_hacks:
344
+ if isinstance(item, target_class):
345
+ return item
346
+
347
+ def _load_image(self, id: int) -> Image.Image:
348
+ path = self.coco.loadImgs(id)[0]["file_name"]
349
+ abs_path = os.path.join(self.root, path)
350
+ return Image.open(abs_path).convert("RGB")
351
+
352
+ def __getitem__(self, idx):
353
+ """
354
+ Output:
355
+ - target: dict of multiple items
356
+ - boxes: Tensor[num_box, 4]. \
357
+ Init type: x0,y0,x1,y1. unnormalized data.
358
+ Final type: cx,cy,w,h. normalized data.
359
+ """
360
+ try:
361
+ img, target = super(CocoDetection, self).__getitem__(idx)
362
+ except:
363
+ print("Error idx: {}".format(idx))
364
+ idx += 1
365
+ img, target = super(CocoDetection, self).__getitem__(idx)
366
+ image_id = self.ids[idx]
367
+ target = {'image_id': image_id, 'annotations': target}
368
+ img, target = self.prepare(img, target)
369
+
370
+ if self._transforms is not None:
371
+ img, target = self._transforms(img, target)
372
+
373
+ # convert to needed format
374
+ if self.aux_target_hacks is not None:
375
+ for hack_runner in self.aux_target_hacks:
376
+ target, img = hack_runner(target, img=img)
377
+
378
+ return img, target
379
+
380
+
381
+ def convert_coco_poly_to_mask(segmentations, height, width):
382
+ masks = []
383
+ for polygons in segmentations:
384
+ rles = coco_mask.frPyObjects(polygons, height, width)
385
+ mask = coco_mask.decode(rles)
386
+ if len(mask.shape) < 3:
387
+ mask = mask[..., None]
388
+ mask = torch.as_tensor(mask, dtype=torch.uint8)
389
+ mask = mask.any(dim=2)
390
+ masks.append(mask)
391
+ if masks:
392
+ masks = torch.stack(masks, dim=0)
393
+ else:
394
+ masks = torch.zeros((0, height, width), dtype=torch.uint8)
395
+ return masks
396
+
397
+
398
+ class ConvertCocoPolysToMask(object):
399
+ def __init__(self, return_masks=False):
400
+ self.return_masks = return_masks
401
+
402
+ def __call__(self, image, target):
403
+ w, h = image.size
404
+
405
+ image_id = target["image_id"]
406
+ image_id = torch.tensor([image_id])
407
+
408
+ anno = target["annotations"]
409
+
410
+ anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
411
+
412
+ boxes = [obj["bbox"] for obj in anno]
413
+ # guard against no boxes via resizing
414
+ boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
415
+ boxes[:, 2:] += boxes[:, :2]
416
+ boxes[:, 0::2].clamp_(min=0, max=w)
417
+ boxes[:, 1::2].clamp_(min=0, max=h)
418
+
419
+ classes = [obj["category_id"] for obj in anno]
420
+ classes = torch.tensor(classes, dtype=torch.int64)
421
+
422
+ if self.return_masks:
423
+ segmentations = [obj["segmentation"] for obj in anno]
424
+ masks = convert_coco_poly_to_mask(segmentations, h, w)
425
+
426
+ keypoints = None
427
+ if anno and "keypoints" in anno[0]:
428
+ keypoints = [obj["keypoints"] for obj in anno]
429
+ keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
430
+ num_keypoints = keypoints.shape[0]
431
+ if num_keypoints:
432
+ keypoints = keypoints.view(num_keypoints, -1, 3)
433
+
434
+ keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
435
+ boxes = boxes[keep]
436
+ classes = classes[keep]
437
+ if self.return_masks:
438
+ masks = masks[keep]
439
+ if keypoints is not None:
440
+ keypoints = keypoints[keep]
441
+
442
+ target = {}
443
+ target["boxes"] = boxes
444
+ target["labels"] = classes
445
+ if self.return_masks:
446
+ target["masks"] = masks
447
+ target["image_id"] = image_id
448
+ if keypoints is not None:
449
+ target["keypoints"] = keypoints
450
+
451
+ # for conversion to coco api
452
+ area = torch.tensor([obj["area"] for obj in anno])
453
+ iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
454
+ target["area"] = area[keep]
455
+ target["iscrowd"] = iscrowd[keep]
456
+
457
+ target["orig_size"] = torch.as_tensor([int(h), int(w)])
458
+ target["size"] = torch.as_tensor([int(h), int(w)])
459
+
460
+ return image, target
461
+
462
+
463
+ def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
464
+
465
+ normalize = T.Compose([
466
+ T.ToTensor(),
467
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
468
+ ])
469
+
470
+ # config the params for data aug
471
+ scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
472
+ max_size = 1333
473
+ scales2_resize = [400, 500, 600]
474
+ scales2_crop = [384, 600]
475
+
476
+ # update args from config files
477
+ scales = getattr(args, 'data_aug_scales', scales)
478
+ max_size = getattr(args, 'data_aug_max_size', max_size)
479
+ scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
480
+ scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
481
+
482
+ # resize them
483
+ data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
484
+ if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
485
+ data_aug_scale_overlap = float(data_aug_scale_overlap)
486
+ scales = [int(i*data_aug_scale_overlap) for i in scales]
487
+ max_size = int(max_size*data_aug_scale_overlap)
488
+ scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
489
+ scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
490
+
491
+ datadict_for_print = {
492
+ 'scales': scales,
493
+ 'max_size': max_size,
494
+ 'scales2_resize': scales2_resize,
495
+ 'scales2_crop': scales2_crop
496
+ }
497
+ # print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
498
+
499
+ if image_set == 'train':
500
+ if fix_size:
501
+ return T.Compose([
502
+ T.RandomHorizontalFlip(),
503
+ T.RandomResize([(max_size, max(scales))]),
504
+ # T.RandomResize([(512, 512)]),
505
+ normalize,
506
+ ])
507
+
508
+ if strong_aug:
509
+ import datasets.sltransform as SLT
510
+
511
+ return T.Compose([
512
+ T.RandomHorizontalFlip(),
513
+ T.RandomSelect(
514
+ T.RandomResize(scales, max_size=max_size),
515
+ T.Compose([
516
+ T.RandomResize(scales2_resize),
517
+ T.RandomSizeCrop(*scales2_crop),
518
+ T.RandomResize(scales, max_size=max_size),
519
+ ])
520
+ ),
521
+ SLT.RandomSelectMulti([
522
+ SLT.RandomCrop(),
523
+ SLT.LightingNoise(),
524
+ SLT.AdjustBrightness(2),
525
+ SLT.AdjustContrast(2),
526
+ ]),
527
+ normalize,
528
+ ])
529
+
530
+ return T.Compose([
531
+ T.RandomHorizontalFlip(),
532
+ T.RandomSelect(
533
+ T.RandomResize(scales, max_size=max_size),
534
+ T.Compose([
535
+ T.RandomResize(scales2_resize),
536
+ T.RandomSizeCrop(*scales2_crop),
537
+ T.RandomResize(scales, max_size=max_size),
538
+ ])
539
+ ),
540
+ normalize,
541
+ ])
542
+
543
+ if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
544
+
545
+ if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
546
+ print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
547
+ return T.Compose([
548
+ T.ResizeDebug((1280, 800)),
549
+ normalize,
550
+ ])
551
+
552
+ return T.Compose([
553
+ T.RandomResize([max(scales)], max_size=max_size),
554
+ normalize,
555
+ ])
556
+
557
+
558
+
559
+ raise ValueError(f'unknown {image_set}')
560
+
561
+
562
+ def get_aux_target_hacks_list(image_set, args):
563
+ if args.modelname in ['q2bs_mask', 'q2bs']:
564
+ aux_target_hacks_list = [
565
+ label2compat(),
566
+ label_compat2onehot(),
567
+ RandomSelectBoxes(num_class=args.num_classes)
568
+ ]
569
+ if args.masked_data and image_set == 'train':
570
+ # aux_target_hacks_list.append()
571
+ aux_target_hacks_list.append(MaskCrop())
572
+ elif args.modelname in ['q2bm_v2', 'q2bs_ce', 'q2op', 'q2ofocal', 'q2opclip', 'q2ocqonly']:
573
+ aux_target_hacks_list = [
574
+ label2compat(),
575
+ label_compat2onehot(),
576
+ box_label_catter(),
577
+ RandomSelectBoxlabels(num_classes=args.num_classes,
578
+ prob_first_item=args.prob_first_item,
579
+ prob_random_item=args.prob_random_item,
580
+ prob_last_item=args.prob_last_item,
581
+ prob_stop_sign=args.prob_stop_sign,
582
+ ),
583
+ BboxPertuber(max_ratio=0.02, generate_samples=1000),
584
+ ]
585
+ elif args.modelname in ['q2omask', 'q2osa']:
586
+ if args.coco_aug:
587
+ aux_target_hacks_list = [
588
+ label2compat(),
589
+ label_compat2onehot(),
590
+ box_label_catter(),
591
+ RandomSelectBoxlabels(num_classes=args.num_classes,
592
+ prob_first_item=args.prob_first_item,
593
+ prob_random_item=args.prob_random_item,
594
+ prob_last_item=args.prob_last_item,
595
+ prob_stop_sign=args.prob_stop_sign,
596
+ ),
597
+ RandomDrop(p=0.2),
598
+ BboxPertuber(max_ratio=0.02, generate_samples=1000),
599
+ RandomCutout(factor=0.5)
600
+ ]
601
+ else:
602
+ aux_target_hacks_list = [
603
+ label2compat(),
604
+ label_compat2onehot(),
605
+ box_label_catter(),
606
+ RandomSelectBoxlabels(num_classes=args.num_classes,
607
+ prob_first_item=args.prob_first_item,
608
+ prob_random_item=args.prob_random_item,
609
+ prob_last_item=args.prob_last_item,
610
+ prob_stop_sign=args.prob_stop_sign,
611
+ ),
612
+ BboxPertuber(max_ratio=0.02, generate_samples=1000),
613
+ ]
614
+ else:
615
+ aux_target_hacks_list = None
616
+
617
+ return aux_target_hacks_list
618
+
619
+
620
+ def build(image_set, args, datasetinfo):
621
+ img_folder = datasetinfo["root"]
622
+ ann_file = datasetinfo["anno"]
623
+
624
+ # copy to local path
625
+ if os.environ.get('DATA_COPY_SHILONG') == 'INFO':
626
+ preparing_dataset(dict(img_folder=img_folder, ann_file=ann_file), image_set, args)
627
+
628
+ try:
629
+ strong_aug = args.strong_aug
630
+ except:
631
+ strong_aug = False
632
+ print(img_folder, ann_file)
633
+ dataset = CocoDetection(img_folder, ann_file,
634
+ transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
635
+ return_masks=args.masks,
636
+ aux_target_hacks=None,
637
+ )
638
+ return dataset
639
+
640
+
641
+ if __name__ == "__main__":
642
+ # Objects365 Val example
643
+ dataset_o365 = CocoDetection(
644
+ '/path/Objects365/train/',
645
+ "/path/Objects365/slannos/anno_preprocess_train_v2.json",
646
+ transforms=None,
647
+ return_masks=False,
648
+ )
649
+ print('len(dataset_o365):', len(dataset_o365))
groundingdino/datasets/coco_eval.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ """
3
+ COCO evaluator that works in distributed mode.
4
+
5
+ Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
6
+ The difference is that there is less copy-pasting from pycocotools
7
+ in the end of the file, as python3 can suppress prints with contextlib
8
+ """
9
+ import os
10
+ import contextlib
11
+ import copy
12
+ import numpy as np
13
+ import torch
14
+
15
+ from pycocotools.cocoeval import COCOeval
16
+ from pycocotools.coco import COCO
17
+ import pycocotools.mask as mask_util
18
+
19
+ from util.misc import all_gather
20
+
21
+
22
+ class CocoEvaluator(object):
23
+ def __init__(self, coco_gt, iou_types, useCats=True):
24
+ assert isinstance(iou_types, (list, tuple))
25
+ coco_gt = copy.deepcopy(coco_gt)
26
+ self.coco_gt = coco_gt
27
+
28
+ self.iou_types = iou_types
29
+ self.coco_eval = {}
30
+ for iou_type in iou_types:
31
+ self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
32
+ self.coco_eval[iou_type].useCats = useCats
33
+
34
+ self.img_ids = []
35
+ self.eval_imgs = {k: [] for k in iou_types}
36
+ self.useCats = useCats
37
+
38
+ def update(self, predictions):
39
+ img_ids = list(np.unique(list(predictions.keys())))
40
+ self.img_ids.extend(img_ids)
41
+
42
+ for iou_type in self.iou_types:
43
+ results = self.prepare(predictions, iou_type)
44
+
45
+ # suppress pycocotools prints
46
+ with open(os.devnull, 'w') as devnull:
47
+ with contextlib.redirect_stdout(devnull):
48
+ coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
49
+ coco_eval = self.coco_eval[iou_type]
50
+
51
+ coco_eval.cocoDt = coco_dt
52
+ coco_eval.params.imgIds = list(img_ids)
53
+ coco_eval.params.useCats = self.useCats
54
+ img_ids, eval_imgs = evaluate(coco_eval)
55
+
56
+ self.eval_imgs[iou_type].append(eval_imgs)
57
+
58
+ def synchronize_between_processes(self):
59
+ for iou_type in self.iou_types:
60
+ self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
61
+ create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
62
+
63
+ def accumulate(self):
64
+ for coco_eval in self.coco_eval.values():
65
+ coco_eval.accumulate()
66
+
67
+ def summarize(self):
68
+ for iou_type, coco_eval in self.coco_eval.items():
69
+ print("IoU metric: {}".format(iou_type))
70
+ coco_eval.summarize()
71
+
72
+ def prepare(self, predictions, iou_type):
73
+ if iou_type == "bbox":
74
+ return self.prepare_for_coco_detection(predictions)
75
+ elif iou_type == "segm":
76
+ return self.prepare_for_coco_segmentation(predictions)
77
+ elif iou_type == "keypoints":
78
+ return self.prepare_for_coco_keypoint(predictions)
79
+ else:
80
+ raise ValueError("Unknown iou type {}".format(iou_type))
81
+
82
+ def prepare_for_coco_detection(self, predictions):
83
+ coco_results = []
84
+ for original_id, prediction in predictions.items():
85
+ if len(prediction) == 0:
86
+ continue
87
+
88
+ boxes = prediction["boxes"]
89
+ boxes = convert_to_xywh(boxes).tolist()
90
+ if not isinstance(prediction["scores"], list):
91
+ scores = prediction["scores"].tolist()
92
+ else:
93
+ scores = prediction["scores"]
94
+ if not isinstance(prediction["labels"], list):
95
+ labels = prediction["labels"].tolist()
96
+ else:
97
+ labels = prediction["labels"]
98
+
99
+
100
+ try:
101
+ coco_results.extend(
102
+ [
103
+ {
104
+ "image_id": original_id,
105
+ "category_id": labels[k],
106
+ "bbox": box,
107
+ "score": scores[k],
108
+ }
109
+ for k, box in enumerate(boxes)
110
+ ]
111
+ )
112
+ except:
113
+ import ipdb; ipdb.set_trace()
114
+ return coco_results
115
+
116
+ def prepare_for_coco_segmentation(self, predictions):
117
+ coco_results = []
118
+ for original_id, prediction in predictions.items():
119
+ if len(prediction) == 0:
120
+ continue
121
+
122
+ scores = prediction["scores"]
123
+ labels = prediction["labels"]
124
+ masks = prediction["masks"]
125
+
126
+ masks = masks > 0.5
127
+
128
+ scores = prediction["scores"].tolist()
129
+ labels = prediction["labels"].tolist()
130
+
131
+ rles = [
132
+ mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
133
+ for mask in masks
134
+ ]
135
+ for rle in rles:
136
+ rle["counts"] = rle["counts"].decode("utf-8")
137
+
138
+ coco_results.extend(
139
+ [
140
+ {
141
+ "image_id": original_id,
142
+ "category_id": labels[k],
143
+ "segmentation": rle,
144
+ "score": scores[k],
145
+ }
146
+ for k, rle in enumerate(rles)
147
+ ]
148
+ )
149
+ return coco_results
150
+
151
+ def prepare_for_coco_keypoint(self, predictions):
152
+ coco_results = []
153
+ for original_id, prediction in predictions.items():
154
+ if len(prediction) == 0:
155
+ continue
156
+
157
+ boxes = prediction["boxes"]
158
+ boxes = convert_to_xywh(boxes).tolist()
159
+ scores = prediction["scores"].tolist()
160
+ labels = prediction["labels"].tolist()
161
+ keypoints = prediction["keypoints"]
162
+ keypoints = keypoints.flatten(start_dim=1).tolist()
163
+
164
+ coco_results.extend(
165
+ [
166
+ {
167
+ "image_id": original_id,
168
+ "category_id": labels[k],
169
+ 'keypoints': keypoint,
170
+ "score": scores[k],
171
+ }
172
+ for k, keypoint in enumerate(keypoints)
173
+ ]
174
+ )
175
+ return coco_results
176
+
177
+
178
+ def convert_to_xywh(boxes):
179
+ xmin, ymin, xmax, ymax = boxes.unbind(1)
180
+ return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
181
+
182
+
183
+ def merge(img_ids, eval_imgs):
184
+ all_img_ids = all_gather(img_ids)
185
+ all_eval_imgs = all_gather(eval_imgs)
186
+
187
+ merged_img_ids = []
188
+ for p in all_img_ids:
189
+ merged_img_ids.extend(p)
190
+
191
+ merged_eval_imgs = []
192
+ for p in all_eval_imgs:
193
+ merged_eval_imgs.append(p)
194
+
195
+ merged_img_ids = np.array(merged_img_ids)
196
+ merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
197
+
198
+ # keep only unique (and in sorted order) images
199
+ merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
200
+ merged_eval_imgs = merged_eval_imgs[..., idx]
201
+
202
+ return merged_img_ids, merged_eval_imgs
203
+
204
+
205
+ def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
206
+ img_ids, eval_imgs = merge(img_ids, eval_imgs)
207
+ img_ids = list(img_ids)
208
+ eval_imgs = list(eval_imgs.flatten())
209
+
210
+ coco_eval.evalImgs = eval_imgs
211
+ coco_eval.params.imgIds = img_ids
212
+ coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
213
+
214
+
215
+ #################################################################
216
+ # From pycocotools, just removed the prints and fixed
217
+ # a Python3 bug about unicode not defined
218
+ #################################################################
219
+
220
+
221
+ def evaluate(self):
222
+ '''
223
+ Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
224
+ :return: None
225
+ '''
226
+ p = self.params
227
+ # add backward compatibility if useSegm is specified in params
228
+ if p.useSegm is not None:
229
+ p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
230
+ print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
231
+ p.imgIds = list(np.unique(p.imgIds))
232
+ if p.useCats:
233
+ p.catIds = list(np.unique(p.catIds))
234
+ p.maxDets = sorted(p.maxDets)
235
+ self.params = p
236
+
237
+ self._prepare()
238
+ # loop through images, area range, max detection number
239
+ catIds = p.catIds if p.useCats else [-1]
240
+
241
+ if p.iouType == 'segm' or p.iouType == 'bbox':
242
+ computeIoU = self.computeIoU
243
+ elif p.iouType == 'keypoints':
244
+ computeIoU = self.computeOks
245
+ self.ious = {
246
+ (imgId, catId): computeIoU(imgId, catId)
247
+ for imgId in p.imgIds
248
+ for catId in catIds}
249
+
250
+ evaluateImg = self.evaluateImg
251
+ maxDet = p.maxDets[-1]
252
+ evalImgs = [
253
+ evaluateImg(imgId, catId, areaRng, maxDet)
254
+ for catId in catIds
255
+ for areaRng in p.areaRng
256
+ for imgId in p.imgIds
257
+ ]
258
+ # this is NOT in the pycocotools code, but could be done outside
259
+ evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
260
+ self._paramsEval = copy.deepcopy(self.params)
261
+
262
+ return p.imgIds, evalImgs
263
+
264
+ #################################################################
265
+ # end of straight copy from pycocotools, just removing the prints
266
+ #################################################################
groundingdino/datasets/coco_panoptic.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ import json
3
+ from pathlib import Path
4
+
5
+ import numpy as np
6
+ import torch
7
+ from PIL import Image
8
+
9
+ from panopticapi.utils import rgb2id
10
+ from util.box_ops import masks_to_boxes
11
+
12
+ from .coco import make_coco_transforms
13
+
14
+
15
+ class CocoPanoptic:
16
+ def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True):
17
+ with open(ann_file, 'r') as f:
18
+ self.coco = json.load(f)
19
+
20
+ # sort 'images' field so that they are aligned with 'annotations'
21
+ # i.e., in alphabetical order
22
+ self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id'])
23
+ # sanity check
24
+ if "annotations" in self.coco:
25
+ for img, ann in zip(self.coco['images'], self.coco['annotations']):
26
+ assert img['file_name'][:-4] == ann['file_name'][:-4]
27
+
28
+ self.img_folder = img_folder
29
+ self.ann_folder = ann_folder
30
+ self.ann_file = ann_file
31
+ self.transforms = transforms
32
+ self.return_masks = return_masks
33
+
34
+ def __getitem__(self, idx):
35
+ ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx]
36
+ img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg')
37
+ ann_path = Path(self.ann_folder) / ann_info['file_name']
38
+
39
+ img = Image.open(img_path).convert('RGB')
40
+ w, h = img.size
41
+ if "segments_info" in ann_info:
42
+ masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
43
+ masks = rgb2id(masks)
44
+
45
+ ids = np.array([ann['id'] for ann in ann_info['segments_info']])
46
+ masks = masks == ids[:, None, None]
47
+
48
+ masks = torch.as_tensor(masks, dtype=torch.uint8)
49
+ labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64)
50
+
51
+ target = {}
52
+ target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]])
53
+ if self.return_masks:
54
+ target['masks'] = masks
55
+ target['labels'] = labels
56
+
57
+ target["boxes"] = masks_to_boxes(masks)
58
+
59
+ target['size'] = torch.as_tensor([int(h), int(w)])
60
+ target['orig_size'] = torch.as_tensor([int(h), int(w)])
61
+ if "segments_info" in ann_info:
62
+ for name in ['iscrowd', 'area']:
63
+ target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']])
64
+
65
+ if self.transforms is not None:
66
+ img, target = self.transforms(img, target)
67
+
68
+ return img, target
69
+
70
+ def __len__(self):
71
+ return len(self.coco['images'])
72
+
73
+ def get_height_and_width(self, idx):
74
+ img_info = self.coco['images'][idx]
75
+ height = img_info['height']
76
+ width = img_info['width']
77
+ return height, width
78
+
79
+
80
+ def build(image_set, args):
81
+ img_folder_root = Path(args.coco_path)
82
+ ann_folder_root = Path(args.coco_panoptic_path)
83
+ assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist'
84
+ assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist'
85
+ mode = 'panoptic'
86
+ PATHS = {
87
+ "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'),
88
+ "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'),
89
+ }
90
+
91
+ img_folder, ann_file = PATHS[image_set]
92
+ img_folder_path = img_folder_root / img_folder
93
+ ann_folder = ann_folder_root / f'{mode}_{img_folder}'
94
+ ann_file = ann_folder_root / ann_file
95
+
96
+ dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file,
97
+ transforms=make_coco_transforms(image_set), return_masks=args.masks)
98
+
99
+ return dataset
groundingdino/datasets/cocogrounding_eval.py CHANGED
@@ -45,7 +45,7 @@ class CocoGroundingEvaluator(object):
45
  def update(self, predictions):
46
  img_ids = list(np.unique(list(predictions.keys())))
47
  self.img_ids.extend(img_ids)
48
-
49
  for iou_type in self.iou_types:
50
  results = self.prepare(predictions, iou_type)
51
 
@@ -223,6 +223,8 @@ def evaluate(self):
223
  """
224
  # tic = time.time()
225
  # print('Running per image evaluation...')
 
 
226
  p = self.params
227
  # add backward compatibility if useSegm is specified in params
228
  if p.useSegm is not None:
 
45
  def update(self, predictions):
46
  img_ids = list(np.unique(list(predictions.keys())))
47
  self.img_ids.extend(img_ids)
48
+ # import pdb;pdb.set_trace()
49
  for iou_type in self.iou_types:
50
  results = self.prepare(predictions, iou_type)
51
 
 
223
  """
224
  # tic = time.time()
225
  # print('Running per image evaluation...')
226
+
227
+ # import pdb;pdb.set_trace()
228
  p = self.params
229
  # add backward compatibility if useSegm is specified in params
230
  if p.useSegm is not None:
groundingdino/datasets/data_util.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import os.path as osp
3
+ import shutil
4
+ import time
5
+ import datetime
6
+
7
+ import torch
8
+
9
+ from util.slconfig import SLConfig
10
+
11
+ class Error(OSError):
12
+ pass
13
+
14
+ def slcopytree(src, dst, symlinks=False, ignore=None, copy_function=shutil.copyfile,
15
+ ignore_dangling_symlinks=False):
16
+ """
17
+ modified from shutil.copytree without copystat.
18
+
19
+ Recursively copy a directory tree.
20
+
21
+ The destination directory must not already exist.
22
+ If exception(s) occur, an Error is raised with a list of reasons.
23
+
24
+ If the optional symlinks flag is true, symbolic links in the
25
+ source tree result in symbolic links in the destination tree; if
26
+ it is false, the contents of the files pointed to by symbolic
27
+ links are copied. If the file pointed by the symlink doesn't
28
+ exist, an exception will be added in the list of errors raised in
29
+ an Error exception at the end of the copy process.
30
+
31
+ You can set the optional ignore_dangling_symlinks flag to true if you
32
+ want to silence this exception. Notice that this has no effect on
33
+ platforms that don't support os.symlink.
34
+
35
+ The optional ignore argument is a callable. If given, it
36
+ is called with the `src` parameter, which is the directory
37
+ being visited by copytree(), and `names` which is the list of
38
+ `src` contents, as returned by os.listdir():
39
+
40
+ callable(src, names) -> ignored_names
41
+
42
+ Since copytree() is called recursively, the callable will be
43
+ called once for each directory that is copied. It returns a
44
+ list of names relative to the `src` directory that should
45
+ not be copied.
46
+
47
+ The optional copy_function argument is a callable that will be used
48
+ to copy each file. It will be called with the source path and the
49
+ destination path as arguments. By default, copy2() is used, but any
50
+ function that supports the same signature (like copy()) can be used.
51
+
52
+ """
53
+ errors = []
54
+ if os.path.isdir(src):
55
+ names = os.listdir(src)
56
+ if ignore is not None:
57
+ ignored_names = ignore(src, names)
58
+ else:
59
+ ignored_names = set()
60
+
61
+ os.makedirs(dst)
62
+ for name in names:
63
+ if name in ignored_names:
64
+ continue
65
+ srcname = os.path.join(src, name)
66
+ dstname = os.path.join(dst, name)
67
+ try:
68
+ if os.path.islink(srcname):
69
+ linkto = os.readlink(srcname)
70
+ if symlinks:
71
+ # We can't just leave it to `copy_function` because legacy
72
+ # code with a custom `copy_function` may rely on copytree
73
+ # doing the right thing.
74
+ os.symlink(linkto, dstname)
75
+ else:
76
+ # ignore dangling symlink if the flag is on
77
+ if not os.path.exists(linkto) and ignore_dangling_symlinks:
78
+ continue
79
+ # otherwise let the copy occurs. copy2 will raise an error
80
+ if os.path.isdir(srcname):
81
+ slcopytree(srcname, dstname, symlinks, ignore,
82
+ copy_function)
83
+ else:
84
+ copy_function(srcname, dstname)
85
+ elif os.path.isdir(srcname):
86
+ slcopytree(srcname, dstname, symlinks, ignore, copy_function)
87
+ else:
88
+ # Will raise a SpecialFileError for unsupported file types
89
+ copy_function(srcname, dstname)
90
+ # catch the Error from the recursive copytree so that we can
91
+ # continue with other files
92
+ except Error as err:
93
+ errors.extend(err.args[0])
94
+ except OSError as why:
95
+ errors.append((srcname, dstname, str(why)))
96
+ else:
97
+ copy_function(src, dst)
98
+
99
+ if errors:
100
+ raise Error(errors)
101
+ return dst
102
+
103
+ def check_and_copy(src_path, tgt_path):
104
+ if os.path.exists(tgt_path):
105
+ return None
106
+
107
+ return slcopytree(src_path, tgt_path)
108
+
109
+
110
+ def remove(srcpath):
111
+ if os.path.isdir(srcpath):
112
+ return shutil.rmtree(srcpath)
113
+ else:
114
+ return os.remove(srcpath)
115
+
116
+
117
+ def preparing_dataset(pathdict, image_set, args):
118
+ start_time = time.time()
119
+ dataset_file = args.dataset_file
120
+ data_static_info = SLConfig.fromfile('util/static_data_path.py')
121
+ static_dict = data_static_info[dataset_file][image_set]
122
+
123
+ copyfilelist = []
124
+ for k,tgt_v in pathdict.items():
125
+ if os.path.exists(tgt_v):
126
+ if args.local_rank == 0:
127
+ print("path <{}> exist. remove it!".format(tgt_v))
128
+ remove(tgt_v)
129
+ # continue
130
+
131
+ if args.local_rank == 0:
132
+ src_v = static_dict[k]
133
+ assert isinstance(src_v, str)
134
+ if src_v.endswith('.zip'):
135
+ # copy
136
+ cp_tgt_dir = os.path.dirname(tgt_v)
137
+ filename = os.path.basename(src_v)
138
+ cp_tgt_path = os.path.join(cp_tgt_dir, filename)
139
+ print('Copy from <{}> to <{}>.'.format(src_v, cp_tgt_path))
140
+ os.makedirs(cp_tgt_dir, exist_ok=True)
141
+ check_and_copy(src_v, cp_tgt_path)
142
+
143
+ # unzip
144
+ import zipfile
145
+ print("Starting unzip <{}>".format(cp_tgt_path))
146
+ with zipfile.ZipFile(cp_tgt_path, 'r') as zip_ref:
147
+ zip_ref.extractall(os.path.dirname(cp_tgt_path))
148
+
149
+ copyfilelist.append(cp_tgt_path)
150
+ copyfilelist.append(tgt_v)
151
+ else:
152
+ print('Copy from <{}> to <{}>.'.format(src_v, tgt_v))
153
+ os.makedirs(os.path.dirname(tgt_v), exist_ok=True)
154
+ check_and_copy(src_v, tgt_v)
155
+ copyfilelist.append(tgt_v)
156
+
157
+ if len(copyfilelist) == 0:
158
+ copyfilelist = None
159
+ args.copyfilelist = copyfilelist
160
+
161
+ if args.distributed:
162
+ torch.distributed.barrier()
163
+ total_time = time.time() - start_time
164
+ if copyfilelist:
165
+ total_time_str = str(datetime.timedelta(seconds=int(total_time)))
166
+ print('Data copy time {}'.format(total_time_str))
167
+ return copyfilelist
168
+
169
+
170
+
groundingdino/datasets/dataset.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import print_function
2
+
3
+ import torch
4
+ import torchvision.datasets as datasets
5
+ from torch.utils.data import Dataset
6
+ from PIL import Image
7
+ from .tsv_io import TSVFile
8
+ import numpy as np
9
+ import base64
10
+ import io
11
+
12
+
13
+ class TSVDataset(Dataset):
14
+ """ TSV dataset for ImageNet 1K training
15
+ """
16
+ def __init__(self, tsv_file, transform=None, target_transform=None):
17
+ self.tsv = TSVFile(tsv_file)
18
+ self.transform = transform
19
+ self.target_transform = target_transform
20
+
21
+ def __getitem__(self, index):
22
+ """
23
+ Args:
24
+ index (int): Index
25
+ Returns:
26
+ tuple: (image, target) where target is class_index of the target class.
27
+ """
28
+ row = self.tsv.seek(index)
29
+ image_data = base64.b64decode(row[-1])
30
+ image = Image.open(io.BytesIO(image_data))
31
+ image = image.convert('RGB')
32
+ target = int(row[1])
33
+
34
+ if self.transform is not None:
35
+ img = self.transform(image)
36
+ else:
37
+ img = image
38
+ if self.target_transform is not None:
39
+ target = self.target_transform(target)
40
+
41
+ return img, target
42
+
43
+ def __len__(self):
44
+ return self.tsv.num_rows()
groundingdino/datasets/odvg.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torchvision.datasets.vision import VisionDataset
2
+ import os.path
3
+ from typing import Callable, Optional
4
+ import json
5
+ from PIL import Image
6
+ import torch
7
+ import random
8
+ import os, sys
9
+ sys.path.append(os.path.dirname(sys.path[0]))
10
+
11
+ import datasets.transforms as T
12
+
13
+ class ODVGDataset(VisionDataset):
14
+ """
15
+ Args:
16
+ root (string): Root directory where images are downloaded to.
17
+ anno (string): Path to json annotation file.
18
+ label_map_anno (string): Path to json label mapping file. Only for Object Detection
19
+ transform (callable, optional): A function/transform that takes in an PIL image
20
+ and returns a transformed version. E.g, ``transforms.PILToTensor``
21
+ target_transform (callable, optional): A function/transform that takes in the
22
+ target and transforms it.
23
+ transforms (callable, optional): A function/transform that takes input sample and its target as entry
24
+ and returns a transformed version.
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ root: str,
30
+ anno: str,
31
+ label_map_anno: str = None,
32
+ max_labels: int = 80,
33
+ transform: Optional[Callable] = None,
34
+ target_transform: Optional[Callable] = None,
35
+ transforms: Optional[Callable] = None,
36
+ ) -> None:
37
+ super().__init__(root, transforms, transform, target_transform)
38
+ self.root = root
39
+ self.dataset_mode = "OD" if label_map_anno else "VG"
40
+ self.max_labels = max_labels
41
+ if self.dataset_mode == "OD":
42
+ self.load_label_map(label_map_anno)
43
+ self._load_metas(anno)
44
+ self.get_dataset_info()
45
+
46
+ def load_label_map(self, label_map_anno):
47
+ with open(label_map_anno, 'r') as file:
48
+ self.label_map = json.load(file)
49
+ self.label_index = set(self.label_map.keys())
50
+
51
+ def _load_metas(self, anno):
52
+ with open(anno, 'r') as f:
53
+ self.metas = json.load(f)
54
+
55
+
56
+ def get_dataset_info(self):
57
+ print(f" == total images: {len(self)}")
58
+ if self.dataset_mode == "OD":
59
+ print(f" == total labels: {len(self.label_map)}")
60
+
61
+ def __getitem__(self, index: int):
62
+ meta = self.metas[index]
63
+ rel_path = meta["filename"]
64
+ abs_path = os.path.join(self.root, rel_path)
65
+ if not os.path.exists(abs_path):
66
+ raise FileNotFoundError(f"{abs_path} not found.")
67
+ image = Image.open(abs_path).convert('RGB')
68
+ w, h = image.size
69
+ if self.dataset_mode == "OD":
70
+ anno = meta["detection"]
71
+ instances = [obj for obj in anno["instances"]]
72
+ boxes = [obj["bbox"] for obj in instances]
73
+ # generate vg_labels
74
+ # pos bbox labels
75
+ ori_classes = [str(obj["label"]) for obj in instances]
76
+ pos_labels = set(ori_classes)
77
+ # neg bbox labels
78
+ neg_labels = self.label_index.difference(pos_labels)
79
+
80
+ vg_labels = list(pos_labels)
81
+ num_to_add = min(len(neg_labels), self.max_labels-len(pos_labels))
82
+ if num_to_add > 0:
83
+ vg_labels.extend(random.sample(neg_labels, num_to_add))
84
+
85
+ # shuffle
86
+ for i in range(len(vg_labels)-1, 0, -1):
87
+ j = random.randint(0, i)
88
+ vg_labels[i], vg_labels[j] = vg_labels[j], vg_labels[i]
89
+
90
+ caption_list = [self.label_map[lb] for lb in vg_labels]
91
+ caption_dict = {item:index for index, item in enumerate(caption_list)}
92
+
93
+ caption = ' . '.join(caption_list) + ' .'
94
+ classes = [caption_dict[self.label_map[str(obj["label"])]] for obj in instances]
95
+ boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
96
+ classes = torch.tensor(classes, dtype=torch.int64)
97
+ elif self.dataset_mode == "VG":
98
+ anno = meta["Grounding"]
99
+ instances = [obj for obj in anno["regions"]]
100
+ boxes = [obj["bbox"] for obj in instances]
101
+ caption_list = [obj["phrase"] for obj in instances]
102
+ c = list(zip(boxes, caption_list))
103
+ random.shuffle(c)
104
+ boxes[:], caption_list[:] = zip(*c)
105
+ uni_caption_list = list(set(caption_list))
106
+ label_map = {}
107
+ for idx in range(len(uni_caption_list)):
108
+ label_map[uni_caption_list[idx]] = idx
109
+ classes = [label_map[cap] for cap in caption_list]
110
+ caption = ' . '.join(uni_caption_list) + ' .'
111
+ boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
112
+ classes = torch.tensor(classes, dtype=torch.int64)
113
+ caption_list = uni_caption_list
114
+ # print("caption_list" , caption_list)
115
+ # print("caption" , caption)
116
+ # print("boxes" , boxes)
117
+ target = {}
118
+ target["image_id"] = rel_path.strip(".jpg")
119
+ target["size"] = torch.as_tensor([int(h), int(w)])
120
+ target["cap_list"] = caption_list
121
+ target["caption"] = caption
122
+ target["boxes"] = boxes
123
+ target["labels"] = classes
124
+ # print(" image_id " , target["image_id"])
125
+ # size, cap_list, caption, bboxes, labels
126
+
127
+ if self.transforms is not None:
128
+ image, target = self.transforms(image, target)
129
+
130
+ return image, target
131
+
132
+
133
+ def __len__(self) -> int:
134
+ return len(self.metas)
135
+
136
+
137
+ def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
138
+
139
+ normalize = T.Compose([
140
+ T.ToTensor(),
141
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
142
+ ])
143
+
144
+ # config the params for data aug
145
+ scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
146
+ max_size = 1333
147
+ scales2_resize = [400, 500, 600]
148
+ scales2_crop = [384, 600]
149
+
150
+ # update args from config files
151
+ scales = getattr(args, 'data_aug_scales', scales)
152
+ max_size = getattr(args, 'data_aug_max_size', max_size)
153
+ scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
154
+ scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
155
+
156
+ # resize them
157
+ data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
158
+ if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
159
+ data_aug_scale_overlap = float(data_aug_scale_overlap)
160
+ scales = [int(i*data_aug_scale_overlap) for i in scales]
161
+ max_size = int(max_size*data_aug_scale_overlap)
162
+ scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
163
+ scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
164
+
165
+ # datadict_for_print = {
166
+ # 'scales': scales,
167
+ # 'max_size': max_size,
168
+ # 'scales2_resize': scales2_resize,
169
+ # 'scales2_crop': scales2_crop
170
+ # }
171
+ # print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
172
+
173
+ if image_set == 'train':
174
+ if fix_size:
175
+ return T.Compose([
176
+ T.RandomHorizontalFlip(),
177
+ T.RandomResize([(max_size, max(scales))]),
178
+ normalize,
179
+ ])
180
+
181
+ if strong_aug:
182
+ import datasets.sltransform as SLT
183
+
184
+ return T.Compose([
185
+ T.RandomHorizontalFlip(),
186
+ T.RandomSelect(
187
+ T.RandomResize(scales, max_size=max_size),
188
+ T.Compose([
189
+ T.RandomResize(scales2_resize),
190
+ T.RandomSizeCrop(*scales2_crop),
191
+ T.RandomResize(scales, max_size=max_size),
192
+ ])
193
+ ),
194
+ SLT.RandomSelectMulti([
195
+ SLT.RandomCrop(),
196
+ SLT.LightingNoise(),
197
+ SLT.AdjustBrightness(2),
198
+ SLT.AdjustContrast(2),
199
+ ]),
200
+ normalize,
201
+ ])
202
+
203
+ return T.Compose([
204
+ T.RandomHorizontalFlip(),
205
+ T.RandomSelect(
206
+ T.RandomResize(scales, max_size=max_size),
207
+ T.Compose([
208
+ T.RandomResize(scales2_resize),
209
+ T.RandomSizeCrop(*scales2_crop),
210
+ T.RandomResize(scales, max_size=max_size),
211
+ ])
212
+ ),
213
+ normalize,
214
+ ])
215
+
216
+ if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
217
+
218
+ if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
219
+ print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
220
+ return T.Compose([
221
+ T.ResizeDebug((1280, 800)),
222
+ normalize,
223
+ ])
224
+
225
+ return T.Compose([
226
+ T.RandomResize([max(scales)], max_size=max_size),
227
+ normalize,
228
+ ])
229
+
230
+ raise ValueError(f'unknown {image_set}')
231
+
232
+ def build_odvg(image_set, args, datasetinfo):
233
+ img_folder = datasetinfo["root"]
234
+ ann_file = datasetinfo["anno"]
235
+ label_map = datasetinfo["label_map"] if "label_map" in datasetinfo else None
236
+ try:
237
+ strong_aug = args.strong_aug
238
+ except:
239
+ strong_aug = False # False originally
240
+ print(img_folder, ann_file, label_map)
241
+ dataset = ODVGDataset(img_folder, ann_file, label_map, max_labels=args.max_labels,
242
+ transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
243
+ )
244
+ return dataset
245
+
246
+
247
+ if __name__=="__main__":
248
+ dataset_vg = ODVGDataset("path/GRIT-20M/data/","path/GRIT-20M/anno/grit_odvg_10k.jsonl",)
249
+ print(len(dataset_vg))
250
+ data = dataset_vg[random.randint(0, 100)]
251
+ print(data)
252
+ dataset_od = ODVGDataset("pathl/V3Det/",
253
+ "path/V3Det/annotations/v3det_2023_v1_all_odvg.jsonl",
254
+ "path/V3Det/annotations/v3det_label_map.json",
255
+ )
256
+ print(len(dataset_od))
257
+ data = dataset_od[random.randint(0, 100)]
258
+ print(data)
groundingdino/datasets/panoptic_eval.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ import json
3
+ import os
4
+
5
+ import util.misc as utils
6
+
7
+ try:
8
+ from panopticapi.evaluation import pq_compute
9
+ except ImportError:
10
+ pass
11
+
12
+
13
+ class PanopticEvaluator(object):
14
+ def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
15
+ self.gt_json = ann_file
16
+ self.gt_folder = ann_folder
17
+ if utils.is_main_process():
18
+ if not os.path.exists(output_dir):
19
+ os.mkdir(output_dir)
20
+ self.output_dir = output_dir
21
+ self.predictions = []
22
+
23
+ def update(self, predictions):
24
+ for p in predictions:
25
+ with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
26
+ f.write(p.pop("png_string"))
27
+
28
+ self.predictions += predictions
29
+
30
+ def synchronize_between_processes(self):
31
+ all_predictions = utils.all_gather(self.predictions)
32
+ merged_predictions = []
33
+ for p in all_predictions:
34
+ merged_predictions += p
35
+ self.predictions = merged_predictions
36
+
37
+ def summarize(self):
38
+ if utils.is_main_process():
39
+ json_data = {"annotations": self.predictions}
40
+ predictions_json = os.path.join(self.output_dir, "predictions.json")
41
+ with open(predictions_json, "w") as f:
42
+ f.write(json.dumps(json_data))
43
+ return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
44
+ return None
groundingdino/datasets/random_crop.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PIL #version 1.2.0
2
+ import torch
3
+ import os
4
+ import torchvision.transforms.functional as F
5
+ import numpy as np
6
+ import random
7
+
8
+
9
+ def intersect(boxes1, boxes2):
10
+ '''
11
+ Find intersection of every box combination between two sets of box
12
+ boxes1: bounding boxes 1, a tensor of dimensions (n1, 4)
13
+ boxes2: bounding boxes 2, a tensor of dimensions (n2, 4)
14
+
15
+ Out: Intersection each of boxes1 with respect to each of boxes2,
16
+ a tensor of dimensions (n1, n2)
17
+ '''
18
+ n1 = boxes1.size(0)
19
+ n2 = boxes2.size(0)
20
+ max_xy = torch.min(boxes1[:, 2:].unsqueeze(1).expand(n1, n2, 2),
21
+ boxes2[:, 2:].unsqueeze(0).expand(n1, n2, 2))
22
+
23
+ min_xy = torch.max(boxes1[:, :2].unsqueeze(1).expand(n1, n2, 2),
24
+ boxes2[:, :2].unsqueeze(0).expand(n1, n2, 2))
25
+ inter = torch.clamp(max_xy - min_xy , min=0) # (n1, n2, 2)
26
+ return inter[:, :, 0] * inter[:, :, 1] #(n1, n2)
27
+ def find_IoU(boxes1, boxes2):
28
+ '''
29
+ Find IoU between every boxes set of boxes
30
+ boxes1: a tensor of dimensions (n1, 4) (left, top, right , bottom)
31
+ boxes2: a tensor of dimensions (n2, 4)
32
+
33
+ Out: IoU each of boxes1 with respect to each of boxes2, a tensor of
34
+ dimensions (n1, n2)
35
+
36
+ Formula:
37
+ (box1 ∩ box2) / (box1 u box2) = (box1 ∩ box2) / (area(box1) + area(box2) - (box1 ∩ box2 ))
38
+ '''
39
+ inter = intersect(boxes1, boxes2)
40
+ area_boxes1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
41
+ area_boxes2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
42
+
43
+ area_boxes1 = area_boxes1.unsqueeze(1).expand_as(inter) #(n1, n2)
44
+ area_boxes2 = area_boxes2.unsqueeze(0).expand_as(inter) #(n1, n2)
45
+ union = (area_boxes1 + area_boxes2 - inter)
46
+ return inter / union
47
+
48
+
49
+ def random_crop(image, boxes, labels, difficulties=None):
50
+ '''
51
+ image: A PIL image
52
+ boxes: Bounding boxes, a tensor of dimensions (#objects, 4)
53
+ labels: labels of object, a tensor of dimensions (#objects)
54
+ difficulties: difficulties of detect object, a tensor of dimensions (#objects)
55
+
56
+ Out: cropped image , new boxes, new labels, new difficulties
57
+ '''
58
+ if type(image) == PIL.Image.Image:
59
+ image = F.to_tensor(image)
60
+ original_h = image.size(1)
61
+ original_w = image.size(2)
62
+
63
+ while True:
64
+ mode = random.choice([0.1, 0.3, 0.5, 0.9, None])
65
+
66
+ if mode is None:
67
+ return F.to_pil_image(image), boxes, labels, difficulties
68
+
69
+ new_image = image
70
+ new_boxes = boxes
71
+ new_difficulties = difficulties
72
+ new_labels = labels
73
+ for _ in range(50):
74
+ # Crop dimensions: [0.3, 1] of original dimensions
75
+ new_h = random.uniform(0.3*original_h, original_h)
76
+ new_w = random.uniform(0.3*original_w, original_w)
77
+
78
+ # Aspect ratio constraint b/t .5 & 2
79
+ if new_h/new_w < 0.5 or new_h/new_w > 2:
80
+ continue
81
+
82
+ #Crop coordinate
83
+ left = random.uniform(0, original_w - new_w)
84
+ right = left + new_w
85
+ top = random.uniform(0, original_h - new_h)
86
+ bottom = top + new_h
87
+ crop = torch.FloatTensor([int(left), int(top), int(right), int(bottom)])
88
+
89
+ # Calculate IoU between the crop and the bounding boxes
90
+ overlap = find_IoU(crop.unsqueeze(0), boxes) #(1, #objects)
91
+ overlap = overlap.squeeze(0)
92
+
93
+ # If not a single bounding box has a IoU of greater than the minimum, try again
94
+ if overlap.shape[0] == 0:
95
+ continue
96
+ if overlap.max().item() < mode:
97
+ continue
98
+
99
+ #Crop
100
+ new_image = image[:, int(top):int(bottom), int(left):int(right)] #(3, new_h, new_w)
101
+
102
+ #Center of bounding boxes
103
+ center_bb = (boxes[:, :2] + boxes[:, 2:])/2.0
104
+
105
+ #Find bounding box has been had center in crop
106
+ center_in_crop = (center_bb[:, 0] >left) * (center_bb[:, 0] < right
107
+ ) *(center_bb[:, 1] > top) * (center_bb[:, 1] < bottom) #( #objects)
108
+
109
+ if not center_in_crop.any():
110
+ continue
111
+
112
+ #take matching bounding box
113
+ new_boxes = boxes[center_in_crop, :]
114
+
115
+ #take matching labels
116
+ new_labels = labels[center_in_crop]
117
+
118
+ #take matching difficulities
119
+ if difficulties is not None:
120
+ new_difficulties = difficulties[center_in_crop]
121
+ else:
122
+ new_difficulties = None
123
+
124
+ #Use the box left and top corner or the crop's
125
+ new_boxes[:, :2] = torch.max(new_boxes[:, :2], crop[:2])
126
+
127
+ #adjust to crop
128
+ new_boxes[:, :2] -= crop[:2]
129
+
130
+ new_boxes[:, 2:] = torch.min(new_boxes[:, 2:],crop[2:])
131
+
132
+ #adjust to crop
133
+ new_boxes[:, 2:] -= crop[:2]
134
+
135
+ return F.to_pil_image(new_image), new_boxes, new_labels, new_difficulties
groundingdino/datasets/sltransform.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modified from https://github.com/anhtuan85/Data-Augmentation-for-Object-Detection/blob/master/augmentation.ipynb
2
+
3
+ import PIL #version 1.2.0
4
+ from PIL import Image #version 6.1.0
5
+ import torch
6
+ import os
7
+ import torchvision.transforms.functional as F
8
+ import numpy as np
9
+ import random
10
+
11
+ from .random_crop import random_crop
12
+ from util.box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
13
+
14
+ class AdjustContrast:
15
+ def __init__(self, contrast_factor):
16
+ self.contrast_factor = contrast_factor
17
+
18
+ def __call__(self, img, target):
19
+ """
20
+ img (PIL Image or Tensor): Image to be adjusted.
21
+ """
22
+ _contrast_factor = ((random.random() + 1.0) / 2.0) * self.contrast_factor
23
+ img = F.adjust_contrast(img, _contrast_factor)
24
+ return img, target
25
+
26
+ class AdjustBrightness:
27
+ def __init__(self, brightness_factor):
28
+ self.brightness_factor = brightness_factor
29
+
30
+ def __call__(self, img, target):
31
+ """
32
+ img (PIL Image or Tensor): Image to be adjusted.
33
+ """
34
+ _brightness_factor = ((random.random() + 1.0) / 2.0) * self.brightness_factor
35
+ img = F.adjust_brightness(img, _brightness_factor)
36
+ return img, target
37
+
38
+ def lighting_noise(image):
39
+ '''
40
+ color channel swap in image
41
+ image: A PIL image
42
+ '''
43
+ new_image = image
44
+ perms = ((0, 1, 2), (0, 2, 1), (1, 0, 2),
45
+ (1, 2, 0), (2, 0, 1), (2, 1, 0))
46
+ swap = perms[random.randint(0, len(perms)- 1)]
47
+ new_image = F.to_tensor(new_image)
48
+ new_image = new_image[swap, :, :]
49
+ new_image = F.to_pil_image(new_image)
50
+ return new_image
51
+
52
+ class LightingNoise:
53
+ def __init__(self) -> None:
54
+ pass
55
+
56
+ def __call__(self, img, target):
57
+ return lighting_noise(img), target
58
+
59
+
60
+ def rotate(image, boxes, angle):
61
+ '''
62
+ Rotate image and bounding box
63
+ image: A Pil image (w, h)
64
+ boxes: A tensors of dimensions (#objects, 4)
65
+
66
+ Out: rotated image (w, h), rotated boxes
67
+ '''
68
+ new_image = image.copy()
69
+ new_boxes = boxes.clone()
70
+
71
+ #Rotate image, expand = True
72
+ w = image.width
73
+ h = image.height
74
+ cx = w/2
75
+ cy = h/2
76
+ new_image = new_image.rotate(angle, expand=True)
77
+ angle = np.radians(angle)
78
+ alpha = np.cos(angle)
79
+ beta = np.sin(angle)
80
+ #Get affine matrix
81
+ AffineMatrix = torch.tensor([[alpha, beta, (1-alpha)*cx - beta*cy],
82
+ [-beta, alpha, beta*cx + (1-alpha)*cy]])
83
+
84
+ #Rotation boxes
85
+ box_width = (boxes[:,2] - boxes[:,0]).reshape(-1,1)
86
+ box_height = (boxes[:,3] - boxes[:,1]).reshape(-1,1)
87
+
88
+ #Get corners for boxes
89
+ x1 = boxes[:,0].reshape(-1,1)
90
+ y1 = boxes[:,1].reshape(-1,1)
91
+
92
+ x2 = x1 + box_width
93
+ y2 = y1
94
+
95
+ x3 = x1
96
+ y3 = y1 + box_height
97
+
98
+ x4 = boxes[:,2].reshape(-1,1)
99
+ y4 = boxes[:,3].reshape(-1,1)
100
+
101
+ corners = torch.stack((x1,y1,x2,y2,x3,y3,x4,y4), dim= 1)
102
+ # corners.reshape(-1, 8) #Tensors of dimensions (#objects, 8)
103
+ corners = corners.reshape(-1,2) #Tensors of dimension (4* #objects, 2)
104
+ corners = torch.cat((corners, torch.ones(corners.shape[0], 1)), dim= 1) #(Tensors of dimension (4* #objects, 3))
105
+
106
+ cos = np.abs(AffineMatrix[0, 0])
107
+ sin = np.abs(AffineMatrix[0, 1])
108
+
109
+ nW = int((h * sin) + (w * cos))
110
+ nH = int((h * cos) + (w * sin))
111
+ AffineMatrix[0, 2] += (nW / 2) - cx
112
+ AffineMatrix[1, 2] += (nH / 2) - cy
113
+
114
+
115
+ #Apply affine transform
116
+ rotate_corners = torch.mm(AffineMatrix, corners.t().to(torch.float64)).t()
117
+ rotate_corners = rotate_corners.reshape(-1,8)
118
+
119
+ x_corners = rotate_corners[:,[0,2,4,6]]
120
+ y_corners = rotate_corners[:,[1,3,5,7]]
121
+
122
+ #Get (x_min, y_min, x_max, y_max)
123
+ x_min, _ = torch.min(x_corners, dim= 1)
124
+ x_min = x_min.reshape(-1, 1)
125
+ y_min, _ = torch.min(y_corners, dim= 1)
126
+ y_min = y_min.reshape(-1, 1)
127
+ x_max, _ = torch.max(x_corners, dim= 1)
128
+ x_max = x_max.reshape(-1, 1)
129
+ y_max, _ = torch.max(y_corners, dim= 1)
130
+ y_max = y_max.reshape(-1, 1)
131
+
132
+ new_boxes = torch.cat((x_min, y_min, x_max, y_max), dim= 1)
133
+
134
+ scale_x = new_image.width / w
135
+ scale_y = new_image.height / h
136
+
137
+ #Resize new image to (w, h)
138
+
139
+ new_image = new_image.resize((w, h))
140
+
141
+ #Resize boxes
142
+ new_boxes /= torch.Tensor([scale_x, scale_y, scale_x, scale_y])
143
+ new_boxes[:, 0] = torch.clamp(new_boxes[:, 0], 0, w)
144
+ new_boxes[:, 1] = torch.clamp(new_boxes[:, 1], 0, h)
145
+ new_boxes[:, 2] = torch.clamp(new_boxes[:, 2], 0, w)
146
+ new_boxes[:, 3] = torch.clamp(new_boxes[:, 3], 0, h)
147
+ return new_image, new_boxes
148
+
149
+ # def convert_xywh_to_xyxy(boxes: torch.Tensor):
150
+ # _boxes = boxes.clone()
151
+ # box_xy = _boxes[:, :2]
152
+ # box_wh = _boxes[:, 2:]
153
+ # box_x1y1 = box_xy - box_wh/2
154
+ # box_x2y2 = box_xy + box_wh/2
155
+ # box_xyxy = torch.cat((box_x1y1, box_x2y2), dim=-1)
156
+ # return box_xyxy
157
+
158
+ class Rotate:
159
+ def __init__(self, angle=10) -> None:
160
+ self.angle = angle
161
+
162
+ def __call__(self, img, target):
163
+ w,h = img.size
164
+ whwh = torch.Tensor([w, h, w, h])
165
+ boxes_xyxy = box_cxcywh_to_xyxy(target['boxes']) * whwh
166
+ img, boxes_new = rotate(img, boxes_xyxy, self.angle)
167
+ target['boxes'] = box_xyxy_to_cxcywh(boxes_new).to(boxes_xyxy.dtype) / (whwh + 1e-3)
168
+ return img, target
169
+
170
+
171
+ class RandomCrop:
172
+ def __init__(self) -> None:
173
+ pass
174
+
175
+ def __call__(self, img, target):
176
+ w,h = img.size
177
+ try:
178
+ boxes_xyxy = target['boxes']
179
+ labels = target['labels']
180
+ img, new_boxes, new_labels, _ = random_crop(img, boxes_xyxy, labels)
181
+ target['boxes'] = new_boxes
182
+ target['labels'] = new_labels
183
+ except Exception as e:
184
+ pass
185
+ return img, target
186
+
187
+
188
+ class RandomCropDebug:
189
+ def __init__(self) -> None:
190
+ pass
191
+
192
+ def __call__(self, img, target):
193
+ boxes_xyxy = target['boxes'].clone()
194
+ labels = target['labels'].clone()
195
+ img, new_boxes, new_labels, _ = random_crop(img, boxes_xyxy, labels)
196
+ target['boxes'] = new_boxes
197
+ target['labels'] = new_labels
198
+
199
+
200
+ return img, target
201
+
202
+ class RandomSelectMulti(object):
203
+ """
204
+ Randomly selects between transforms1 and transforms2,
205
+ """
206
+ def __init__(self, transformslist, p=-1):
207
+ self.transformslist = transformslist
208
+ self.p = p
209
+ assert p == -1
210
+
211
+ def __call__(self, img, target):
212
+ if self.p == -1:
213
+ return random.choice(self.transformslist)(img, target)
214
+
215
+
216
+ class Albumentations:
217
+ def __init__(self):
218
+ import albumentations as A
219
+ self.transform = A.Compose([
220
+ A.Blur(p=0.01),
221
+ A.MedianBlur(p=0.01),
222
+ A.ToGray(p=0.01),
223
+ A.CLAHE(p=0.01),
224
+ A.RandomBrightnessContrast(p=0.005),
225
+ A.RandomGamma(p=0.005),
226
+ A.ImageCompression(quality_lower=75, p=0.005)],
227
+ bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_labels']))
228
+
229
+ def __call__(self, img, target, p=1.0):
230
+ """
231
+ Input:
232
+ target['boxes']: xyxy, unnormalized data.
233
+
234
+ """
235
+ boxes_raw = target['boxes']
236
+ labels_raw = target['labels']
237
+ img_np = np.array(img)
238
+ if self.transform and random.random() < p:
239
+ new_res = self.transform(image=img_np, bboxes=boxes_raw, class_labels=labels_raw) # transformed
240
+ boxes_new = torch.Tensor(new_res['bboxes']).to(boxes_raw.dtype).reshape_as(boxes_raw)
241
+ img_np = new_res['image']
242
+ labels_new = torch.Tensor(new_res['class_labels']).to(labels_raw.dtype)
243
+ img_new = Image.fromarray(img_np)
244
+ target['boxes'] = boxes_new
245
+ target['labels'] = labels_new
246
+
247
+ return img_new, target
groundingdino/datasets/transforms.py CHANGED
@@ -2,7 +2,6 @@
2
  """
3
  Transforms and data augmentation for both image + bbox.
4
  """
5
- import os
6
  import random
7
 
8
  import PIL
@@ -10,8 +9,8 @@ import torch
10
  import torchvision.transforms as T
11
  import torchvision.transforms.functional as F
12
 
13
- from groundingdino.util.box_ops import box_xyxy_to_cxcywh
14
- from groundingdino.util.misc import interpolate
15
 
16
 
17
  def crop(image, target, region):
@@ -23,7 +22,7 @@ def crop(image, target, region):
23
  # should we do something wrt the original size?
24
  target["size"] = torch.tensor([h, w])
25
 
26
- fields = ["labels", "area", "iscrowd", "positive_map"]
27
 
28
  if "boxes" in target:
29
  boxes = target["boxes"]
@@ -38,29 +37,22 @@ def crop(image, target, region):
38
 
39
  if "masks" in target:
40
  # FIXME should we update the area here if there are no boxes?
41
- target["masks"] = target["masks"][:, i : i + h, j : j + w]
42
  fields.append("masks")
43
 
 
44
  # remove elements for which the boxes or masks that have zero area
45
  if "boxes" in target or "masks" in target:
46
  # favor boxes selection when defining which elements to keep
47
  # this is compatible with previous implementation
48
  if "boxes" in target:
49
- cropped_boxes = target["boxes"].reshape(-1, 2, 2)
50
  keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
51
  else:
52
- keep = target["masks"].flatten(1).any(1)
53
 
54
  for field in fields:
55
- if field in target:
56
- target[field] = target[field][keep]
57
-
58
- if os.environ.get("IPDB_SHILONG_DEBUG", None) == "INFO":
59
- # for debug and visualization only.
60
- if "strings_positive" in target:
61
- target["strings_positive"] = [
62
- _i for _i, _j in zip(target["strings_positive"], keep) if _j
63
- ]
64
 
65
  return cropped_image, target
66
 
@@ -73,13 +65,11 @@ def hflip(image, target):
73
  target = target.copy()
74
  if "boxes" in target:
75
  boxes = target["boxes"]
76
- boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor(
77
- [w, 0, w, 0]
78
- )
79
  target["boxes"] = boxes
80
 
81
  if "masks" in target:
82
- target["masks"] = target["masks"].flip(-1)
83
 
84
  return flipped_image, target
85
 
@@ -125,9 +115,7 @@ def resize(image, target, size, max_size=None):
125
  target = target.copy()
126
  if "boxes" in target:
127
  boxes = target["boxes"]
128
- scaled_boxes = boxes * torch.as_tensor(
129
- [ratio_width, ratio_height, ratio_width, ratio_height]
130
- )
131
  target["boxes"] = scaled_boxes
132
 
133
  if "area" in target:
@@ -139,9 +127,8 @@ def resize(image, target, size, max_size=None):
139
  target["size"] = torch.tensor([h, w])
140
 
141
  if "masks" in target:
142
- target["masks"] = (
143
- interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5
144
- )
145
 
146
  return rescaled_image, target
147
 
@@ -155,7 +142,7 @@ def pad(image, target, padding):
155
  # should we do something wrt the original size?
156
  target["size"] = torch.tensor(padded_image.size[::-1])
157
  if "masks" in target:
158
- target["masks"] = torch.nn.functional.pad(target["masks"], (0, padding[0], 0, padding[1]))
159
  return padded_image, target
160
 
161
 
@@ -177,28 +164,15 @@ class RandomCrop(object):
177
 
178
 
179
  class RandomSizeCrop(object):
180
- def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False):
181
- # respect_boxes: True to keep all boxes
182
- # False to tolerence box filter
183
  self.min_size = min_size
184
  self.max_size = max_size
185
- self.respect_boxes = respect_boxes
186
 
187
  def __call__(self, img: PIL.Image.Image, target: dict):
188
- init_boxes = len(target["boxes"])
189
- max_patience = 10
190
- for i in range(max_patience):
191
- w = random.randint(self.min_size, min(img.width, self.max_size))
192
- h = random.randint(self.min_size, min(img.height, self.max_size))
193
- region = T.RandomCrop.get_params(img, [h, w])
194
- result_img, result_target = crop(img, target, region)
195
- if (
196
- not self.respect_boxes
197
- or len(result_target["boxes"]) == init_boxes
198
- or i == max_patience - 1
199
- ):
200
- return result_img, result_target
201
- return result_img, result_target
202
 
203
 
204
  class CenterCrop(object):
@@ -208,8 +182,8 @@ class CenterCrop(object):
208
  def __call__(self, img, target):
209
  image_width, image_height = img.size
210
  crop_height, crop_width = self.size
211
- crop_top = int(round((image_height - crop_height) / 2.0))
212
- crop_left = int(round((image_width - crop_width) / 2.0))
213
  return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
214
 
215
 
@@ -249,7 +223,6 @@ class RandomSelect(object):
249
  Randomly selects between transforms1 and transforms2,
250
  with probability p for transforms1 and (1 - p) for transforms2
251
  """
252
-
253
  def __init__(self, transforms1, transforms2, p=0.5):
254
  self.transforms1 = transforms1
255
  self.transforms2 = transforms2
@@ -267,6 +240,7 @@ class ToTensor(object):
267
 
268
 
269
  class RandomErasing(object):
 
270
  def __init__(self, *args, **kwargs):
271
  self.eraser = T.RandomErasing(*args, **kwargs)
272
 
 
2
  """
3
  Transforms and data augmentation for both image + bbox.
4
  """
 
5
  import random
6
 
7
  import PIL
 
9
  import torchvision.transforms as T
10
  import torchvision.transforms.functional as F
11
 
12
+ from util.box_ops import box_xyxy_to_cxcywh
13
+ from util.misc import interpolate
14
 
15
 
16
  def crop(image, target, region):
 
22
  # should we do something wrt the original size?
23
  target["size"] = torch.tensor([h, w])
24
 
25
+ fields = ["labels", "area"]
26
 
27
  if "boxes" in target:
28
  boxes = target["boxes"]
 
37
 
38
  if "masks" in target:
39
  # FIXME should we update the area here if there are no boxes?
40
+ target['masks'] = target['masks'][:, i:i + h, j:j + w]
41
  fields.append("masks")
42
 
43
+
44
  # remove elements for which the boxes or masks that have zero area
45
  if "boxes" in target or "masks" in target:
46
  # favor boxes selection when defining which elements to keep
47
  # this is compatible with previous implementation
48
  if "boxes" in target:
49
+ cropped_boxes = target['boxes'].reshape(-1, 2, 2)
50
  keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
51
  else:
52
+ keep = target['masks'].flatten(1).any(1)
53
 
54
  for field in fields:
55
+ target[field] = target[field][keep]
 
 
 
 
 
 
 
 
56
 
57
  return cropped_image, target
58
 
 
65
  target = target.copy()
66
  if "boxes" in target:
67
  boxes = target["boxes"]
68
+ boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
 
 
69
  target["boxes"] = boxes
70
 
71
  if "masks" in target:
72
+ target['masks'] = target['masks'].flip(-1)
73
 
74
  return flipped_image, target
75
 
 
115
  target = target.copy()
116
  if "boxes" in target:
117
  boxes = target["boxes"]
118
+ scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
 
 
119
  target["boxes"] = scaled_boxes
120
 
121
  if "area" in target:
 
127
  target["size"] = torch.tensor([h, w])
128
 
129
  if "masks" in target:
130
+ target['masks'] = interpolate(
131
+ target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
 
132
 
133
  return rescaled_image, target
134
 
 
142
  # should we do something wrt the original size?
143
  target["size"] = torch.tensor(padded_image.size[::-1])
144
  if "masks" in target:
145
+ target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
146
  return padded_image, target
147
 
148
 
 
164
 
165
 
166
  class RandomSizeCrop(object):
167
+ def __init__(self, min_size: int, max_size: int):
 
 
168
  self.min_size = min_size
169
  self.max_size = max_size
 
170
 
171
  def __call__(self, img: PIL.Image.Image, target: dict):
172
+ w = random.randint(self.min_size, min(img.width, self.max_size))
173
+ h = random.randint(self.min_size, min(img.height, self.max_size))
174
+ region = T.RandomCrop.get_params(img, [h, w])
175
+ return crop(img, target, region)
 
 
 
 
 
 
 
 
 
 
176
 
177
 
178
  class CenterCrop(object):
 
182
  def __call__(self, img, target):
183
  image_width, image_height = img.size
184
  crop_height, crop_width = self.size
185
+ crop_top = int(round((image_height - crop_height) / 2.))
186
+ crop_left = int(round((image_width - crop_width) / 2.))
187
  return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
188
 
189
 
 
223
  Randomly selects between transforms1 and transforms2,
224
  with probability p for transforms1 and (1 - p) for transforms2
225
  """
 
226
  def __init__(self, transforms1, transforms2, p=0.5):
227
  self.transforms1 = transforms1
228
  self.transforms2 = transforms2
 
240
 
241
 
242
  class RandomErasing(object):
243
+
244
  def __init__(self, *args, **kwargs):
245
  self.eraser = T.RandomErasing(*args, **kwargs)
246
 
groundingdino/models/.ipynb_checkpoints/__init__-checkpoint.py DELETED
@@ -1,18 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # Grounding DINO
3
- # url: https://github.com/IDEA-Research/GroundingDINO
4
- # Copyright (c) 2023 IDEA. All Rights Reserved.
5
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
- # ------------------------------------------------------------------------
7
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
8
- from .GroundingDINO import build_groundingdino
9
-
10
-
11
- def build_model(args):
12
- # we use register to maintain models from catdet6 on.
13
- from .registry import MODULE_BUILD_FUNCS
14
-
15
- assert args.modelname in MODULE_BUILD_FUNCS._module_dict
16
- build_func = MODULE_BUILD_FUNCS.get(args.modelname)
17
- model = build_func(args)
18
- return model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/.ipynb_checkpoints/registry-checkpoint.py DELETED
@@ -1,66 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # Grounding DINO
3
- # url: https://github.com/IDEA-Research/GroundingDINO
4
- # Copyright (c) 2023 IDEA. All Rights Reserved.
5
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
- # ------------------------------------------------------------------------
7
- # -*- coding: utf-8 -*-
8
- # @Author: Yihao Chen
9
- # @Date: 2021-08-16 16:03:17
10
- # @Last Modified by: Shilong Liu
11
- # @Last Modified time: 2022-01-23 15:26
12
- # modified from mmcv
13
-
14
- import inspect
15
- from functools import partial
16
-
17
-
18
- class Registry(object):
19
- def __init__(self, name):
20
- self._name = name
21
- self._module_dict = dict()
22
-
23
- def __repr__(self):
24
- format_str = self.__class__.__name__ + "(name={}, items={})".format(
25
- self._name, list(self._module_dict.keys())
26
- )
27
- return format_str
28
-
29
- def __len__(self):
30
- return len(self._module_dict)
31
-
32
- @property
33
- def name(self):
34
- return self._name
35
-
36
- @property
37
- def module_dict(self):
38
- return self._module_dict
39
-
40
- def get(self, key):
41
- return self._module_dict.get(key, None)
42
-
43
- def registe_with_name(self, module_name=None, force=False):
44
- return partial(self.register, module_name=module_name, force=force)
45
-
46
- def register(self, module_build_function, module_name=None, force=False):
47
- """Register a module build function.
48
- Args:
49
- module (:obj:`nn.Module`): Module to be registered.
50
- """
51
- if not inspect.isfunction(module_build_function):
52
- raise TypeError(
53
- "module_build_function must be a function, but got {}".format(
54
- type(module_build_function)
55
- )
56
- )
57
- if module_name is None:
58
- module_name = module_build_function.__name__
59
- if not force and module_name in self._module_dict:
60
- raise KeyError("{} is already registered in {}".format(module_name, self.name))
61
- self._module_dict[module_name] = module_build_function
62
-
63
- return module_build_function
64
-
65
-
66
- MODULE_BUILD_FUNCS = Registry("model build functions")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/GroundingDINO/.ipynb_checkpoints/bertwarper-checkpoint.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # Grounding DINO
3
+ # url: https://github.com/IDEA-Research/GroundingDINO
4
+ # Copyright (c) 2023 IDEA. All Rights Reserved.
5
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ # ------------------------------------------------------------------------
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ import torch.utils.checkpoint as checkpoint
11
+ from torch import Tensor, nn
12
+ from torchvision.ops.boxes import nms
13
+ from transformers import BertConfig, BertModel, BertPreTrainedModel
14
+ from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
15
+
16
+
17
+ class BertModelWarper(nn.Module):
18
+ def __init__(self, bert_model):
19
+ super().__init__()
20
+ # self.bert = bert_modelc
21
+
22
+ self.config = bert_model.config
23
+ self.embeddings = bert_model.embeddings
24
+ self.encoder = bert_model.encoder
25
+ self.pooler = bert_model.pooler
26
+
27
+ self.get_extended_attention_mask = bert_model.get_extended_attention_mask
28
+ self.invert_attention_mask = bert_model.invert_attention_mask
29
+ self.get_head_mask = bert_model.get_head_mask
30
+
31
+ def forward(
32
+ self,
33
+ input_ids=None,
34
+ attention_mask=None,
35
+ token_type_ids=None,
36
+ position_ids=None,
37
+ head_mask=None,
38
+ inputs_embeds=None,
39
+ encoder_hidden_states=None,
40
+ encoder_attention_mask=None,
41
+ past_key_values=None,
42
+ use_cache=None,
43
+ output_attentions=None,
44
+ output_hidden_states=None,
45
+ return_dict=None,
46
+ ):
47
+ r"""
48
+ encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
49
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
50
+ the model is configured as a decoder.
51
+ encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
52
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
53
+ the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
54
+
55
+ - 1 for tokens that are **not masked**,
56
+ - 0 for tokens that are **masked**.
57
+ past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
58
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
59
+
60
+ If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
61
+ (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
62
+ instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
63
+ use_cache (:obj:`bool`, `optional`):
64
+ If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
65
+ decoding (see :obj:`past_key_values`).
66
+ """
67
+ output_attentions = (
68
+ output_attentions if output_attentions is not None else self.config.output_attentions
69
+ )
70
+ output_hidden_states = (
71
+ output_hidden_states
72
+ if output_hidden_states is not None
73
+ else self.config.output_hidden_states
74
+ )
75
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
76
+
77
+ if self.config.is_decoder:
78
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
79
+ else:
80
+ use_cache = False
81
+
82
+ if input_ids is not None and inputs_embeds is not None:
83
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
84
+ elif input_ids is not None:
85
+ input_shape = input_ids.size()
86
+ batch_size, seq_length = input_shape
87
+ elif inputs_embeds is not None:
88
+ input_shape = inputs_embeds.size()[:-1]
89
+ batch_size, seq_length = input_shape
90
+ else:
91
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
92
+
93
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
94
+
95
+ # past_key_values_length
96
+ past_key_values_length = (
97
+ past_key_values[0][0].shape[2] if past_key_values is not None else 0
98
+ )
99
+
100
+ if attention_mask is None:
101
+ attention_mask = torch.ones(
102
+ ((batch_size, seq_length + past_key_values_length)), device=device
103
+ )
104
+ if token_type_ids is None:
105
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
106
+
107
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
108
+ # ourselves in which case we just need to make it broadcastable to all heads.
109
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
110
+ attention_mask, input_shape, device
111
+ )
112
+
113
+ # If a 2D or 3D attention mask is provided for the cross-attention
114
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
115
+ if self.config.is_decoder and encoder_hidden_states is not None:
116
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
117
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
118
+ if encoder_attention_mask is None:
119
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
120
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
121
+ else:
122
+ encoder_extended_attention_mask = None
123
+ # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
124
+ # import ipdb; ipdb.set_trace()
125
+
126
+ # Prepare head mask if needed
127
+ # 1.0 in head_mask indicate we keep the head
128
+ # attention_probs has shape bsz x n_heads x N x N
129
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
130
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
131
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
132
+
133
+ embedding_output = self.embeddings(
134
+ input_ids=input_ids,
135
+ position_ids=position_ids,
136
+ token_type_ids=token_type_ids,
137
+ inputs_embeds=inputs_embeds,
138
+ past_key_values_length=past_key_values_length,
139
+ )
140
+
141
+ encoder_outputs = self.encoder(
142
+ embedding_output,
143
+ attention_mask=extended_attention_mask,
144
+ head_mask=head_mask,
145
+ encoder_hidden_states=encoder_hidden_states,
146
+ encoder_attention_mask=encoder_extended_attention_mask,
147
+ past_key_values=past_key_values,
148
+ use_cache=use_cache,
149
+ output_attentions=output_attentions,
150
+ output_hidden_states=output_hidden_states,
151
+ return_dict=return_dict,
152
+ )
153
+ sequence_output = encoder_outputs[0]
154
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
155
+
156
+ if not return_dict:
157
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
158
+
159
+ return BaseModelOutputWithPoolingAndCrossAttentions(
160
+ last_hidden_state=sequence_output,
161
+ pooler_output=pooled_output,
162
+ past_key_values=encoder_outputs.past_key_values,
163
+ hidden_states=encoder_outputs.hidden_states,
164
+ attentions=encoder_outputs.attentions,
165
+ cross_attentions=encoder_outputs.cross_attentions,
166
+ )
167
+
168
+
169
+ class TextEncoderShell(nn.Module):
170
+ def __init__(self, text_encoder):
171
+ super().__init__()
172
+ self.text_encoder = text_encoder
173
+ self.config = self.text_encoder.config
174
+
175
+ def forward(self, **kw):
176
+ # feed into text encoder
177
+ return self.text_encoder(**kw)
178
+
179
+
180
+ def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer):
181
+ """Generate attention mask between each pair of special tokens
182
+ Args:
183
+ input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
184
+ special_tokens_mask (list): special tokens mask.
185
+ Returns:
186
+ torch.Tensor: attention mask between each special tokens.
187
+ """
188
+ input_ids = tokenized["input_ids"]
189
+ bs, num_token = input_ids.shape
190
+ # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
191
+ special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
192
+ for special_token in special_tokens_list:
193
+ special_tokens_mask |= input_ids == special_token
194
+
195
+ # idxs: each row is a list of indices of special tokens
196
+ idxs = torch.nonzero(special_tokens_mask)
197
+
198
+ # generate attention mask and positional ids
199
+ attention_mask = (
200
+ torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
201
+ )
202
+ position_ids = torch.zeros((bs, num_token), device=input_ids.device)
203
+ previous_col = 0
204
+ for i in range(idxs.shape[0]):
205
+ row, col = idxs[i]
206
+ if (col == 0) or (col == num_token - 1):
207
+ attention_mask[row, col, col] = True
208
+ position_ids[row, col] = 0
209
+ else:
210
+ attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
211
+ position_ids[row, previous_col + 1 : col + 1] = torch.arange(
212
+ 0, col - previous_col, device=input_ids.device
213
+ )
214
+
215
+ previous_col = col
216
+
217
+ # # padding mask
218
+ # padding_mask = tokenized['attention_mask']
219
+ # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
220
+
221
+ return attention_mask, position_ids.to(torch.long)
222
+
223
+
224
+ def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list, tokenizer):
225
+ """Generate attention mask between each pair of special tokens
226
+ Args:
227
+ input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
228
+ special_tokens_mask (list): special tokens mask.
229
+ Returns:
230
+ torch.Tensor: attention mask between each special tokens.
231
+ """
232
+ input_ids = tokenized["input_ids"]
233
+ bs, num_token = input_ids.shape
234
+ # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
235
+ special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
236
+ for special_token in special_tokens_list:
237
+ special_tokens_mask |= input_ids == special_token
238
+
239
+ # idxs: each row is a list of indices of special tokens
240
+ idxs = torch.nonzero(special_tokens_mask)
241
+
242
+ # generate attention mask and positional ids
243
+ attention_mask = (
244
+ torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
245
+ )
246
+ position_ids = torch.zeros((bs, num_token), device=input_ids.device)
247
+ cate_to_token_mask_list = [[] for _ in range(bs)]
248
+ previous_col = 0
249
+ for i in range(idxs.shape[0]):
250
+ row, col = idxs[i]
251
+ if (col == 0) or (col == num_token - 1):
252
+ attention_mask[row, col, col] = True
253
+ position_ids[row, col] = 0
254
+ else:
255
+ attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
256
+ position_ids[row, previous_col + 1 : col + 1] = torch.arange(
257
+ 0, col - previous_col, device=input_ids.device
258
+ )
259
+ c2t_maski = torch.zeros((num_token), device=input_ids.device).bool()
260
+ c2t_maski[previous_col + 1 : col] = True
261
+ cate_to_token_mask_list[row].append(c2t_maski)
262
+ previous_col = col
263
+
264
+ cate_to_token_mask_list = [
265
+ torch.stack(cate_to_token_mask_listi, dim=0)
266
+ for cate_to_token_mask_listi in cate_to_token_mask_list
267
+ ]
268
+
269
+ # # padding mask
270
+ # padding_mask = tokenized['attention_mask']
271
+ # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
272
+
273
+ return attention_mask, position_ids.to(torch.long), cate_to_token_mask_list
groundingdino/models/GroundingDINO/.ipynb_checkpoints/fuse_modules-checkpoint.py CHANGED
@@ -20,9 +20,9 @@ class FeatureResizer(nn.Module):
20
  def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
21
  super().__init__()
22
  self.do_ln = do_ln
 
23
  # Object feature encoding
24
- r = 16
25
- self.fc = lora.Linear(input_feat_size, output_feat_size,r=r , bias=True)
26
  self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
27
  self.dropout = nn.Dropout(dropout)
28
 
@@ -112,14 +112,14 @@ class BiMultiHeadAttention(nn.Module):
112
  ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
113
  self.scale = self.head_dim ** (-0.5)
114
  self.dropout = dropout
115
- r = 16
116
  self.v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r)
117
- self.l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r)
118
- self.values_v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r)
119
- self.values_l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r)
120
 
121
- self.out_v_proj = lora.Linear(self.embed_dim, self.v_dim , r=r)
122
- self.out_l_proj = lora.Linear(self.embed_dim, self.l_dim , r=r)
123
 
124
  self.stable_softmax_2d = True
125
  self.clamp_min_for_underflow = True
 
20
  def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
21
  super().__init__()
22
  self.do_ln = do_ln
23
+ r = 12
24
  # Object feature encoding
25
+ self.fc = lora.Linear(input_feat_size, output_feat_size,r=r, bias=True)
 
26
  self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
27
  self.dropout = nn.Dropout(dropout)
28
 
 
112
  ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
113
  self.scale = self.head_dim ** (-0.5)
114
  self.dropout = dropout
115
+ r = 12
116
  self.v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r)
117
+ self.l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r )
118
+ self.values_v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r )
119
+ self.values_l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r )
120
 
121
+ self.out_v_proj = lora.Linear(self.embed_dim, self.v_dim , r=r )
122
+ self.out_l_proj = lora.Linear(self.embed_dim, self.l_dim , r=r )
123
 
124
  self.stable_softmax_2d = True
125
  self.clamp_min_for_underflow = True