Hasanmog commited on
Commit
23a2072
·
2 Parent(s): 616dc83 4234e89

Merge branch 'main' of https://huggingface.co/spaces/Hasanmog/Peft-GroundingDINO

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. groundingdino/datasets/.ipynb_checkpoints/__init__-checkpoint.py +0 -23
  2. groundingdino/datasets/.ipynb_checkpoints/coco-checkpoint.py +0 -649
  3. groundingdino/datasets/.ipynb_checkpoints/dataset-checkpoint.py +0 -44
  4. groundingdino/datasets/.ipynb_checkpoints/odvg-checkpoint.py +0 -258
  5. groundingdino/datasets/.ipynb_checkpoints/transforms-checkpoint.py +0 -285
  6. groundingdino/datasets/__init__.py +0 -23
  7. groundingdino/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
  8. groundingdino/datasets/__pycache__/coco.cpython-310.pyc +0 -0
  9. groundingdino/datasets/__pycache__/coco_eval.cpython-310.pyc +0 -0
  10. groundingdino/datasets/__pycache__/cocogrounding_eval.cpython-310.pyc +0 -0
  11. groundingdino/datasets/__pycache__/data_util.cpython-310.pyc +0 -0
  12. groundingdino/datasets/__pycache__/odvg.cpython-310.pyc +0 -0
  13. groundingdino/datasets/__pycache__/panoptic_eval.cpython-310.pyc +0 -0
  14. groundingdino/datasets/__pycache__/random_crop.cpython-310.pyc +0 -0
  15. groundingdino/datasets/__pycache__/sltransform.cpython-310.pyc +0 -0
  16. groundingdino/datasets/__pycache__/transforms.cpython-310.pyc +0 -0
  17. groundingdino/datasets/coco.py +0 -649
  18. groundingdino/datasets/coco_eval.py +0 -266
  19. groundingdino/datasets/coco_panoptic.py +0 -99
  20. groundingdino/datasets/cocogrounding_eval.py +0 -271
  21. groundingdino/datasets/data_util.py +0 -170
  22. groundingdino/datasets/dataset.py +0 -44
  23. groundingdino/datasets/odvg.py +0 -258
  24. groundingdino/datasets/panoptic_eval.py +0 -44
  25. groundingdino/datasets/random_crop.py +0 -135
  26. groundingdino/datasets/sltransform.py +0 -247
  27. groundingdino/datasets/transforms.py +0 -285
  28. groundingdino/models/GroundingDINO/.ipynb_checkpoints/bertwarper-checkpoint.py +0 -273
  29. groundingdino/models/GroundingDINO/.ipynb_checkpoints/fuse_modules-checkpoint.py +0 -298
  30. groundingdino/models/GroundingDINO/.ipynb_checkpoints/groundingdino-checkpoint.py +0 -857
  31. groundingdino/models/GroundingDINO/.ipynb_checkpoints/matcher-checkpoint.py +0 -218
  32. groundingdino/models/GroundingDINO/.ipynb_checkpoints/ms_deform_attn-checkpoint.py +0 -416
  33. groundingdino/models/GroundingDINO/.ipynb_checkpoints/transformer-checkpoint.py +0 -969
  34. groundingdino/models/GroundingDINO/.ipynb_checkpoints/transformer_vanilla-checkpoint.py +0 -125
  35. groundingdino/models/GroundingDINO/.ipynb_checkpoints/utils-checkpoint.py +0 -274
  36. groundingdino/models/GroundingDINO/__pycache__/__init__.cpython-310.pyc +0 -0
  37. groundingdino/models/GroundingDINO/__pycache__/bertwarper.cpython-310.pyc +0 -0
  38. groundingdino/models/GroundingDINO/__pycache__/fuse_modules.cpython-310.pyc +0 -0
  39. groundingdino/models/GroundingDINO/__pycache__/groundingdino.cpython-310.pyc +0 -0
  40. groundingdino/models/GroundingDINO/__pycache__/matcher.cpython-310.pyc +0 -0
  41. groundingdino/models/GroundingDINO/__pycache__/ms_deform_attn.cpython-310.pyc +0 -0
  42. groundingdino/models/GroundingDINO/__pycache__/transformer.cpython-310.pyc +0 -0
  43. groundingdino/models/GroundingDINO/__pycache__/transformer_vanilla.cpython-310.pyc +0 -0
  44. groundingdino/models/GroundingDINO/__pycache__/utils.cpython-310.pyc +0 -0
  45. groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/__init__-checkpoint.py +0 -1
  46. groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/backbone-checkpoint.py +0 -221
  47. groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/position_encoding-checkpoint.py +0 -186
  48. groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/swin_transformer-checkpoint.py +0 -804
  49. groundingdino/models/GroundingDINO/backbone/__pycache__/__init__.cpython-310.pyc +0 -0
  50. groundingdino/models/GroundingDINO/backbone/__pycache__/backbone.cpython-310.pyc +0 -0
groundingdino/datasets/.ipynb_checkpoints/__init__-checkpoint.py DELETED
@@ -1,23 +0,0 @@
1
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
- import torch.utils.data
3
- import torchvision
4
- from .coco import build as build_coco
5
-
6
-
7
- def get_coco_api_from_dataset(dataset):
8
- for _ in range(10):
9
- # if isinstance(dataset, torchvision.datasets.CocoDetection):
10
- # break
11
- if isinstance(dataset, torch.utils.data.Subset):
12
- dataset = dataset.dataset
13
- if isinstance(dataset, torchvision.datasets.CocoDetection):
14
- return dataset.coco
15
-
16
-
17
- def build_dataset(image_set, args, datasetinfo):
18
- if datasetinfo["dataset_mode"] == 'coco':
19
- return build_coco(image_set, args, datasetinfo)
20
- if datasetinfo["dataset_mode"] == 'odvg':
21
- from .odvg import build_odvg
22
- return build_odvg(image_set, args, datasetinfo)
23
- raise ValueError(f'dataset {args.dataset_file} not supported')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/.ipynb_checkpoints/coco-checkpoint.py DELETED
@@ -1,649 +0,0 @@
1
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
- """
3
- COCO dataset which returns image_id for evaluation.
4
-
5
- Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
6
- """
7
- if __name__=="__main__":
8
- # for debug only
9
- import os, sys
10
- sys.path.append(os.path.dirname(sys.path[0]))
11
- from torchvision.datasets.vision import VisionDataset
12
-
13
- import json
14
- from pathlib import Path
15
- import random
16
- import os
17
- from typing import Any, Callable, List, Optional, Tuple
18
-
19
- from PIL import Image
20
-
21
- import torch
22
- import torch.utils.data
23
- import torchvision
24
- from pycocotools import mask as coco_mask
25
-
26
- from datasets.data_util import preparing_dataset
27
- import datasets.transforms as T
28
- from util.box_ops import box_cxcywh_to_xyxy, box_iou
29
-
30
- __all__ = ['build']
31
-
32
-
33
- class label2compat():
34
- def __init__(self) -> None:
35
- self.category_map_str = {"1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10, "11": 11, "13": 12, "14": 13, "15": 14, "16": 15, "17": 16, "18": 17, "19": 18, "20": 19, "21": 20, "22": 21, "23": 22, "24": 23, "25": 24, "27": 25, "28": 26, "31": 27, "32": 28, "33": 29, "34": 30, "35": 31, "36": 32, "37": 33, "38": 34, "39": 35, "40": 36, "41": 37, "42": 38, "43": 39, "44": 40, "46": 41, "47": 42, "48": 43, "49": 44, "50": 45, "51": 46, "52": 47, "53": 48, "54": 49, "55": 50, "56": 51, "57": 52, "58": 53, "59": 54, "60": 55, "61": 56, "62": 57, "63": 58, "64": 59, "65": 60, "67": 61, "70": 62, "72": 63, "73": 64, "74": 65, "75": 66, "76": 67, "77": 68, "78": 69, "79": 70, "80": 71, "81": 72, "82": 73, "84": 74, "85": 75, "86": 76, "87": 77, "88": 78, "89": 79, "90": 80}
36
- self.category_map = {int(k):v for k,v in self.category_map_str.items()}
37
-
38
- def __call__(self, target, img=None):
39
- labels = target['labels']
40
- res = torch.zeros(labels.shape, dtype=labels.dtype)
41
- for idx, item in enumerate(labels):
42
- res[idx] = self.category_map[item.item()] - 1
43
- target['label_compat'] = res
44
- if img is not None:
45
- return target, img
46
- else:
47
- return target
48
-
49
-
50
- class label_compat2onehot():
51
- def __init__(self, num_class=80, num_output_objs=1):
52
- self.num_class = num_class
53
- self.num_output_objs = num_output_objs
54
- if num_output_objs != 1:
55
- raise DeprecationWarning("num_output_objs!=1, which is only used for comparison")
56
-
57
- def __call__(self, target, img=None):
58
- labels = target['label_compat']
59
- place_dict = {k:0 for k in range(self.num_class)}
60
- if self.num_output_objs == 1:
61
- res = torch.zeros(self.num_class)
62
- for i in labels:
63
- itm = i.item()
64
- res[itm] = 1.0
65
- else:
66
- # compat with baseline
67
- res = torch.zeros(self.num_class, self.num_output_objs)
68
- for i in labels:
69
- itm = i.item()
70
- res[itm][place_dict[itm]] = 1.0
71
- place_dict[itm] += 1
72
- target['label_compat_onehot'] = res
73
- if img is not None:
74
- return target, img
75
- else:
76
- return target
77
-
78
-
79
- class box_label_catter():
80
- def __init__(self):
81
- pass
82
-
83
- def __call__(self, target, img=None):
84
- labels = target['label_compat']
85
- boxes = target['boxes']
86
- box_label = torch.cat((boxes, labels.unsqueeze(-1)), 1)
87
- target['box_label'] = box_label
88
- if img is not None:
89
- return target, img
90
- else:
91
- return target
92
-
93
-
94
- class RandomSelectBoxlabels():
95
- def __init__(self, num_classes, leave_one_out=False, blank_prob=0.8,
96
- prob_first_item = 0.0,
97
- prob_random_item = 0.0,
98
- prob_last_item = 0.8,
99
- prob_stop_sign = 0.2
100
- ) -> None:
101
- self.num_classes = num_classes
102
- self.leave_one_out = leave_one_out
103
- self.blank_prob = blank_prob
104
-
105
- self.set_state(prob_first_item, prob_random_item, prob_last_item, prob_stop_sign)
106
-
107
-
108
- def get_state(self):
109
- return [self.prob_first_item, self.prob_random_item, self.prob_last_item, self.prob_stop_sign]
110
-
111
- def set_state(self, prob_first_item, prob_random_item, prob_last_item, prob_stop_sign):
112
- sum_prob = prob_first_item + prob_random_item + prob_last_item + prob_stop_sign
113
- assert sum_prob - 1 < 1e-6, \
114
- f"Sum up all prob = {sum_prob}. prob_first_item:{prob_first_item}" \
115
- + f"prob_random_item:{prob_random_item}, prob_last_item:{prob_last_item}" \
116
- + f"prob_stop_sign:{prob_stop_sign}"
117
-
118
- self.prob_first_item = prob_first_item
119
- self.prob_random_item = prob_random_item
120
- self.prob_last_item = prob_last_item
121
- self.prob_stop_sign = prob_stop_sign
122
-
123
-
124
- def sample_for_pred_first_item(self, box_label: torch.FloatTensor):
125
- box_label_known = torch.Tensor(0,5)
126
- box_label_unknown = box_label
127
- return box_label_known, box_label_unknown
128
-
129
- def sample_for_pred_random_item(self, box_label: torch.FloatTensor):
130
- n_select = int(random.random() * box_label.shape[0])
131
- box_label = box_label[torch.randperm(box_label.shape[0])]
132
- box_label_known = box_label[:n_select]
133
- box_label_unknown = box_label[n_select:]
134
- return box_label_known, box_label_unknown
135
-
136
- def sample_for_pred_last_item(self, box_label: torch.FloatTensor):
137
- box_label_perm = box_label[torch.randperm(box_label.shape[0])]
138
- known_label_list = []
139
- box_label_known = []
140
- box_label_unknown = []
141
- for item in box_label_perm:
142
- label_i = item[4].item()
143
- if label_i in known_label_list:
144
- box_label_known.append(item)
145
- else:
146
- # first item
147
- box_label_unknown.append(item)
148
- known_label_list.append(label_i)
149
- box_label_known = torch.stack(box_label_known) if len(box_label_known) > 0 else torch.Tensor(0,5)
150
- box_label_unknown = torch.stack(box_label_unknown) if len(box_label_unknown) > 0 else torch.Tensor(0,5)
151
- return box_label_known, box_label_unknown
152
-
153
- def sample_for_pred_stop_sign(self, box_label: torch.FloatTensor):
154
- box_label_unknown = torch.Tensor(0,5)
155
- box_label_known = box_label
156
- return box_label_known, box_label_unknown
157
-
158
- def __call__(self, target, img=None):
159
- box_label = target['box_label'] # K, 5
160
-
161
- dice_number = random.random()
162
-
163
- if dice_number < self.prob_first_item:
164
- box_label_known, box_label_unknown = self.sample_for_pred_first_item(box_label)
165
- elif dice_number < self.prob_first_item + self.prob_random_item:
166
- box_label_known, box_label_unknown = self.sample_for_pred_random_item(box_label)
167
- elif dice_number < self.prob_first_item + self.prob_random_item + self.prob_last_item:
168
- box_label_known, box_label_unknown = self.sample_for_pred_last_item(box_label)
169
- else:
170
- box_label_known, box_label_unknown = self.sample_for_pred_stop_sign(box_label)
171
-
172
- target['label_onehot_known'] = label2onehot(box_label_known[:,-1], self.num_classes)
173
- target['label_onehot_unknown'] = label2onehot(box_label_unknown[:, -1], self.num_classes)
174
- target['box_label_known'] = box_label_known
175
- target['box_label_unknown'] = box_label_unknown
176
-
177
- return target, img
178
-
179
-
180
- class RandomDrop():
181
- def __init__(self, p=0.2) -> None:
182
- self.p = p
183
-
184
- def __call__(self, target, img=None):
185
- known_box = target['box_label_known']
186
- num_known_box = known_box.size(0)
187
- idxs = torch.rand(num_known_box)
188
- # indices = torch.randperm(num_known_box)[:int((1-self).p*num_known_box + 0.5 + random.random())]
189
- target['box_label_known'] = known_box[idxs > self.p]
190
- return target, img
191
-
192
-
193
- class BboxPertuber():
194
- def __init__(self, max_ratio = 0.02, generate_samples = 1000) -> None:
195
- self.max_ratio = max_ratio
196
- self.generate_samples = generate_samples
197
- self.samples = self.generate_pertube_samples()
198
- self.idx = 0
199
-
200
- def generate_pertube_samples(self):
201
- import torch
202
- samples = (torch.rand(self.generate_samples, 5) - 0.5) * 2 * self.max_ratio
203
- return samples
204
-
205
- def __call__(self, target, img):
206
- known_box = target['box_label_known'] # Tensor(K,5), K known bbox
207
- K = known_box.shape[0]
208
- known_box_pertube = torch.zeros(K, 6) # 4:bbox, 1:prob, 1:label
209
- if K == 0:
210
- pass
211
- else:
212
- if self.idx + K > self.generate_samples:
213
- self.idx = 0
214
- delta = self.samples[self.idx: self.idx + K, :]
215
- known_box_pertube[:, :4] = known_box[:, :4] + delta[:, :4]
216
- iou = (torch.diag(box_iou(box_cxcywh_to_xyxy(known_box[:, :4]), box_cxcywh_to_xyxy(known_box_pertube[:, :4]))[0])) * (1 + delta[:, -1])
217
- known_box_pertube[:, 4].copy_(iou)
218
- known_box_pertube[:, -1].copy_(known_box[:, -1])
219
-
220
- target['box_label_known_pertube'] = known_box_pertube
221
- return target, img
222
-
223
-
224
- class RandomCutout():
225
- def __init__(self, factor=0.5) -> None:
226
- self.factor = factor
227
-
228
- def __call__(self, target, img=None):
229
- unknown_box = target['box_label_unknown'] # Ku, 5
230
- known_box = target['box_label_known_pertube'] # Kk, 6
231
- Ku = unknown_box.size(0)
232
-
233
- known_box_add = torch.zeros(Ku, 6) # Ku, 6
234
- known_box_add[:, :5] = unknown_box
235
- known_box_add[:, 5].uniform_(0.5, 1)
236
-
237
-
238
- known_box_add[:, :2] += known_box_add[:, 2:4] * (torch.rand(Ku, 2) - 0.5) / 2
239
- known_box_add[:, 2:4] /= 2
240
-
241
- target['box_label_known_pertube'] = torch.cat((known_box, known_box_add))
242
- return target, img
243
-
244
-
245
- class RandomSelectBoxes():
246
- def __init__(self, num_class=80) -> None:
247
- Warning("This is such a slow function and will be deprecated soon!!!")
248
- self.num_class = num_class
249
-
250
- def __call__(self, target, img=None):
251
- boxes = target['boxes']
252
- labels = target['label_compat']
253
-
254
- # transform to list of tensors
255
- boxs_list = [[] for i in range(self.num_class)]
256
- for idx, item in enumerate(boxes):
257
- label = labels[idx].item()
258
- boxs_list[label].append(item)
259
- boxs_list_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in boxs_list]
260
-
261
- # random selection
262
- box_known = []
263
- box_unknown = []
264
- for idx, item in enumerate(boxs_list_tensor):
265
- ncnt = item.shape[0]
266
- nselect = int(random.random() * ncnt) # close in both sides, much faster than random.randint
267
-
268
- item = item[torch.randperm(ncnt)]
269
- # random.shuffle(item)
270
- box_known.append(item[:nselect])
271
- box_unknown.append(item[nselect:])
272
-
273
- # box_known_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_known]
274
- # box_unknown_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_unknown]
275
- # print('box_unknown_tensor:', box_unknown_tensor)
276
- target['known_box'] = box_known
277
- target['unknown_box'] = box_unknown
278
- return target, img
279
-
280
-
281
- def label2onehot(label, num_classes):
282
- """
283
- label: Tensor(K)
284
- """
285
- res = torch.zeros(num_classes)
286
- for i in label:
287
- itm = int(i.item())
288
- res[itm] = 1.0
289
- return res
290
-
291
-
292
- class MaskCrop():
293
- def __init__(self) -> None:
294
- pass
295
-
296
- def __call__(self, target, img):
297
- known_box = target['known_box']
298
- h,w = img.shape[1:] # h,w
299
- # imgsize = target['orig_size'] # h,w
300
-
301
- scale = torch.Tensor([w, h, w, h])
302
-
303
- # _cnt = 0
304
- for boxes in known_box:
305
- if boxes.shape[0] == 0:
306
- continue
307
- box_xyxy = box_cxcywh_to_xyxy(boxes) * scale
308
- for box in box_xyxy:
309
- x1, y1, x2, y2 = [int(i) for i in box.tolist()]
310
- img[:, y1:y2, x1:x2] = 0
311
- # _cnt += 1
312
- # print("_cnt:", _cnt)
313
- return target, img
314
-
315
-
316
- dataset_hook_register = {
317
- 'label2compat': label2compat,
318
- 'label_compat2onehot': label_compat2onehot,
319
- 'box_label_catter': box_label_catter,
320
- 'RandomSelectBoxlabels': RandomSelectBoxlabels,
321
- 'RandomSelectBoxes': RandomSelectBoxes,
322
- 'MaskCrop': MaskCrop,
323
- 'BboxPertuber': BboxPertuber,
324
- }
325
-
326
-
327
- class CocoDetection(torchvision.datasets.CocoDetection):
328
- def __init__(self, img_folder, ann_file, transforms, return_masks, aux_target_hacks=None):
329
- super(CocoDetection, self).__init__(img_folder, ann_file)
330
- self._transforms = transforms
331
- self.prepare = ConvertCocoPolysToMask(return_masks)
332
- self.aux_target_hacks = aux_target_hacks
333
-
334
- def change_hack_attr(self, hackclassname, attrkv_dict):
335
- target_class = dataset_hook_register[hackclassname]
336
- for item in self.aux_target_hacks:
337
- if isinstance(item, target_class):
338
- for k,v in attrkv_dict.items():
339
- setattr(item, k, v)
340
-
341
- def get_hack(self, hackclassname):
342
- target_class = dataset_hook_register[hackclassname]
343
- for item in self.aux_target_hacks:
344
- if isinstance(item, target_class):
345
- return item
346
-
347
- def _load_image(self, id: int) -> Image.Image:
348
- path = self.coco.loadImgs(id)[0]["file_name"]
349
- abs_path = os.path.join(self.root, path)
350
- return Image.open(abs_path).convert("RGB")
351
-
352
- def __getitem__(self, idx):
353
- """
354
- Output:
355
- - target: dict of multiple items
356
- - boxes: Tensor[num_box, 4]. \
357
- Init type: x0,y0,x1,y1. unnormalized data.
358
- Final type: cx,cy,w,h. normalized data.
359
- """
360
- try:
361
- img, target = super(CocoDetection, self).__getitem__(idx)
362
- except:
363
- print("Error idx: {}".format(idx))
364
- idx += 1
365
- img, target = super(CocoDetection, self).__getitem__(idx)
366
- image_id = self.ids[idx]
367
- target = {'image_id': image_id, 'annotations': target}
368
- img, target = self.prepare(img, target)
369
-
370
- if self._transforms is not None:
371
- img, target = self._transforms(img, target)
372
-
373
- # convert to needed format
374
- if self.aux_target_hacks is not None:
375
- for hack_runner in self.aux_target_hacks:
376
- target, img = hack_runner(target, img=img)
377
-
378
- return img, target
379
-
380
-
381
- def convert_coco_poly_to_mask(segmentations, height, width):
382
- masks = []
383
- for polygons in segmentations:
384
- rles = coco_mask.frPyObjects(polygons, height, width)
385
- mask = coco_mask.decode(rles)
386
- if len(mask.shape) < 3:
387
- mask = mask[..., None]
388
- mask = torch.as_tensor(mask, dtype=torch.uint8)
389
- mask = mask.any(dim=2)
390
- masks.append(mask)
391
- if masks:
392
- masks = torch.stack(masks, dim=0)
393
- else:
394
- masks = torch.zeros((0, height, width), dtype=torch.uint8)
395
- return masks
396
-
397
-
398
- class ConvertCocoPolysToMask(object):
399
- def __init__(self, return_masks=False):
400
- self.return_masks = return_masks
401
-
402
- def __call__(self, image, target):
403
- w, h = image.size
404
-
405
- image_id = target["image_id"]
406
- image_id = torch.tensor([image_id])
407
-
408
- anno = target["annotations"]
409
-
410
- anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
411
-
412
- boxes = [obj["bbox"] for obj in anno]
413
- # guard against no boxes via resizing
414
- boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
415
- boxes[:, 2:] += boxes[:, :2]
416
- boxes[:, 0::2].clamp_(min=0, max=w)
417
- boxes[:, 1::2].clamp_(min=0, max=h)
418
-
419
- classes = [obj["category_id"] for obj in anno]
420
- classes = torch.tensor(classes, dtype=torch.int64)
421
-
422
- if self.return_masks:
423
- segmentations = [obj["segmentation"] for obj in anno]
424
- masks = convert_coco_poly_to_mask(segmentations, h, w)
425
-
426
- keypoints = None
427
- if anno and "keypoints" in anno[0]:
428
- keypoints = [obj["keypoints"] for obj in anno]
429
- keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
430
- num_keypoints = keypoints.shape[0]
431
- if num_keypoints:
432
- keypoints = keypoints.view(num_keypoints, -1, 3)
433
-
434
- keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
435
- boxes = boxes[keep]
436
- classes = classes[keep]
437
- if self.return_masks:
438
- masks = masks[keep]
439
- if keypoints is not None:
440
- keypoints = keypoints[keep]
441
-
442
- target = {}
443
- target["boxes"] = boxes
444
- target["labels"] = classes
445
- if self.return_masks:
446
- target["masks"] = masks
447
- target["image_id"] = image_id
448
- if keypoints is not None:
449
- target["keypoints"] = keypoints
450
-
451
- # for conversion to coco api
452
- area = torch.tensor([obj["area"] for obj in anno])
453
- iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
454
- target["area"] = area[keep]
455
- target["iscrowd"] = iscrowd[keep]
456
-
457
- target["orig_size"] = torch.as_tensor([int(h), int(w)])
458
- target["size"] = torch.as_tensor([int(h), int(w)])
459
-
460
- return image, target
461
-
462
-
463
- def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
464
-
465
- normalize = T.Compose([
466
- T.ToTensor(),
467
- T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
468
- ])
469
-
470
- # config the params for data aug
471
- scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
472
- max_size = 1333
473
- scales2_resize = [400, 500, 600]
474
- scales2_crop = [384, 600]
475
-
476
- # update args from config files
477
- scales = getattr(args, 'data_aug_scales', scales)
478
- max_size = getattr(args, 'data_aug_max_size', max_size)
479
- scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
480
- scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
481
-
482
- # resize them
483
- data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
484
- if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
485
- data_aug_scale_overlap = float(data_aug_scale_overlap)
486
- scales = [int(i*data_aug_scale_overlap) for i in scales]
487
- max_size = int(max_size*data_aug_scale_overlap)
488
- scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
489
- scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
490
-
491
- datadict_for_print = {
492
- 'scales': scales,
493
- 'max_size': max_size,
494
- 'scales2_resize': scales2_resize,
495
- 'scales2_crop': scales2_crop
496
- }
497
- # print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
498
-
499
- if image_set == 'train':
500
- if fix_size:
501
- return T.Compose([
502
- T.RandomHorizontalFlip(),
503
- T.RandomResize([(max_size, max(scales))]),
504
- # T.RandomResize([(512, 512)]),
505
- normalize,
506
- ])
507
-
508
- if strong_aug:
509
- import datasets.sltransform as SLT
510
-
511
- return T.Compose([
512
- T.RandomHorizontalFlip(),
513
- T.RandomSelect(
514
- T.RandomResize(scales, max_size=max_size),
515
- T.Compose([
516
- T.RandomResize(scales2_resize),
517
- T.RandomSizeCrop(*scales2_crop),
518
- T.RandomResize(scales, max_size=max_size),
519
- ])
520
- ),
521
- SLT.RandomSelectMulti([
522
- SLT.RandomCrop(),
523
- SLT.LightingNoise(),
524
- SLT.AdjustBrightness(2),
525
- SLT.AdjustContrast(2),
526
- ]),
527
- normalize,
528
- ])
529
-
530
- return T.Compose([
531
- T.RandomHorizontalFlip(),
532
- T.RandomSelect(
533
- T.RandomResize(scales, max_size=max_size),
534
- T.Compose([
535
- T.RandomResize(scales2_resize),
536
- T.RandomSizeCrop(*scales2_crop),
537
- T.RandomResize(scales, max_size=max_size),
538
- ])
539
- ),
540
- normalize,
541
- ])
542
-
543
- if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
544
-
545
- if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
546
- print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
547
- return T.Compose([
548
- T.ResizeDebug((1280, 800)),
549
- normalize,
550
- ])
551
-
552
- return T.Compose([
553
- T.RandomResize([max(scales)], max_size=max_size),
554
- normalize,
555
- ])
556
-
557
-
558
-
559
- raise ValueError(f'unknown {image_set}')
560
-
561
-
562
- def get_aux_target_hacks_list(image_set, args):
563
- if args.modelname in ['q2bs_mask', 'q2bs']:
564
- aux_target_hacks_list = [
565
- label2compat(),
566
- label_compat2onehot(),
567
- RandomSelectBoxes(num_class=args.num_classes)
568
- ]
569
- if args.masked_data and image_set == 'train':
570
- # aux_target_hacks_list.append()
571
- aux_target_hacks_list.append(MaskCrop())
572
- elif args.modelname in ['q2bm_v2', 'q2bs_ce', 'q2op', 'q2ofocal', 'q2opclip', 'q2ocqonly']:
573
- aux_target_hacks_list = [
574
- label2compat(),
575
- label_compat2onehot(),
576
- box_label_catter(),
577
- RandomSelectBoxlabels(num_classes=args.num_classes,
578
- prob_first_item=args.prob_first_item,
579
- prob_random_item=args.prob_random_item,
580
- prob_last_item=args.prob_last_item,
581
- prob_stop_sign=args.prob_stop_sign,
582
- ),
583
- BboxPertuber(max_ratio=0.02, generate_samples=1000),
584
- ]
585
- elif args.modelname in ['q2omask', 'q2osa']:
586
- if args.coco_aug:
587
- aux_target_hacks_list = [
588
- label2compat(),
589
- label_compat2onehot(),
590
- box_label_catter(),
591
- RandomSelectBoxlabels(num_classes=args.num_classes,
592
- prob_first_item=args.prob_first_item,
593
- prob_random_item=args.prob_random_item,
594
- prob_last_item=args.prob_last_item,
595
- prob_stop_sign=args.prob_stop_sign,
596
- ),
597
- RandomDrop(p=0.2),
598
- BboxPertuber(max_ratio=0.02, generate_samples=1000),
599
- RandomCutout(factor=0.5)
600
- ]
601
- else:
602
- aux_target_hacks_list = [
603
- label2compat(),
604
- label_compat2onehot(),
605
- box_label_catter(),
606
- RandomSelectBoxlabels(num_classes=args.num_classes,
607
- prob_first_item=args.prob_first_item,
608
- prob_random_item=args.prob_random_item,
609
- prob_last_item=args.prob_last_item,
610
- prob_stop_sign=args.prob_stop_sign,
611
- ),
612
- BboxPertuber(max_ratio=0.02, generate_samples=1000),
613
- ]
614
- else:
615
- aux_target_hacks_list = None
616
-
617
- return aux_target_hacks_list
618
-
619
-
620
- def build(image_set, args, datasetinfo):
621
- img_folder = datasetinfo["root"]
622
- ann_file = datasetinfo["anno"]
623
-
624
- # copy to local path
625
- if os.environ.get('DATA_COPY_SHILONG') == 'INFO':
626
- preparing_dataset(dict(img_folder=img_folder, ann_file=ann_file), image_set, args)
627
-
628
- try:
629
- strong_aug = args.strong_aug
630
- except:
631
- strong_aug = False
632
- print(img_folder, ann_file)
633
- dataset = CocoDetection(img_folder, ann_file,
634
- transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
635
- return_masks=args.masks,
636
- aux_target_hacks=None,
637
- )
638
- return dataset
639
-
640
-
641
- if __name__ == "__main__":
642
- # Objects365 Val example
643
- dataset_o365 = CocoDetection(
644
- '/path/Objects365/train/',
645
- "/path/Objects365/slannos/anno_preprocess_train_v2.json",
646
- transforms=None,
647
- return_masks=False,
648
- )
649
- print('len(dataset_o365):', len(dataset_o365))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/.ipynb_checkpoints/dataset-checkpoint.py DELETED
@@ -1,44 +0,0 @@
1
- from __future__ import print_function
2
-
3
- import torch
4
- import torchvision.datasets as datasets
5
- from torch.utils.data import Dataset
6
- from PIL import Image
7
- from .tsv_io import TSVFile
8
- import numpy as np
9
- import base64
10
- import io
11
-
12
-
13
- class TSVDataset(Dataset):
14
- """ TSV dataset for ImageNet 1K training
15
- """
16
- def __init__(self, tsv_file, transform=None, target_transform=None):
17
- self.tsv = TSVFile(tsv_file)
18
- self.transform = transform
19
- self.target_transform = target_transform
20
-
21
- def __getitem__(self, index):
22
- """
23
- Args:
24
- index (int): Index
25
- Returns:
26
- tuple: (image, target) where target is class_index of the target class.
27
- """
28
- row = self.tsv.seek(index)
29
- image_data = base64.b64decode(row[-1])
30
- image = Image.open(io.BytesIO(image_data))
31
- image = image.convert('RGB')
32
- target = int(row[1])
33
-
34
- if self.transform is not None:
35
- img = self.transform(image)
36
- else:
37
- img = image
38
- if self.target_transform is not None:
39
- target = self.target_transform(target)
40
-
41
- return img, target
42
-
43
- def __len__(self):
44
- return self.tsv.num_rows()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/.ipynb_checkpoints/odvg-checkpoint.py DELETED
@@ -1,258 +0,0 @@
1
- from torchvision.datasets.vision import VisionDataset
2
- import os.path
3
- from typing import Callable, Optional
4
- import json
5
- from PIL import Image
6
- import torch
7
- import random
8
- import os, sys
9
- sys.path.append(os.path.dirname(sys.path[0]))
10
-
11
- import datasets.transforms as T
12
-
13
- class ODVGDataset(VisionDataset):
14
- """
15
- Args:
16
- root (string): Root directory where images are downloaded to.
17
- anno (string): Path to json annotation file.
18
- label_map_anno (string): Path to json label mapping file. Only for Object Detection
19
- transform (callable, optional): A function/transform that takes in an PIL image
20
- and returns a transformed version. E.g, ``transforms.PILToTensor``
21
- target_transform (callable, optional): A function/transform that takes in the
22
- target and transforms it.
23
- transforms (callable, optional): A function/transform that takes input sample and its target as entry
24
- and returns a transformed version.
25
- """
26
-
27
- def __init__(
28
- self,
29
- root: str,
30
- anno: str,
31
- label_map_anno: str = None,
32
- max_labels: int = 80,
33
- transform: Optional[Callable] = None,
34
- target_transform: Optional[Callable] = None,
35
- transforms: Optional[Callable] = None,
36
- ) -> None:
37
- super().__init__(root, transforms, transform, target_transform)
38
- self.root = root
39
- self.dataset_mode = "OD" if label_map_anno else "VG"
40
- self.max_labels = max_labels
41
- if self.dataset_mode == "OD":
42
- self.load_label_map(label_map_anno)
43
- self._load_metas(anno)
44
- self.get_dataset_info()
45
-
46
- def load_label_map(self, label_map_anno):
47
- with open(label_map_anno, 'r') as file:
48
- self.label_map = json.load(file)
49
- self.label_index = set(self.label_map.keys())
50
-
51
- def _load_metas(self, anno):
52
- with open(anno, 'r') as f:
53
- self.metas = json.load(f)
54
-
55
-
56
- def get_dataset_info(self):
57
- print(f" == total images: {len(self)}")
58
- if self.dataset_mode == "OD":
59
- print(f" == total labels: {len(self.label_map)}")
60
-
61
- def __getitem__(self, index: int):
62
- meta = self.metas[index]
63
- rel_path = meta["filename"]
64
- abs_path = os.path.join(self.root, rel_path)
65
- if not os.path.exists(abs_path):
66
- raise FileNotFoundError(f"{abs_path} not found.")
67
- image = Image.open(abs_path).convert('RGB')
68
- w, h = image.size
69
- if self.dataset_mode == "OD":
70
- anno = meta["detection"]
71
- instances = [obj for obj in anno["instances"]]
72
- boxes = [obj["bbox"] for obj in instances]
73
- # generate vg_labels
74
- # pos bbox labels
75
- ori_classes = [str(obj["label"]) for obj in instances]
76
- pos_labels = set(ori_classes)
77
- # neg bbox labels
78
- neg_labels = self.label_index.difference(pos_labels)
79
-
80
- vg_labels = list(pos_labels)
81
- num_to_add = min(len(neg_labels), self.max_labels-len(pos_labels))
82
- if num_to_add > 0:
83
- vg_labels.extend(random.sample(neg_labels, num_to_add))
84
-
85
- # shuffle
86
- for i in range(len(vg_labels)-1, 0, -1):
87
- j = random.randint(0, i)
88
- vg_labels[i], vg_labels[j] = vg_labels[j], vg_labels[i]
89
-
90
- caption_list = [self.label_map[lb] for lb in vg_labels]
91
- caption_dict = {item:index for index, item in enumerate(caption_list)}
92
-
93
- caption = ' . '.join(caption_list) + ' .'
94
- classes = [caption_dict[self.label_map[str(obj["label"])]] for obj in instances]
95
- boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
96
- classes = torch.tensor(classes, dtype=torch.int64)
97
- elif self.dataset_mode == "VG":
98
- anno = meta["Grounding"]
99
- instances = [obj for obj in anno["regions"]]
100
- boxes = [obj["bbox"] for obj in instances]
101
- caption_list = [obj["phrase"] for obj in instances]
102
- c = list(zip(boxes, caption_list))
103
- random.shuffle(c)
104
- boxes[:], caption_list[:] = zip(*c)
105
- uni_caption_list = list(set(caption_list))
106
- label_map = {}
107
- for idx in range(len(uni_caption_list)):
108
- label_map[uni_caption_list[idx]] = idx
109
- classes = [label_map[cap] for cap in caption_list]
110
- caption = ' . '.join(uni_caption_list) + ' .'
111
- boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
112
- classes = torch.tensor(classes, dtype=torch.int64)
113
- caption_list = uni_caption_list
114
- # print("caption_list" , caption_list)
115
- # print("caption" , caption)
116
- # print("boxes" , boxes)
117
- target = {}
118
- target["image_id"] = rel_path.strip(".jpg")
119
- target["size"] = torch.as_tensor([int(h), int(w)])
120
- target["cap_list"] = caption_list
121
- target["caption"] = caption
122
- target["boxes"] = boxes
123
- target["labels"] = classes
124
- # print(" image_id " , target["image_id"])
125
- # size, cap_list, caption, bboxes, labels
126
-
127
- if self.transforms is not None:
128
- image, target = self.transforms(image, target)
129
-
130
- return image, target
131
-
132
-
133
- def __len__(self) -> int:
134
- return len(self.metas)
135
-
136
-
137
- def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
138
-
139
- normalize = T.Compose([
140
- T.ToTensor(),
141
- T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
142
- ])
143
-
144
- # config the params for data aug
145
- scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
146
- max_size = 1333
147
- scales2_resize = [400, 500, 600]
148
- scales2_crop = [384, 600]
149
-
150
- # update args from config files
151
- scales = getattr(args, 'data_aug_scales', scales)
152
- max_size = getattr(args, 'data_aug_max_size', max_size)
153
- scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
154
- scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
155
-
156
- # resize them
157
- data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
158
- if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
159
- data_aug_scale_overlap = float(data_aug_scale_overlap)
160
- scales = [int(i*data_aug_scale_overlap) for i in scales]
161
- max_size = int(max_size*data_aug_scale_overlap)
162
- scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
163
- scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
164
-
165
- # datadict_for_print = {
166
- # 'scales': scales,
167
- # 'max_size': max_size,
168
- # 'scales2_resize': scales2_resize,
169
- # 'scales2_crop': scales2_crop
170
- # }
171
- # print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
172
-
173
- if image_set == 'train':
174
- if fix_size:
175
- return T.Compose([
176
- T.RandomHorizontalFlip(),
177
- T.RandomResize([(max_size, max(scales))]),
178
- normalize,
179
- ])
180
-
181
- if strong_aug:
182
- import datasets.sltransform as SLT
183
-
184
- return T.Compose([
185
- T.RandomHorizontalFlip(),
186
- T.RandomSelect(
187
- T.RandomResize(scales, max_size=max_size),
188
- T.Compose([
189
- T.RandomResize(scales2_resize),
190
- T.RandomSizeCrop(*scales2_crop),
191
- T.RandomResize(scales, max_size=max_size),
192
- ])
193
- ),
194
- SLT.RandomSelectMulti([
195
- SLT.RandomCrop(),
196
- SLT.LightingNoise(),
197
- SLT.AdjustBrightness(2),
198
- SLT.AdjustContrast(2),
199
- ]),
200
- normalize,
201
- ])
202
-
203
- return T.Compose([
204
- T.RandomHorizontalFlip(),
205
- T.RandomSelect(
206
- T.RandomResize(scales, max_size=max_size),
207
- T.Compose([
208
- T.RandomResize(scales2_resize),
209
- T.RandomSizeCrop(*scales2_crop),
210
- T.RandomResize(scales, max_size=max_size),
211
- ])
212
- ),
213
- normalize,
214
- ])
215
-
216
- if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
217
-
218
- if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
219
- print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
220
- return T.Compose([
221
- T.ResizeDebug((1280, 800)),
222
- normalize,
223
- ])
224
-
225
- return T.Compose([
226
- T.RandomResize([max(scales)], max_size=max_size),
227
- normalize,
228
- ])
229
-
230
- raise ValueError(f'unknown {image_set}')
231
-
232
- def build_odvg(image_set, args, datasetinfo):
233
- img_folder = datasetinfo["root"]
234
- ann_file = datasetinfo["anno"]
235
- label_map = datasetinfo["label_map"] if "label_map" in datasetinfo else None
236
- try:
237
- strong_aug = args.strong_aug
238
- except:
239
- strong_aug = False # False originally
240
- print(img_folder, ann_file, label_map)
241
- dataset = ODVGDataset(img_folder, ann_file, label_map, max_labels=args.max_labels,
242
- transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
243
- )
244
- return dataset
245
-
246
-
247
- if __name__=="__main__":
248
- dataset_vg = ODVGDataset("path/GRIT-20M/data/","path/GRIT-20M/anno/grit_odvg_10k.jsonl",)
249
- print(len(dataset_vg))
250
- data = dataset_vg[random.randint(0, 100)]
251
- print(data)
252
- dataset_od = ODVGDataset("pathl/V3Det/",
253
- "path/V3Det/annotations/v3det_2023_v1_all_odvg.jsonl",
254
- "path/V3Det/annotations/v3det_label_map.json",
255
- )
256
- print(len(dataset_od))
257
- data = dataset_od[random.randint(0, 100)]
258
- print(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/.ipynb_checkpoints/transforms-checkpoint.py DELETED
@@ -1,285 +0,0 @@
1
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
- """
3
- Transforms and data augmentation for both image + bbox.
4
- """
5
- import random
6
-
7
- import PIL
8
- import torch
9
- import torchvision.transforms as T
10
- import torchvision.transforms.functional as F
11
-
12
- from util.box_ops import box_xyxy_to_cxcywh
13
- from util.misc import interpolate
14
-
15
-
16
- def crop(image, target, region):
17
- cropped_image = F.crop(image, *region)
18
-
19
- target = target.copy()
20
- i, j, h, w = region
21
-
22
- # should we do something wrt the original size?
23
- target["size"] = torch.tensor([h, w])
24
-
25
- fields = ["labels", "area"]
26
-
27
- if "boxes" in target:
28
- boxes = target["boxes"]
29
- max_size = torch.as_tensor([w, h], dtype=torch.float32)
30
- cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
31
- cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
32
- cropped_boxes = cropped_boxes.clamp(min=0)
33
- area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
34
- target["boxes"] = cropped_boxes.reshape(-1, 4)
35
- target["area"] = area
36
- fields.append("boxes")
37
-
38
- if "masks" in target:
39
- # FIXME should we update the area here if there are no boxes?
40
- target['masks'] = target['masks'][:, i:i + h, j:j + w]
41
- fields.append("masks")
42
-
43
-
44
- # remove elements for which the boxes or masks that have zero area
45
- if "boxes" in target or "masks" in target:
46
- # favor boxes selection when defining which elements to keep
47
- # this is compatible with previous implementation
48
- if "boxes" in target:
49
- cropped_boxes = target['boxes'].reshape(-1, 2, 2)
50
- keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
51
- else:
52
- keep = target['masks'].flatten(1).any(1)
53
-
54
- for field in fields:
55
- target[field] = target[field][keep]
56
-
57
- return cropped_image, target
58
-
59
-
60
- def hflip(image, target):
61
- flipped_image = F.hflip(image)
62
-
63
- w, h = image.size
64
-
65
- target = target.copy()
66
- if "boxes" in target:
67
- boxes = target["boxes"]
68
- boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
69
- target["boxes"] = boxes
70
-
71
- if "masks" in target:
72
- target['masks'] = target['masks'].flip(-1)
73
-
74
- return flipped_image, target
75
-
76
-
77
- def resize(image, target, size, max_size=None):
78
- # size can be min_size (scalar) or (w, h) tuple
79
-
80
- def get_size_with_aspect_ratio(image_size, size, max_size=None):
81
- w, h = image_size
82
- if max_size is not None:
83
- min_original_size = float(min((w, h)))
84
- max_original_size = float(max((w, h)))
85
- if max_original_size / min_original_size * size > max_size:
86
- size = int(round(max_size * min_original_size / max_original_size))
87
-
88
- if (w <= h and w == size) or (h <= w and h == size):
89
- return (h, w)
90
-
91
- if w < h:
92
- ow = size
93
- oh = int(size * h / w)
94
- else:
95
- oh = size
96
- ow = int(size * w / h)
97
-
98
- return (oh, ow)
99
-
100
- def get_size(image_size, size, max_size=None):
101
- if isinstance(size, (list, tuple)):
102
- return size[::-1]
103
- else:
104
- return get_size_with_aspect_ratio(image_size, size, max_size)
105
-
106
- size = get_size(image.size, size, max_size)
107
- rescaled_image = F.resize(image, size)
108
-
109
- if target is None:
110
- return rescaled_image, None
111
-
112
- ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
113
- ratio_width, ratio_height = ratios
114
-
115
- target = target.copy()
116
- if "boxes" in target:
117
- boxes = target["boxes"]
118
- scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
119
- target["boxes"] = scaled_boxes
120
-
121
- if "area" in target:
122
- area = target["area"]
123
- scaled_area = area * (ratio_width * ratio_height)
124
- target["area"] = scaled_area
125
-
126
- h, w = size
127
- target["size"] = torch.tensor([h, w])
128
-
129
- if "masks" in target:
130
- target['masks'] = interpolate(
131
- target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
132
-
133
- return rescaled_image, target
134
-
135
-
136
- def pad(image, target, padding):
137
- # assumes that we only pad on the bottom right corners
138
- padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
139
- if target is None:
140
- return padded_image, None
141
- target = target.copy()
142
- # should we do something wrt the original size?
143
- target["size"] = torch.tensor(padded_image.size[::-1])
144
- if "masks" in target:
145
- target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
146
- return padded_image, target
147
-
148
-
149
- class ResizeDebug(object):
150
- def __init__(self, size):
151
- self.size = size
152
-
153
- def __call__(self, img, target):
154
- return resize(img, target, self.size)
155
-
156
-
157
- class RandomCrop(object):
158
- def __init__(self, size):
159
- self.size = size
160
-
161
- def __call__(self, img, target):
162
- region = T.RandomCrop.get_params(img, self.size)
163
- return crop(img, target, region)
164
-
165
-
166
- class RandomSizeCrop(object):
167
- def __init__(self, min_size: int, max_size: int):
168
- self.min_size = min_size
169
- self.max_size = max_size
170
-
171
- def __call__(self, img: PIL.Image.Image, target: dict):
172
- w = random.randint(self.min_size, min(img.width, self.max_size))
173
- h = random.randint(self.min_size, min(img.height, self.max_size))
174
- region = T.RandomCrop.get_params(img, [h, w])
175
- return crop(img, target, region)
176
-
177
-
178
- class CenterCrop(object):
179
- def __init__(self, size):
180
- self.size = size
181
-
182
- def __call__(self, img, target):
183
- image_width, image_height = img.size
184
- crop_height, crop_width = self.size
185
- crop_top = int(round((image_height - crop_height) / 2.))
186
- crop_left = int(round((image_width - crop_width) / 2.))
187
- return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
188
-
189
-
190
- class RandomHorizontalFlip(object):
191
- def __init__(self, p=0.5):
192
- self.p = p
193
-
194
- def __call__(self, img, target):
195
- if random.random() < self.p:
196
- return hflip(img, target)
197
- return img, target
198
-
199
-
200
- class RandomResize(object):
201
- def __init__(self, sizes, max_size=None):
202
- assert isinstance(sizes, (list, tuple))
203
- self.sizes = sizes
204
- self.max_size = max_size
205
-
206
- def __call__(self, img, target=None):
207
- size = random.choice(self.sizes)
208
- return resize(img, target, size, self.max_size)
209
-
210
-
211
- class RandomPad(object):
212
- def __init__(self, max_pad):
213
- self.max_pad = max_pad
214
-
215
- def __call__(self, img, target):
216
- pad_x = random.randint(0, self.max_pad)
217
- pad_y = random.randint(0, self.max_pad)
218
- return pad(img, target, (pad_x, pad_y))
219
-
220
-
221
- class RandomSelect(object):
222
- """
223
- Randomly selects between transforms1 and transforms2,
224
- with probability p for transforms1 and (1 - p) for transforms2
225
- """
226
- def __init__(self, transforms1, transforms2, p=0.5):
227
- self.transforms1 = transforms1
228
- self.transforms2 = transforms2
229
- self.p = p
230
-
231
- def __call__(self, img, target):
232
- if random.random() < self.p:
233
- return self.transforms1(img, target)
234
- return self.transforms2(img, target)
235
-
236
-
237
- class ToTensor(object):
238
- def __call__(self, img, target):
239
- return F.to_tensor(img), target
240
-
241
-
242
- class RandomErasing(object):
243
-
244
- def __init__(self, *args, **kwargs):
245
- self.eraser = T.RandomErasing(*args, **kwargs)
246
-
247
- def __call__(self, img, target):
248
- return self.eraser(img), target
249
-
250
-
251
- class Normalize(object):
252
- def __init__(self, mean, std):
253
- self.mean = mean
254
- self.std = std
255
-
256
- def __call__(self, image, target=None):
257
- image = F.normalize(image, mean=self.mean, std=self.std)
258
- if target is None:
259
- return image, None
260
- target = target.copy()
261
- h, w = image.shape[-2:]
262
- if "boxes" in target:
263
- boxes = target["boxes"]
264
- boxes = box_xyxy_to_cxcywh(boxes)
265
- boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
266
- target["boxes"] = boxes
267
- return image, target
268
-
269
-
270
- class Compose(object):
271
- def __init__(self, transforms):
272
- self.transforms = transforms
273
-
274
- def __call__(self, image, target):
275
- for t in self.transforms:
276
- image, target = t(image, target)
277
- return image, target
278
-
279
- def __repr__(self):
280
- format_string = self.__class__.__name__ + "("
281
- for t in self.transforms:
282
- format_string += "\n"
283
- format_string += " {0}".format(t)
284
- format_string += "\n)"
285
- return format_string
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/__init__.py DELETED
@@ -1,23 +0,0 @@
1
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
- import torch.utils.data
3
- import torchvision
4
- from .coco import build as build_coco
5
-
6
-
7
- def get_coco_api_from_dataset(dataset):
8
- for _ in range(10):
9
- # if isinstance(dataset, torchvision.datasets.CocoDetection):
10
- # break
11
- if isinstance(dataset, torch.utils.data.Subset):
12
- dataset = dataset.dataset
13
- if isinstance(dataset, torchvision.datasets.CocoDetection):
14
- return dataset.coco
15
-
16
-
17
- def build_dataset(image_set, args, datasetinfo):
18
- if datasetinfo["dataset_mode"] == 'coco':
19
- return build_coco(image_set, args, datasetinfo)
20
- if datasetinfo["dataset_mode"] == 'odvg':
21
- from .odvg import build_odvg
22
- return build_odvg(image_set, args, datasetinfo)
23
- raise ValueError(f'dataset {args.dataset_file} not supported')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (899 Bytes)
 
groundingdino/datasets/__pycache__/coco.cpython-310.pyc DELETED
Binary file (20.2 kB)
 
groundingdino/datasets/__pycache__/coco_eval.cpython-310.pyc DELETED
Binary file (7.42 kB)
 
groundingdino/datasets/__pycache__/cocogrounding_eval.cpython-310.pyc DELETED
Binary file (7.44 kB)
 
groundingdino/datasets/__pycache__/data_util.cpython-310.pyc DELETED
Binary file (4.55 kB)
 
groundingdino/datasets/__pycache__/odvg.cpython-310.pyc DELETED
Binary file (8.21 kB)
 
groundingdino/datasets/__pycache__/panoptic_eval.cpython-310.pyc DELETED
Binary file (1.87 kB)
 
groundingdino/datasets/__pycache__/random_crop.cpython-310.pyc DELETED
Binary file (3.69 kB)
 
groundingdino/datasets/__pycache__/sltransform.cpython-310.pyc DELETED
Binary file (7.68 kB)
 
groundingdino/datasets/__pycache__/transforms.cpython-310.pyc DELETED
Binary file (9.53 kB)
 
groundingdino/datasets/coco.py DELETED
@@ -1,649 +0,0 @@
1
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
- """
3
- COCO dataset which returns image_id for evaluation.
4
-
5
- Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
6
- """
7
- if __name__=="__main__":
8
- # for debug only
9
- import os, sys
10
- sys.path.append(os.path.dirname(sys.path[0]))
11
- from torchvision.datasets.vision import VisionDataset
12
-
13
- import json
14
- from pathlib import Path
15
- import random
16
- import os
17
- from typing import Any, Callable, List, Optional, Tuple
18
-
19
- from PIL import Image
20
-
21
- import torch
22
- import torch.utils.data
23
- import torchvision
24
- from pycocotools import mask as coco_mask
25
-
26
- from datasets.data_util import preparing_dataset
27
- import datasets.transforms as T
28
- from util.box_ops import box_cxcywh_to_xyxy, box_iou
29
-
30
- __all__ = ['build']
31
-
32
-
33
- class label2compat():
34
- def __init__(self) -> None:
35
- self.category_map_str = {"1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10, "11": 11, "13": 12, "14": 13, "15": 14, "16": 15, "17": 16, "18": 17, "19": 18, "20": 19, "21": 20, "22": 21, "23": 22, "24": 23, "25": 24, "27": 25, "28": 26, "31": 27, "32": 28, "33": 29, "34": 30, "35": 31, "36": 32, "37": 33, "38": 34, "39": 35, "40": 36, "41": 37, "42": 38, "43": 39, "44": 40, "46": 41, "47": 42, "48": 43, "49": 44, "50": 45, "51": 46, "52": 47, "53": 48, "54": 49, "55": 50, "56": 51, "57": 52, "58": 53, "59": 54, "60": 55, "61": 56, "62": 57, "63": 58, "64": 59, "65": 60, "67": 61, "70": 62, "72": 63, "73": 64, "74": 65, "75": 66, "76": 67, "77": 68, "78": 69, "79": 70, "80": 71, "81": 72, "82": 73, "84": 74, "85": 75, "86": 76, "87": 77, "88": 78, "89": 79, "90": 80}
36
- self.category_map = {int(k):v for k,v in self.category_map_str.items()}
37
-
38
- def __call__(self, target, img=None):
39
- labels = target['labels']
40
- res = torch.zeros(labels.shape, dtype=labels.dtype)
41
- for idx, item in enumerate(labels):
42
- res[idx] = self.category_map[item.item()] - 1
43
- target['label_compat'] = res
44
- if img is not None:
45
- return target, img
46
- else:
47
- return target
48
-
49
-
50
- class label_compat2onehot():
51
- def __init__(self, num_class=80, num_output_objs=1):
52
- self.num_class = num_class
53
- self.num_output_objs = num_output_objs
54
- if num_output_objs != 1:
55
- raise DeprecationWarning("num_output_objs!=1, which is only used for comparison")
56
-
57
- def __call__(self, target, img=None):
58
- labels = target['label_compat']
59
- place_dict = {k:0 for k in range(self.num_class)}
60
- if self.num_output_objs == 1:
61
- res = torch.zeros(self.num_class)
62
- for i in labels:
63
- itm = i.item()
64
- res[itm] = 1.0
65
- else:
66
- # compat with baseline
67
- res = torch.zeros(self.num_class, self.num_output_objs)
68
- for i in labels:
69
- itm = i.item()
70
- res[itm][place_dict[itm]] = 1.0
71
- place_dict[itm] += 1
72
- target['label_compat_onehot'] = res
73
- if img is not None:
74
- return target, img
75
- else:
76
- return target
77
-
78
-
79
- class box_label_catter():
80
- def __init__(self):
81
- pass
82
-
83
- def __call__(self, target, img=None):
84
- labels = target['label_compat']
85
- boxes = target['boxes']
86
- box_label = torch.cat((boxes, labels.unsqueeze(-1)), 1)
87
- target['box_label'] = box_label
88
- if img is not None:
89
- return target, img
90
- else:
91
- return target
92
-
93
-
94
- class RandomSelectBoxlabels():
95
- def __init__(self, num_classes, leave_one_out=False, blank_prob=0.8,
96
- prob_first_item = 0.0,
97
- prob_random_item = 0.0,
98
- prob_last_item = 0.8,
99
- prob_stop_sign = 0.2
100
- ) -> None:
101
- self.num_classes = num_classes
102
- self.leave_one_out = leave_one_out
103
- self.blank_prob = blank_prob
104
-
105
- self.set_state(prob_first_item, prob_random_item, prob_last_item, prob_stop_sign)
106
-
107
-
108
- def get_state(self):
109
- return [self.prob_first_item, self.prob_random_item, self.prob_last_item, self.prob_stop_sign]
110
-
111
- def set_state(self, prob_first_item, prob_random_item, prob_last_item, prob_stop_sign):
112
- sum_prob = prob_first_item + prob_random_item + prob_last_item + prob_stop_sign
113
- assert sum_prob - 1 < 1e-6, \
114
- f"Sum up all prob = {sum_prob}. prob_first_item:{prob_first_item}" \
115
- + f"prob_random_item:{prob_random_item}, prob_last_item:{prob_last_item}" \
116
- + f"prob_stop_sign:{prob_stop_sign}"
117
-
118
- self.prob_first_item = prob_first_item
119
- self.prob_random_item = prob_random_item
120
- self.prob_last_item = prob_last_item
121
- self.prob_stop_sign = prob_stop_sign
122
-
123
-
124
- def sample_for_pred_first_item(self, box_label: torch.FloatTensor):
125
- box_label_known = torch.Tensor(0,5)
126
- box_label_unknown = box_label
127
- return box_label_known, box_label_unknown
128
-
129
- def sample_for_pred_random_item(self, box_label: torch.FloatTensor):
130
- n_select = int(random.random() * box_label.shape[0])
131
- box_label = box_label[torch.randperm(box_label.shape[0])]
132
- box_label_known = box_label[:n_select]
133
- box_label_unknown = box_label[n_select:]
134
- return box_label_known, box_label_unknown
135
-
136
- def sample_for_pred_last_item(self, box_label: torch.FloatTensor):
137
- box_label_perm = box_label[torch.randperm(box_label.shape[0])]
138
- known_label_list = []
139
- box_label_known = []
140
- box_label_unknown = []
141
- for item in box_label_perm:
142
- label_i = item[4].item()
143
- if label_i in known_label_list:
144
- box_label_known.append(item)
145
- else:
146
- # first item
147
- box_label_unknown.append(item)
148
- known_label_list.append(label_i)
149
- box_label_known = torch.stack(box_label_known) if len(box_label_known) > 0 else torch.Tensor(0,5)
150
- box_label_unknown = torch.stack(box_label_unknown) if len(box_label_unknown) > 0 else torch.Tensor(0,5)
151
- return box_label_known, box_label_unknown
152
-
153
- def sample_for_pred_stop_sign(self, box_label: torch.FloatTensor):
154
- box_label_unknown = torch.Tensor(0,5)
155
- box_label_known = box_label
156
- return box_label_known, box_label_unknown
157
-
158
- def __call__(self, target, img=None):
159
- box_label = target['box_label'] # K, 5
160
-
161
- dice_number = random.random()
162
-
163
- if dice_number < self.prob_first_item:
164
- box_label_known, box_label_unknown = self.sample_for_pred_first_item(box_label)
165
- elif dice_number < self.prob_first_item + self.prob_random_item:
166
- box_label_known, box_label_unknown = self.sample_for_pred_random_item(box_label)
167
- elif dice_number < self.prob_first_item + self.prob_random_item + self.prob_last_item:
168
- box_label_known, box_label_unknown = self.sample_for_pred_last_item(box_label)
169
- else:
170
- box_label_known, box_label_unknown = self.sample_for_pred_stop_sign(box_label)
171
-
172
- target['label_onehot_known'] = label2onehot(box_label_known[:,-1], self.num_classes)
173
- target['label_onehot_unknown'] = label2onehot(box_label_unknown[:, -1], self.num_classes)
174
- target['box_label_known'] = box_label_known
175
- target['box_label_unknown'] = box_label_unknown
176
-
177
- return target, img
178
-
179
-
180
- class RandomDrop():
181
- def __init__(self, p=0.2) -> None:
182
- self.p = p
183
-
184
- def __call__(self, target, img=None):
185
- known_box = target['box_label_known']
186
- num_known_box = known_box.size(0)
187
- idxs = torch.rand(num_known_box)
188
- # indices = torch.randperm(num_known_box)[:int((1-self).p*num_known_box + 0.5 + random.random())]
189
- target['box_label_known'] = known_box[idxs > self.p]
190
- return target, img
191
-
192
-
193
- class BboxPertuber():
194
- def __init__(self, max_ratio = 0.02, generate_samples = 1000) -> None:
195
- self.max_ratio = max_ratio
196
- self.generate_samples = generate_samples
197
- self.samples = self.generate_pertube_samples()
198
- self.idx = 0
199
-
200
- def generate_pertube_samples(self):
201
- import torch
202
- samples = (torch.rand(self.generate_samples, 5) - 0.5) * 2 * self.max_ratio
203
- return samples
204
-
205
- def __call__(self, target, img):
206
- known_box = target['box_label_known'] # Tensor(K,5), K known bbox
207
- K = known_box.shape[0]
208
- known_box_pertube = torch.zeros(K, 6) # 4:bbox, 1:prob, 1:label
209
- if K == 0:
210
- pass
211
- else:
212
- if self.idx + K > self.generate_samples:
213
- self.idx = 0
214
- delta = self.samples[self.idx: self.idx + K, :]
215
- known_box_pertube[:, :4] = known_box[:, :4] + delta[:, :4]
216
- iou = (torch.diag(box_iou(box_cxcywh_to_xyxy(known_box[:, :4]), box_cxcywh_to_xyxy(known_box_pertube[:, :4]))[0])) * (1 + delta[:, -1])
217
- known_box_pertube[:, 4].copy_(iou)
218
- known_box_pertube[:, -1].copy_(known_box[:, -1])
219
-
220
- target['box_label_known_pertube'] = known_box_pertube
221
- return target, img
222
-
223
-
224
- class RandomCutout():
225
- def __init__(self, factor=0.5) -> None:
226
- self.factor = factor
227
-
228
- def __call__(self, target, img=None):
229
- unknown_box = target['box_label_unknown'] # Ku, 5
230
- known_box = target['box_label_known_pertube'] # Kk, 6
231
- Ku = unknown_box.size(0)
232
-
233
- known_box_add = torch.zeros(Ku, 6) # Ku, 6
234
- known_box_add[:, :5] = unknown_box
235
- known_box_add[:, 5].uniform_(0.5, 1)
236
-
237
-
238
- known_box_add[:, :2] += known_box_add[:, 2:4] * (torch.rand(Ku, 2) - 0.5) / 2
239
- known_box_add[:, 2:4] /= 2
240
-
241
- target['box_label_known_pertube'] = torch.cat((known_box, known_box_add))
242
- return target, img
243
-
244
-
245
- class RandomSelectBoxes():
246
- def __init__(self, num_class=80) -> None:
247
- Warning("This is such a slow function and will be deprecated soon!!!")
248
- self.num_class = num_class
249
-
250
- def __call__(self, target, img=None):
251
- boxes = target['boxes']
252
- labels = target['label_compat']
253
-
254
- # transform to list of tensors
255
- boxs_list = [[] for i in range(self.num_class)]
256
- for idx, item in enumerate(boxes):
257
- label = labels[idx].item()
258
- boxs_list[label].append(item)
259
- boxs_list_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in boxs_list]
260
-
261
- # random selection
262
- box_known = []
263
- box_unknown = []
264
- for idx, item in enumerate(boxs_list_tensor):
265
- ncnt = item.shape[0]
266
- nselect = int(random.random() * ncnt) # close in both sides, much faster than random.randint
267
-
268
- item = item[torch.randperm(ncnt)]
269
- # random.shuffle(item)
270
- box_known.append(item[:nselect])
271
- box_unknown.append(item[nselect:])
272
-
273
- # box_known_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_known]
274
- # box_unknown_tensor = [torch.stack(i) if len(i) > 0 else torch.Tensor(0,4) for i in box_unknown]
275
- # print('box_unknown_tensor:', box_unknown_tensor)
276
- target['known_box'] = box_known
277
- target['unknown_box'] = box_unknown
278
- return target, img
279
-
280
-
281
- def label2onehot(label, num_classes):
282
- """
283
- label: Tensor(K)
284
- """
285
- res = torch.zeros(num_classes)
286
- for i in label:
287
- itm = int(i.item())
288
- res[itm] = 1.0
289
- return res
290
-
291
-
292
- class MaskCrop():
293
- def __init__(self) -> None:
294
- pass
295
-
296
- def __call__(self, target, img):
297
- known_box = target['known_box']
298
- h,w = img.shape[1:] # h,w
299
- # imgsize = target['orig_size'] # h,w
300
-
301
- scale = torch.Tensor([w, h, w, h])
302
-
303
- # _cnt = 0
304
- for boxes in known_box:
305
- if boxes.shape[0] == 0:
306
- continue
307
- box_xyxy = box_cxcywh_to_xyxy(boxes) * scale
308
- for box in box_xyxy:
309
- x1, y1, x2, y2 = [int(i) for i in box.tolist()]
310
- img[:, y1:y2, x1:x2] = 0
311
- # _cnt += 1
312
- # print("_cnt:", _cnt)
313
- return target, img
314
-
315
-
316
- dataset_hook_register = {
317
- 'label2compat': label2compat,
318
- 'label_compat2onehot': label_compat2onehot,
319
- 'box_label_catter': box_label_catter,
320
- 'RandomSelectBoxlabels': RandomSelectBoxlabels,
321
- 'RandomSelectBoxes': RandomSelectBoxes,
322
- 'MaskCrop': MaskCrop,
323
- 'BboxPertuber': BboxPertuber,
324
- }
325
-
326
-
327
- class CocoDetection(torchvision.datasets.CocoDetection):
328
- def __init__(self, img_folder, ann_file, transforms, return_masks, aux_target_hacks=None):
329
- super(CocoDetection, self).__init__(img_folder, ann_file)
330
- self._transforms = transforms
331
- self.prepare = ConvertCocoPolysToMask(return_masks)
332
- self.aux_target_hacks = aux_target_hacks
333
-
334
- def change_hack_attr(self, hackclassname, attrkv_dict):
335
- target_class = dataset_hook_register[hackclassname]
336
- for item in self.aux_target_hacks:
337
- if isinstance(item, target_class):
338
- for k,v in attrkv_dict.items():
339
- setattr(item, k, v)
340
-
341
- def get_hack(self, hackclassname):
342
- target_class = dataset_hook_register[hackclassname]
343
- for item in self.aux_target_hacks:
344
- if isinstance(item, target_class):
345
- return item
346
-
347
- def _load_image(self, id: int) -> Image.Image:
348
- path = self.coco.loadImgs(id)[0]["file_name"]
349
- abs_path = os.path.join(self.root, path)
350
- return Image.open(abs_path).convert("RGB")
351
-
352
- def __getitem__(self, idx):
353
- """
354
- Output:
355
- - target: dict of multiple items
356
- - boxes: Tensor[num_box, 4]. \
357
- Init type: x0,y0,x1,y1. unnormalized data.
358
- Final type: cx,cy,w,h. normalized data.
359
- """
360
- try:
361
- img, target = super(CocoDetection, self).__getitem__(idx)
362
- except:
363
- print("Error idx: {}".format(idx))
364
- idx += 1
365
- img, target = super(CocoDetection, self).__getitem__(idx)
366
- image_id = self.ids[idx]
367
- target = {'image_id': image_id, 'annotations': target}
368
- img, target = self.prepare(img, target)
369
-
370
- if self._transforms is not None:
371
- img, target = self._transforms(img, target)
372
-
373
- # convert to needed format
374
- if self.aux_target_hacks is not None:
375
- for hack_runner in self.aux_target_hacks:
376
- target, img = hack_runner(target, img=img)
377
-
378
- return img, target
379
-
380
-
381
- def convert_coco_poly_to_mask(segmentations, height, width):
382
- masks = []
383
- for polygons in segmentations:
384
- rles = coco_mask.frPyObjects(polygons, height, width)
385
- mask = coco_mask.decode(rles)
386
- if len(mask.shape) < 3:
387
- mask = mask[..., None]
388
- mask = torch.as_tensor(mask, dtype=torch.uint8)
389
- mask = mask.any(dim=2)
390
- masks.append(mask)
391
- if masks:
392
- masks = torch.stack(masks, dim=0)
393
- else:
394
- masks = torch.zeros((0, height, width), dtype=torch.uint8)
395
- return masks
396
-
397
-
398
- class ConvertCocoPolysToMask(object):
399
- def __init__(self, return_masks=False):
400
- self.return_masks = return_masks
401
-
402
- def __call__(self, image, target):
403
- w, h = image.size
404
-
405
- image_id = target["image_id"]
406
- image_id = torch.tensor([image_id])
407
-
408
- anno = target["annotations"]
409
-
410
- anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
411
-
412
- boxes = [obj["bbox"] for obj in anno]
413
- # guard against no boxes via resizing
414
- boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
415
- boxes[:, 2:] += boxes[:, :2]
416
- boxes[:, 0::2].clamp_(min=0, max=w)
417
- boxes[:, 1::2].clamp_(min=0, max=h)
418
-
419
- classes = [obj["category_id"] for obj in anno]
420
- classes = torch.tensor(classes, dtype=torch.int64)
421
-
422
- if self.return_masks:
423
- segmentations = [obj["segmentation"] for obj in anno]
424
- masks = convert_coco_poly_to_mask(segmentations, h, w)
425
-
426
- keypoints = None
427
- if anno and "keypoints" in anno[0]:
428
- keypoints = [obj["keypoints"] for obj in anno]
429
- keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
430
- num_keypoints = keypoints.shape[0]
431
- if num_keypoints:
432
- keypoints = keypoints.view(num_keypoints, -1, 3)
433
-
434
- keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
435
- boxes = boxes[keep]
436
- classes = classes[keep]
437
- if self.return_masks:
438
- masks = masks[keep]
439
- if keypoints is not None:
440
- keypoints = keypoints[keep]
441
-
442
- target = {}
443
- target["boxes"] = boxes
444
- target["labels"] = classes
445
- if self.return_masks:
446
- target["masks"] = masks
447
- target["image_id"] = image_id
448
- if keypoints is not None:
449
- target["keypoints"] = keypoints
450
-
451
- # for conversion to coco api
452
- area = torch.tensor([obj["area"] for obj in anno])
453
- iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
454
- target["area"] = area[keep]
455
- target["iscrowd"] = iscrowd[keep]
456
-
457
- target["orig_size"] = torch.as_tensor([int(h), int(w)])
458
- target["size"] = torch.as_tensor([int(h), int(w)])
459
-
460
- return image, target
461
-
462
-
463
- def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
464
-
465
- normalize = T.Compose([
466
- T.ToTensor(),
467
- T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
468
- ])
469
-
470
- # config the params for data aug
471
- scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
472
- max_size = 1333
473
- scales2_resize = [400, 500, 600]
474
- scales2_crop = [384, 600]
475
-
476
- # update args from config files
477
- scales = getattr(args, 'data_aug_scales', scales)
478
- max_size = getattr(args, 'data_aug_max_size', max_size)
479
- scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
480
- scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
481
-
482
- # resize them
483
- data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
484
- if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
485
- data_aug_scale_overlap = float(data_aug_scale_overlap)
486
- scales = [int(i*data_aug_scale_overlap) for i in scales]
487
- max_size = int(max_size*data_aug_scale_overlap)
488
- scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
489
- scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
490
-
491
- datadict_for_print = {
492
- 'scales': scales,
493
- 'max_size': max_size,
494
- 'scales2_resize': scales2_resize,
495
- 'scales2_crop': scales2_crop
496
- }
497
- # print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
498
-
499
- if image_set == 'train':
500
- if fix_size:
501
- return T.Compose([
502
- T.RandomHorizontalFlip(),
503
- T.RandomResize([(max_size, max(scales))]),
504
- # T.RandomResize([(512, 512)]),
505
- normalize,
506
- ])
507
-
508
- if strong_aug:
509
- import datasets.sltransform as SLT
510
-
511
- return T.Compose([
512
- T.RandomHorizontalFlip(),
513
- T.RandomSelect(
514
- T.RandomResize(scales, max_size=max_size),
515
- T.Compose([
516
- T.RandomResize(scales2_resize),
517
- T.RandomSizeCrop(*scales2_crop),
518
- T.RandomResize(scales, max_size=max_size),
519
- ])
520
- ),
521
- SLT.RandomSelectMulti([
522
- SLT.RandomCrop(),
523
- SLT.LightingNoise(),
524
- SLT.AdjustBrightness(2),
525
- SLT.AdjustContrast(2),
526
- ]),
527
- normalize,
528
- ])
529
-
530
- return T.Compose([
531
- T.RandomHorizontalFlip(),
532
- T.RandomSelect(
533
- T.RandomResize(scales, max_size=max_size),
534
- T.Compose([
535
- T.RandomResize(scales2_resize),
536
- T.RandomSizeCrop(*scales2_crop),
537
- T.RandomResize(scales, max_size=max_size),
538
- ])
539
- ),
540
- normalize,
541
- ])
542
-
543
- if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
544
-
545
- if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
546
- print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
547
- return T.Compose([
548
- T.ResizeDebug((1280, 800)),
549
- normalize,
550
- ])
551
-
552
- return T.Compose([
553
- T.RandomResize([max(scales)], max_size=max_size),
554
- normalize,
555
- ])
556
-
557
-
558
-
559
- raise ValueError(f'unknown {image_set}')
560
-
561
-
562
- def get_aux_target_hacks_list(image_set, args):
563
- if args.modelname in ['q2bs_mask', 'q2bs']:
564
- aux_target_hacks_list = [
565
- label2compat(),
566
- label_compat2onehot(),
567
- RandomSelectBoxes(num_class=args.num_classes)
568
- ]
569
- if args.masked_data and image_set == 'train':
570
- # aux_target_hacks_list.append()
571
- aux_target_hacks_list.append(MaskCrop())
572
- elif args.modelname in ['q2bm_v2', 'q2bs_ce', 'q2op', 'q2ofocal', 'q2opclip', 'q2ocqonly']:
573
- aux_target_hacks_list = [
574
- label2compat(),
575
- label_compat2onehot(),
576
- box_label_catter(),
577
- RandomSelectBoxlabels(num_classes=args.num_classes,
578
- prob_first_item=args.prob_first_item,
579
- prob_random_item=args.prob_random_item,
580
- prob_last_item=args.prob_last_item,
581
- prob_stop_sign=args.prob_stop_sign,
582
- ),
583
- BboxPertuber(max_ratio=0.02, generate_samples=1000),
584
- ]
585
- elif args.modelname in ['q2omask', 'q2osa']:
586
- if args.coco_aug:
587
- aux_target_hacks_list = [
588
- label2compat(),
589
- label_compat2onehot(),
590
- box_label_catter(),
591
- RandomSelectBoxlabels(num_classes=args.num_classes,
592
- prob_first_item=args.prob_first_item,
593
- prob_random_item=args.prob_random_item,
594
- prob_last_item=args.prob_last_item,
595
- prob_stop_sign=args.prob_stop_sign,
596
- ),
597
- RandomDrop(p=0.2),
598
- BboxPertuber(max_ratio=0.02, generate_samples=1000),
599
- RandomCutout(factor=0.5)
600
- ]
601
- else:
602
- aux_target_hacks_list = [
603
- label2compat(),
604
- label_compat2onehot(),
605
- box_label_catter(),
606
- RandomSelectBoxlabels(num_classes=args.num_classes,
607
- prob_first_item=args.prob_first_item,
608
- prob_random_item=args.prob_random_item,
609
- prob_last_item=args.prob_last_item,
610
- prob_stop_sign=args.prob_stop_sign,
611
- ),
612
- BboxPertuber(max_ratio=0.02, generate_samples=1000),
613
- ]
614
- else:
615
- aux_target_hacks_list = None
616
-
617
- return aux_target_hacks_list
618
-
619
-
620
- def build(image_set, args, datasetinfo):
621
- img_folder = datasetinfo["root"]
622
- ann_file = datasetinfo["anno"]
623
-
624
- # copy to local path
625
- if os.environ.get('DATA_COPY_SHILONG') == 'INFO':
626
- preparing_dataset(dict(img_folder=img_folder, ann_file=ann_file), image_set, args)
627
-
628
- try:
629
- strong_aug = args.strong_aug
630
- except:
631
- strong_aug = False
632
- print(img_folder, ann_file)
633
- dataset = CocoDetection(img_folder, ann_file,
634
- transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
635
- return_masks=args.masks,
636
- aux_target_hacks=None,
637
- )
638
- return dataset
639
-
640
-
641
- if __name__ == "__main__":
642
- # Objects365 Val example
643
- dataset_o365 = CocoDetection(
644
- '/path/Objects365/train/',
645
- "/path/Objects365/slannos/anno_preprocess_train_v2.json",
646
- transforms=None,
647
- return_masks=False,
648
- )
649
- print('len(dataset_o365):', len(dataset_o365))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/coco_eval.py DELETED
@@ -1,266 +0,0 @@
1
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
- """
3
- COCO evaluator that works in distributed mode.
4
-
5
- Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
6
- The difference is that there is less copy-pasting from pycocotools
7
- in the end of the file, as python3 can suppress prints with contextlib
8
- """
9
- import os
10
- import contextlib
11
- import copy
12
- import numpy as np
13
- import torch
14
-
15
- from pycocotools.cocoeval import COCOeval
16
- from pycocotools.coco import COCO
17
- import pycocotools.mask as mask_util
18
-
19
- from util.misc import all_gather
20
-
21
-
22
- class CocoEvaluator(object):
23
- def __init__(self, coco_gt, iou_types, useCats=True):
24
- assert isinstance(iou_types, (list, tuple))
25
- coco_gt = copy.deepcopy(coco_gt)
26
- self.coco_gt = coco_gt
27
-
28
- self.iou_types = iou_types
29
- self.coco_eval = {}
30
- for iou_type in iou_types:
31
- self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
32
- self.coco_eval[iou_type].useCats = useCats
33
-
34
- self.img_ids = []
35
- self.eval_imgs = {k: [] for k in iou_types}
36
- self.useCats = useCats
37
-
38
- def update(self, predictions):
39
- img_ids = list(np.unique(list(predictions.keys())))
40
- self.img_ids.extend(img_ids)
41
-
42
- for iou_type in self.iou_types:
43
- results = self.prepare(predictions, iou_type)
44
-
45
- # suppress pycocotools prints
46
- with open(os.devnull, 'w') as devnull:
47
- with contextlib.redirect_stdout(devnull):
48
- coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
49
- coco_eval = self.coco_eval[iou_type]
50
-
51
- coco_eval.cocoDt = coco_dt
52
- coco_eval.params.imgIds = list(img_ids)
53
- coco_eval.params.useCats = self.useCats
54
- img_ids, eval_imgs = evaluate(coco_eval)
55
-
56
- self.eval_imgs[iou_type].append(eval_imgs)
57
-
58
- def synchronize_between_processes(self):
59
- for iou_type in self.iou_types:
60
- self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
61
- create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
62
-
63
- def accumulate(self):
64
- for coco_eval in self.coco_eval.values():
65
- coco_eval.accumulate()
66
-
67
- def summarize(self):
68
- for iou_type, coco_eval in self.coco_eval.items():
69
- print("IoU metric: {}".format(iou_type))
70
- coco_eval.summarize()
71
-
72
- def prepare(self, predictions, iou_type):
73
- if iou_type == "bbox":
74
- return self.prepare_for_coco_detection(predictions)
75
- elif iou_type == "segm":
76
- return self.prepare_for_coco_segmentation(predictions)
77
- elif iou_type == "keypoints":
78
- return self.prepare_for_coco_keypoint(predictions)
79
- else:
80
- raise ValueError("Unknown iou type {}".format(iou_type))
81
-
82
- def prepare_for_coco_detection(self, predictions):
83
- coco_results = []
84
- for original_id, prediction in predictions.items():
85
- if len(prediction) == 0:
86
- continue
87
-
88
- boxes = prediction["boxes"]
89
- boxes = convert_to_xywh(boxes).tolist()
90
- if not isinstance(prediction["scores"], list):
91
- scores = prediction["scores"].tolist()
92
- else:
93
- scores = prediction["scores"]
94
- if not isinstance(prediction["labels"], list):
95
- labels = prediction["labels"].tolist()
96
- else:
97
- labels = prediction["labels"]
98
-
99
-
100
- try:
101
- coco_results.extend(
102
- [
103
- {
104
- "image_id": original_id,
105
- "category_id": labels[k],
106
- "bbox": box,
107
- "score": scores[k],
108
- }
109
- for k, box in enumerate(boxes)
110
- ]
111
- )
112
- except:
113
- import ipdb; ipdb.set_trace()
114
- return coco_results
115
-
116
- def prepare_for_coco_segmentation(self, predictions):
117
- coco_results = []
118
- for original_id, prediction in predictions.items():
119
- if len(prediction) == 0:
120
- continue
121
-
122
- scores = prediction["scores"]
123
- labels = prediction["labels"]
124
- masks = prediction["masks"]
125
-
126
- masks = masks > 0.5
127
-
128
- scores = prediction["scores"].tolist()
129
- labels = prediction["labels"].tolist()
130
-
131
- rles = [
132
- mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
133
- for mask in masks
134
- ]
135
- for rle in rles:
136
- rle["counts"] = rle["counts"].decode("utf-8")
137
-
138
- coco_results.extend(
139
- [
140
- {
141
- "image_id": original_id,
142
- "category_id": labels[k],
143
- "segmentation": rle,
144
- "score": scores[k],
145
- }
146
- for k, rle in enumerate(rles)
147
- ]
148
- )
149
- return coco_results
150
-
151
- def prepare_for_coco_keypoint(self, predictions):
152
- coco_results = []
153
- for original_id, prediction in predictions.items():
154
- if len(prediction) == 0:
155
- continue
156
-
157
- boxes = prediction["boxes"]
158
- boxes = convert_to_xywh(boxes).tolist()
159
- scores = prediction["scores"].tolist()
160
- labels = prediction["labels"].tolist()
161
- keypoints = prediction["keypoints"]
162
- keypoints = keypoints.flatten(start_dim=1).tolist()
163
-
164
- coco_results.extend(
165
- [
166
- {
167
- "image_id": original_id,
168
- "category_id": labels[k],
169
- 'keypoints': keypoint,
170
- "score": scores[k],
171
- }
172
- for k, keypoint in enumerate(keypoints)
173
- ]
174
- )
175
- return coco_results
176
-
177
-
178
- def convert_to_xywh(boxes):
179
- xmin, ymin, xmax, ymax = boxes.unbind(1)
180
- return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
181
-
182
-
183
- def merge(img_ids, eval_imgs):
184
- all_img_ids = all_gather(img_ids)
185
- all_eval_imgs = all_gather(eval_imgs)
186
-
187
- merged_img_ids = []
188
- for p in all_img_ids:
189
- merged_img_ids.extend(p)
190
-
191
- merged_eval_imgs = []
192
- for p in all_eval_imgs:
193
- merged_eval_imgs.append(p)
194
-
195
- merged_img_ids = np.array(merged_img_ids)
196
- merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
197
-
198
- # keep only unique (and in sorted order) images
199
- merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
200
- merged_eval_imgs = merged_eval_imgs[..., idx]
201
-
202
- return merged_img_ids, merged_eval_imgs
203
-
204
-
205
- def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
206
- img_ids, eval_imgs = merge(img_ids, eval_imgs)
207
- img_ids = list(img_ids)
208
- eval_imgs = list(eval_imgs.flatten())
209
-
210
- coco_eval.evalImgs = eval_imgs
211
- coco_eval.params.imgIds = img_ids
212
- coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
213
-
214
-
215
- #################################################################
216
- # From pycocotools, just removed the prints and fixed
217
- # a Python3 bug about unicode not defined
218
- #################################################################
219
-
220
-
221
- def evaluate(self):
222
- '''
223
- Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
224
- :return: None
225
- '''
226
- p = self.params
227
- # add backward compatibility if useSegm is specified in params
228
- if p.useSegm is not None:
229
- p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
230
- print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
231
- p.imgIds = list(np.unique(p.imgIds))
232
- if p.useCats:
233
- p.catIds = list(np.unique(p.catIds))
234
- p.maxDets = sorted(p.maxDets)
235
- self.params = p
236
-
237
- self._prepare()
238
- # loop through images, area range, max detection number
239
- catIds = p.catIds if p.useCats else [-1]
240
-
241
- if p.iouType == 'segm' or p.iouType == 'bbox':
242
- computeIoU = self.computeIoU
243
- elif p.iouType == 'keypoints':
244
- computeIoU = self.computeOks
245
- self.ious = {
246
- (imgId, catId): computeIoU(imgId, catId)
247
- for imgId in p.imgIds
248
- for catId in catIds}
249
-
250
- evaluateImg = self.evaluateImg
251
- maxDet = p.maxDets[-1]
252
- evalImgs = [
253
- evaluateImg(imgId, catId, areaRng, maxDet)
254
- for catId in catIds
255
- for areaRng in p.areaRng
256
- for imgId in p.imgIds
257
- ]
258
- # this is NOT in the pycocotools code, but could be done outside
259
- evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
260
- self._paramsEval = copy.deepcopy(self.params)
261
-
262
- return p.imgIds, evalImgs
263
-
264
- #################################################################
265
- # end of straight copy from pycocotools, just removing the prints
266
- #################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/coco_panoptic.py DELETED
@@ -1,99 +0,0 @@
1
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
- import json
3
- from pathlib import Path
4
-
5
- import numpy as np
6
- import torch
7
- from PIL import Image
8
-
9
- from panopticapi.utils import rgb2id
10
- from util.box_ops import masks_to_boxes
11
-
12
- from .coco import make_coco_transforms
13
-
14
-
15
- class CocoPanoptic:
16
- def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True):
17
- with open(ann_file, 'r') as f:
18
- self.coco = json.load(f)
19
-
20
- # sort 'images' field so that they are aligned with 'annotations'
21
- # i.e., in alphabetical order
22
- self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id'])
23
- # sanity check
24
- if "annotations" in self.coco:
25
- for img, ann in zip(self.coco['images'], self.coco['annotations']):
26
- assert img['file_name'][:-4] == ann['file_name'][:-4]
27
-
28
- self.img_folder = img_folder
29
- self.ann_folder = ann_folder
30
- self.ann_file = ann_file
31
- self.transforms = transforms
32
- self.return_masks = return_masks
33
-
34
- def __getitem__(self, idx):
35
- ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx]
36
- img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg')
37
- ann_path = Path(self.ann_folder) / ann_info['file_name']
38
-
39
- img = Image.open(img_path).convert('RGB')
40
- w, h = img.size
41
- if "segments_info" in ann_info:
42
- masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
43
- masks = rgb2id(masks)
44
-
45
- ids = np.array([ann['id'] for ann in ann_info['segments_info']])
46
- masks = masks == ids[:, None, None]
47
-
48
- masks = torch.as_tensor(masks, dtype=torch.uint8)
49
- labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64)
50
-
51
- target = {}
52
- target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]])
53
- if self.return_masks:
54
- target['masks'] = masks
55
- target['labels'] = labels
56
-
57
- target["boxes"] = masks_to_boxes(masks)
58
-
59
- target['size'] = torch.as_tensor([int(h), int(w)])
60
- target['orig_size'] = torch.as_tensor([int(h), int(w)])
61
- if "segments_info" in ann_info:
62
- for name in ['iscrowd', 'area']:
63
- target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']])
64
-
65
- if self.transforms is not None:
66
- img, target = self.transforms(img, target)
67
-
68
- return img, target
69
-
70
- def __len__(self):
71
- return len(self.coco['images'])
72
-
73
- def get_height_and_width(self, idx):
74
- img_info = self.coco['images'][idx]
75
- height = img_info['height']
76
- width = img_info['width']
77
- return height, width
78
-
79
-
80
- def build(image_set, args):
81
- img_folder_root = Path(args.coco_path)
82
- ann_folder_root = Path(args.coco_panoptic_path)
83
- assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist'
84
- assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist'
85
- mode = 'panoptic'
86
- PATHS = {
87
- "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'),
88
- "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'),
89
- }
90
-
91
- img_folder, ann_file = PATHS[image_set]
92
- img_folder_path = img_folder_root / img_folder
93
- ann_folder = ann_folder_root / f'{mode}_{img_folder}'
94
- ann_file = ann_folder_root / ann_file
95
-
96
- dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file,
97
- transforms=make_coco_transforms(image_set), return_masks=args.masks)
98
-
99
- return dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/cocogrounding_eval.py DELETED
@@ -1,271 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # Grounding DINO. Midified by Shilong Liu.
3
- # url: https://github.com/IDEA-Research/GroundingDINO
4
- # Copyright (c) 2023 IDEA. All Rights Reserved.
5
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
- # ------------------------------------------------------------------------
7
- # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
8
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9
- """
10
- COCO evaluator that works in distributed mode.
11
-
12
- Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
13
- The difference is that there is less copy-pasting from pycocotools
14
- in the end of the file, as python3 can suppress prints with contextlib
15
- """
16
- import contextlib
17
- import copy
18
- import os
19
-
20
- import numpy as np
21
- import pycocotools.mask as mask_util
22
- import torch
23
- from pycocotools.coco import COCO
24
- from pycocotools.cocoeval import COCOeval
25
-
26
- from groundingdino.util.misc import all_gather
27
-
28
-
29
- class CocoGroundingEvaluator(object):
30
- def __init__(self, coco_gt, iou_types, useCats=True):
31
- assert isinstance(iou_types, (list, tuple))
32
- coco_gt = copy.deepcopy(coco_gt)
33
- self.coco_gt = coco_gt
34
-
35
- self.iou_types = iou_types
36
- self.coco_eval = {}
37
- for iou_type in iou_types:
38
- self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
39
- self.coco_eval[iou_type].useCats = useCats
40
-
41
- self.img_ids = []
42
- self.eval_imgs = {k: [] for k in iou_types}
43
- self.useCats = useCats
44
-
45
- def update(self, predictions):
46
- img_ids = list(np.unique(list(predictions.keys())))
47
- self.img_ids.extend(img_ids)
48
- # import pdb;pdb.set_trace()
49
- for iou_type in self.iou_types:
50
- results = self.prepare(predictions, iou_type)
51
-
52
- # suppress pycocotools prints
53
- with open(os.devnull, "w") as devnull:
54
- with contextlib.redirect_stdout(devnull):
55
- coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
56
-
57
- coco_eval = self.coco_eval[iou_type]
58
-
59
- coco_eval.cocoDt = coco_dt
60
- coco_eval.params.imgIds = list(img_ids)
61
- coco_eval.params.useCats = self.useCats
62
- img_ids, eval_imgs = evaluate(coco_eval)
63
-
64
- self.eval_imgs[iou_type].append(eval_imgs)
65
-
66
- def synchronize_between_processes(self):
67
- for iou_type in self.iou_types:
68
- self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
69
- create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
70
-
71
- def accumulate(self):
72
- for coco_eval in self.coco_eval.values():
73
- coco_eval.accumulate()
74
-
75
- def summarize(self):
76
- for iou_type, coco_eval in self.coco_eval.items():
77
- print("IoU metric: {}".format(iou_type))
78
- coco_eval.summarize()
79
-
80
- def prepare(self, predictions, iou_type):
81
- if iou_type == "bbox":
82
- return self.prepare_for_coco_detection(predictions)
83
- elif iou_type == "segm":
84
- return self.prepare_for_coco_segmentation(predictions)
85
- elif iou_type == "keypoints":
86
- return self.prepare_for_coco_keypoint(predictions)
87
- else:
88
- raise ValueError("Unknown iou type {}".format(iou_type))
89
-
90
- def prepare_for_coco_detection(self, predictions):
91
- coco_results = []
92
- for original_id, prediction in predictions.items():
93
- if len(prediction) == 0:
94
- continue
95
-
96
- boxes = prediction["boxes"]
97
- boxes = convert_to_xywh(boxes).tolist()
98
- scores = prediction["scores"].tolist()
99
- labels = prediction["labels"].tolist()
100
-
101
- coco_results.extend(
102
- [
103
- {
104
- "image_id": original_id,
105
- "category_id": labels[k],
106
- "bbox": box,
107
- "score": scores[k],
108
- }
109
- for k, box in enumerate(boxes)
110
- ]
111
- )
112
- return coco_results
113
-
114
- def prepare_for_coco_segmentation(self, predictions):
115
- coco_results = []
116
- for original_id, prediction in predictions.items():
117
- if len(prediction) == 0:
118
- continue
119
-
120
- scores = prediction["scores"]
121
- labels = prediction["labels"]
122
- masks = prediction["masks"]
123
-
124
- masks = masks > 0.5
125
-
126
- scores = prediction["scores"].tolist()
127
- labels = prediction["labels"].tolist()
128
-
129
- rles = [
130
- mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
131
- for mask in masks
132
- ]
133
- for rle in rles:
134
- rle["counts"] = rle["counts"].decode("utf-8")
135
-
136
- coco_results.extend(
137
- [
138
- {
139
- "image_id": original_id,
140
- "category_id": labels[k],
141
- "segmentation": rle,
142
- "score": scores[k],
143
- }
144
- for k, rle in enumerate(rles)
145
- ]
146
- )
147
- return coco_results
148
-
149
- def prepare_for_coco_keypoint(self, predictions):
150
- coco_results = []
151
- for original_id, prediction in predictions.items():
152
- if len(prediction) == 0:
153
- continue
154
-
155
- boxes = prediction["boxes"]
156
- boxes = convert_to_xywh(boxes).tolist()
157
- scores = prediction["scores"].tolist()
158
- labels = prediction["labels"].tolist()
159
- keypoints = prediction["keypoints"]
160
- keypoints = keypoints.flatten(start_dim=1).tolist()
161
-
162
- coco_results.extend(
163
- [
164
- {
165
- "image_id": original_id,
166
- "category_id": labels[k],
167
- "keypoints": keypoint,
168
- "score": scores[k],
169
- }
170
- for k, keypoint in enumerate(keypoints)
171
- ]
172
- )
173
- return coco_results
174
-
175
-
176
- def convert_to_xywh(boxes):
177
- xmin, ymin, xmax, ymax = boxes.unbind(1)
178
- return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
179
-
180
-
181
- def merge(img_ids, eval_imgs):
182
- all_img_ids = all_gather(img_ids)
183
- all_eval_imgs = all_gather(eval_imgs)
184
-
185
- merged_img_ids = []
186
- for p in all_img_ids:
187
- merged_img_ids.extend(p)
188
-
189
- merged_eval_imgs = []
190
- for p in all_eval_imgs:
191
- merged_eval_imgs.append(p)
192
-
193
- merged_img_ids = np.array(merged_img_ids)
194
- merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
195
-
196
- # keep only unique (and in sorted order) images
197
- merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
198
- merged_eval_imgs = merged_eval_imgs[..., idx]
199
-
200
- return merged_img_ids, merged_eval_imgs
201
-
202
-
203
- def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
204
- img_ids, eval_imgs = merge(img_ids, eval_imgs)
205
- img_ids = list(img_ids)
206
- eval_imgs = list(eval_imgs.flatten())
207
-
208
- coco_eval.evalImgs = eval_imgs
209
- coco_eval.params.imgIds = img_ids
210
- coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
211
-
212
-
213
- #################################################################
214
- # From pycocotools, just removed the prints and fixed
215
- # a Python3 bug about unicode not defined
216
- #################################################################
217
-
218
-
219
- def evaluate(self):
220
- """
221
- Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
222
- :return: None
223
- """
224
- # tic = time.time()
225
- # print('Running per image evaluation...')
226
-
227
- # import pdb;pdb.set_trace()
228
- p = self.params
229
- # add backward compatibility if useSegm is specified in params
230
- if p.useSegm is not None:
231
- p.iouType = "segm" if p.useSegm == 1 else "bbox"
232
- print("useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType))
233
- # print('Evaluate annotation type *{}*'.format(p.iouType))
234
- p.imgIds = list(np.unique(p.imgIds))
235
- if p.useCats:
236
- p.catIds = list(np.unique(p.catIds))
237
- p.maxDets = sorted(p.maxDets)
238
- self.params = p
239
-
240
- self._prepare()
241
- # loop through images, area range, max detection number
242
- catIds = p.catIds if p.useCats else [-1]
243
-
244
- if p.iouType == "segm" or p.iouType == "bbox":
245
- computeIoU = self.computeIoU
246
- elif p.iouType == "keypoints":
247
- computeIoU = self.computeOks
248
- self.ious = {
249
- (imgId, catId): computeIoU(imgId, catId)
250
- for imgId in p.imgIds
251
- for catId in catIds}
252
-
253
- evaluateImg = self.evaluateImg
254
- maxDet = p.maxDets[-1]
255
- evalImgs = [
256
- evaluateImg(imgId, catId, areaRng, maxDet)
257
- for catId in catIds
258
- for areaRng in p.areaRng
259
- for imgId in p.imgIds
260
- ]
261
- # this is NOT in the pycocotools code, but could be done outside
262
- evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
263
- self._paramsEval = copy.deepcopy(self.params)
264
- # toc = time.time()
265
- # print('DONE (t={:0.2f}s).'.format(toc-tic))
266
- return p.imgIds, evalImgs
267
-
268
-
269
- #################################################################
270
- # end of straight copy from pycocotools, just removing the prints
271
- #################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/data_util.py DELETED
@@ -1,170 +0,0 @@
1
- import os
2
- import os.path as osp
3
- import shutil
4
- import time
5
- import datetime
6
-
7
- import torch
8
-
9
- from util.slconfig import SLConfig
10
-
11
- class Error(OSError):
12
- pass
13
-
14
- def slcopytree(src, dst, symlinks=False, ignore=None, copy_function=shutil.copyfile,
15
- ignore_dangling_symlinks=False):
16
- """
17
- modified from shutil.copytree without copystat.
18
-
19
- Recursively copy a directory tree.
20
-
21
- The destination directory must not already exist.
22
- If exception(s) occur, an Error is raised with a list of reasons.
23
-
24
- If the optional symlinks flag is true, symbolic links in the
25
- source tree result in symbolic links in the destination tree; if
26
- it is false, the contents of the files pointed to by symbolic
27
- links are copied. If the file pointed by the symlink doesn't
28
- exist, an exception will be added in the list of errors raised in
29
- an Error exception at the end of the copy process.
30
-
31
- You can set the optional ignore_dangling_symlinks flag to true if you
32
- want to silence this exception. Notice that this has no effect on
33
- platforms that don't support os.symlink.
34
-
35
- The optional ignore argument is a callable. If given, it
36
- is called with the `src` parameter, which is the directory
37
- being visited by copytree(), and `names` which is the list of
38
- `src` contents, as returned by os.listdir():
39
-
40
- callable(src, names) -> ignored_names
41
-
42
- Since copytree() is called recursively, the callable will be
43
- called once for each directory that is copied. It returns a
44
- list of names relative to the `src` directory that should
45
- not be copied.
46
-
47
- The optional copy_function argument is a callable that will be used
48
- to copy each file. It will be called with the source path and the
49
- destination path as arguments. By default, copy2() is used, but any
50
- function that supports the same signature (like copy()) can be used.
51
-
52
- """
53
- errors = []
54
- if os.path.isdir(src):
55
- names = os.listdir(src)
56
- if ignore is not None:
57
- ignored_names = ignore(src, names)
58
- else:
59
- ignored_names = set()
60
-
61
- os.makedirs(dst)
62
- for name in names:
63
- if name in ignored_names:
64
- continue
65
- srcname = os.path.join(src, name)
66
- dstname = os.path.join(dst, name)
67
- try:
68
- if os.path.islink(srcname):
69
- linkto = os.readlink(srcname)
70
- if symlinks:
71
- # We can't just leave it to `copy_function` because legacy
72
- # code with a custom `copy_function` may rely on copytree
73
- # doing the right thing.
74
- os.symlink(linkto, dstname)
75
- else:
76
- # ignore dangling symlink if the flag is on
77
- if not os.path.exists(linkto) and ignore_dangling_symlinks:
78
- continue
79
- # otherwise let the copy occurs. copy2 will raise an error
80
- if os.path.isdir(srcname):
81
- slcopytree(srcname, dstname, symlinks, ignore,
82
- copy_function)
83
- else:
84
- copy_function(srcname, dstname)
85
- elif os.path.isdir(srcname):
86
- slcopytree(srcname, dstname, symlinks, ignore, copy_function)
87
- else:
88
- # Will raise a SpecialFileError for unsupported file types
89
- copy_function(srcname, dstname)
90
- # catch the Error from the recursive copytree so that we can
91
- # continue with other files
92
- except Error as err:
93
- errors.extend(err.args[0])
94
- except OSError as why:
95
- errors.append((srcname, dstname, str(why)))
96
- else:
97
- copy_function(src, dst)
98
-
99
- if errors:
100
- raise Error(errors)
101
- return dst
102
-
103
- def check_and_copy(src_path, tgt_path):
104
- if os.path.exists(tgt_path):
105
- return None
106
-
107
- return slcopytree(src_path, tgt_path)
108
-
109
-
110
- def remove(srcpath):
111
- if os.path.isdir(srcpath):
112
- return shutil.rmtree(srcpath)
113
- else:
114
- return os.remove(srcpath)
115
-
116
-
117
- def preparing_dataset(pathdict, image_set, args):
118
- start_time = time.time()
119
- dataset_file = args.dataset_file
120
- data_static_info = SLConfig.fromfile('util/static_data_path.py')
121
- static_dict = data_static_info[dataset_file][image_set]
122
-
123
- copyfilelist = []
124
- for k,tgt_v in pathdict.items():
125
- if os.path.exists(tgt_v):
126
- if args.local_rank == 0:
127
- print("path <{}> exist. remove it!".format(tgt_v))
128
- remove(tgt_v)
129
- # continue
130
-
131
- if args.local_rank == 0:
132
- src_v = static_dict[k]
133
- assert isinstance(src_v, str)
134
- if src_v.endswith('.zip'):
135
- # copy
136
- cp_tgt_dir = os.path.dirname(tgt_v)
137
- filename = os.path.basename(src_v)
138
- cp_tgt_path = os.path.join(cp_tgt_dir, filename)
139
- print('Copy from <{}> to <{}>.'.format(src_v, cp_tgt_path))
140
- os.makedirs(cp_tgt_dir, exist_ok=True)
141
- check_and_copy(src_v, cp_tgt_path)
142
-
143
- # unzip
144
- import zipfile
145
- print("Starting unzip <{}>".format(cp_tgt_path))
146
- with zipfile.ZipFile(cp_tgt_path, 'r') as zip_ref:
147
- zip_ref.extractall(os.path.dirname(cp_tgt_path))
148
-
149
- copyfilelist.append(cp_tgt_path)
150
- copyfilelist.append(tgt_v)
151
- else:
152
- print('Copy from <{}> to <{}>.'.format(src_v, tgt_v))
153
- os.makedirs(os.path.dirname(tgt_v), exist_ok=True)
154
- check_and_copy(src_v, tgt_v)
155
- copyfilelist.append(tgt_v)
156
-
157
- if len(copyfilelist) == 0:
158
- copyfilelist = None
159
- args.copyfilelist = copyfilelist
160
-
161
- if args.distributed:
162
- torch.distributed.barrier()
163
- total_time = time.time() - start_time
164
- if copyfilelist:
165
- total_time_str = str(datetime.timedelta(seconds=int(total_time)))
166
- print('Data copy time {}'.format(total_time_str))
167
- return copyfilelist
168
-
169
-
170
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/dataset.py DELETED
@@ -1,44 +0,0 @@
1
- from __future__ import print_function
2
-
3
- import torch
4
- import torchvision.datasets as datasets
5
- from torch.utils.data import Dataset
6
- from PIL import Image
7
- from .tsv_io import TSVFile
8
- import numpy as np
9
- import base64
10
- import io
11
-
12
-
13
- class TSVDataset(Dataset):
14
- """ TSV dataset for ImageNet 1K training
15
- """
16
- def __init__(self, tsv_file, transform=None, target_transform=None):
17
- self.tsv = TSVFile(tsv_file)
18
- self.transform = transform
19
- self.target_transform = target_transform
20
-
21
- def __getitem__(self, index):
22
- """
23
- Args:
24
- index (int): Index
25
- Returns:
26
- tuple: (image, target) where target is class_index of the target class.
27
- """
28
- row = self.tsv.seek(index)
29
- image_data = base64.b64decode(row[-1])
30
- image = Image.open(io.BytesIO(image_data))
31
- image = image.convert('RGB')
32
- target = int(row[1])
33
-
34
- if self.transform is not None:
35
- img = self.transform(image)
36
- else:
37
- img = image
38
- if self.target_transform is not None:
39
- target = self.target_transform(target)
40
-
41
- return img, target
42
-
43
- def __len__(self):
44
- return self.tsv.num_rows()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/odvg.py DELETED
@@ -1,258 +0,0 @@
1
- from torchvision.datasets.vision import VisionDataset
2
- import os.path
3
- from typing import Callable, Optional
4
- import json
5
- from PIL import Image
6
- import torch
7
- import random
8
- import os, sys
9
- sys.path.append(os.path.dirname(sys.path[0]))
10
-
11
- import datasets.transforms as T
12
-
13
- class ODVGDataset(VisionDataset):
14
- """
15
- Args:
16
- root (string): Root directory where images are downloaded to.
17
- anno (string): Path to json annotation file.
18
- label_map_anno (string): Path to json label mapping file. Only for Object Detection
19
- transform (callable, optional): A function/transform that takes in an PIL image
20
- and returns a transformed version. E.g, ``transforms.PILToTensor``
21
- target_transform (callable, optional): A function/transform that takes in the
22
- target and transforms it.
23
- transforms (callable, optional): A function/transform that takes input sample and its target as entry
24
- and returns a transformed version.
25
- """
26
-
27
- def __init__(
28
- self,
29
- root: str,
30
- anno: str,
31
- label_map_anno: str = None,
32
- max_labels: int = 80,
33
- transform: Optional[Callable] = None,
34
- target_transform: Optional[Callable] = None,
35
- transforms: Optional[Callable] = None,
36
- ) -> None:
37
- super().__init__(root, transforms, transform, target_transform)
38
- self.root = root
39
- self.dataset_mode = "OD" if label_map_anno else "VG"
40
- self.max_labels = max_labels
41
- if self.dataset_mode == "OD":
42
- self.load_label_map(label_map_anno)
43
- self._load_metas(anno)
44
- self.get_dataset_info()
45
-
46
- def load_label_map(self, label_map_anno):
47
- with open(label_map_anno, 'r') as file:
48
- self.label_map = json.load(file)
49
- self.label_index = set(self.label_map.keys())
50
-
51
- def _load_metas(self, anno):
52
- with open(anno, 'r') as f:
53
- self.metas = json.load(f)
54
-
55
-
56
- def get_dataset_info(self):
57
- print(f" == total images: {len(self)}")
58
- if self.dataset_mode == "OD":
59
- print(f" == total labels: {len(self.label_map)}")
60
-
61
- def __getitem__(self, index: int):
62
- meta = self.metas[index]
63
- rel_path = meta["filename"]
64
- abs_path = os.path.join(self.root, rel_path)
65
- if not os.path.exists(abs_path):
66
- raise FileNotFoundError(f"{abs_path} not found.")
67
- image = Image.open(abs_path).convert('RGB')
68
- w, h = image.size
69
- if self.dataset_mode == "OD":
70
- anno = meta["detection"]
71
- instances = [obj for obj in anno["instances"]]
72
- boxes = [obj["bbox"] for obj in instances]
73
- # generate vg_labels
74
- # pos bbox labels
75
- ori_classes = [str(obj["label"]) for obj in instances]
76
- pos_labels = set(ori_classes)
77
- # neg bbox labels
78
- neg_labels = self.label_index.difference(pos_labels)
79
-
80
- vg_labels = list(pos_labels)
81
- num_to_add = min(len(neg_labels), self.max_labels-len(pos_labels))
82
- if num_to_add > 0:
83
- vg_labels.extend(random.sample(neg_labels, num_to_add))
84
-
85
- # shuffle
86
- for i in range(len(vg_labels)-1, 0, -1):
87
- j = random.randint(0, i)
88
- vg_labels[i], vg_labels[j] = vg_labels[j], vg_labels[i]
89
-
90
- caption_list = [self.label_map[lb] for lb in vg_labels]
91
- caption_dict = {item:index for index, item in enumerate(caption_list)}
92
-
93
- caption = ' . '.join(caption_list) + ' .'
94
- classes = [caption_dict[self.label_map[str(obj["label"])]] for obj in instances]
95
- boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
96
- classes = torch.tensor(classes, dtype=torch.int64)
97
- elif self.dataset_mode == "VG":
98
- anno = meta["Grounding"]
99
- instances = [obj for obj in anno["regions"]]
100
- boxes = [obj["bbox"] for obj in instances]
101
- caption_list = [obj["phrase"] for obj in instances]
102
- c = list(zip(boxes, caption_list))
103
- random.shuffle(c)
104
- boxes[:], caption_list[:] = zip(*c)
105
- uni_caption_list = list(set(caption_list))
106
- label_map = {}
107
- for idx in range(len(uni_caption_list)):
108
- label_map[uni_caption_list[idx]] = idx
109
- classes = [label_map[cap] for cap in caption_list]
110
- caption = ' . '.join(uni_caption_list) + ' .'
111
- boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
112
- classes = torch.tensor(classes, dtype=torch.int64)
113
- caption_list = uni_caption_list
114
- # print("caption_list" , caption_list)
115
- # print("caption" , caption)
116
- # print("boxes" , boxes)
117
- target = {}
118
- target["image_id"] = rel_path.strip(".jpg")
119
- target["size"] = torch.as_tensor([int(h), int(w)])
120
- target["cap_list"] = caption_list
121
- target["caption"] = caption
122
- target["boxes"] = boxes
123
- target["labels"] = classes
124
- # print(" image_id " , target["image_id"])
125
- # size, cap_list, caption, bboxes, labels
126
-
127
- if self.transforms is not None:
128
- image, target = self.transforms(image, target)
129
-
130
- return image, target
131
-
132
-
133
- def __len__(self) -> int:
134
- return len(self.metas)
135
-
136
-
137
- def make_coco_transforms(image_set, fix_size=False, strong_aug=False, args=None):
138
-
139
- normalize = T.Compose([
140
- T.ToTensor(),
141
- T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
142
- ])
143
-
144
- # config the params for data aug
145
- scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
146
- max_size = 1333
147
- scales2_resize = [400, 500, 600]
148
- scales2_crop = [384, 600]
149
-
150
- # update args from config files
151
- scales = getattr(args, 'data_aug_scales', scales)
152
- max_size = getattr(args, 'data_aug_max_size', max_size)
153
- scales2_resize = getattr(args, 'data_aug_scales2_resize', scales2_resize)
154
- scales2_crop = getattr(args, 'data_aug_scales2_crop', scales2_crop)
155
-
156
- # resize them
157
- data_aug_scale_overlap = getattr(args, 'data_aug_scale_overlap', None)
158
- if data_aug_scale_overlap is not None and data_aug_scale_overlap > 0:
159
- data_aug_scale_overlap = float(data_aug_scale_overlap)
160
- scales = [int(i*data_aug_scale_overlap) for i in scales]
161
- max_size = int(max_size*data_aug_scale_overlap)
162
- scales2_resize = [int(i*data_aug_scale_overlap) for i in scales2_resize]
163
- scales2_crop = [int(i*data_aug_scale_overlap) for i in scales2_crop]
164
-
165
- # datadict_for_print = {
166
- # 'scales': scales,
167
- # 'max_size': max_size,
168
- # 'scales2_resize': scales2_resize,
169
- # 'scales2_crop': scales2_crop
170
- # }
171
- # print("data_aug_params:", json.dumps(datadict_for_print, indent=2))
172
-
173
- if image_set == 'train':
174
- if fix_size:
175
- return T.Compose([
176
- T.RandomHorizontalFlip(),
177
- T.RandomResize([(max_size, max(scales))]),
178
- normalize,
179
- ])
180
-
181
- if strong_aug:
182
- import datasets.sltransform as SLT
183
-
184
- return T.Compose([
185
- T.RandomHorizontalFlip(),
186
- T.RandomSelect(
187
- T.RandomResize(scales, max_size=max_size),
188
- T.Compose([
189
- T.RandomResize(scales2_resize),
190
- T.RandomSizeCrop(*scales2_crop),
191
- T.RandomResize(scales, max_size=max_size),
192
- ])
193
- ),
194
- SLT.RandomSelectMulti([
195
- SLT.RandomCrop(),
196
- SLT.LightingNoise(),
197
- SLT.AdjustBrightness(2),
198
- SLT.AdjustContrast(2),
199
- ]),
200
- normalize,
201
- ])
202
-
203
- return T.Compose([
204
- T.RandomHorizontalFlip(),
205
- T.RandomSelect(
206
- T.RandomResize(scales, max_size=max_size),
207
- T.Compose([
208
- T.RandomResize(scales2_resize),
209
- T.RandomSizeCrop(*scales2_crop),
210
- T.RandomResize(scales, max_size=max_size),
211
- ])
212
- ),
213
- normalize,
214
- ])
215
-
216
- if image_set in ['val', 'eval_debug', 'train_reg', 'test']:
217
-
218
- if os.environ.get("GFLOPS_DEBUG_SHILONG", False) == 'INFO':
219
- print("Under debug mode for flops calculation only!!!!!!!!!!!!!!!!")
220
- return T.Compose([
221
- T.ResizeDebug((1280, 800)),
222
- normalize,
223
- ])
224
-
225
- return T.Compose([
226
- T.RandomResize([max(scales)], max_size=max_size),
227
- normalize,
228
- ])
229
-
230
- raise ValueError(f'unknown {image_set}')
231
-
232
- def build_odvg(image_set, args, datasetinfo):
233
- img_folder = datasetinfo["root"]
234
- ann_file = datasetinfo["anno"]
235
- label_map = datasetinfo["label_map"] if "label_map" in datasetinfo else None
236
- try:
237
- strong_aug = args.strong_aug
238
- except:
239
- strong_aug = False # False originally
240
- print(img_folder, ann_file, label_map)
241
- dataset = ODVGDataset(img_folder, ann_file, label_map, max_labels=args.max_labels,
242
- transforms=make_coco_transforms(image_set, fix_size=args.fix_size, strong_aug=strong_aug, args=args),
243
- )
244
- return dataset
245
-
246
-
247
- if __name__=="__main__":
248
- dataset_vg = ODVGDataset("path/GRIT-20M/data/","path/GRIT-20M/anno/grit_odvg_10k.jsonl",)
249
- print(len(dataset_vg))
250
- data = dataset_vg[random.randint(0, 100)]
251
- print(data)
252
- dataset_od = ODVGDataset("pathl/V3Det/",
253
- "path/V3Det/annotations/v3det_2023_v1_all_odvg.jsonl",
254
- "path/V3Det/annotations/v3det_label_map.json",
255
- )
256
- print(len(dataset_od))
257
- data = dataset_od[random.randint(0, 100)]
258
- print(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/panoptic_eval.py DELETED
@@ -1,44 +0,0 @@
1
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
- import json
3
- import os
4
-
5
- import util.misc as utils
6
-
7
- try:
8
- from panopticapi.evaluation import pq_compute
9
- except ImportError:
10
- pass
11
-
12
-
13
- class PanopticEvaluator(object):
14
- def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
15
- self.gt_json = ann_file
16
- self.gt_folder = ann_folder
17
- if utils.is_main_process():
18
- if not os.path.exists(output_dir):
19
- os.mkdir(output_dir)
20
- self.output_dir = output_dir
21
- self.predictions = []
22
-
23
- def update(self, predictions):
24
- for p in predictions:
25
- with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
26
- f.write(p.pop("png_string"))
27
-
28
- self.predictions += predictions
29
-
30
- def synchronize_between_processes(self):
31
- all_predictions = utils.all_gather(self.predictions)
32
- merged_predictions = []
33
- for p in all_predictions:
34
- merged_predictions += p
35
- self.predictions = merged_predictions
36
-
37
- def summarize(self):
38
- if utils.is_main_process():
39
- json_data = {"annotations": self.predictions}
40
- predictions_json = os.path.join(self.output_dir, "predictions.json")
41
- with open(predictions_json, "w") as f:
42
- f.write(json.dumps(json_data))
43
- return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
44
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/random_crop.py DELETED
@@ -1,135 +0,0 @@
1
- import PIL #version 1.2.0
2
- import torch
3
- import os
4
- import torchvision.transforms.functional as F
5
- import numpy as np
6
- import random
7
-
8
-
9
- def intersect(boxes1, boxes2):
10
- '''
11
- Find intersection of every box combination between two sets of box
12
- boxes1: bounding boxes 1, a tensor of dimensions (n1, 4)
13
- boxes2: bounding boxes 2, a tensor of dimensions (n2, 4)
14
-
15
- Out: Intersection each of boxes1 with respect to each of boxes2,
16
- a tensor of dimensions (n1, n2)
17
- '''
18
- n1 = boxes1.size(0)
19
- n2 = boxes2.size(0)
20
- max_xy = torch.min(boxes1[:, 2:].unsqueeze(1).expand(n1, n2, 2),
21
- boxes2[:, 2:].unsqueeze(0).expand(n1, n2, 2))
22
-
23
- min_xy = torch.max(boxes1[:, :2].unsqueeze(1).expand(n1, n2, 2),
24
- boxes2[:, :2].unsqueeze(0).expand(n1, n2, 2))
25
- inter = torch.clamp(max_xy - min_xy , min=0) # (n1, n2, 2)
26
- return inter[:, :, 0] * inter[:, :, 1] #(n1, n2)
27
- def find_IoU(boxes1, boxes2):
28
- '''
29
- Find IoU between every boxes set of boxes
30
- boxes1: a tensor of dimensions (n1, 4) (left, top, right , bottom)
31
- boxes2: a tensor of dimensions (n2, 4)
32
-
33
- Out: IoU each of boxes1 with respect to each of boxes2, a tensor of
34
- dimensions (n1, n2)
35
-
36
- Formula:
37
- (box1 ∩ box2) / (box1 u box2) = (box1 ∩ box2) / (area(box1) + area(box2) - (box1 ∩ box2 ))
38
- '''
39
- inter = intersect(boxes1, boxes2)
40
- area_boxes1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
41
- area_boxes2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
42
-
43
- area_boxes1 = area_boxes1.unsqueeze(1).expand_as(inter) #(n1, n2)
44
- area_boxes2 = area_boxes2.unsqueeze(0).expand_as(inter) #(n1, n2)
45
- union = (area_boxes1 + area_boxes2 - inter)
46
- return inter / union
47
-
48
-
49
- def random_crop(image, boxes, labels, difficulties=None):
50
- '''
51
- image: A PIL image
52
- boxes: Bounding boxes, a tensor of dimensions (#objects, 4)
53
- labels: labels of object, a tensor of dimensions (#objects)
54
- difficulties: difficulties of detect object, a tensor of dimensions (#objects)
55
-
56
- Out: cropped image , new boxes, new labels, new difficulties
57
- '''
58
- if type(image) == PIL.Image.Image:
59
- image = F.to_tensor(image)
60
- original_h = image.size(1)
61
- original_w = image.size(2)
62
-
63
- while True:
64
- mode = random.choice([0.1, 0.3, 0.5, 0.9, None])
65
-
66
- if mode is None:
67
- return F.to_pil_image(image), boxes, labels, difficulties
68
-
69
- new_image = image
70
- new_boxes = boxes
71
- new_difficulties = difficulties
72
- new_labels = labels
73
- for _ in range(50):
74
- # Crop dimensions: [0.3, 1] of original dimensions
75
- new_h = random.uniform(0.3*original_h, original_h)
76
- new_w = random.uniform(0.3*original_w, original_w)
77
-
78
- # Aspect ratio constraint b/t .5 & 2
79
- if new_h/new_w < 0.5 or new_h/new_w > 2:
80
- continue
81
-
82
- #Crop coordinate
83
- left = random.uniform(0, original_w - new_w)
84
- right = left + new_w
85
- top = random.uniform(0, original_h - new_h)
86
- bottom = top + new_h
87
- crop = torch.FloatTensor([int(left), int(top), int(right), int(bottom)])
88
-
89
- # Calculate IoU between the crop and the bounding boxes
90
- overlap = find_IoU(crop.unsqueeze(0), boxes) #(1, #objects)
91
- overlap = overlap.squeeze(0)
92
-
93
- # If not a single bounding box has a IoU of greater than the minimum, try again
94
- if overlap.shape[0] == 0:
95
- continue
96
- if overlap.max().item() < mode:
97
- continue
98
-
99
- #Crop
100
- new_image = image[:, int(top):int(bottom), int(left):int(right)] #(3, new_h, new_w)
101
-
102
- #Center of bounding boxes
103
- center_bb = (boxes[:, :2] + boxes[:, 2:])/2.0
104
-
105
- #Find bounding box has been had center in crop
106
- center_in_crop = (center_bb[:, 0] >left) * (center_bb[:, 0] < right
107
- ) *(center_bb[:, 1] > top) * (center_bb[:, 1] < bottom) #( #objects)
108
-
109
- if not center_in_crop.any():
110
- continue
111
-
112
- #take matching bounding box
113
- new_boxes = boxes[center_in_crop, :]
114
-
115
- #take matching labels
116
- new_labels = labels[center_in_crop]
117
-
118
- #take matching difficulities
119
- if difficulties is not None:
120
- new_difficulties = difficulties[center_in_crop]
121
- else:
122
- new_difficulties = None
123
-
124
- #Use the box left and top corner or the crop's
125
- new_boxes[:, :2] = torch.max(new_boxes[:, :2], crop[:2])
126
-
127
- #adjust to crop
128
- new_boxes[:, :2] -= crop[:2]
129
-
130
- new_boxes[:, 2:] = torch.min(new_boxes[:, 2:],crop[2:])
131
-
132
- #adjust to crop
133
- new_boxes[:, 2:] -= crop[:2]
134
-
135
- return F.to_pil_image(new_image), new_boxes, new_labels, new_difficulties
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/sltransform.py DELETED
@@ -1,247 +0,0 @@
1
- # modified from https://github.com/anhtuan85/Data-Augmentation-for-Object-Detection/blob/master/augmentation.ipynb
2
-
3
- import PIL #version 1.2.0
4
- from PIL import Image #version 6.1.0
5
- import torch
6
- import os
7
- import torchvision.transforms.functional as F
8
- import numpy as np
9
- import random
10
-
11
- from .random_crop import random_crop
12
- from util.box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
13
-
14
- class AdjustContrast:
15
- def __init__(self, contrast_factor):
16
- self.contrast_factor = contrast_factor
17
-
18
- def __call__(self, img, target):
19
- """
20
- img (PIL Image or Tensor): Image to be adjusted.
21
- """
22
- _contrast_factor = ((random.random() + 1.0) / 2.0) * self.contrast_factor
23
- img = F.adjust_contrast(img, _contrast_factor)
24
- return img, target
25
-
26
- class AdjustBrightness:
27
- def __init__(self, brightness_factor):
28
- self.brightness_factor = brightness_factor
29
-
30
- def __call__(self, img, target):
31
- """
32
- img (PIL Image or Tensor): Image to be adjusted.
33
- """
34
- _brightness_factor = ((random.random() + 1.0) / 2.0) * self.brightness_factor
35
- img = F.adjust_brightness(img, _brightness_factor)
36
- return img, target
37
-
38
- def lighting_noise(image):
39
- '''
40
- color channel swap in image
41
- image: A PIL image
42
- '''
43
- new_image = image
44
- perms = ((0, 1, 2), (0, 2, 1), (1, 0, 2),
45
- (1, 2, 0), (2, 0, 1), (2, 1, 0))
46
- swap = perms[random.randint(0, len(perms)- 1)]
47
- new_image = F.to_tensor(new_image)
48
- new_image = new_image[swap, :, :]
49
- new_image = F.to_pil_image(new_image)
50
- return new_image
51
-
52
- class LightingNoise:
53
- def __init__(self) -> None:
54
- pass
55
-
56
- def __call__(self, img, target):
57
- return lighting_noise(img), target
58
-
59
-
60
- def rotate(image, boxes, angle):
61
- '''
62
- Rotate image and bounding box
63
- image: A Pil image (w, h)
64
- boxes: A tensors of dimensions (#objects, 4)
65
-
66
- Out: rotated image (w, h), rotated boxes
67
- '''
68
- new_image = image.copy()
69
- new_boxes = boxes.clone()
70
-
71
- #Rotate image, expand = True
72
- w = image.width
73
- h = image.height
74
- cx = w/2
75
- cy = h/2
76
- new_image = new_image.rotate(angle, expand=True)
77
- angle = np.radians(angle)
78
- alpha = np.cos(angle)
79
- beta = np.sin(angle)
80
- #Get affine matrix
81
- AffineMatrix = torch.tensor([[alpha, beta, (1-alpha)*cx - beta*cy],
82
- [-beta, alpha, beta*cx + (1-alpha)*cy]])
83
-
84
- #Rotation boxes
85
- box_width = (boxes[:,2] - boxes[:,0]).reshape(-1,1)
86
- box_height = (boxes[:,3] - boxes[:,1]).reshape(-1,1)
87
-
88
- #Get corners for boxes
89
- x1 = boxes[:,0].reshape(-1,1)
90
- y1 = boxes[:,1].reshape(-1,1)
91
-
92
- x2 = x1 + box_width
93
- y2 = y1
94
-
95
- x3 = x1
96
- y3 = y1 + box_height
97
-
98
- x4 = boxes[:,2].reshape(-1,1)
99
- y4 = boxes[:,3].reshape(-1,1)
100
-
101
- corners = torch.stack((x1,y1,x2,y2,x3,y3,x4,y4), dim= 1)
102
- # corners.reshape(-1, 8) #Tensors of dimensions (#objects, 8)
103
- corners = corners.reshape(-1,2) #Tensors of dimension (4* #objects, 2)
104
- corners = torch.cat((corners, torch.ones(corners.shape[0], 1)), dim= 1) #(Tensors of dimension (4* #objects, 3))
105
-
106
- cos = np.abs(AffineMatrix[0, 0])
107
- sin = np.abs(AffineMatrix[0, 1])
108
-
109
- nW = int((h * sin) + (w * cos))
110
- nH = int((h * cos) + (w * sin))
111
- AffineMatrix[0, 2] += (nW / 2) - cx
112
- AffineMatrix[1, 2] += (nH / 2) - cy
113
-
114
-
115
- #Apply affine transform
116
- rotate_corners = torch.mm(AffineMatrix, corners.t().to(torch.float64)).t()
117
- rotate_corners = rotate_corners.reshape(-1,8)
118
-
119
- x_corners = rotate_corners[:,[0,2,4,6]]
120
- y_corners = rotate_corners[:,[1,3,5,7]]
121
-
122
- #Get (x_min, y_min, x_max, y_max)
123
- x_min, _ = torch.min(x_corners, dim= 1)
124
- x_min = x_min.reshape(-1, 1)
125
- y_min, _ = torch.min(y_corners, dim= 1)
126
- y_min = y_min.reshape(-1, 1)
127
- x_max, _ = torch.max(x_corners, dim= 1)
128
- x_max = x_max.reshape(-1, 1)
129
- y_max, _ = torch.max(y_corners, dim= 1)
130
- y_max = y_max.reshape(-1, 1)
131
-
132
- new_boxes = torch.cat((x_min, y_min, x_max, y_max), dim= 1)
133
-
134
- scale_x = new_image.width / w
135
- scale_y = new_image.height / h
136
-
137
- #Resize new image to (w, h)
138
-
139
- new_image = new_image.resize((w, h))
140
-
141
- #Resize boxes
142
- new_boxes /= torch.Tensor([scale_x, scale_y, scale_x, scale_y])
143
- new_boxes[:, 0] = torch.clamp(new_boxes[:, 0], 0, w)
144
- new_boxes[:, 1] = torch.clamp(new_boxes[:, 1], 0, h)
145
- new_boxes[:, 2] = torch.clamp(new_boxes[:, 2], 0, w)
146
- new_boxes[:, 3] = torch.clamp(new_boxes[:, 3], 0, h)
147
- return new_image, new_boxes
148
-
149
- # def convert_xywh_to_xyxy(boxes: torch.Tensor):
150
- # _boxes = boxes.clone()
151
- # box_xy = _boxes[:, :2]
152
- # box_wh = _boxes[:, 2:]
153
- # box_x1y1 = box_xy - box_wh/2
154
- # box_x2y2 = box_xy + box_wh/2
155
- # box_xyxy = torch.cat((box_x1y1, box_x2y2), dim=-1)
156
- # return box_xyxy
157
-
158
- class Rotate:
159
- def __init__(self, angle=10) -> None:
160
- self.angle = angle
161
-
162
- def __call__(self, img, target):
163
- w,h = img.size
164
- whwh = torch.Tensor([w, h, w, h])
165
- boxes_xyxy = box_cxcywh_to_xyxy(target['boxes']) * whwh
166
- img, boxes_new = rotate(img, boxes_xyxy, self.angle)
167
- target['boxes'] = box_xyxy_to_cxcywh(boxes_new).to(boxes_xyxy.dtype) / (whwh + 1e-3)
168
- return img, target
169
-
170
-
171
- class RandomCrop:
172
- def __init__(self) -> None:
173
- pass
174
-
175
- def __call__(self, img, target):
176
- w,h = img.size
177
- try:
178
- boxes_xyxy = target['boxes']
179
- labels = target['labels']
180
- img, new_boxes, new_labels, _ = random_crop(img, boxes_xyxy, labels)
181
- target['boxes'] = new_boxes
182
- target['labels'] = new_labels
183
- except Exception as e:
184
- pass
185
- return img, target
186
-
187
-
188
- class RandomCropDebug:
189
- def __init__(self) -> None:
190
- pass
191
-
192
- def __call__(self, img, target):
193
- boxes_xyxy = target['boxes'].clone()
194
- labels = target['labels'].clone()
195
- img, new_boxes, new_labels, _ = random_crop(img, boxes_xyxy, labels)
196
- target['boxes'] = new_boxes
197
- target['labels'] = new_labels
198
-
199
-
200
- return img, target
201
-
202
- class RandomSelectMulti(object):
203
- """
204
- Randomly selects between transforms1 and transforms2,
205
- """
206
- def __init__(self, transformslist, p=-1):
207
- self.transformslist = transformslist
208
- self.p = p
209
- assert p == -1
210
-
211
- def __call__(self, img, target):
212
- if self.p == -1:
213
- return random.choice(self.transformslist)(img, target)
214
-
215
-
216
- class Albumentations:
217
- def __init__(self):
218
- import albumentations as A
219
- self.transform = A.Compose([
220
- A.Blur(p=0.01),
221
- A.MedianBlur(p=0.01),
222
- A.ToGray(p=0.01),
223
- A.CLAHE(p=0.01),
224
- A.RandomBrightnessContrast(p=0.005),
225
- A.RandomGamma(p=0.005),
226
- A.ImageCompression(quality_lower=75, p=0.005)],
227
- bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_labels']))
228
-
229
- def __call__(self, img, target, p=1.0):
230
- """
231
- Input:
232
- target['boxes']: xyxy, unnormalized data.
233
-
234
- """
235
- boxes_raw = target['boxes']
236
- labels_raw = target['labels']
237
- img_np = np.array(img)
238
- if self.transform and random.random() < p:
239
- new_res = self.transform(image=img_np, bboxes=boxes_raw, class_labels=labels_raw) # transformed
240
- boxes_new = torch.Tensor(new_res['bboxes']).to(boxes_raw.dtype).reshape_as(boxes_raw)
241
- img_np = new_res['image']
242
- labels_new = torch.Tensor(new_res['class_labels']).to(labels_raw.dtype)
243
- img_new = Image.fromarray(img_np)
244
- target['boxes'] = boxes_new
245
- target['labels'] = labels_new
246
-
247
- return img_new, target
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/datasets/transforms.py DELETED
@@ -1,285 +0,0 @@
1
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
- """
3
- Transforms and data augmentation for both image + bbox.
4
- """
5
- import random
6
-
7
- import PIL
8
- import torch
9
- import torchvision.transforms as T
10
- import torchvision.transforms.functional as F
11
-
12
- from util.box_ops import box_xyxy_to_cxcywh
13
- from util.misc import interpolate
14
-
15
-
16
- def crop(image, target, region):
17
- cropped_image = F.crop(image, *region)
18
-
19
- target = target.copy()
20
- i, j, h, w = region
21
-
22
- # should we do something wrt the original size?
23
- target["size"] = torch.tensor([h, w])
24
-
25
- fields = ["labels", "area"]
26
-
27
- if "boxes" in target:
28
- boxes = target["boxes"]
29
- max_size = torch.as_tensor([w, h], dtype=torch.float32)
30
- cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
31
- cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
32
- cropped_boxes = cropped_boxes.clamp(min=0)
33
- area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
34
- target["boxes"] = cropped_boxes.reshape(-1, 4)
35
- target["area"] = area
36
- fields.append("boxes")
37
-
38
- if "masks" in target:
39
- # FIXME should we update the area here if there are no boxes?
40
- target['masks'] = target['masks'][:, i:i + h, j:j + w]
41
- fields.append("masks")
42
-
43
-
44
- # remove elements for which the boxes or masks that have zero area
45
- if "boxes" in target or "masks" in target:
46
- # favor boxes selection when defining which elements to keep
47
- # this is compatible with previous implementation
48
- if "boxes" in target:
49
- cropped_boxes = target['boxes'].reshape(-1, 2, 2)
50
- keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
51
- else:
52
- keep = target['masks'].flatten(1).any(1)
53
-
54
- for field in fields:
55
- target[field] = target[field][keep]
56
-
57
- return cropped_image, target
58
-
59
-
60
- def hflip(image, target):
61
- flipped_image = F.hflip(image)
62
-
63
- w, h = image.size
64
-
65
- target = target.copy()
66
- if "boxes" in target:
67
- boxes = target["boxes"]
68
- boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
69
- target["boxes"] = boxes
70
-
71
- if "masks" in target:
72
- target['masks'] = target['masks'].flip(-1)
73
-
74
- return flipped_image, target
75
-
76
-
77
- def resize(image, target, size, max_size=None):
78
- # size can be min_size (scalar) or (w, h) tuple
79
-
80
- def get_size_with_aspect_ratio(image_size, size, max_size=None):
81
- w, h = image_size
82
- if max_size is not None:
83
- min_original_size = float(min((w, h)))
84
- max_original_size = float(max((w, h)))
85
- if max_original_size / min_original_size * size > max_size:
86
- size = int(round(max_size * min_original_size / max_original_size))
87
-
88
- if (w <= h and w == size) or (h <= w and h == size):
89
- return (h, w)
90
-
91
- if w < h:
92
- ow = size
93
- oh = int(size * h / w)
94
- else:
95
- oh = size
96
- ow = int(size * w / h)
97
-
98
- return (oh, ow)
99
-
100
- def get_size(image_size, size, max_size=None):
101
- if isinstance(size, (list, tuple)):
102
- return size[::-1]
103
- else:
104
- return get_size_with_aspect_ratio(image_size, size, max_size)
105
-
106
- size = get_size(image.size, size, max_size)
107
- rescaled_image = F.resize(image, size)
108
-
109
- if target is None:
110
- return rescaled_image, None
111
-
112
- ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
113
- ratio_width, ratio_height = ratios
114
-
115
- target = target.copy()
116
- if "boxes" in target:
117
- boxes = target["boxes"]
118
- scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
119
- target["boxes"] = scaled_boxes
120
-
121
- if "area" in target:
122
- area = target["area"]
123
- scaled_area = area * (ratio_width * ratio_height)
124
- target["area"] = scaled_area
125
-
126
- h, w = size
127
- target["size"] = torch.tensor([h, w])
128
-
129
- if "masks" in target:
130
- target['masks'] = interpolate(
131
- target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
132
-
133
- return rescaled_image, target
134
-
135
-
136
- def pad(image, target, padding):
137
- # assumes that we only pad on the bottom right corners
138
- padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
139
- if target is None:
140
- return padded_image, None
141
- target = target.copy()
142
- # should we do something wrt the original size?
143
- target["size"] = torch.tensor(padded_image.size[::-1])
144
- if "masks" in target:
145
- target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
146
- return padded_image, target
147
-
148
-
149
- class ResizeDebug(object):
150
- def __init__(self, size):
151
- self.size = size
152
-
153
- def __call__(self, img, target):
154
- return resize(img, target, self.size)
155
-
156
-
157
- class RandomCrop(object):
158
- def __init__(self, size):
159
- self.size = size
160
-
161
- def __call__(self, img, target):
162
- region = T.RandomCrop.get_params(img, self.size)
163
- return crop(img, target, region)
164
-
165
-
166
- class RandomSizeCrop(object):
167
- def __init__(self, min_size: int, max_size: int):
168
- self.min_size = min_size
169
- self.max_size = max_size
170
-
171
- def __call__(self, img: PIL.Image.Image, target: dict):
172
- w = random.randint(self.min_size, min(img.width, self.max_size))
173
- h = random.randint(self.min_size, min(img.height, self.max_size))
174
- region = T.RandomCrop.get_params(img, [h, w])
175
- return crop(img, target, region)
176
-
177
-
178
- class CenterCrop(object):
179
- def __init__(self, size):
180
- self.size = size
181
-
182
- def __call__(self, img, target):
183
- image_width, image_height = img.size
184
- crop_height, crop_width = self.size
185
- crop_top = int(round((image_height - crop_height) / 2.))
186
- crop_left = int(round((image_width - crop_width) / 2.))
187
- return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
188
-
189
-
190
- class RandomHorizontalFlip(object):
191
- def __init__(self, p=0.5):
192
- self.p = p
193
-
194
- def __call__(self, img, target):
195
- if random.random() < self.p:
196
- return hflip(img, target)
197
- return img, target
198
-
199
-
200
- class RandomResize(object):
201
- def __init__(self, sizes, max_size=None):
202
- assert isinstance(sizes, (list, tuple))
203
- self.sizes = sizes
204
- self.max_size = max_size
205
-
206
- def __call__(self, img, target=None):
207
- size = random.choice(self.sizes)
208
- return resize(img, target, size, self.max_size)
209
-
210
-
211
- class RandomPad(object):
212
- def __init__(self, max_pad):
213
- self.max_pad = max_pad
214
-
215
- def __call__(self, img, target):
216
- pad_x = random.randint(0, self.max_pad)
217
- pad_y = random.randint(0, self.max_pad)
218
- return pad(img, target, (pad_x, pad_y))
219
-
220
-
221
- class RandomSelect(object):
222
- """
223
- Randomly selects between transforms1 and transforms2,
224
- with probability p for transforms1 and (1 - p) for transforms2
225
- """
226
- def __init__(self, transforms1, transforms2, p=0.5):
227
- self.transforms1 = transforms1
228
- self.transforms2 = transforms2
229
- self.p = p
230
-
231
- def __call__(self, img, target):
232
- if random.random() < self.p:
233
- return self.transforms1(img, target)
234
- return self.transforms2(img, target)
235
-
236
-
237
- class ToTensor(object):
238
- def __call__(self, img, target):
239
- return F.to_tensor(img), target
240
-
241
-
242
- class RandomErasing(object):
243
-
244
- def __init__(self, *args, **kwargs):
245
- self.eraser = T.RandomErasing(*args, **kwargs)
246
-
247
- def __call__(self, img, target):
248
- return self.eraser(img), target
249
-
250
-
251
- class Normalize(object):
252
- def __init__(self, mean, std):
253
- self.mean = mean
254
- self.std = std
255
-
256
- def __call__(self, image, target=None):
257
- image = F.normalize(image, mean=self.mean, std=self.std)
258
- if target is None:
259
- return image, None
260
- target = target.copy()
261
- h, w = image.shape[-2:]
262
- if "boxes" in target:
263
- boxes = target["boxes"]
264
- boxes = box_xyxy_to_cxcywh(boxes)
265
- boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
266
- target["boxes"] = boxes
267
- return image, target
268
-
269
-
270
- class Compose(object):
271
- def __init__(self, transforms):
272
- self.transforms = transforms
273
-
274
- def __call__(self, image, target):
275
- for t in self.transforms:
276
- image, target = t(image, target)
277
- return image, target
278
-
279
- def __repr__(self):
280
- format_string = self.__class__.__name__ + "("
281
- for t in self.transforms:
282
- format_string += "\n"
283
- format_string += " {0}".format(t)
284
- format_string += "\n)"
285
- return format_string
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/GroundingDINO/.ipynb_checkpoints/bertwarper-checkpoint.py DELETED
@@ -1,273 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # Grounding DINO
3
- # url: https://github.com/IDEA-Research/GroundingDINO
4
- # Copyright (c) 2023 IDEA. All Rights Reserved.
5
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
- # ------------------------------------------------------------------------
7
-
8
- import torch
9
- import torch.nn.functional as F
10
- import torch.utils.checkpoint as checkpoint
11
- from torch import Tensor, nn
12
- from torchvision.ops.boxes import nms
13
- from transformers import BertConfig, BertModel, BertPreTrainedModel
14
- from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
15
-
16
-
17
- class BertModelWarper(nn.Module):
18
- def __init__(self, bert_model):
19
- super().__init__()
20
- # self.bert = bert_modelc
21
-
22
- self.config = bert_model.config
23
- self.embeddings = bert_model.embeddings
24
- self.encoder = bert_model.encoder
25
- self.pooler = bert_model.pooler
26
-
27
- self.get_extended_attention_mask = bert_model.get_extended_attention_mask
28
- self.invert_attention_mask = bert_model.invert_attention_mask
29
- self.get_head_mask = bert_model.get_head_mask
30
-
31
- def forward(
32
- self,
33
- input_ids=None,
34
- attention_mask=None,
35
- token_type_ids=None,
36
- position_ids=None,
37
- head_mask=None,
38
- inputs_embeds=None,
39
- encoder_hidden_states=None,
40
- encoder_attention_mask=None,
41
- past_key_values=None,
42
- use_cache=None,
43
- output_attentions=None,
44
- output_hidden_states=None,
45
- return_dict=None,
46
- ):
47
- r"""
48
- encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
49
- Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
50
- the model is configured as a decoder.
51
- encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
52
- Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
53
- the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
54
-
55
- - 1 for tokens that are **not masked**,
56
- - 0 for tokens that are **masked**.
57
- past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
58
- Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
59
-
60
- If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
61
- (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
62
- instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
63
- use_cache (:obj:`bool`, `optional`):
64
- If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
65
- decoding (see :obj:`past_key_values`).
66
- """
67
- output_attentions = (
68
- output_attentions if output_attentions is not None else self.config.output_attentions
69
- )
70
- output_hidden_states = (
71
- output_hidden_states
72
- if output_hidden_states is not None
73
- else self.config.output_hidden_states
74
- )
75
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
76
-
77
- if self.config.is_decoder:
78
- use_cache = use_cache if use_cache is not None else self.config.use_cache
79
- else:
80
- use_cache = False
81
-
82
- if input_ids is not None and inputs_embeds is not None:
83
- raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
84
- elif input_ids is not None:
85
- input_shape = input_ids.size()
86
- batch_size, seq_length = input_shape
87
- elif inputs_embeds is not None:
88
- input_shape = inputs_embeds.size()[:-1]
89
- batch_size, seq_length = input_shape
90
- else:
91
- raise ValueError("You have to specify either input_ids or inputs_embeds")
92
-
93
- device = input_ids.device if input_ids is not None else inputs_embeds.device
94
-
95
- # past_key_values_length
96
- past_key_values_length = (
97
- past_key_values[0][0].shape[2] if past_key_values is not None else 0
98
- )
99
-
100
- if attention_mask is None:
101
- attention_mask = torch.ones(
102
- ((batch_size, seq_length + past_key_values_length)), device=device
103
- )
104
- if token_type_ids is None:
105
- token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
106
-
107
- # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
108
- # ourselves in which case we just need to make it broadcastable to all heads.
109
- extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
110
- attention_mask, input_shape, device
111
- )
112
-
113
- # If a 2D or 3D attention mask is provided for the cross-attention
114
- # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
115
- if self.config.is_decoder and encoder_hidden_states is not None:
116
- encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
117
- encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
118
- if encoder_attention_mask is None:
119
- encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
120
- encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
121
- else:
122
- encoder_extended_attention_mask = None
123
- # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
124
- # import ipdb; ipdb.set_trace()
125
-
126
- # Prepare head mask if needed
127
- # 1.0 in head_mask indicate we keep the head
128
- # attention_probs has shape bsz x n_heads x N x N
129
- # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
130
- # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
131
- head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
132
-
133
- embedding_output = self.embeddings(
134
- input_ids=input_ids,
135
- position_ids=position_ids,
136
- token_type_ids=token_type_ids,
137
- inputs_embeds=inputs_embeds,
138
- past_key_values_length=past_key_values_length,
139
- )
140
-
141
- encoder_outputs = self.encoder(
142
- embedding_output,
143
- attention_mask=extended_attention_mask,
144
- head_mask=head_mask,
145
- encoder_hidden_states=encoder_hidden_states,
146
- encoder_attention_mask=encoder_extended_attention_mask,
147
- past_key_values=past_key_values,
148
- use_cache=use_cache,
149
- output_attentions=output_attentions,
150
- output_hidden_states=output_hidden_states,
151
- return_dict=return_dict,
152
- )
153
- sequence_output = encoder_outputs[0]
154
- pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
155
-
156
- if not return_dict:
157
- return (sequence_output, pooled_output) + encoder_outputs[1:]
158
-
159
- return BaseModelOutputWithPoolingAndCrossAttentions(
160
- last_hidden_state=sequence_output,
161
- pooler_output=pooled_output,
162
- past_key_values=encoder_outputs.past_key_values,
163
- hidden_states=encoder_outputs.hidden_states,
164
- attentions=encoder_outputs.attentions,
165
- cross_attentions=encoder_outputs.cross_attentions,
166
- )
167
-
168
-
169
- class TextEncoderShell(nn.Module):
170
- def __init__(self, text_encoder):
171
- super().__init__()
172
- self.text_encoder = text_encoder
173
- self.config = self.text_encoder.config
174
-
175
- def forward(self, **kw):
176
- # feed into text encoder
177
- return self.text_encoder(**kw)
178
-
179
-
180
- def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer):
181
- """Generate attention mask between each pair of special tokens
182
- Args:
183
- input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
184
- special_tokens_mask (list): special tokens mask.
185
- Returns:
186
- torch.Tensor: attention mask between each special tokens.
187
- """
188
- input_ids = tokenized["input_ids"]
189
- bs, num_token = input_ids.shape
190
- # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
191
- special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
192
- for special_token in special_tokens_list:
193
- special_tokens_mask |= input_ids == special_token
194
-
195
- # idxs: each row is a list of indices of special tokens
196
- idxs = torch.nonzero(special_tokens_mask)
197
-
198
- # generate attention mask and positional ids
199
- attention_mask = (
200
- torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
201
- )
202
- position_ids = torch.zeros((bs, num_token), device=input_ids.device)
203
- previous_col = 0
204
- for i in range(idxs.shape[0]):
205
- row, col = idxs[i]
206
- if (col == 0) or (col == num_token - 1):
207
- attention_mask[row, col, col] = True
208
- position_ids[row, col] = 0
209
- else:
210
- attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
211
- position_ids[row, previous_col + 1 : col + 1] = torch.arange(
212
- 0, col - previous_col, device=input_ids.device
213
- )
214
-
215
- previous_col = col
216
-
217
- # # padding mask
218
- # padding_mask = tokenized['attention_mask']
219
- # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
220
-
221
- return attention_mask, position_ids.to(torch.long)
222
-
223
-
224
- def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list, tokenizer):
225
- """Generate attention mask between each pair of special tokens
226
- Args:
227
- input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
228
- special_tokens_mask (list): special tokens mask.
229
- Returns:
230
- torch.Tensor: attention mask between each special tokens.
231
- """
232
- input_ids = tokenized["input_ids"]
233
- bs, num_token = input_ids.shape
234
- # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
235
- special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
236
- for special_token in special_tokens_list:
237
- special_tokens_mask |= input_ids == special_token
238
-
239
- # idxs: each row is a list of indices of special tokens
240
- idxs = torch.nonzero(special_tokens_mask)
241
-
242
- # generate attention mask and positional ids
243
- attention_mask = (
244
- torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
245
- )
246
- position_ids = torch.zeros((bs, num_token), device=input_ids.device)
247
- cate_to_token_mask_list = [[] for _ in range(bs)]
248
- previous_col = 0
249
- for i in range(idxs.shape[0]):
250
- row, col = idxs[i]
251
- if (col == 0) or (col == num_token - 1):
252
- attention_mask[row, col, col] = True
253
- position_ids[row, col] = 0
254
- else:
255
- attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
256
- position_ids[row, previous_col + 1 : col + 1] = torch.arange(
257
- 0, col - previous_col, device=input_ids.device
258
- )
259
- c2t_maski = torch.zeros((num_token), device=input_ids.device).bool()
260
- c2t_maski[previous_col + 1 : col] = True
261
- cate_to_token_mask_list[row].append(c2t_maski)
262
- previous_col = col
263
-
264
- cate_to_token_mask_list = [
265
- torch.stack(cate_to_token_mask_listi, dim=0)
266
- for cate_to_token_mask_listi in cate_to_token_mask_list
267
- ]
268
-
269
- # # padding mask
270
- # padding_mask = tokenized['attention_mask']
271
- # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
272
-
273
- return attention_mask, position_ids.to(torch.long), cate_to_token_mask_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/GroundingDINO/.ipynb_checkpoints/fuse_modules-checkpoint.py DELETED
@@ -1,298 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # Grounding DINO
3
- # url: https://github.com/IDEA-Research/GroundingDINO
4
- # Copyright (c) 2023 IDEA. All Rights Reserved.
5
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
- # ------------------------------------------------------------------------
7
-
8
- import torch
9
- import torch.nn as nn
10
- import torch.nn.functional as F
11
- from timm.models.layers import DropPath
12
- import loralib as lora
13
-
14
- class FeatureResizer(nn.Module):
15
- """
16
- This class takes as input a set of embeddings of dimension C1 and outputs a set of
17
- embedding of dimension C2, after a linear transformation, dropout and normalization (LN).
18
- """
19
-
20
- def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
21
- super().__init__()
22
- self.do_ln = do_ln
23
- r = 12
24
- # Object feature encoding
25
- self.fc = lora.Linear(input_feat_size, output_feat_size,r=r, bias=True)
26
- self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
27
- self.dropout = nn.Dropout(dropout)
28
-
29
- def forward(self, encoder_features):
30
- x = self.fc(encoder_features)
31
- if self.do_ln:
32
- x = self.layer_norm(x)
33
- output = self.dropout(x)
34
- return output
35
-
36
-
37
- def l1norm(X, dim, eps=1e-8):
38
- """L1-normalize columns of X"""
39
- norm = torch.abs(X).sum(dim=dim, keepdim=True) + eps
40
- X = torch.div(X, norm)
41
- return X
42
-
43
-
44
- def l2norm(X, dim, eps=1e-8):
45
- """L2-normalize columns of X"""
46
- norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps
47
- X = torch.div(X, norm)
48
- return X
49
-
50
-
51
- def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8):
52
- """
53
- query: (n_context, queryL, d)
54
- context: (n_context, sourceL, d)
55
- """
56
- batch_size_q, queryL = query.size(0), query.size(1)
57
- batch_size, sourceL = context.size(0), context.size(1)
58
-
59
- # Get attention
60
- # --> (batch, d, queryL)
61
- queryT = torch.transpose(query, 1, 2)
62
-
63
- # (batch, sourceL, d)(batch, d, queryL)
64
- # --> (batch, sourceL, queryL)
65
- attn = torch.bmm(context, queryT)
66
- if raw_feature_norm == "softmax":
67
- # --> (batch*sourceL, queryL)
68
- attn = attn.view(batch_size * sourceL, queryL)
69
- attn = nn.Softmax()(attn)
70
- # --> (batch, sourceL, queryL)
71
- attn = attn.view(batch_size, sourceL, queryL)
72
- elif raw_feature_norm == "l2norm":
73
- attn = l2norm(attn, 2)
74
- elif raw_feature_norm == "clipped_l2norm":
75
- attn = nn.LeakyReLU(0.1)(attn)
76
- attn = l2norm(attn, 2)
77
- else:
78
- raise ValueError("unknown first norm type:", raw_feature_norm)
79
- # --> (batch, queryL, sourceL)
80
- attn = torch.transpose(attn, 1, 2).contiguous()
81
- # --> (batch*queryL, sourceL)
82
- attn = attn.view(batch_size * queryL, sourceL)
83
- attn = nn.Softmax()(attn * smooth)
84
- # --> (batch, queryL, sourceL)
85
- attn = attn.view(batch_size, queryL, sourceL)
86
- # --> (batch, sourceL, queryL)
87
- attnT = torch.transpose(attn, 1, 2).contiguous()
88
-
89
- # --> (batch, d, sourceL)
90
- contextT = torch.transpose(context, 1, 2)
91
- # (batch x d x sourceL)(batch x sourceL x queryL)
92
- # --> (batch, d, queryL)
93
- weightedContext = torch.bmm(contextT, attnT)
94
- # --> (batch, queryL, d)
95
- weightedContext = torch.transpose(weightedContext, 1, 2)
96
-
97
- return weightedContext, attnT
98
-
99
-
100
- class BiMultiHeadAttention(nn.Module):
101
- def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None):
102
- super(BiMultiHeadAttention, self).__init__()
103
-
104
- self.embed_dim = embed_dim
105
- self.num_heads = num_heads
106
- self.head_dim = embed_dim // num_heads
107
- self.v_dim = v_dim
108
- self.l_dim = l_dim
109
-
110
- assert (
111
- self.head_dim * self.num_heads == self.embed_dim
112
- ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
113
- self.scale = self.head_dim ** (-0.5)
114
- self.dropout = dropout
115
- r = 12
116
- self.v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r)
117
- self.l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r )
118
- self.values_v_proj = lora.Linear(self.v_dim, self.embed_dim , r=r )
119
- self.values_l_proj = lora.Linear(self.l_dim, self.embed_dim , r=r )
120
-
121
- self.out_v_proj = lora.Linear(self.embed_dim, self.v_dim , r=r )
122
- self.out_l_proj = lora.Linear(self.embed_dim, self.l_dim , r=r )
123
-
124
- self.stable_softmax_2d = True
125
- self.clamp_min_for_underflow = True
126
- self.clamp_max_for_overflow = True
127
-
128
- self._reset_parameters()
129
-
130
- def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
131
- return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
132
-
133
- def _reset_parameters(self):
134
- nn.init.xavier_uniform_(self.v_proj.weight)
135
- self.v_proj.bias.data.fill_(0)
136
- nn.init.xavier_uniform_(self.l_proj.weight)
137
- self.l_proj.bias.data.fill_(0)
138
- nn.init.xavier_uniform_(self.values_v_proj.weight)
139
- self.values_v_proj.bias.data.fill_(0)
140
- nn.init.xavier_uniform_(self.values_l_proj.weight)
141
- self.values_l_proj.bias.data.fill_(0)
142
- nn.init.xavier_uniform_(self.out_v_proj.weight)
143
- self.out_v_proj.bias.data.fill_(0)
144
- nn.init.xavier_uniform_(self.out_l_proj.weight)
145
- self.out_l_proj.bias.data.fill_(0)
146
-
147
- def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
148
- """_summary_
149
-
150
- Args:
151
- v (_type_): bs, n_img, dim
152
- l (_type_): bs, n_text, dim
153
- attention_mask_v (_type_, optional): _description_. bs, n_img
154
- attention_mask_l (_type_, optional): _description_. bs, n_text
155
-
156
- Returns:
157
- _type_: _description_
158
- """
159
- # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
160
- # import ipdb; ipdb.set_trace()
161
- bsz, tgt_len, _ = v.size()
162
-
163
- query_states = self.v_proj(v) * self.scale
164
- key_states = self._shape(self.l_proj(l), -1, bsz)
165
- value_v_states = self._shape(self.values_v_proj(v), -1, bsz)
166
- value_l_states = self._shape(self.values_l_proj(l), -1, bsz)
167
-
168
- proj_shape = (bsz * self.num_heads, -1, self.head_dim)
169
- query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
170
- key_states = key_states.view(*proj_shape)
171
- value_v_states = value_v_states.view(*proj_shape)
172
- value_l_states = value_l_states.view(*proj_shape)
173
-
174
- src_len = key_states.size(1)
175
- attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt
176
-
177
- if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
178
- raise ValueError(
179
- f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
180
- )
181
-
182
- if self.stable_softmax_2d:
183
- attn_weights = attn_weights - attn_weights.max()
184
-
185
- if self.clamp_min_for_underflow:
186
- attn_weights = torch.clamp(
187
- attn_weights, min=-50000
188
- ) # Do not increase -50000, data type half has quite limited range
189
- if self.clamp_max_for_overflow:
190
- attn_weights = torch.clamp(
191
- attn_weights, max=50000
192
- ) # Do not increase 50000, data type half has quite limited range
193
-
194
- attn_weights_T = attn_weights.transpose(1, 2)
195
- attn_weights_l = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0]
196
- if self.clamp_min_for_underflow:
197
- attn_weights_l = torch.clamp(
198
- attn_weights_l, min=-50000
199
- ) # Do not increase -50000, data type half has quite limited range
200
- if self.clamp_max_for_overflow:
201
- attn_weights_l = torch.clamp(
202
- attn_weights_l, max=50000
203
- ) # Do not increase 50000, data type half has quite limited range
204
-
205
- # mask vison for language
206
- if attention_mask_v is not None:
207
- attention_mask_v = (
208
- attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
209
- )
210
- attn_weights_l.masked_fill_(attention_mask_v, float("-inf"))
211
-
212
- attn_weights_l = attn_weights_l.softmax(dim=-1)
213
-
214
- # mask language for vision
215
- if attention_mask_l is not None:
216
- attention_mask_l = (
217
- attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
218
- )
219
- attn_weights.masked_fill_(attention_mask_l, float("-inf"))
220
- attn_weights_v = attn_weights.softmax(dim=-1)
221
-
222
- attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training)
223
- attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training)
224
-
225
- attn_output_v = torch.bmm(attn_probs_v, value_l_states)
226
- attn_output_l = torch.bmm(attn_probs_l, value_v_states)
227
-
228
- if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
229
- raise ValueError(
230
- f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}"
231
- )
232
-
233
- if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim):
234
- raise ValueError(
235
- f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}"
236
- )
237
-
238
- attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim)
239
- attn_output_v = attn_output_v.transpose(1, 2)
240
- attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)
241
-
242
- attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim)
243
- attn_output_l = attn_output_l.transpose(1, 2)
244
- attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)
245
-
246
- attn_output_v = self.out_v_proj(attn_output_v)
247
- attn_output_l = self.out_l_proj(attn_output_l)
248
-
249
- return attn_output_v, attn_output_l
250
-
251
-
252
- # Bi-Direction MHA (text->image, image->text)
253
- class BiAttentionBlock(nn.Module):
254
- def __init__(
255
- self,
256
- v_dim,
257
- l_dim,
258
- embed_dim,
259
- num_heads,
260
- dropout=0.1,
261
- drop_path=0.0,
262
- init_values=1e-4,
263
- cfg=None,
264
- ):
265
- """
266
- Inputs:
267
- embed_dim - Dimensionality of input and attention feature vectors
268
- hidden_dim - Dimensionality of hidden layer in feed-forward network
269
- (usually 2-4x larger than embed_dim)
270
- num_heads - Number of heads to use in the Multi-Head Attention block
271
- dropout - Amount of dropout to apply in the feed-forward network
272
- """
273
- super(BiAttentionBlock, self).__init__()
274
-
275
- # pre layer norm
276
- self.layer_norm_v = nn.LayerNorm(v_dim)
277
- self.layer_norm_l = nn.LayerNorm(l_dim)
278
- self.attn = BiMultiHeadAttention(
279
- v_dim=v_dim, l_dim=l_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout
280
- )
281
-
282
- # add layer scale for training stability
283
- self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
284
- self.gamma_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=True)
285
- self.gamma_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=True)
286
-
287
- def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
288
- v = self.layer_norm_v(v)
289
- l = self.layer_norm_l(l)
290
- delta_v, delta_l = self.attn(
291
- v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l
292
- )
293
- # v, l = v + delta_v, l + delta_l
294
- v = v + self.drop_path(self.gamma_v * delta_v)
295
- l = l + self.drop_path(self.gamma_l * delta_l)
296
- return v, l
297
-
298
- # def forward(self, v:List[torch.Tensor], l, attention_mask_v=None, attention_mask_l=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/GroundingDINO/.ipynb_checkpoints/groundingdino-checkpoint.py DELETED
@@ -1,857 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # Grounding DINO
3
- # url: https://github.com/IDEA-Research/GroundingDINO
4
- # Copyright (c) 2023 IDEA. All Rights Reserved.
5
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
- # ------------------------------------------------------------------------
7
- # Conditional DETR model and criterion classes.
8
- # Copyright (c) 2021 Microsoft. All Rights Reserved.
9
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10
- # ------------------------------------------------------------------------
11
- # Modified from DETR (https://github.com/facebookresearch/detr)
12
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
13
- # ------------------------------------------------------------------------
14
- # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
15
- # Copyright (c) 2020 SenseTime. All Rights Reserved.
16
- # ------------------------------------------------------------------------
17
- import copy
18
- from typing import List
19
-
20
- import torch
21
- import torch.nn.functional as F
22
- from torch import nn
23
- from torchvision.ops.boxes import nms
24
- from transformers import AutoTokenizer, BertModel, BertTokenizer, RobertaModel, RobertaTokenizerFast
25
-
26
- from groundingdino.util import box_ops, get_tokenlizer
27
- from groundingdino.util.misc import (
28
- NestedTensor,
29
- accuracy,
30
- get_world_size,
31
- interpolate,
32
- inverse_sigmoid,
33
- is_dist_avail_and_initialized,
34
- nested_tensor_from_tensor_list,
35
- )
36
- from groundingdino.util.utils import get_phrases_from_posmap
37
- from groundingdino.util.visualizer import COCOVisualizer
38
- from groundingdino.util.vl_utils import create_positive_map_from_span
39
-
40
- from ..registry import MODULE_BUILD_FUNCS
41
- from .backbone import build_backbone
42
- from .bertwarper import (
43
- BertModelWarper,
44
- generate_masks_with_special_tokens,
45
- generate_masks_with_special_tokens_and_transfer_map,
46
- )
47
- from .transformer import build_transformer
48
- from .utils import MLP, ContrastiveEmbed, sigmoid_focal_loss
49
-
50
- from .matcher import build_matcher
51
-
52
-
53
-
54
-
55
- class GroundingDINO(nn.Module):
56
- """This is the Cross-Attention Detector module that performs object detection"""
57
-
58
- def __init__(
59
- self,
60
- backbone,
61
- transformer,
62
- num_queries,
63
- aux_loss=False,
64
- iter_update=False,
65
- query_dim=2,
66
- num_feature_levels=1,
67
- nheads=8,
68
- # two stage
69
- two_stage_type="no", # ['no', 'standard']
70
- dec_pred_bbox_embed_share=True,
71
- two_stage_class_embed_share=True,
72
- two_stage_bbox_embed_share=True,
73
- num_patterns=0,
74
- dn_number=100,
75
- dn_box_noise_scale=0.4,
76
- dn_label_noise_ratio=0.5,
77
- dn_labelbook_size=100,
78
- text_encoder_type="bert-base-uncased",
79
- sub_sentence_present=True,
80
- max_text_len=256,
81
- ):
82
- """Initializes the model.
83
- Parameters:
84
- backbone: torch module of the backbone to be used. See backbone.py
85
- transformer: torch module of the transformer architecture. See transformer.py
86
- num_queries: number of object queries, ie detection slot. This is the maximal number of objects
87
- Conditional DETR can detect in a single image. For COCO, we recommend 100 queries.
88
- aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
89
- """
90
- super().__init__()
91
- self.num_queries = num_queries
92
- self.transformer = transformer
93
- self.hidden_dim = hidden_dim = transformer.d_model
94
- self.num_feature_levels = num_feature_levels
95
- self.nheads = nheads
96
- self.max_text_len = 256
97
- self.sub_sentence_present = sub_sentence_present
98
-
99
- # setting query dim
100
- self.query_dim = query_dim
101
- assert query_dim == 4
102
-
103
- # for dn training
104
- self.num_patterns = num_patterns
105
- self.dn_number = dn_number
106
- self.dn_box_noise_scale = dn_box_noise_scale
107
- self.dn_label_noise_ratio = dn_label_noise_ratio
108
- self.dn_labelbook_size = dn_labelbook_size
109
-
110
- # bert
111
- self.tokenizer = get_tokenlizer.get_tokenlizer(text_encoder_type)
112
- self.bert = get_tokenlizer.get_pretrained_language_model(text_encoder_type)
113
- self.bert.pooler.dense.weight.requires_grad_(False)
114
- self.bert.pooler.dense.bias.requires_grad_(False)
115
- self.bert = BertModelWarper(bert_model=self.bert)
116
-
117
- self.feat_map = nn.Linear(self.bert.config.hidden_size, self.hidden_dim, bias=True)
118
- nn.init.constant_(self.feat_map.bias.data, 0)
119
- nn.init.xavier_uniform_(self.feat_map.weight.data)
120
- # freeze
121
-
122
- # special tokens
123
- self.specical_tokens = self.tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
124
-
125
- # prepare input projection layers
126
- if num_feature_levels > 1:
127
- num_backbone_outs = len(backbone.num_channels)
128
- input_proj_list = []
129
- for _ in range(num_backbone_outs):
130
- in_channels = backbone.num_channels[_]
131
- input_proj_list.append(
132
- nn.Sequential(
133
- nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
134
- nn.GroupNorm(32, hidden_dim),
135
- )
136
- )
137
- for _ in range(num_feature_levels - num_backbone_outs):
138
- input_proj_list.append(
139
- nn.Sequential(
140
- nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
141
- nn.GroupNorm(32, hidden_dim),
142
- )
143
- )
144
- in_channels = hidden_dim
145
- self.input_proj = nn.ModuleList(input_proj_list)
146
- else:
147
- assert two_stage_type == "no", "two_stage_type should be no if num_feature_levels=1 !!!"
148
- self.input_proj = nn.ModuleList(
149
- [
150
- nn.Sequential(
151
- nn.Conv2d(backbone.num_channels[-1], hidden_dim, kernel_size=1),
152
- nn.GroupNorm(32, hidden_dim),
153
- )
154
- ]
155
- )
156
-
157
- self.backbone = backbone
158
- self.aux_loss = aux_loss
159
- self.box_pred_damping = box_pred_damping = None
160
-
161
- self.iter_update = iter_update
162
- assert iter_update, "Why not iter_update?"
163
-
164
- # prepare pred layers
165
- self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share
166
- # prepare class & box embed
167
- _class_embed = ContrastiveEmbed()
168
-
169
- _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
170
- nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
171
- nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
172
-
173
- if dec_pred_bbox_embed_share:
174
- box_embed_layerlist = [_bbox_embed for i in range(transformer.num_decoder_layers)]
175
- else:
176
- box_embed_layerlist = [
177
- copy.deepcopy(_bbox_embed) for i in range(transformer.num_decoder_layers)
178
- ]
179
- class_embed_layerlist = [_class_embed for i in range(transformer.num_decoder_layers)]
180
- self.bbox_embed = nn.ModuleList(box_embed_layerlist)
181
- self.class_embed = nn.ModuleList(class_embed_layerlist)
182
- self.transformer.decoder.bbox_embed = self.bbox_embed
183
- self.transformer.decoder.class_embed = self.class_embed
184
-
185
- # two stage
186
- self.two_stage_type = two_stage_type
187
- assert two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format(
188
- two_stage_type
189
- )
190
- if two_stage_type != "no":
191
- if two_stage_bbox_embed_share:
192
- assert dec_pred_bbox_embed_share
193
- self.transformer.enc_out_bbox_embed = _bbox_embed
194
- else:
195
- self.transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed)
196
-
197
- if two_stage_class_embed_share:
198
- assert dec_pred_bbox_embed_share
199
- self.transformer.enc_out_class_embed = _class_embed
200
- else:
201
- self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed)
202
-
203
- self.refpoint_embed = None
204
-
205
- self._reset_parameters()
206
-
207
- def _reset_parameters(self):
208
- # init input_proj
209
- for proj in self.input_proj:
210
- nn.init.xavier_uniform_(proj[0].weight, gain=1)
211
- nn.init.constant_(proj[0].bias, 0)
212
-
213
- def init_ref_points(self, use_num_queries):
214
- self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)
215
-
216
- def forward(self, samples: NestedTensor, targets: List = None, **kw):
217
- """The forward expects a NestedTensor, which consists of:
218
- - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
219
- - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
220
-
221
- It returns a dict with the following elements:
222
- - "pred_logits": the classification logits (including no-object) for all queries.
223
- Shape= [batch_size x num_queries x num_classes]
224
- - "pred_boxes": The normalized boxes coordinates for all queries, represented as
225
- (center_x, center_y, width, height). These values are normalized in [0, 1],
226
- relative to the size of each individual image (disregarding possible padding).
227
- See PostProcess for information on how to retrieve the unnormalized bounding box.
228
- - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
229
- dictionnaries containing the two above keys for each decoder layer.
230
- """
231
- if targets is None:
232
- captions = kw["captions"]
233
- else:
234
- captions = [t["caption"] for t in targets]
235
- # encoder texts
236
-
237
- tokenized = self.tokenizer(captions, padding="longest", return_tensors="pt").to(
238
- samples.device
239
- )
240
- one_hot_token = tokenized
241
-
242
- (
243
- text_self_attention_masks,
244
- position_ids,
245
- cate_to_token_mask_list,
246
- ) = generate_masks_with_special_tokens_and_transfer_map(
247
- tokenized, self.specical_tokens, self.tokenizer
248
- )
249
-
250
- if text_self_attention_masks.shape[1] > self.max_text_len:
251
- text_self_attention_masks = text_self_attention_masks[
252
- :, : self.max_text_len, : self.max_text_len
253
- ]
254
- position_ids = position_ids[:, : self.max_text_len]
255
- tokenized["input_ids"] = tokenized["input_ids"][:, : self.max_text_len]
256
- tokenized["attention_mask"] = tokenized["attention_mask"][:, : self.max_text_len]
257
- tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : self.max_text_len]
258
-
259
- # extract text embeddings
260
- if self.sub_sentence_present:
261
- tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"}
262
- tokenized_for_encoder["attention_mask"] = text_self_attention_masks
263
- tokenized_for_encoder["position_ids"] = position_ids
264
- else:
265
- tokenized_for_encoder = tokenized
266
-
267
- bert_output = self.bert(**tokenized_for_encoder) # bs, 195, 768
268
-
269
- encoded_text = self.feat_map(bert_output["last_hidden_state"]) # bs, 195, d_model
270
- text_token_mask = tokenized.attention_mask.bool() # bs, 195
271
- # text_token_mask: True for nomask, False for mask
272
- # text_self_attention_masks: True for nomask, False for mask
273
-
274
- if encoded_text.shape[1] > self.max_text_len:
275
- encoded_text = encoded_text[:, : self.max_text_len, :]
276
- text_token_mask = text_token_mask[:, : self.max_text_len]
277
- position_ids = position_ids[:, : self.max_text_len]
278
- text_self_attention_masks = text_self_attention_masks[
279
- :, : self.max_text_len, : self.max_text_len
280
- ]
281
-
282
- text_dict = {
283
- "encoded_text": encoded_text, # bs, 195, d_model
284
- "text_token_mask": text_token_mask, # bs, 195
285
- "position_ids": position_ids, # bs, 195
286
- "text_self_attention_masks": text_self_attention_masks, # bs, 195,195
287
- }
288
-
289
-
290
- if isinstance(samples, (list, torch.Tensor)):
291
- samples = nested_tensor_from_tensor_list(samples)
292
- features, poss = self.backbone(samples)
293
- srcs = []
294
- masks = []
295
- for l, feat in enumerate(features):
296
- src, mask = feat.decompose()
297
- srcs.append(self.input_proj[l](src))
298
- masks.append(mask)
299
- assert mask is not None
300
- if self.num_feature_levels > len(srcs):
301
- _len_srcs = len(srcs)
302
- for l in range(_len_srcs, self.num_feature_levels):
303
- if l == _len_srcs:
304
- src = self.input_proj[l](features[-1].tensors)
305
- else:
306
- src = self.input_proj[l](srcs[-1])
307
- m = samples.mask
308
- mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
309
- pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
310
- srcs.append(src)
311
- masks.append(mask)
312
- poss.append(pos_l)
313
-
314
- input_query_bbox = input_query_label = attn_mask = dn_meta = None
315
- hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(
316
- srcs, masks, input_query_bbox, poss, input_query_label, attn_mask, text_dict
317
- )
318
-
319
-
320
- # deformable-detr-like anchor update
321
- outputs_coord_list = []
322
- for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate(
323
- zip(reference[:-1], self.bbox_embed, hs)
324
- ):
325
- layer_delta_unsig = layer_bbox_embed(layer_hs)
326
- layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
327
- layer_outputs_unsig = layer_outputs_unsig.sigmoid()
328
- outputs_coord_list.append(layer_outputs_unsig)
329
- outputs_coord_list = torch.stack(outputs_coord_list)
330
-
331
-
332
- outputs_class = torch.stack(
333
- [
334
- layer_cls_embed(layer_hs, text_dict)
335
- for layer_cls_embed, layer_hs in zip(self.class_embed, hs)
336
- ]
337
- )
338
-
339
- out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord_list[-1]}
340
-
341
- # Used to calculate losses
342
- bs, len_td = text_dict['text_token_mask'].shape
343
- out['text_mask']=torch.zeros(bs, self.max_text_len, dtype=torch.bool).to(
344
- samples.device
345
- )
346
- for b in range(bs):
347
- for j in range(len_td):
348
- if text_dict['text_token_mask'][b][j] == True:
349
- out['text_mask'][b][j] = True
350
-
351
- # for intermediate outputs
352
- if self.aux_loss:
353
- out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord_list)
354
- out['token']=one_hot_token
355
- # # for encoder output
356
- if hs_enc is not None:
357
- # prepare intermediate outputs
358
- interm_coord = ref_enc[-1]
359
- interm_class = self.transformer.enc_out_class_embed(hs_enc[-1], text_dict)
360
- out['interm_outputs'] = {'pred_logits': interm_class, 'pred_boxes': interm_coord}
361
- out['interm_outputs_for_matching_pre'] = {'pred_logits': interm_class, 'pred_boxes': init_box_proposal}
362
-
363
- # outputs['pred_logits'].shape
364
- # torch.Size([4, 900, 256])
365
-
366
- # outputs['pred_boxes'].shape
367
- # torch.Size([4, 900, 4])
368
-
369
- # outputs['text_mask'].shape
370
- # torch.Size([256])
371
-
372
- # outputs['text_mask']
373
-
374
- # outputs['aux_outputs'][0].keys()
375
- # dict_keys(['pred_logits', 'pred_boxes', 'one_hot', 'text_mask'])
376
-
377
- # outputs['aux_outputs'][img_idx]
378
-
379
- # outputs['token']
380
- # <class 'transformers.tokenization_utils_base.BatchEncoding'>
381
-
382
- # outputs['interm_outputs'].keys()
383
- # dict_keys(['pred_logits', 'pred_boxes', 'one_hot', 'text_mask'])
384
-
385
-
386
- # outputs['interm_outputs_for_matching_pre'].keys()
387
- # dict_keys(['pred_logits', 'pred_boxes'])
388
-
389
- # outputs['one_hot'].shape
390
- # torch.Size([4, 900, 256])
391
-
392
- return out
393
-
394
- @torch.jit.unused
395
- def _set_aux_loss(self, outputs_class, outputs_coord):
396
- # this is a workaround to make torchscript happy, as torchscript
397
- # doesn't support dictionary with non-homogeneous values, such
398
- # as a dict having both a Tensor and a list.
399
- return [
400
- {"pred_logits": a, "pred_boxes": b}
401
- for a, b in zip(outputs_class[:-1], outputs_coord[:-1])
402
- ]
403
-
404
-
405
-
406
-
407
- class SetCriterion(nn.Module):
408
- def __init__(self, matcher, weight_dict, focal_alpha,focal_gamma, losses):
409
- """ Create the criterion.
410
- Parameters:
411
- matcher: module able to compute a matching between targets and proposals
412
- weight_dict: dict containing as key the names of the losses and as values their relative weight.
413
- losses: list of all the losses to be applied. See get_loss for list of available losses.
414
- focal_alpha: alpha in Focal Loss
415
- """
416
- super().__init__()
417
- self.matcher = matcher
418
- self.weight_dict = weight_dict
419
- self.losses = losses
420
- self.focal_alpha = focal_alpha
421
- self.focal_gamma= focal_gamma
422
-
423
- @torch.no_grad()
424
- def loss_cardinality(self, outputs, targets, indices, num_boxes):
425
- """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
426
- This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
427
- """
428
-
429
- pred_logits = outputs['pred_logits']
430
- device = pred_logits.device
431
- tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device)
432
- # Count the number of predictions that are NOT "no-object" (which is the last class)
433
- card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
434
- card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
435
- losses = {'cardinality_error': card_err}
436
- return losses
437
-
438
- def loss_boxes(self, outputs, targets, indices, num_boxes):
439
- """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
440
- targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
441
- The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
442
- """
443
- assert 'pred_boxes' in outputs
444
- idx = self._get_src_permutation_idx(indices)
445
- src_boxes = outputs['pred_boxes'][idx]
446
- target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
447
-
448
- loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
449
-
450
- losses = {}
451
- losses['loss_bbox'] = loss_bbox.sum() / num_boxes
452
-
453
- loss_giou = 1 - torch.diag(box_ops.generalized_box_iou(
454
- box_ops.box_cxcywh_to_xyxy(src_boxes),
455
- box_ops.box_cxcywh_to_xyxy(target_boxes)))
456
- losses['loss_giou'] = loss_giou.sum() / num_boxes
457
-
458
- # calculate the x,y and h,w loss
459
- with torch.no_grad():
460
- losses['loss_xy'] = loss_bbox[..., :2].sum() / num_boxes
461
- losses['loss_hw'] = loss_bbox[..., 2:].sum() / num_boxes
462
-
463
-
464
- return losses
465
-
466
-
467
- def token_sigmoid_binary_focal_loss(self, outputs, targets, indices, num_boxes):
468
- pred_logits=outputs['pred_logits']
469
- new_targets=outputs['one_hot'].to(pred_logits.device)
470
- text_mask=outputs['text_mask']
471
-
472
- assert (new_targets.dim() == 3)
473
- assert (pred_logits.dim() == 3) # batch x from x to
474
-
475
- bs, n, _ = pred_logits.shape
476
- alpha=self.focal_alpha
477
- gamma=self.focal_gamma
478
- if text_mask is not None:
479
- # ODVG: each sample has different mask
480
- text_mask = text_mask.repeat(1, pred_logits.size(1)).view(outputs['text_mask'].shape[0],-1,outputs['text_mask'].shape[1])
481
- pred_logits = torch.masked_select(pred_logits, text_mask)
482
- new_targets = torch.masked_select(new_targets, text_mask)
483
-
484
- new_targets=new_targets.float()
485
- p = torch.sigmoid(pred_logits)
486
- ce_loss = F.binary_cross_entropy_with_logits(pred_logits, new_targets, reduction="none")
487
- p_t = p * new_targets + (1 - p) * (1 - new_targets)
488
- loss = ce_loss * ((1 - p_t) ** gamma)
489
-
490
- if alpha >= 0:
491
- alpha_t = alpha * new_targets + (1 - alpha) * (1 - new_targets)
492
- loss = alpha_t * loss
493
-
494
- total_num_pos=0
495
- for batch_indices in indices:
496
- total_num_pos += len(batch_indices[0])
497
- num_pos_avg_per_gpu = max(total_num_pos , 1.0)
498
- loss=loss.sum()/num_pos_avg_per_gpu
499
-
500
- losses = {'loss_ce': loss}
501
- return losses
502
-
503
-
504
- def _get_src_permutation_idx(self, indices):
505
- # permute predictions following indices
506
- batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
507
- src_idx = torch.cat([src for (src, _) in indices])
508
- return batch_idx, src_idx
509
-
510
- def _get_tgt_permutation_idx(self, indices):
511
- # permute targets following indices
512
- batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
513
- tgt_idx = torch.cat([tgt for (_, tgt) in indices])
514
- return batch_idx, tgt_idx
515
-
516
- def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
517
- loss_map = {
518
- 'labels': self.token_sigmoid_binary_focal_loss,
519
- 'cardinality': self.loss_cardinality,
520
- 'boxes': self.loss_boxes,
521
- }
522
- assert loss in loss_map, f'do you really want to compute {loss} loss?'
523
- return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
524
-
525
- def forward(self, outputs, targets, cat_list, caption, return_indices=False):
526
- """ This performs the loss computation.
527
- Parameters:
528
- outputs: dict of tensors, see the output specification of the model for the format
529
- targets: list of dicts, such that len(targets) == batch_size.
530
- The expected keys in each dict depends on the losses applied, see each loss' doc
531
-
532
- return_indices: used for vis. if True, the layer0-5 indices will be returned as well.
533
- """
534
- device=next(iter(outputs.values())).device
535
- one_hot = torch.zeros(outputs['pred_logits'].size(),dtype=torch.int64) # torch.Size([bs, 900, 256])
536
- token = outputs['token']
537
-
538
- label_map_list = []
539
- indices = []
540
- for j in range(len(cat_list)): # bs
541
- label_map=[]
542
- for i in range(len(cat_list[j])):
543
- label_id=torch.tensor([i])
544
- per_label=create_positive_map(token[j], label_id, cat_list[j], caption[j])
545
- label_map.append(per_label)
546
- label_map=torch.stack(label_map,dim=0).squeeze(1)
547
- label_map_list.append(label_map)
548
- for j in range(len(cat_list)): # bs
549
- for_match = {
550
- "pred_logits" : outputs['pred_logits'][j].unsqueeze(0),
551
- "pred_boxes" : outputs['pred_boxes'][j].unsqueeze(0)
552
- }
553
- inds = self.matcher(for_match, [targets[j]], label_map_list[j])
554
- indices.extend(inds)
555
- # indices : A list of size batch_size, containing tuples of (index_i, index_j) where:
556
- # - index_i is the indices of the selected predictions (in order)
557
- # - index_j is the indices of the corresponding selected targets (in order)
558
-
559
- # import pdb; pdb.set_trace()
560
- tgt_ids = [v["labels"].cpu() for v in targets]
561
- # len(tgt_ids) == bs
562
- for i in range(len(indices)):
563
- tgt_ids[i]=tgt_ids[i][indices[i][1]]
564
- one_hot[i,indices[i][0]] = label_map_list[i][tgt_ids[i]].to(torch.long)
565
- outputs['one_hot'] = one_hot
566
- if return_indices:
567
- indices0_copy = indices
568
- indices_list = []
569
-
570
- # Compute the average number of target boxes accross all nodes, for normalization purposes
571
- num_boxes_list = [len(t["labels"]) for t in targets]
572
- num_boxes = sum(num_boxes_list)
573
- num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=device)
574
- if is_dist_avail_and_initialized():
575
- torch.distributed.all_reduce(num_boxes)
576
- num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
577
-
578
- # Compute all the requested losses
579
- losses = {}
580
- for loss in self.losses:
581
- losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
582
-
583
- # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
584
- if 'aux_outputs' in outputs:
585
- for idx, aux_outputs in enumerate(outputs['aux_outputs']):
586
- indices = []
587
- for j in range(len(cat_list)): # bs
588
- aux_output_single = {
589
- 'pred_logits' : aux_outputs['pred_logits'][j].unsqueeze(0),
590
- 'pred_boxes': aux_outputs['pred_boxes'][j].unsqueeze(0)
591
- }
592
- inds = self.matcher(aux_output_single, [targets[j]], label_map_list[j])
593
- indices.extend(inds)
594
- one_hot_aux = torch.zeros(outputs['pred_logits'].size(),dtype=torch.int64)
595
- tgt_ids = [v["labels"].cpu() for v in targets]
596
- for i in range(len(indices)):
597
- tgt_ids[i]=tgt_ids[i][indices[i][1]]
598
- one_hot_aux[i,indices[i][0]] = label_map_list[i][tgt_ids[i]].to(torch.long)
599
- aux_outputs['one_hot'] = one_hot_aux
600
- aux_outputs['text_mask'] = outputs['text_mask']
601
- if return_indices:
602
- indices_list.append(indices)
603
- for loss in self.losses:
604
- kwargs = {}
605
- l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
606
- l_dict = {k + f'_{idx}': v for k, v in l_dict.items()}
607
- losses.update(l_dict)
608
-
609
- # interm_outputs loss
610
- if 'interm_outputs' in outputs:
611
- interm_outputs = outputs['interm_outputs']
612
- indices = []
613
- for j in range(len(cat_list)): # bs
614
- interm_output_single = {
615
- 'pred_logits' : interm_outputs['pred_logits'][j].unsqueeze(0),
616
- 'pred_boxes': interm_outputs['pred_boxes'][j].unsqueeze(0)
617
- }
618
- inds = self.matcher(interm_output_single, [targets[j]], label_map_list[j])
619
- indices.extend(inds)
620
- one_hot_aux = torch.zeros(outputs['pred_logits'].size(),dtype=torch.int64)
621
- tgt_ids = [v["labels"].cpu() for v in targets]
622
- for i in range(len(indices)):
623
- tgt_ids[i]=tgt_ids[i][indices[i][1]]
624
- one_hot_aux[i,indices[i][0]] = label_map_list[i][tgt_ids[i]].to(torch.long)
625
- interm_outputs['one_hot'] = one_hot_aux
626
- interm_outputs['text_mask'] = outputs['text_mask']
627
- if return_indices:
628
- indices_list.append(indices)
629
- for loss in self.losses:
630
- kwargs = {}
631
- l_dict = self.get_loss(loss, interm_outputs, targets, indices, num_boxes, **kwargs)
632
- l_dict = {k + f'_interm': v for k, v in l_dict.items()}
633
- losses.update(l_dict)
634
-
635
- if return_indices:
636
- indices_list.append(indices0_copy)
637
- return losses, indices_list
638
-
639
- return losses
640
-
641
-
642
- class PostProcess(nn.Module):
643
- """ This module converts the model's output into the format expected by the coco api"""
644
- def __init__(self, num_select=100,text_encoder_type='text_encoder_type', nms_iou_threshold=-1,use_coco_eval=False,args=None) -> None:
645
- super().__init__()
646
- self.num_select = num_select
647
- self.tokenizer = get_tokenlizer.get_tokenlizer(text_encoder_type)
648
- if args.use_coco_eval:
649
- from pycocotools.coco import COCO
650
- coco = COCO(args.coco_val_path)
651
- category_dict = coco.loadCats(coco.getCatIds())
652
- cat_list = [item['name'] for item in category_dict]
653
- else:
654
- cat_list=args.label_list
655
- caption = " . ".join(cat_list) + ' .'
656
- tokenized = self.tokenizer(caption, padding="longest", return_tensors="pt")
657
- label_list = torch.arange(len(cat_list))
658
- pos_map=create_positive_map(tokenized,label_list,cat_list,caption)
659
- # build a mapping from label_id to pos_map
660
- if args.use_coco_eval:
661
- id_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 13, 12: 14, 13: 15, 14: 16, 15: 17, 16: 18, 17: 19, 18: 20, 19: 21, 20: 22, 21: 23, 22: 24, 23: 25, 24: 27, 25: 28, 26: 31, 27: 32, 28: 33, 29: 34, 30: 35, 31: 36, 32: 37, 33: 38, 34: 39, 35: 40, 36: 41, 37: 42, 38: 43, 39: 44, 40: 46,
662
- 41: 47, 42: 48, 43: 49, 44: 50, 45: 51, 46: 52, 47: 53, 48: 54, 49: 55, 50: 56, 51: 57, 52: 58, 53: 59, 54: 60, 55: 61, 56: 62, 57: 63, 58: 64, 59: 65, 60: 67, 61: 70, 62: 72, 63: 73, 64: 74, 65: 75, 66: 76, 67: 77, 68: 78, 69: 79, 70: 80, 71: 81, 72: 82, 73: 84, 74: 85, 75: 86, 76: 87, 77: 88, 78: 89, 79: 90}
663
- new_pos_map = torch.zeros((91, 256))
664
- for k, v in id_map.items():
665
- new_pos_map[v] = pos_map[k]
666
- pos_map=new_pos_map
667
-
668
-
669
- self.nms_iou_threshold=nms_iou_threshold
670
- self.positive_map = pos_map
671
-
672
- @torch.no_grad()
673
- def forward(self, outputs, target_sizes, not_to_xyxy=False, test=False):
674
- """ Perform the computation
675
- Parameters:
676
- outputs: raw outputs of the model
677
- target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
678
- For evaluation, this must be the original image size (before any data augmentation)
679
- For visualization, this should be the image size after data augment, but before padding
680
- """
681
- num_select = self.num_select
682
- out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
683
-
684
-
685
- prob_to_token = out_logits.sigmoid()
686
- pos_maps = self.positive_map.to(prob_to_token.device)
687
- for label_ind in range(len(pos_maps)):
688
- if pos_maps[label_ind].sum() != 0:
689
- pos_maps[label_ind]=pos_maps[label_ind]/pos_maps[label_ind].sum()
690
-
691
- prob_to_label = prob_to_token @ pos_maps.T
692
-
693
- assert len(out_logits) == len(target_sizes)
694
- assert target_sizes.shape[1] == 2
695
-
696
- prob = prob_to_label
697
- topk_values, topk_indexes = torch.topk(prob.view(prob.shape[0], -1), num_select, dim=1)
698
- scores = topk_values
699
- topk_boxes = torch.div(topk_indexes, prob.shape[2], rounding_mode='trunc')
700
- labels = topk_indexes % prob.shape[2]
701
- if not_to_xyxy:
702
- boxes = out_bbox
703
- else:
704
- boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
705
-
706
- # if test:
707
- # assert not not_to_xyxy
708
- # boxes[:,:,2:] = boxes[:,:,2:] - boxes[:,:,:2]
709
- boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
710
-
711
- # and from relative [0, 1] to absolute [0, height] coordinates
712
- img_h, img_w = target_sizes.unbind(1)
713
- scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
714
- boxes = boxes * scale_fct[:, None, :]
715
-
716
- if self.nms_iou_threshold > 0:
717
- item_indices = [nms(b, s, iou_threshold=self.nms_iou_threshold) for b,s in zip(boxes, scores)]
718
-
719
- results = [{'scores': s[i], 'labels': l[i], 'boxes': b[i]} for s, l, b, i in zip(scores, labels, boxes, item_indices)]
720
- else:
721
- results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)]
722
- results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)]
723
- return results
724
-
725
-
726
- @MODULE_BUILD_FUNCS.registe_with_name(module_name="groundingdino")
727
- def build_groundingdino(args):
728
- device = torch.device(args.device)
729
- backbone = build_backbone(args)
730
- transformer = build_transformer(args)
731
-
732
- dn_labelbook_size = args.dn_labelbook_size
733
- dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share
734
- sub_sentence_present = args.sub_sentence_present
735
-
736
- model = GroundingDINO(
737
- backbone,
738
- transformer,
739
- num_queries=args.num_queries,
740
- aux_loss=args.aux_loss,
741
- iter_update=True,
742
- query_dim=4,
743
- num_feature_levels=args.num_feature_levels,
744
- nheads=args.nheads,
745
- dec_pred_bbox_embed_share=dec_pred_bbox_embed_share,
746
- two_stage_type=args.two_stage_type,
747
- two_stage_bbox_embed_share=args.two_stage_bbox_embed_share,
748
- two_stage_class_embed_share=args.two_stage_class_embed_share,
749
- num_patterns=args.num_patterns,
750
- dn_number=0,
751
- dn_box_noise_scale=args.dn_box_noise_scale,
752
- dn_label_noise_ratio=args.dn_label_noise_ratio,
753
- dn_labelbook_size=dn_labelbook_size,
754
- text_encoder_type=args.text_encoder_type,
755
- sub_sentence_present=sub_sentence_present,
756
- max_text_len=args.max_text_len,
757
- )
758
-
759
-
760
-
761
- matcher = build_matcher(args)
762
-
763
- # prepare weight dict
764
- weight_dict = {'loss_ce': args.cls_loss_coef, 'loss_bbox': args.bbox_loss_coef}
765
- weight_dict['loss_giou'] = args.giou_loss_coef
766
- clean_weight_dict_wo_dn = copy.deepcopy(weight_dict)
767
-
768
-
769
-
770
- clean_weight_dict = copy.deepcopy(weight_dict)
771
-
772
- # TODO this is a hack
773
- if args.aux_loss:
774
- aux_weight_dict = {}
775
- for i in range(args.dec_layers - 1):
776
- aux_weight_dict.update({k + f'_{i}': v for k, v in clean_weight_dict.items()})
777
- weight_dict.update(aux_weight_dict)
778
-
779
- if args.two_stage_type != 'no':
780
- interm_weight_dict = {}
781
- try:
782
- no_interm_box_loss = args.no_interm_box_loss
783
- except:
784
- no_interm_box_loss = False
785
- _coeff_weight_dict = {
786
- 'loss_ce': 1.0,
787
- 'loss_bbox': 1.0 if not no_interm_box_loss else 0.0,
788
- 'loss_giou': 1.0 if not no_interm_box_loss else 0.0,
789
- }
790
- try:
791
- interm_loss_coef = args.interm_loss_coef
792
- except:
793
- interm_loss_coef = 1.0
794
- interm_weight_dict.update({k + f'_interm': v * interm_loss_coef * _coeff_weight_dict[k] for k, v in clean_weight_dict_wo_dn.items()})
795
- weight_dict.update(interm_weight_dict)
796
-
797
- # losses = ['labels', 'boxes', 'cardinality']
798
- losses = ['labels', 'boxes']
799
-
800
- criterion = SetCriterion(matcher=matcher, weight_dict=weight_dict,
801
- focal_alpha=args.focal_alpha, focal_gamma=args.focal_gamma,losses=losses
802
- )
803
- criterion.to(device)
804
- postprocessors = {'bbox': PostProcess(num_select=args.num_select , text_encoder_type=args.text_encoder_type,nms_iou_threshold=args.nms_iou_threshold,args=args)}
805
-
806
- return model, criterion, postprocessors
807
-
808
- def create_positive_map(tokenized, tokens_positive,cat_list,caption):
809
- """construct a map such that positive_map[i,j] = True iff box i is associated to token j"""
810
- positive_map = torch.zeros((len(tokens_positive), 256), dtype=torch.float)
811
-
812
- for j,label in enumerate(tokens_positive):
813
-
814
- start_ind = caption.find(cat_list[label])
815
- end_ind = start_ind + len(cat_list[label]) - 1
816
- beg_pos = tokenized.char_to_token(start_ind)
817
- try:
818
- end_pos = tokenized.char_to_token(end_ind)
819
- except:
820
- end_pos = None
821
- if end_pos is None:
822
- try:
823
- end_pos = tokenized.char_to_token(end_ind - 1)
824
- if end_pos is None:
825
- end_pos = tokenized.char_to_token(end_ind - 2)
826
- except:
827
- end_pos = None
828
- # except Exception as e:
829
- # print("beg:", beg, "end:", end)
830
- # print("token_positive:", tokens_positive)
831
- # # print("beg_pos:", beg_pos, "end_pos:", end_pos)
832
- # raise e
833
- # if beg_pos is None:
834
- # try:
835
- # beg_pos = tokenized.char_to_token(beg + 1)
836
- # if beg_pos is None:
837
- # beg_pos = tokenized.char_to_token(beg + 2)
838
- # except:
839
- # beg_pos = None
840
- # if end_pos is None:
841
- # try:
842
- # end_pos = tokenized.char_to_token(end - 2)
843
- # if end_pos is None:
844
- # end_pos = tokenized.char_to_token(end - 3)
845
- # except:
846
- # end_pos = None
847
- if beg_pos is None or end_pos is None:
848
- continue
849
- if beg_pos < 0 or end_pos < 0:
850
- continue
851
- if beg_pos > end_pos:
852
- continue
853
- # assert beg_pos is not None and end_pos is not None
854
- positive_map[j,beg_pos: end_pos + 1].fill_(1)
855
- return positive_map
856
-
857
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/GroundingDINO/.ipynb_checkpoints/matcher-checkpoint.py DELETED
@@ -1,218 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # DINO
3
- # Copyright (c) 2022 IDEA. All Rights Reserved.
4
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
- # ------------------------------------------------------------------------
6
- # Modules to compute the matching cost and solve the corresponding LSAP.
7
- # Copyright (c) 2021 Microsoft. All Rights Reserved.
8
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
9
- # ------------------------------------------------------------------------
10
- # Modified from DETR (https://github.com/facebookresearch/detr)
11
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
12
- # ------------------------------------------------------------------------
13
- # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
14
- # Copyright (c) 2020 SenseTime. All Rights Reserved.
15
- # ------------------------------------------------------------------------
16
-
17
-
18
- import torch, os
19
- from torch import nn
20
- from scipy.optimize import linear_sum_assignment
21
-
22
- from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
23
-
24
-
25
- class HungarianMatcher(nn.Module):
26
- """This class computes an assignment between the targets and the predictions of the network
27
- For efficiency reasons, the targets don't include the no_object. Because of this, in general,
28
- there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
29
- while the others are un-matched (and thus treated as non-objects).
30
- """
31
-
32
- def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1, focal_alpha = 0.25):
33
- """Creates the matcher
34
- Params:
35
- cost_class: This is the relative weight of the classification error in the matching cost
36
- cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
37
- cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
38
- """
39
- super().__init__()
40
- self.cost_class = cost_class
41
- self.cost_bbox = cost_bbox
42
- self.cost_giou = cost_giou
43
- assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
44
-
45
- self.focal_alpha = focal_alpha
46
-
47
- @torch.no_grad()
48
- def forward(self, outputs, targets, label_map):
49
- """ Performs the matching
50
- Params:
51
- outputs: This is a dict that contains at least these entries:
52
- "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
53
- "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
54
- targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
55
- "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
56
- objects in the target) containing the class labels
57
- "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
58
- Returns:
59
- A list of size batch_size, containing tuples of (index_i, index_j) where:
60
- - index_i is the indices of the selected predictions (in order)
61
- - index_j is the indices of the corresponding selected targets (in order)
62
- For each batch element, it holds:
63
- len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
64
- """
65
-
66
- bs, num_queries = outputs["pred_logits"].shape[:2]
67
-
68
- # We flatten to compute the cost matrices in a batch
69
- out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes]
70
- out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4]
71
-
72
- # Also concat the target labels and boxes
73
- tgt_ids = torch.cat([v["labels"] for v in targets])
74
- tgt_bbox = torch.cat([v["boxes"] for v in targets])
75
-
76
- # Compute the classification cost.
77
- alpha = self.focal_alpha
78
- gamma = 2.0
79
-
80
- new_label_map=label_map[tgt_ids.cpu()]
81
-
82
- neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
83
- pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
84
- new_label_map=new_label_map.to(pos_cost_class.device)
85
-
86
- cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
87
-
88
- # cost_class=(pos_cost_class @ new_label_map.T - neg_cost_class@ new_label_map.T)
89
- cost_class=[]
90
- for idx_map in new_label_map:
91
- idx_map = idx_map / idx_map.sum()
92
- cost_class.append(pos_cost_class @ idx_map - neg_cost_class@ idx_map)
93
- if cost_class:
94
- cost_class=torch.stack(cost_class,dim=0).T
95
- else:
96
- cost_class=torch.zeros_like(cost_bbox)
97
- # Compute the L1 cost between boxes
98
-
99
-
100
- # Compute the giou cost betwen boxes
101
- cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
102
- # import pdb;pdb.set_trace()
103
- # Final cost matrix
104
- C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
105
- C = C.view(bs, num_queries, -1).cpu()
106
- C[torch.isnan(C)] = 0.0
107
- C[torch.isinf(C)] = 0.0
108
-
109
- sizes = [len(v["boxes"]) for v in targets]
110
- try:
111
- indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
112
- except:
113
- print("warning: use SimpleMinsumMatcher")
114
- indices = []
115
- device = C.device
116
- for i, (c, _size) in enumerate(zip(C.split(sizes, -1), sizes)):
117
- weight_mat = c[i]
118
- idx_i = weight_mat.min(0)[1]
119
- idx_j = torch.arange(_size).to(device)
120
- indices.append((idx_i, idx_j))
121
- return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
122
-
123
-
124
- class SimpleMinsumMatcher(nn.Module):
125
- """This class computes an assignment between the targets and the predictions of the network
126
- For efficiency reasons, the targets don't include the no_object. Because of this, in general,
127
- there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
128
- while the others are un-matched (and thus treated as non-objects).
129
- """
130
-
131
- def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1, focal_alpha = 0.25):
132
- """Creates the matcher
133
- Params:
134
- cost_class: This is the relative weight of the classification error in the matching cost
135
- cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
136
- cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
137
- """
138
- super().__init__()
139
- self.cost_class = cost_class
140
- self.cost_bbox = cost_bbox
141
- self.cost_giou = cost_giou
142
- assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
143
-
144
- self.focal_alpha = focal_alpha
145
-
146
- @torch.no_grad()
147
- def forward(self, outputs, targets):
148
- """ Performs the matching
149
- Params:
150
- outputs: This is a dict that contains at least these entries:
151
- "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
152
- "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
153
- targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
154
- "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
155
- objects in the target) containing the class labels
156
- "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
157
- Returns:
158
- A list of size batch_size, containing tuples of (index_i, index_j) where:
159
- - index_i is the indices of the selected predictions (in order)
160
- - index_j is the indices of the corresponding selected targets (in order)
161
- For each batch element, it holds:
162
- len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
163
- """
164
-
165
- bs, num_queries = outputs["pred_logits"].shape[:2]
166
-
167
- # We flatten to compute the cost matrices in a batch
168
- out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes]
169
- out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4]
170
-
171
- # Also concat the target labels and boxes
172
- tgt_ids = torch.cat([v["labels"] for v in targets])
173
- tgt_bbox = torch.cat([v["boxes"] for v in targets])
174
-
175
- # Compute the classification cost.
176
- alpha = self.focal_alpha
177
- gamma = 2.0
178
- neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
179
- pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
180
- cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
181
-
182
- # Compute the L1 cost between boxes
183
- cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
184
-
185
- # Compute the giou cost betwen boxes
186
- cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
187
-
188
- # Final cost matrix
189
-
190
- C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
191
- C = C.view(bs, num_queries, -1)
192
-
193
- sizes = [len(v["boxes"]) for v in targets]
194
- indices = []
195
- device = C.device
196
- for i, (c, _size) in enumerate(zip(C.split(sizes, -1), sizes)):
197
- weight_mat = c[i]
198
- idx_i = weight_mat.min(0)[1]
199
- idx_j = torch.arange(_size).to(device)
200
- indices.append((idx_i, idx_j))
201
-
202
- return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
203
-
204
-
205
- def build_matcher(args):
206
- assert args.matcher_type in ['HungarianMatcher', 'SimpleMinsumMatcher'], "Unknown args.matcher_type: {}".format(args.matcher_type)
207
- if args.matcher_type == 'HungarianMatcher':
208
- return HungarianMatcher(
209
- cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou,
210
- focal_alpha=args.focal_alpha
211
- )
212
- elif args.matcher_type == 'SimpleMinsumMatcher':
213
- return SimpleMinsumMatcher(
214
- cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou,
215
- focal_alpha=args.focal_alpha
216
- )
217
- else:
218
- raise NotImplementedError("Unknown args.matcher_type: {}".format(args.matcher_type))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/GroundingDINO/.ipynb_checkpoints/ms_deform_attn-checkpoint.py DELETED
@@ -1,416 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # Grounding DINO
3
- # url: https://github.com/IDEA-Research/GroundingDINO
4
- # Copyright (c) 2023 IDEA. All Rights Reserved.
5
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
- # ------------------------------------------------------------------------
7
- # Deformable DETR
8
- # Copyright (c) 2020 SenseTime. All Rights Reserved.
9
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10
- # ------------------------------------------------------------------------------------------------
11
- # Modified from:
12
- # https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/functions/ms_deform_attn_func.py
13
- # https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
14
- # https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/multi_scale_deform_attn.py
15
- # ------------------------------------------------------------------------------------------------
16
-
17
- import math
18
- import warnings
19
- from typing import Optional
20
-
21
- import torch
22
- import torch.nn as nn
23
- import torch.nn.functional as F
24
- from torch.autograd import Function
25
- from torch.autograd.function import once_differentiable
26
- from torch.nn.init import constant_, xavier_uniform_
27
- import loralib as lora
28
-
29
- try:
30
- # from groundingdino import _C
31
- import MultiScaleDeformableAttention as _C
32
- except:
33
- warnings.warn("Failed to load custom C++ ops. Running on CPU mode Only!")
34
-
35
-
36
- # helpers
37
- def _is_power_of_2(n):
38
- if (not isinstance(n, int)) or (n < 0):
39
- raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
40
- return (n & (n - 1) == 0) and n != 0
41
-
42
-
43
- class MultiScaleDeformableAttnFunction(Function):
44
- @staticmethod
45
- def forward(
46
- ctx,
47
- value,
48
- value_spatial_shapes,
49
- value_level_start_index,
50
- sampling_locations,
51
- attention_weights,
52
- im2col_step,
53
- ):
54
- ctx.im2col_step = im2col_step
55
- output = _C.ms_deform_attn_forward(
56
- value,
57
- value_spatial_shapes,
58
- value_level_start_index,
59
- sampling_locations,
60
- attention_weights,
61
- ctx.im2col_step,
62
- )
63
- ctx.save_for_backward(
64
- value,
65
- value_spatial_shapes,
66
- value_level_start_index,
67
- sampling_locations,
68
- attention_weights,
69
- )
70
- return output
71
-
72
- @staticmethod
73
- @once_differentiable
74
- def backward(ctx, grad_output):
75
- (
76
- value,
77
- value_spatial_shapes,
78
- value_level_start_index,
79
- sampling_locations,
80
- attention_weights,
81
- ) = ctx.saved_tensors
82
- grad_value, grad_sampling_loc, grad_attn_weight = _C.ms_deform_attn_backward(
83
- value,
84
- value_spatial_shapes,
85
- value_level_start_index,
86
- sampling_locations,
87
- attention_weights,
88
- grad_output,
89
- ctx.im2col_step,
90
- )
91
-
92
- return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
93
-
94
-
95
- def multi_scale_deformable_attn_pytorch(
96
- value: torch.Tensor,
97
- value_spatial_shapes: torch.Tensor,
98
- sampling_locations: torch.Tensor,
99
- attention_weights: torch.Tensor,
100
- ) -> torch.Tensor:
101
-
102
- bs, _, num_heads, embed_dims = value.shape
103
- _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
104
- value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
105
- sampling_grids = 2 * sampling_locations - 1
106
- sampling_value_list = []
107
- for level, (H_, W_) in enumerate(value_spatial_shapes):
108
- # bs, H_*W_, num_heads, embed_dims ->
109
- # bs, H_*W_, num_heads*embed_dims ->
110
- # bs, num_heads*embed_dims, H_*W_ ->
111
- # bs*num_heads, embed_dims, H_, W_
112
- value_l_ = (
113
- value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
114
- )
115
- # bs, num_queries, num_heads, num_points, 2 ->
116
- # bs, num_heads, num_queries, num_points, 2 ->
117
- # bs*num_heads, num_queries, num_points, 2
118
- sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
119
- # bs*num_heads, embed_dims, num_queries, num_points
120
- sampling_value_l_ = F.grid_sample(
121
- value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
122
- )
123
- sampling_value_list.append(sampling_value_l_)
124
- # (bs, num_queries, num_heads, num_levels, num_points) ->
125
- # (bs, num_heads, num_queries, num_levels, num_points) ->
126
- # (bs, num_heads, 1, num_queries, num_levels*num_points)
127
- attention_weights = attention_weights.transpose(1, 2).reshape(
128
- bs * num_heads, 1, num_queries, num_levels * num_points
129
- )
130
- output = (
131
- (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
132
- .sum(-1)
133
- .view(bs, num_heads * embed_dims, num_queries)
134
- )
135
- return output.transpose(1, 2).contiguous()
136
-
137
-
138
- class MultiScaleDeformableAttention(nn.Module):
139
- """Multi-Scale Deformable Attention Module used in Deformable-DETR
140
-
141
- `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
142
- <https://arxiv.org/pdf/2010.04159.pdf>`_.
143
-
144
- Args:
145
- embed_dim (int): The embedding dimension of Attention. Default: 256.
146
- num_heads (int): The number of attention heads. Default: 8.
147
- num_levels (int): The number of feature map used in Attention. Default: 4.
148
- num_points (int): The number of sampling points for each query
149
- in each head. Default: 4.
150
- img2col_steps (int): The step used in image_to_column. Defualt: 64.
151
- dropout (float): Dropout layer used in output. Default: 0.1.
152
- batch_first (bool): if ``True``, then the input and output tensor will be
153
- provided as `(bs, n, embed_dim)`. Default: False. `(n, bs, embed_dim)`
154
- """
155
-
156
- def __init__(
157
- self,
158
- embed_dim: int = 256,
159
- num_heads: int = 8,
160
- num_levels: int = 4,
161
- num_points: int = 4,
162
- img2col_step: int = 64,
163
- batch_first: bool = False,
164
- ):
165
- super().__init__()
166
- if embed_dim % num_heads != 0:
167
- raise ValueError(
168
- "embed_dim must be divisible by num_heads, but got {} and {}".format(
169
- embed_dim, num_heads
170
- )
171
- )
172
- head_dim = embed_dim // num_heads
173
-
174
- self.batch_first = batch_first
175
-
176
- if not _is_power_of_2(head_dim):
177
- warnings.warn(
178
- """
179
- You'd better set d_model in MSDeformAttn to make sure that
180
- each dim of the attention head a power of 2, which is more efficient.
181
- """
182
- )
183
-
184
- self.im2col_step = img2col_step
185
- self.embed_dim = embed_dim
186
- self.num_heads = num_heads
187
- self.num_levels = num_levels
188
- self.num_points = num_points
189
- r = 12
190
- self.sampling_offsets = lora.Linear(embed_dim, num_heads * num_levels * num_points * 2 , r=r )
191
- self.attention_weights = lora.Linear(embed_dim, num_heads * num_levels * num_points , r=r)
192
- self.value_proj = lora.Linear(embed_dim, embed_dim , r=r)
193
- self.output_proj = lora.Linear(embed_dim, embed_dim , r=r)
194
-
195
- self.init_weights()
196
-
197
- def _reset_parameters(self):
198
- return self.init_weights()
199
-
200
- def init_weights(self):
201
- """
202
- Default initialization for Parameters of Module.
203
- """
204
- constant_(self.sampling_offsets.weight.data, 0.0)
205
- thetas = torch.arange(self.num_heads, dtype=torch.float32) * (
206
- 2.0 * math.pi / self.num_heads
207
- )
208
- grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
209
- grid_init = (
210
- (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
211
- .view(self.num_heads, 1, 1, 2)
212
- .repeat(1, self.num_levels, self.num_points, 1)
213
- )
214
- for i in range(self.num_points):
215
- grid_init[:, :, i, :] *= i + 1
216
- with torch.no_grad():
217
- self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
218
- constant_(self.attention_weights.weight.data, 0.0)
219
- constant_(self.attention_weights.bias.data, 0.0)
220
- xavier_uniform_(self.value_proj.weight.data)
221
- constant_(self.value_proj.bias.data, 0.0)
222
- xavier_uniform_(self.output_proj.weight.data)
223
- constant_(self.output_proj.bias.data, 0.0)
224
-
225
- def freeze_sampling_offsets(self):
226
- print("Freeze sampling offsets")
227
- self.sampling_offsets.weight.requires_grad = False
228
- self.sampling_offsets.bias.requires_grad = False
229
-
230
- def freeze_attention_weights(self):
231
- print("Freeze attention weights")
232
- self.attention_weights.weight.requires_grad = False
233
- self.attention_weights.bias.requires_grad = False
234
-
235
- def forward(
236
- self,
237
- query: torch.Tensor,
238
- key: Optional[torch.Tensor] = None,
239
- value: Optional[torch.Tensor] = None,
240
- query_pos: Optional[torch.Tensor] = None,
241
- key_padding_mask: Optional[torch.Tensor] = None,
242
- reference_points: Optional[torch.Tensor] = None,
243
- spatial_shapes: Optional[torch.Tensor] = None,
244
- level_start_index: Optional[torch.Tensor] = None,
245
- **kwargs
246
- ) -> torch.Tensor:
247
-
248
- """Forward Function of MultiScaleDeformableAttention
249
-
250
- Args:
251
- query (torch.Tensor): Query embeddings with shape
252
- `(num_query, bs, embed_dim)`
253
- key (torch.Tensor): Key embeddings with shape
254
- `(num_key, bs, embed_dim)`
255
- value (torch.Tensor): Value embeddings with shape
256
- `(num_key, bs, embed_dim)`
257
- query_pos (torch.Tensor): The position embedding for `query`. Default: None.
258
- key_padding_mask (torch.Tensor): ByteTensor for `query`, with shape `(bs, num_key)`,
259
- indicating which elements within `key` to be ignored in attention.
260
- reference_points (torch.Tensor): The normalized reference points
261
- with shape `(bs, num_query, num_levels, 2)`,
262
- all elements is range in [0, 1], top-left (0, 0),
263
- bottom-right (1, 1), including padding are.
264
- or `(N, Length_{query}, num_levels, 4)`, add additional
265
- two dimensions `(h, w)` to form reference boxes.
266
- spatial_shapes (torch.Tensor): Spatial shape of features in different levels.
267
- With shape `(num_levels, 2)`, last dimension represents `(h, w)`.
268
- level_start_index (torch.Tensor): The start index of each level. A tensor with
269
- shape `(num_levels, )` which can be represented as
270
- `[0, h_0 * w_0, h_0 * w_0 + h_1 * w_1, ...]`.
271
-
272
- Returns:
273
- torch.Tensor: forward results with shape `(num_query, bs, embed_dim)`
274
- """
275
-
276
- if value is None:
277
- value = query
278
-
279
- if query_pos is not None:
280
- query = query + query_pos
281
-
282
- if not self.batch_first:
283
- # change to (bs, num_query ,embed_dims)
284
- query = query.permute(1, 0, 2)
285
- value = value.permute(1, 0, 2)
286
-
287
- bs, num_query, _ = query.shape
288
- bs, num_value, _ = value.shape
289
-
290
- assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
291
-
292
- value = self.value_proj(value)
293
- if key_padding_mask is not None:
294
- value = value.masked_fill(key_padding_mask[..., None], float(0))
295
- value = value.view(bs, num_value, self.num_heads, -1)
296
- sampling_offsets = self.sampling_offsets(query).view(
297
- bs, num_query, self.num_heads, self.num_levels, self.num_points, 2
298
- )
299
- attention_weights = self.attention_weights(query).view(
300
- bs, num_query, self.num_heads, self.num_levels * self.num_points
301
- )
302
- attention_weights = attention_weights.softmax(-1)
303
- attention_weights = attention_weights.view(
304
- bs,
305
- num_query,
306
- self.num_heads,
307
- self.num_levels,
308
- self.num_points,
309
- )
310
-
311
- # bs, num_query, num_heads, num_levels, num_points, 2
312
- if reference_points.shape[-1] == 2:
313
- offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
314
- sampling_locations = (
315
- reference_points[:, :, None, :, None, :]
316
- + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
317
- )
318
- elif reference_points.shape[-1] == 4:
319
- sampling_locations = (
320
- reference_points[:, :, None, :, None, :2]
321
- + sampling_offsets
322
- / self.num_points
323
- * reference_points[:, :, None, :, None, 2:]
324
- * 0.5
325
- )
326
- else:
327
- raise ValueError(
328
- "Last dim of reference_points must be 2 or 4, but get {} instead.".format(
329
- reference_points.shape[-1]
330
- )
331
- )
332
-
333
- if torch.cuda.is_available() and value.is_cuda:
334
- halffloat = False
335
- if value.dtype == torch.float16:
336
- halffloat = True
337
- value = value.float()
338
- sampling_locations = sampling_locations.float()
339
- attention_weights = attention_weights.float()
340
-
341
- output = MultiScaleDeformableAttnFunction.apply(
342
- value,
343
- spatial_shapes,
344
- level_start_index,
345
- sampling_locations,
346
- attention_weights,
347
- self.im2col_step,
348
- )
349
-
350
- if halffloat:
351
- output = output.half()
352
- else:
353
- output = multi_scale_deformable_attn_pytorch(
354
- value, spatial_shapes, sampling_locations, attention_weights
355
- )
356
-
357
- output = self.output_proj(output)
358
-
359
- if not self.batch_first:
360
- output = output.permute(1, 0, 2)
361
-
362
- return output
363
-
364
-
365
- def create_dummy_class(klass, dependency, message=""):
366
- """
367
- When a dependency of a class is not available, create a dummy class which throws ImportError
368
- when used.
369
-
370
- Args:
371
- klass (str): name of the class.
372
- dependency (str): name of the dependency.
373
- message: extra message to print
374
- Returns:
375
- class: a class object
376
- """
377
- err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, klass)
378
- if message:
379
- err = err + " " + message
380
-
381
- class _DummyMetaClass(type):
382
- # throw error on class attribute access
383
- def __getattr__(_, __): # noqa: B902
384
- raise ImportError(err)
385
-
386
- class _Dummy(object, metaclass=_DummyMetaClass):
387
- # throw error on constructor
388
- def __init__(self, *args, **kwargs):
389
- raise ImportError(err)
390
-
391
- return _Dummy
392
-
393
-
394
- def create_dummy_func(func, dependency, message=""):
395
- """
396
- When a dependency of a function is not available, create a dummy function which throws
397
- ImportError when used.
398
-
399
- Args:
400
- func (str): name of the function.
401
- dependency (str or list[str]): name(s) of the dependency.
402
- message: extra message to print
403
- Returns:
404
- function: a function object
405
- """
406
- err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, func)
407
- if message:
408
- err = err + " " + message
409
-
410
- if isinstance(dependency, (list, tuple)):
411
- dependency = ",".join(dependency)
412
-
413
- def _dummy(*args, **kwargs):
414
- raise ImportError(err)
415
-
416
- return _dummy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/GroundingDINO/.ipynb_checkpoints/transformer-checkpoint.py DELETED
@@ -1,969 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # Grounding DINO
3
- # url: https://github.com/IDEA-Research/GroundingDINO
4
- # Copyright (c) 2023 IDEA. All Rights Reserved.
5
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
- # ------------------------------------------------------------------------
7
- # DINO
8
- # Copyright (c) 2022 IDEA. All Rights Reserved.
9
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10
- # ------------------------------------------------------------------------
11
- # Conditional DETR Transformer class.
12
- # Copyright (c) 2021 Microsoft. All Rights Reserved.
13
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
14
- # ------------------------------------------------------------------------
15
- # Modified from DETR (https://github.com/facebookresearch/detr)
16
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
17
- # ------------------------------------------------------------------------
18
-
19
- from typing import Optional
20
-
21
- import torch
22
- import torch.utils.checkpoint as checkpoint
23
- from torch import Tensor, nn
24
-
25
- from groundingdino.util.misc import inverse_sigmoid
26
- import loralib as lora
27
- from .fuse_modules import BiAttentionBlock
28
- from .ms_deform_attn import MultiScaleDeformableAttention as MSDeformAttn
29
- from .transformer_vanilla import TransformerEncoderLayer
30
- from .utils import (
31
- MLP,
32
- _get_activation_fn,
33
- _get_clones,
34
- gen_encoder_output_proposals,
35
- gen_sineembed_for_position,
36
- get_sine_pos_embed,
37
- )
38
-
39
-
40
- class Transformer(nn.Module):
41
- def __init__(
42
- self,
43
- d_model=256,
44
- nhead=8,
45
- num_queries=300,
46
- num_encoder_layers=6,
47
- num_unicoder_layers=0,
48
- num_decoder_layers=6,
49
- dim_feedforward=2048,
50
- dropout=0.0,
51
- activation="relu",
52
- normalize_before=False,
53
- return_intermediate_dec=False,
54
- query_dim=4,
55
- num_patterns=0,
56
- # for deformable encoder
57
- num_feature_levels=1,
58
- enc_n_points=4,
59
- dec_n_points=4,
60
- # init query
61
- learnable_tgt_init=False,
62
- # two stage
63
- two_stage_type="no", # ['no', 'standard', 'early', 'combine', 'enceachlayer', 'enclayer1']
64
- embed_init_tgt=False,
65
- # for text
66
- use_text_enhancer=False,
67
- use_fusion_layer=False,
68
- use_checkpoint=False,
69
- use_transformer_ckpt=False,
70
- use_text_cross_attention=False,
71
- text_dropout=0.1,
72
- fusion_dropout=0.1,
73
- fusion_droppath=0.0,
74
- ):
75
- super().__init__()
76
- self.num_feature_levels = num_feature_levels
77
- self.num_encoder_layers = num_encoder_layers
78
- self.num_unicoder_layers = num_unicoder_layers
79
- self.num_decoder_layers = num_decoder_layers
80
- self.num_queries = num_queries
81
- assert query_dim == 4
82
-
83
- # choose encoder layer type
84
- encoder_layer = DeformableTransformerEncoderLayer(
85
- d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, enc_n_points
86
- )
87
-
88
- if use_text_enhancer:
89
- text_enhance_layer = TransformerEncoderLayer(
90
- d_model=d_model,
91
- nhead=nhead // 2,
92
- dim_feedforward=dim_feedforward // 2,
93
- dropout=text_dropout,
94
- )
95
- else:
96
- text_enhance_layer = None
97
-
98
- if use_fusion_layer:
99
- feature_fusion_layer = BiAttentionBlock(
100
- v_dim=d_model,
101
- l_dim=d_model,
102
- embed_dim=dim_feedforward // 2,
103
- num_heads=nhead // 2,
104
- dropout=fusion_dropout,
105
- drop_path=fusion_droppath,
106
- )
107
- else:
108
- feature_fusion_layer = None
109
-
110
- encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
111
- assert encoder_norm is None
112
- self.encoder = TransformerEncoder(
113
- encoder_layer,
114
- num_encoder_layers,
115
- d_model=d_model,
116
- num_queries=num_queries,
117
- text_enhance_layer=text_enhance_layer,
118
- feature_fusion_layer=feature_fusion_layer,
119
- use_checkpoint=use_checkpoint,
120
- use_transformer_ckpt=use_transformer_ckpt,
121
- )
122
-
123
- # choose decoder layer type
124
- decoder_layer = DeformableTransformerDecoderLayer(
125
- d_model,
126
- dim_feedforward,
127
- dropout,
128
- activation,
129
- num_feature_levels,
130
- nhead,
131
- dec_n_points,
132
- use_text_cross_attention=use_text_cross_attention,
133
- )
134
-
135
- decoder_norm = nn.LayerNorm(d_model)
136
- self.decoder = TransformerDecoder(
137
- decoder_layer,
138
- num_decoder_layers,
139
- decoder_norm,
140
- return_intermediate=return_intermediate_dec,
141
- d_model=d_model,
142
- query_dim=query_dim,
143
- num_feature_levels=num_feature_levels,
144
- )
145
-
146
- self.d_model = d_model
147
- self.nhead = nhead
148
- self.dec_layers = num_decoder_layers
149
- self.num_queries = num_queries # useful for single stage model only
150
- self.num_patterns = num_patterns
151
- if not isinstance(num_patterns, int):
152
- Warning("num_patterns should be int but {}".format(type(num_patterns)))
153
- self.num_patterns = 0
154
-
155
- if num_feature_levels > 1:
156
- if self.num_encoder_layers > 0:
157
- self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
158
- else:
159
- self.level_embed = None
160
-
161
- self.learnable_tgt_init = learnable_tgt_init
162
- assert learnable_tgt_init, "why not learnable_tgt_init"
163
- self.embed_init_tgt = embed_init_tgt
164
- if (two_stage_type != "no" and embed_init_tgt) or (two_stage_type == "no"):
165
- self.tgt_embed = nn.Embedding(self.num_queries, d_model)
166
- nn.init.normal_(self.tgt_embed.weight.data)
167
- else:
168
- self.tgt_embed = None
169
-
170
- # for two stage
171
- self.two_stage_type = two_stage_type
172
- assert two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format(
173
- two_stage_type
174
- )
175
- if two_stage_type == "standard":
176
- # anchor selection at the output of encoder
177
- r = 24
178
- self.enc_output = lora.Linear(d_model, d_model , r=r)
179
- self.enc_output_norm = nn.LayerNorm(d_model)
180
- self.two_stage_wh_embedding = None
181
-
182
- if two_stage_type == "no":
183
- self.init_ref_points(num_queries) # init self.refpoint_embed
184
-
185
- self.enc_out_class_embed = None
186
- self.enc_out_bbox_embed = None
187
-
188
- self._reset_parameters()
189
-
190
- def _reset_parameters(self):
191
- for p in self.parameters():
192
- if p.dim() > 1:
193
- nn.init.xavier_uniform_(p)
194
- for m in self.modules():
195
- if isinstance(m, MSDeformAttn):
196
- m._reset_parameters()
197
- if self.num_feature_levels > 1 and self.level_embed is not None:
198
- nn.init.normal_(self.level_embed)
199
-
200
- def get_valid_ratio(self, mask):
201
- _, H, W = mask.shape
202
- valid_H = torch.sum(~mask[:, :, 0], 1)
203
- valid_W = torch.sum(~mask[:, 0, :], 1)
204
- valid_ratio_h = valid_H.float() / H
205
- valid_ratio_w = valid_W.float() / W
206
- valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
207
- return valid_ratio
208
-
209
- def init_ref_points(self, use_num_queries):
210
- self.refpoint_embed = nn.Embedding(use_num_queries, 4)
211
-
212
- def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, text_dict=None):
213
- """
214
- Input:
215
- - srcs: List of multi features [bs, ci, hi, wi]
216
- - masks: List of multi masks [bs, hi, wi]
217
- - refpoint_embed: [bs, num_dn, 4]. None in infer
218
- - pos_embeds: List of multi pos embeds [bs, ci, hi, wi]
219
- - tgt: [bs, num_dn, d_model]. None in infer
220
-
221
- """
222
- # prepare input for encoder
223
- src_flatten = []
224
- mask_flatten = []
225
- lvl_pos_embed_flatten = []
226
- spatial_shapes = []
227
- for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
228
- bs, c, h, w = src.shape
229
- spatial_shape = (h, w)
230
- spatial_shapes.append(spatial_shape)
231
-
232
- src = src.flatten(2).transpose(1, 2) # bs, hw, c
233
- mask = mask.flatten(1) # bs, hw
234
- pos_embed = pos_embed.flatten(2).transpose(1, 2) # bs, hw, c
235
- if self.num_feature_levels > 1 and self.level_embed is not None:
236
- lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
237
- else:
238
- lvl_pos_embed = pos_embed
239
- lvl_pos_embed_flatten.append(lvl_pos_embed)
240
- src_flatten.append(src)
241
- mask_flatten.append(mask)
242
- src_flatten = torch.cat(src_flatten, 1) # bs, \sum{hxw}, c
243
- mask_flatten = torch.cat(mask_flatten, 1) # bs, \sum{hxw}
244
- lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) # bs, \sum{hxw}, c
245
- spatial_shapes = torch.as_tensor(
246
- spatial_shapes, dtype=torch.long, device=src_flatten.device
247
- )
248
- level_start_index = torch.cat(
249
- (spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])
250
- )
251
- valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
252
-
253
- # two stage
254
- enc_topk_proposals = enc_refpoint_embed = None
255
-
256
- #########################################################
257
- # Begin Encoder
258
- #########################################################
259
- memory, memory_text = self.encoder(
260
- src_flatten,
261
- pos=lvl_pos_embed_flatten,
262
- level_start_index=level_start_index,
263
- spatial_shapes=spatial_shapes,
264
- valid_ratios=valid_ratios,
265
- key_padding_mask=mask_flatten,
266
- memory_text=text_dict["encoded_text"],
267
- text_attention_mask=~text_dict["text_token_mask"],
268
- # we ~ the mask . False means use the token; True means pad the token
269
- position_ids=text_dict["position_ids"],
270
- text_self_attention_masks=text_dict["text_self_attention_masks"],
271
- )
272
- #########################################################
273
- # End Encoder
274
- # - memory: bs, \sum{hw}, c
275
- # - mask_flatten: bs, \sum{hw}
276
- # - lvl_pos_embed_flatten: bs, \sum{hw}, c
277
- # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
278
- # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
279
- #########################################################
280
- text_dict["encoded_text"] = memory_text
281
- # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
282
- # if memory.isnan().any() | memory.isinf().any():
283
- # import ipdb; ipdb.set_trace()
284
-
285
- if self.two_stage_type == "standard": #把encoder的输出作为proposal
286
- output_memory, output_proposals = gen_encoder_output_proposals(
287
- memory, mask_flatten, spatial_shapes
288
- )
289
- output_memory = self.enc_output_norm(self.enc_output(output_memory))
290
-
291
- if text_dict is not None:
292
- enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict)
293
- else:
294
- enc_outputs_class_unselected = self.enc_out_class_embed(output_memory)
295
-
296
- topk_logits = enc_outputs_class_unselected.max(-1)[0]
297
- enc_outputs_coord_unselected = (
298
- self.enc_out_bbox_embed(output_memory) + output_proposals
299
- ) # (bs, \sum{hw}, 4) unsigmoid
300
- topk = self.num_queries
301
-
302
- topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] # bs, nq
303
-
304
- # gather boxes
305
- refpoint_embed_undetach = torch.gather(
306
- enc_outputs_coord_unselected, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
307
- ) # unsigmoid
308
- refpoint_embed_ = refpoint_embed_undetach.detach()
309
- init_box_proposal = torch.gather(
310
- output_proposals, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
311
- ).sigmoid() # sigmoid
312
-
313
- # gather tgt
314
- tgt_undetach = torch.gather(
315
- output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
316
- )
317
- if self.embed_init_tgt:
318
- tgt_ = (
319
- self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)
320
- ) # nq, bs, d_model
321
- else:
322
- tgt_ = tgt_undetach.detach()
323
-
324
- if refpoint_embed is not None:
325
- refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
326
- tgt = torch.cat([tgt, tgt_], dim=1)
327
- else:
328
- refpoint_embed, tgt = refpoint_embed_, tgt_
329
-
330
- elif self.two_stage_type == "no":
331
- tgt_ = (
332
- self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)
333
- ) # nq, bs, d_model
334
- refpoint_embed_ = (
335
- self.refpoint_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)
336
- ) # nq, bs, 4
337
-
338
- if refpoint_embed is not None:
339
- refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
340
- tgt = torch.cat([tgt, tgt_], dim=1)
341
- else:
342
- refpoint_embed, tgt = refpoint_embed_, tgt_
343
-
344
- if self.num_patterns > 0:
345
- tgt_embed = tgt.repeat(1, self.num_patterns, 1)
346
- refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1)
347
- tgt_pat = self.patterns.weight[None, :, :].repeat_interleave(
348
- self.num_queries, 1
349
- ) # 1, n_q*n_pat, d_model
350
- tgt = tgt_embed + tgt_pat
351
-
352
- init_box_proposal = refpoint_embed_.sigmoid()
353
-
354
- else:
355
- raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type))
356
- #########################################################
357
- # End preparing tgt
358
- # - tgt: bs, NQ, d_model
359
- # - refpoint_embed(unsigmoid): bs, NQ, d_model
360
- #########################################################
361
-
362
- #########################################################
363
- # Begin Decoder
364
- #########################################################
365
-
366
- #memory torch.Size([2, 16320, 256])
367
-
368
- # import pdb;pdb.set_trace()
369
- hs, references = self.decoder(
370
- tgt=tgt.transpose(0, 1),
371
- memory=memory.transpose(0, 1),
372
- memory_key_padding_mask=mask_flatten,
373
- pos=lvl_pos_embed_flatten.transpose(0, 1),
374
- refpoints_unsigmoid=refpoint_embed.transpose(0, 1),
375
- level_start_index=level_start_index,
376
- spatial_shapes=spatial_shapes,
377
- valid_ratios=valid_ratios,
378
- tgt_mask=attn_mask,
379
- memory_text=text_dict["encoded_text"],
380
- text_attention_mask=~text_dict["text_token_mask"],
381
- # we ~ the mask . False means use the token; True means pad the token
382
- )
383
- #########################################################
384
- # End Decoder
385
- # hs: n_dec, bs, nq, d_model
386
- # references: n_dec+1, bs, nq, query_dim
387
- #########################################################
388
-
389
- #########################################################
390
- # Begin postprocess
391
- #########################################################
392
- if self.two_stage_type == "standard":
393
- hs_enc = tgt_undetach.unsqueeze(0)
394
- ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0)
395
- else:
396
- hs_enc = ref_enc = None
397
- #########################################################
398
- # End postprocess
399
- # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None
400
- # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None
401
- #########################################################
402
-
403
- return hs, references, hs_enc, ref_enc, init_box_proposal
404
- # hs: (n_dec, bs, nq, d_model)
405
- # references: sigmoid coordinates. (n_dec+1, bs, bq, 4)
406
- # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None
407
- # ref_enc: sigmoid coordinates. \
408
- # (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None
409
-
410
-
411
- class TransformerEncoder(nn.Module):
412
- def __init__(
413
- self,
414
- encoder_layer,
415
- num_layers,
416
- d_model=256,
417
- num_queries=300,
418
- enc_layer_share=False,
419
- text_enhance_layer=None,
420
- feature_fusion_layer=None,
421
- use_checkpoint=False,
422
- use_transformer_ckpt=False,
423
- ):
424
- """_summary_
425
-
426
- Args:
427
- encoder_layer (_type_): _description_
428
- num_layers (_type_): _description_
429
- norm (_type_, optional): _description_. Defaults to None.
430
- d_model (int, optional): _description_. Defaults to 256.
431
- num_queries (int, optional): _description_. Defaults to 300.
432
- enc_layer_share (bool, optional): _description_. Defaults to False.
433
-
434
- """
435
- super().__init__()
436
- # prepare layers
437
- self.layers = []
438
- self.text_layers = []
439
- self.fusion_layers = []
440
- if num_layers > 0:
441
- self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share)
442
-
443
- if text_enhance_layer is not None:
444
- self.text_layers = _get_clones(
445
- text_enhance_layer, num_layers, layer_share=enc_layer_share
446
- )
447
- if feature_fusion_layer is not None:
448
- self.fusion_layers = _get_clones(
449
- feature_fusion_layer, num_layers, layer_share=enc_layer_share
450
- )
451
- else:
452
- self.layers = []
453
- del encoder_layer
454
-
455
- if text_enhance_layer is not None:
456
- self.text_layers = []
457
- del text_enhance_layer
458
- if feature_fusion_layer is not None:
459
- self.fusion_layers = []
460
- del feature_fusion_layer
461
-
462
- self.query_scale = None
463
- self.num_queries = num_queries
464
- self.num_layers = num_layers
465
- self.d_model = d_model
466
-
467
- self.use_checkpoint = use_checkpoint
468
- self.use_transformer_ckpt = use_transformer_ckpt
469
-
470
- @staticmethod
471
- def get_reference_points(spatial_shapes, valid_ratios, device):
472
- reference_points_list = []
473
- for lvl, (H_, W_) in enumerate(spatial_shapes):
474
-
475
- ref_y, ref_x = torch.meshgrid(
476
- torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
477
- torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),
478
- )
479
- ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
480
- ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
481
- ref = torch.stack((ref_x, ref_y), -1)
482
- reference_points_list.append(ref)
483
- reference_points = torch.cat(reference_points_list, 1)
484
- reference_points = reference_points[:, :, None] * valid_ratios[:, None]
485
- return reference_points
486
-
487
- def forward(
488
- self,
489
- # for images
490
- src: Tensor,
491
- pos: Tensor,
492
- spatial_shapes: Tensor,
493
- level_start_index: Tensor,
494
- valid_ratios: Tensor,
495
- key_padding_mask: Tensor,
496
- # for texts
497
- memory_text: Tensor = None,
498
- text_attention_mask: Tensor = None,
499
- pos_text: Tensor = None,
500
- text_self_attention_masks: Tensor = None,
501
- position_ids: Tensor = None,
502
- ):
503
- """
504
- Input:
505
- - src: [bs, sum(hi*wi), 256]
506
- - pos: pos embed for src. [bs, sum(hi*wi), 256]
507
- - spatial_shapes: h,w of each level [num_level, 2]
508
- - level_start_index: [num_level] start point of level in sum(hi*wi).
509
- - valid_ratios: [bs, num_level, 2]
510
- - key_padding_mask: [bs, sum(hi*wi)]
511
-
512
- - memory_text: bs, n_text, 256
513
- - text_attention_mask: bs, n_text
514
- False for no padding; True for padding
515
- - pos_text: bs, n_text, 256
516
-
517
- - position_ids: bs, n_text
518
- Intermedia:
519
- - reference_points: [bs, sum(hi*wi), num_level, 2]
520
- Outpus:
521
- - output: [bs, sum(hi*wi), 256]
522
- """
523
-
524
- output = src
525
-
526
- # preparation and reshape
527
- if self.num_layers > 0:
528
- reference_points = self.get_reference_points(
529
- spatial_shapes, valid_ratios, device=src.device
530
- )
531
-
532
- if self.text_layers:
533
- # generate pos_text
534
- bs, n_text, text_dim = memory_text.shape
535
- if pos_text is None and position_ids is None:
536
- pos_text = (
537
- torch.arange(n_text, device=memory_text.device)
538
- .float()
539
- .unsqueeze(0)
540
- .unsqueeze(-1)
541
- .repeat(bs, 1, 1)
542
- )
543
- pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False)
544
- if position_ids is not None:
545
- pos_text = get_sine_pos_embed(
546
- position_ids[..., None], num_pos_feats=256, exchange_xy=False
547
- )
548
-
549
- # main process
550
- for layer_id, layer in enumerate(self.layers):
551
- # if output.isnan().any() or memory_text.isnan().any():
552
- # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
553
- # import ipdb; ipdb.set_trace()
554
- if self.fusion_layers:
555
- if self.use_checkpoint:
556
- output, memory_text = checkpoint.checkpoint(
557
- self.fusion_layers[layer_id],
558
- output,
559
- memory_text,
560
- key_padding_mask,
561
- text_attention_mask,
562
- )
563
- else:
564
- output, memory_text = self.fusion_layers[layer_id](
565
- v=output,
566
- l=memory_text,
567
- attention_mask_v=key_padding_mask,
568
- attention_mask_l=text_attention_mask,
569
- )
570
-
571
- if self.text_layers:
572
- memory_text = self.text_layers[layer_id](
573
- src=memory_text.transpose(0, 1),
574
- src_mask=~text_self_attention_masks, # note we use ~ for mask here
575
- src_key_padding_mask=text_attention_mask,
576
- pos=(pos_text.transpose(0, 1) if pos_text is not None else None),
577
- ).transpose(0, 1)
578
-
579
- # main process
580
- if self.use_transformer_ckpt:
581
- output = checkpoint.checkpoint(
582
- layer,
583
- output,
584
- pos,
585
- reference_points,
586
- spatial_shapes,
587
- level_start_index,
588
- key_padding_mask,
589
- )
590
- else:
591
- output = layer(
592
- src=output,
593
- pos=pos,
594
- reference_points=reference_points,
595
- spatial_shapes=spatial_shapes,
596
- level_start_index=level_start_index,
597
- key_padding_mask=key_padding_mask,
598
- )
599
-
600
- return output, memory_text
601
-
602
-
603
- class TransformerDecoder(nn.Module):
604
- def __init__(
605
- self,
606
- decoder_layer,
607
- num_layers,
608
- norm=None,
609
- return_intermediate=False,
610
- d_model=256,
611
- query_dim=4,
612
- num_feature_levels=1,
613
- ):
614
- super().__init__()
615
- if num_layers > 0:
616
- self.layers = _get_clones(decoder_layer, num_layers)
617
- else:
618
- self.layers = []
619
- self.num_layers = num_layers
620
- self.norm = norm
621
- self.return_intermediate = return_intermediate
622
- assert return_intermediate, "support return_intermediate only"
623
- self.query_dim = query_dim
624
- assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim)
625
- self.num_feature_levels = num_feature_levels
626
-
627
- self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
628
- self.query_pos_sine_scale = None
629
-
630
- self.query_scale = None
631
- self.bbox_embed = None
632
- self.class_embed = None
633
-
634
- self.d_model = d_model
635
-
636
- self.ref_anchor_head = None
637
-
638
- def forward(
639
- self,
640
- tgt,
641
- memory,
642
- tgt_mask: Optional[Tensor] = None,
643
- memory_mask: Optional[Tensor] = None,
644
- tgt_key_padding_mask: Optional[Tensor] = None,
645
- memory_key_padding_mask: Optional[Tensor] = None,
646
- pos: Optional[Tensor] = None,
647
- refpoints_unsigmoid: Optional[Tensor] = None, # num_queries, bs, 2
648
- # for memory
649
- level_start_index: Optional[Tensor] = None, # num_levels
650
- spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
651
- valid_ratios: Optional[Tensor] = None,
652
- # for text
653
- memory_text: Optional[Tensor] = None,
654
- text_attention_mask: Optional[Tensor] = None,
655
- ):
656
- """
657
- Input:
658
- - tgt: nq, bs, d_model
659
- - memory: hw, bs, d_model
660
- - pos: hw, bs, d_model
661
- - refpoints_unsigmoid: nq, bs, 2/4
662
- - valid_ratios/spatial_shapes: bs, nlevel, 2
663
- """
664
- output = tgt
665
-
666
- intermediate = []
667
- reference_points = refpoints_unsigmoid.sigmoid()
668
- ref_points = [reference_points]
669
-
670
-
671
-
672
- for layer_id, layer in enumerate(self.layers):
673
-
674
- if reference_points.shape[-1] == 4:
675
- reference_points_input = (
676
- reference_points[:, :, None]
677
- * torch.cat([valid_ratios, valid_ratios], -1)[None, :]
678
- ) # nq, bs, nlevel, 4
679
- else:
680
- assert reference_points.shape[-1] == 2
681
- reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]
682
- query_sine_embed = gen_sineembed_for_position(
683
- reference_points_input[:, :, 0, :]
684
- ) # nq, bs, 256*2
685
-
686
- # conditional query
687
- raw_query_pos = self.ref_point_head(query_sine_embed) # nq, bs, 256
688
- pos_scale = self.query_scale(output) if self.query_scale is not None else 1
689
- query_pos = pos_scale * raw_query_pos
690
- # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
691
- # if query_pos.isnan().any() | query_pos.isinf().any():
692
- # import ipdb; ipdb.set_trace()
693
-
694
- # main process
695
- output = layer(
696
- tgt=output,
697
- tgt_query_pos=query_pos,
698
- tgt_query_sine_embed=query_sine_embed,
699
- tgt_key_padding_mask=tgt_key_padding_mask,
700
- tgt_reference_points=reference_points_input,
701
- memory_text=memory_text,
702
- text_attention_mask=text_attention_mask,
703
- memory=memory,
704
- memory_key_padding_mask=memory_key_padding_mask,
705
- memory_level_start_index=level_start_index,
706
- memory_spatial_shapes=spatial_shapes,
707
- memory_pos=pos,
708
- self_attn_mask=tgt_mask,
709
- cross_attn_mask=memory_mask,
710
- )
711
- if output.isnan().any() | output.isinf().any():
712
- print(f"output layer_id {layer_id} is nan")
713
- try:
714
- num_nan = output.isnan().sum().item()
715
- num_inf = output.isinf().sum().item()
716
- print(f"num_nan {num_nan}, num_inf {num_inf}")
717
- except Exception as e:
718
- print(e)
719
- # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
720
- # import ipdb; ipdb.set_trace()
721
-
722
- # iter update
723
- if self.bbox_embed is not None:
724
- # box_holder = self.bbox_embed(output)
725
- # box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points)
726
- # new_reference_points = box_holder[..., :self.query_dim].sigmoid()
727
-
728
- reference_before_sigmoid = inverse_sigmoid(reference_points)
729
- delta_unsig = self.bbox_embed[layer_id](output)
730
- outputs_unsig = delta_unsig + reference_before_sigmoid
731
- new_reference_points = outputs_unsig.sigmoid()
732
-
733
- reference_points = new_reference_points.detach()
734
- # if layer_id != self.num_layers - 1:
735
- ref_points.append(new_reference_points)
736
-
737
- intermediate.append(self.norm(output))
738
-
739
- # import pdb;pdb.set_trace()
740
-
741
- return [
742
- [itm_out.transpose(0, 1) for itm_out in intermediate],
743
- [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points],
744
- ]
745
-
746
-
747
- class DeformableTransformerEncoderLayer(nn.Module):
748
- def __init__(
749
- self,
750
- d_model=256,
751
- d_ffn=1024,
752
- dropout=0.1,
753
- activation="relu",
754
- n_levels=4,
755
- n_heads=8,
756
- n_points=4,
757
- ):
758
- super().__init__()
759
-
760
- # self attention
761
- self.self_attn = MSDeformAttn(
762
- embed_dim=d_model,
763
- num_levels=n_levels,
764
- num_heads=n_heads,
765
- num_points=n_points,
766
- batch_first=True,
767
- )
768
- self.dropout1 = nn.Dropout(dropout)
769
- self.norm1 = nn.LayerNorm(d_model)
770
- r =12
771
- # ffn
772
- self.linear1 = lora.Linear(d_model, d_ffn , r=r )
773
- self.activation = _get_activation_fn(activation, d_model=d_ffn)
774
- self.dropout2 = nn.Dropout(dropout)
775
- self.linear2 = lora.Linear(d_ffn, d_model , r=r)
776
- self.dropout3 = nn.Dropout(dropout)
777
- self.norm2 = nn.LayerNorm(d_model)
778
-
779
- @staticmethod
780
- def with_pos_embed(tensor, pos):
781
- return tensor if pos is None else tensor + pos
782
-
783
- def forward_ffn(self, src):
784
- src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
785
- src = src + self.dropout3(src2)
786
- src = self.norm2(src)
787
- return src
788
-
789
- def forward(
790
- self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None
791
- ):
792
- # self attention
793
- # import ipdb; ipdb.set_trace()
794
- src2 = self.self_attn(
795
- query=self.with_pos_embed(src, pos),
796
- reference_points=reference_points,
797
- value=src,
798
- spatial_shapes=spatial_shapes,
799
- level_start_index=level_start_index,
800
- key_padding_mask=key_padding_mask,
801
- )
802
- src = src + self.dropout1(src2)
803
- src = self.norm1(src)
804
-
805
- # ffn
806
- src = self.forward_ffn(src)
807
-
808
- return src
809
-
810
-
811
- class DeformableTransformerDecoderLayer(nn.Module):
812
- def __init__(
813
- self,
814
- d_model=256,
815
- d_ffn=1024,
816
- dropout=0.1,
817
- activation="relu",
818
- n_levels=4,
819
- n_heads=8,
820
- n_points=4,
821
- use_text_feat_guide=False,
822
- use_text_cross_attention=False,
823
- ):
824
- super().__init__()
825
-
826
- # cross attention
827
- self.cross_attn = MSDeformAttn(
828
- embed_dim=d_model,
829
- num_levels=n_levels,
830
- num_heads=n_heads,
831
- num_points=n_points,
832
- batch_first=True,
833
- )
834
- self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
835
- self.norm1 = nn.LayerNorm(d_model)
836
-
837
- # cross attention text
838
- if use_text_cross_attention:
839
- self.ca_text = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
840
- self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
841
- self.catext_norm = nn.LayerNorm(d_model)
842
-
843
- # self attention
844
- self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
845
- self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
846
- self.norm2 = nn.LayerNorm(d_model)
847
-
848
- # ffn
849
- r = 12
850
- self.linear1 = lora.Linear(d_model, d_ffn , r=r)
851
- self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
852
- self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
853
- self.linear2 = lora.Linear(d_ffn, d_model , r=r )
854
- self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
855
- self.norm3 = nn.LayerNorm(d_model)
856
-
857
- self.key_aware_proj = None
858
- self.use_text_feat_guide = use_text_feat_guide
859
- assert not use_text_feat_guide
860
- self.use_text_cross_attention = use_text_cross_attention
861
-
862
- def rm_self_attn_modules(self):
863
- self.self_attn = None
864
- self.dropout2 = None
865
- self.norm2 = None
866
-
867
- @staticmethod
868
- def with_pos_embed(tensor, pos):
869
- return tensor if pos is None else tensor + pos
870
-
871
- def forward_ffn(self, tgt):
872
- with torch.cuda.amp.autocast(enabled=False):
873
- tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
874
- tgt = tgt + self.dropout4(tgt2)
875
- tgt = self.norm3(tgt)
876
- return tgt
877
-
878
- def forward(
879
- self,
880
- # for tgt
881
- tgt: Optional[Tensor], # nq, bs, d_model
882
- tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos))
883
- tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos)
884
- tgt_key_padding_mask: Optional[Tensor] = None,
885
- tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4
886
- memory_text: Optional[Tensor] = None, # bs, num_token, d_model
887
- text_attention_mask: Optional[Tensor] = None, # bs, num_token
888
- # for memory
889
- memory: Optional[Tensor] = None, # hw, bs, d_model
890
- memory_key_padding_mask: Optional[Tensor] = None,
891
- memory_level_start_index: Optional[Tensor] = None, # num_levels
892
- memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2
893
- memory_pos: Optional[Tensor] = None, # pos for memory
894
- # sa
895
- self_attn_mask: Optional[Tensor] = None, # mask used for self-attention
896
- cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention
897
- ):
898
- """
899
- Input:
900
- - tgt/tgt_query_pos: nq, bs, d_model
901
- -
902
- """
903
- assert cross_attn_mask is None
904
-
905
- # self attention
906
- if self.self_attn is not None:
907
- # import ipdb; ipdb.set_trace()
908
- q = k = self.with_pos_embed(tgt, tgt_query_pos)
909
- tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
910
- tgt = tgt + self.dropout2(tgt2)
911
- tgt = self.norm2(tgt)
912
-
913
- if self.use_text_cross_attention:
914
- tgt2 = self.ca_text(
915
- self.with_pos_embed(tgt, tgt_query_pos),
916
- memory_text.transpose(0, 1),
917
- memory_text.transpose(0, 1),
918
- key_padding_mask=text_attention_mask,
919
- )[0]
920
- tgt = tgt + self.catext_dropout(tgt2)
921
- tgt = self.catext_norm(tgt)
922
-
923
- tgt2 = self.cross_attn(
924
- query=self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
925
- reference_points=tgt_reference_points.transpose(0, 1).contiguous(),
926
- value=memory.transpose(0, 1),
927
- spatial_shapes=memory_spatial_shapes,
928
- level_start_index=memory_level_start_index,
929
- key_padding_mask=memory_key_padding_mask,
930
- ).transpose(0, 1)
931
- tgt = tgt + self.dropout1(tgt2)
932
- tgt = self.norm1(tgt)
933
-
934
- # ffn
935
- tgt = self.forward_ffn(tgt)
936
-
937
- return tgt
938
-
939
-
940
- def build_transformer(args):
941
- return Transformer(
942
- d_model=args.hidden_dim,
943
- dropout=args.dropout,
944
- nhead=args.nheads,
945
- num_queries=args.num_queries,
946
- dim_feedforward=args.dim_feedforward,
947
- num_encoder_layers=args.enc_layers,
948
- num_decoder_layers=args.dec_layers,
949
- normalize_before=args.pre_norm,
950
- return_intermediate_dec=True,
951
- query_dim=args.query_dim,
952
- activation=args.transformer_activation,
953
- num_patterns=args.num_patterns,
954
- num_feature_levels=args.num_feature_levels,
955
- enc_n_points=args.enc_n_points,
956
- dec_n_points=args.dec_n_points,
957
- learnable_tgt_init=True,
958
- # two stage
959
- two_stage_type=args.two_stage_type, # ['no', 'standard', 'early']
960
- embed_init_tgt=args.embed_init_tgt,
961
- use_text_enhancer=args.use_text_enhancer,
962
- use_fusion_layer=args.use_fusion_layer,
963
- use_checkpoint=args.use_checkpoint,
964
- use_transformer_ckpt=args.use_transformer_ckpt,
965
- use_text_cross_attention=args.use_text_cross_attention,
966
- text_dropout=args.text_dropout,
967
- fusion_dropout=args.fusion_dropout,
968
- fusion_droppath=args.fusion_droppath,
969
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/GroundingDINO/.ipynb_checkpoints/transformer_vanilla-checkpoint.py DELETED
@@ -1,125 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # Grounding DINO
3
- # url: https://github.com/IDEA-Research/GroundingDINO
4
- # Copyright (c) 2023 IDEA. All Rights Reserved.
5
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
- # ------------------------------------------------------------------------
7
- # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
8
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
9
- """
10
- DETR Transformer class.
11
-
12
- Copy-paste from torch.nn.Transformer with modifications:
13
- * positional encodings are passed in MHattention
14
- * extra LN at the end of encoder is removed
15
- * decoder returns a stack of activations from all decoding layers
16
- """
17
- from typing import Optional
18
-
19
- import torch
20
- import torch.nn.functional as F
21
- from torch import Tensor, nn
22
- import loralib as lora
23
-
24
- from .utils import (
25
- MLP,
26
- _get_activation_fn,
27
- _get_clones,
28
- gen_encoder_output_proposals,
29
- gen_sineembed_for_position,
30
- sigmoid_focal_loss,
31
- )
32
-
33
-
34
- class TextTransformer(nn.Module):
35
- def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1):
36
- super().__init__()
37
- self.num_layers = num_layers
38
- self.d_model = d_model
39
- self.nheads = nheads
40
- self.dim_feedforward = dim_feedforward
41
- self.norm = None
42
-
43
- single_encoder_layer = TransformerEncoderLayer(
44
- d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout
45
- )
46
- self.layers = _get_clones(single_encoder_layer, num_layers)
47
-
48
- def forward(self, memory_text: torch.Tensor, text_attention_mask: torch.Tensor):
49
- """
50
-
51
- Args:
52
- text_attention_mask: bs, num_token
53
- memory_text: bs, num_token, d_model
54
-
55
- Raises:
56
- RuntimeError: _description_
57
-
58
- Returns:
59
- output: bs, num_token, d_model
60
- """
61
-
62
- output = memory_text.transpose(0, 1)
63
-
64
- for layer in self.layers:
65
- output = layer(output, src_key_padding_mask=text_attention_mask)
66
-
67
- if self.norm is not None:
68
- output = self.norm(output)
69
-
70
- return output.transpose(0, 1)
71
-
72
-
73
- class TransformerEncoderLayer(nn.Module):
74
- def __init__(
75
- self,
76
- d_model,
77
- nhead,
78
- dim_feedforward=2048,
79
- dropout=0.1,
80
- activation="relu",
81
- normalize_before=False,
82
- ):
83
- super().__init__()
84
- self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
85
- r = 12
86
- # Implementation of Feedforward model
87
- self.linear1 = lora.Linear(d_model, dim_feedforward , r=r)
88
- self.dropout = nn.Dropout(dropout)
89
- self.linear2 = lora.Linear(dim_feedforward, d_model , r=r)
90
-
91
- self.norm1 = nn.LayerNorm(d_model)
92
- self.norm2 = nn.LayerNorm(d_model)
93
- self.dropout1 = nn.Dropout(dropout)
94
- self.dropout2 = nn.Dropout(dropout)
95
-
96
- self.activation = _get_activation_fn(activation)
97
- self.normalize_before = normalize_before
98
- self.nhead = nhead
99
-
100
- def with_pos_embed(self, tensor, pos: Optional[Tensor]):
101
- return tensor if pos is None else tensor + pos
102
-
103
- def forward(
104
- self,
105
- src,
106
- src_mask: Optional[Tensor] = None,
107
- src_key_padding_mask: Optional[Tensor] = None,
108
- pos: Optional[Tensor] = None,
109
- ):
110
- # repeat attn mask
111
- if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]:
112
- # bs, num_q, num_k
113
- src_mask = src_mask.repeat(self.nhead, 1, 1)
114
-
115
- q = k = self.with_pos_embed(src, pos)
116
-
117
- src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0]
118
-
119
- # src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
120
- src = src + self.dropout1(src2)
121
- src = self.norm1(src)
122
- src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
123
- src = src + self.dropout2(src2)
124
- src = self.norm2(src)
125
- return src
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/GroundingDINO/.ipynb_checkpoints/utils-checkpoint.py DELETED
@@ -1,274 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # Grounding DINO
3
- # url: https://github.com/IDEA-Research/GroundingDINO
4
- # Copyright (c) 2023 IDEA. All Rights Reserved.
5
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
- # ------------------------------------------------------------------------
7
-
8
- import copy
9
- import math
10
-
11
- import torch
12
- import torch.nn.functional as F
13
- from torch import Tensor, nn
14
- import loralib as lora
15
-
16
- def _get_clones(module, N, layer_share=False):
17
- # import ipdb; ipdb.set_trace()
18
- if layer_share:
19
- return nn.ModuleList([module for i in range(N)])
20
- else:
21
- return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
22
-
23
-
24
- def get_sine_pos_embed(
25
- pos_tensor: torch.Tensor,
26
- num_pos_feats: int = 128,
27
- temperature: int = 10000,
28
- exchange_xy: bool = True,
29
- ):
30
- """generate sine position embedding from a position tensor
31
- Args:
32
- pos_tensor (torch.Tensor): shape: [..., n].
33
- num_pos_feats (int): projected shape for each float in the tensor.
34
- temperature (int): temperature in the sine/cosine function.
35
- exchange_xy (bool, optional): exchange pos x and pos y. \
36
- For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True.
37
- Returns:
38
- pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
39
- """
40
- scale = 2 * math.pi
41
- dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
42
- dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
43
-
44
- def sine_func(x: torch.Tensor):
45
- sin_x = x * scale / dim_t
46
- sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)
47
- return sin_x
48
-
49
- pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)]
50
- if exchange_xy:
51
- pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
52
- pos_res = torch.cat(pos_res, dim=-1)
53
- return pos_res
54
-
55
-
56
- def gen_encoder_output_proposals(
57
- memory: Tensor, memory_padding_mask: Tensor, spatial_shapes: Tensor, learnedwh=None
58
- ):
59
- """
60
- Input:
61
- - memory: bs, \sum{hw}, d_model
62
- - memory_padding_mask: bs, \sum{hw}
63
- - spatial_shapes: nlevel, 2
64
- - learnedwh: 2
65
- Output:
66
- - output_memory: bs, \sum{hw}, d_model
67
- - output_proposals: bs, \sum{hw}, 4
68
- """
69
- N_, S_, C_ = memory.shape
70
- proposals = []
71
- _cur = 0
72
- for lvl, (H_, W_) in enumerate(spatial_shapes):
73
- mask_flatten_ = memory_padding_mask[:, _cur : (_cur + H_ * W_)].view(N_, H_, W_, 1)
74
- valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
75
- valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
76
-
77
- # import ipdb; ipdb.set_trace()
78
-
79
- grid_y, grid_x = torch.meshgrid(
80
- torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
81
- torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device),
82
- )
83
- grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2
84
-
85
- scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
86
- grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
87
-
88
- if learnedwh is not None:
89
- # import ipdb; ipdb.set_trace()
90
- wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0**lvl)
91
- else:
92
- wh = torch.ones_like(grid) * 0.05 * (2.0**lvl)
93
-
94
- # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1)
95
- # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
96
- # wh = torch.ones_like(grid) / scale
97
- proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
98
- proposals.append(proposal)
99
- _cur += H_ * W_
100
- # import ipdb; ipdb.set_trace()
101
- output_proposals = torch.cat(proposals, 1)
102
- output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(
103
- -1, keepdim=True
104
- )
105
- output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid
106
- output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float("inf"))
107
- output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
108
-
109
- output_memory = memory
110
- output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
111
- output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
112
-
113
- # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
114
- # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf'))
115
-
116
- return output_memory, output_proposals
117
-
118
-
119
- class RandomBoxPerturber:
120
- def __init__(
121
- self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2
122
- ) -> None:
123
- self.noise_scale = torch.Tensor(
124
- [x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale]
125
- )
126
-
127
- def __call__(self, refanchors: Tensor) -> Tensor:
128
- nq, bs, query_dim = refanchors.shape
129
- device = refanchors.device
130
-
131
- noise_raw = torch.rand_like(refanchors)
132
- noise_scale = self.noise_scale.to(device)[:query_dim]
133
-
134
- new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale)
135
- return new_refanchors.clamp_(0, 1)
136
-
137
-
138
- def sigmoid_focal_loss(
139
- inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False
140
- ):
141
- """
142
- Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
143
- Args:
144
- inputs: A float tensor of arbitrary shape.
145
- The predictions for each example.
146
- targets: A float tensor with the same shape as inputs. Stores the binary
147
- classification label for each element in inputs
148
- (0 for the negative class and 1 for the positive class).
149
- alpha: (optional) Weighting factor in range (0,1) to balance
150
- positive vs negative examples. Default = -1 (no weighting).
151
- gamma: Exponent of the modulating factor (1 - p_t) to
152
- balance easy vs hard examples.
153
- Returns:
154
- Loss tensor
155
- """
156
- prob = inputs.sigmoid()
157
- ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
158
- p_t = prob * targets + (1 - prob) * (1 - targets)
159
- loss = ce_loss * ((1 - p_t) ** gamma)
160
-
161
- if alpha >= 0:
162
- alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
163
- loss = alpha_t * loss
164
-
165
- if no_reduction:
166
- return loss
167
-
168
- return loss.mean(1).sum() / num_boxes
169
-
170
-
171
- class MLP(nn.Module):
172
- """Very simple multi-layer perceptron (also called FFN)"""
173
-
174
- def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
175
- super().__init__()
176
- r = 12
177
- self.num_layers = num_layers
178
- h = [hidden_dim] * (num_layers - 1)
179
- self.layers = nn.ModuleList(
180
- lora.Linear(n, k , r=r) for n, k in zip([input_dim] + h, h + [output_dim])
181
- )
182
-
183
- def forward(self, x):
184
- for i, layer in enumerate(self.layers):
185
- x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
186
- return x
187
-
188
-
189
- def _get_activation_fn(activation, d_model=256, batch_dim=0):
190
- """Return an activation function given a string"""
191
- if activation == "relu":
192
- return F.relu
193
- if activation == "gelu":
194
- return F.gelu
195
- if activation == "glu":
196
- return F.glu
197
- if activation == "prelu":
198
- return nn.PReLU()
199
- if activation == "selu":
200
- return F.selu
201
-
202
- raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
203
-
204
-
205
- def gen_sineembed_for_position(pos_tensor):
206
- # n_query, bs, _ = pos_tensor.size()
207
- # sineembed_tensor = torch.zeros(n_query, bs, 256)
208
- scale = 2 * math.pi
209
- dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
210
- dim_t = 10000 ** (2 * (torch.div(dim_t, 2, rounding_mode='floor')) / 128)
211
- x_embed = pos_tensor[:, :, 0] * scale
212
- y_embed = pos_tensor[:, :, 1] * scale
213
- pos_x = x_embed[:, :, None] / dim_t
214
- pos_y = y_embed[:, :, None] / dim_t
215
- pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
216
- pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
217
- if pos_tensor.size(-1) == 2:
218
- pos = torch.cat((pos_y, pos_x), dim=2)
219
- elif pos_tensor.size(-1) == 4:
220
- w_embed = pos_tensor[:, :, 2] * scale
221
- pos_w = w_embed[:, :, None] / dim_t
222
- pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
223
-
224
- h_embed = pos_tensor[:, :, 3] * scale
225
- pos_h = h_embed[:, :, None] / dim_t
226
- pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
227
-
228
- pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
229
- else:
230
- raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
231
- return pos
232
-
233
-
234
- class ContrastiveEmbed(nn.Module):
235
- def __init__(self, max_text_len=256):
236
- """
237
- Args:
238
- max_text_len: max length of text.
239
- """
240
- super().__init__()
241
- self.max_text_len = max_text_len
242
-
243
- def forward(self, x, text_dict):
244
- """_summary_
245
-
246
- Args:
247
- x (_type_): _description_
248
- text_dict (_type_): _description_
249
- {
250
- 'encoded_text': encoded_text, # bs, 195, d_model
251
- 'text_token_mask': text_token_mask, # bs, 195
252
- # True for used tokens. False for padding tokens
253
- }
254
- Returns:
255
- _type_: _description_
256
- """
257
- assert isinstance(text_dict, dict)
258
- # print(x) #torch.Size([2, 16320, 256])
259
- # print(text_dict)
260
-
261
- # import pdb;pdb.set_trace()
262
- y = text_dict["encoded_text"] #torch.Size([2, 195, 256])
263
- text_token_mask = text_dict["text_token_mask"]
264
-
265
- res = x @ y.transpose(-1, -2)
266
- res.masked_fill_(~text_token_mask[:, None, :], float("-inf"))
267
- # 接着,对res进行掩码操作,将未使用的文本token(即padding的token)对应的得分置为负无穷float("-inf")。这是为了在计算相似度时,排除padding部分的影响。
268
-
269
-
270
- # padding to max_text_len
271
- new_res = torch.full((*res.shape[:-1], self.max_text_len), float("-inf"), device=res.device)
272
- new_res[..., : res.shape[-1]] = res #torch.Size([2, 16320, 195])
273
-
274
- return new_res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/GroundingDINO/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (243 Bytes)
 
groundingdino/models/GroundingDINO/__pycache__/bertwarper.cpython-310.pyc DELETED
Binary file (7.21 kB)
 
groundingdino/models/GroundingDINO/__pycache__/fuse_modules.cpython-310.pyc DELETED
Binary file (7.88 kB)
 
groundingdino/models/GroundingDINO/__pycache__/groundingdino.cpython-310.pyc DELETED
Binary file (25.3 kB)
 
groundingdino/models/GroundingDINO/__pycache__/matcher.cpython-310.pyc DELETED
Binary file (6.69 kB)
 
groundingdino/models/GroundingDINO/__pycache__/ms_deform_attn.cpython-310.pyc DELETED
Binary file (11.8 kB)
 
groundingdino/models/GroundingDINO/__pycache__/transformer.cpython-310.pyc DELETED
Binary file (19.5 kB)
 
groundingdino/models/GroundingDINO/__pycache__/transformer_vanilla.cpython-310.pyc DELETED
Binary file (3.51 kB)
 
groundingdino/models/GroundingDINO/__pycache__/utils.cpython-310.pyc DELETED
Binary file (9.62 kB)
 
groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/__init__-checkpoint.py DELETED
@@ -1 +0,0 @@
1
- from .backbone import build_backbone
 
 
groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/backbone-checkpoint.py DELETED
@@ -1,221 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # Grounding DINO
3
- # url: https://github.com/IDEA-Research/GroundingDINO
4
- # Copyright (c) 2023 IDEA. All Rights Reserved.
5
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
- # ------------------------------------------------------------------------
7
- # Conditional DETR
8
- # Copyright (c) 2021 Microsoft. All Rights Reserved.
9
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10
- # ------------------------------------------------------------------------
11
- # Copied from DETR (https://github.com/facebookresearch/detr)
12
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
13
- # ------------------------------------------------------------------------
14
-
15
- """
16
- Backbone modules.
17
- """
18
-
19
- from typing import Dict, List
20
-
21
- import torch
22
- import torch.nn.functional as F
23
- import torchvision
24
- from torch import nn
25
- from torchvision.models._utils import IntermediateLayerGetter
26
-
27
- from groundingdino.util.misc import NestedTensor, clean_state_dict, is_main_process
28
-
29
- from .position_encoding import build_position_encoding
30
- from .swin_transformer import build_swin_transformer
31
-
32
-
33
- class FrozenBatchNorm2d(torch.nn.Module):
34
- """
35
- BatchNorm2d where the batch statistics and the affine parameters are fixed.
36
-
37
- Copy-paste from torchvision.misc.ops with added eps before rqsrt,
38
- without which any other models than torchvision.models.resnet[18,34,50,101]
39
- produce nans.
40
- """
41
-
42
- def __init__(self, n):
43
- super(FrozenBatchNorm2d, self).__init__()
44
- self.register_buffer("weight", torch.ones(n))
45
- self.register_buffer("bias", torch.zeros(n))
46
- self.register_buffer("running_mean", torch.zeros(n))
47
- self.register_buffer("running_var", torch.ones(n))
48
-
49
- def _load_from_state_dict(
50
- self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
51
- ):
52
- num_batches_tracked_key = prefix + "num_batches_tracked"
53
- if num_batches_tracked_key in state_dict:
54
- del state_dict[num_batches_tracked_key]
55
-
56
- super(FrozenBatchNorm2d, self)._load_from_state_dict(
57
- state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
58
- )
59
-
60
- def forward(self, x):
61
- # move reshapes to the beginning
62
- # to make it fuser-friendly
63
- w = self.weight.reshape(1, -1, 1, 1)
64
- b = self.bias.reshape(1, -1, 1, 1)
65
- rv = self.running_var.reshape(1, -1, 1, 1)
66
- rm = self.running_mean.reshape(1, -1, 1, 1)
67
- eps = 1e-5
68
- scale = w * (rv + eps).rsqrt()
69
- bias = b - rm * scale
70
- return x * scale + bias
71
-
72
-
73
- class BackboneBase(nn.Module):
74
- def __init__(
75
- self,
76
- backbone: nn.Module,
77
- train_backbone: bool,
78
- num_channels: int,
79
- return_interm_indices: list,
80
- ):
81
- super().__init__()
82
- for name, parameter in backbone.named_parameters():
83
- if (
84
- not train_backbone
85
- or "layer2" not in name
86
- and "layer3" not in name
87
- and "layer4" not in name
88
- ):
89
- parameter.requires_grad_(False)
90
-
91
- return_layers = {}
92
- for idx, layer_index in enumerate(return_interm_indices):
93
- return_layers.update(
94
- {"layer{}".format(5 - len(return_interm_indices) + idx): "{}".format(layer_index)}
95
- )
96
-
97
- # if len:
98
- # if use_stage1_feature:
99
- # return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
100
- # else:
101
- # return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
102
- # else:
103
- # return_layers = {'layer4': "0"}
104
- self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
105
- self.num_channels = num_channels
106
-
107
- def forward(self, tensor_list: NestedTensor):
108
- xs = self.body(tensor_list.tensors)
109
- out: Dict[str, NestedTensor] = {}
110
- for name, x in xs.items():
111
- m = tensor_list.mask
112
- assert m is not None
113
- mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
114
- out[name] = NestedTensor(x, mask)
115
- # import ipdb; ipdb.set_trace()
116
- return out
117
-
118
-
119
- class Backbone(BackboneBase):
120
- """ResNet backbone with frozen BatchNorm."""
121
-
122
- def __init__(
123
- self,
124
- name: str,
125
- train_backbone: bool,
126
- dilation: bool,
127
- return_interm_indices: list,
128
- batch_norm=FrozenBatchNorm2d,
129
- ):
130
- if name in ["resnet18", "resnet34", "resnet50", "resnet101"]:
131
- backbone = getattr(torchvision.models, name)(
132
- replace_stride_with_dilation=[False, False, dilation],
133
- pretrained=is_main_process(),
134
- norm_layer=batch_norm,
135
- )
136
- else:
137
- raise NotImplementedError("Why you can get here with name {}".format(name))
138
- # num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
139
- assert name not in ("resnet18", "resnet34"), "Only resnet50 and resnet101 are available."
140
- assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
141
- num_channels_all = [256, 512, 1024, 2048]
142
- num_channels = num_channels_all[4 - len(return_interm_indices) :]
143
- super().__init__(backbone, train_backbone, num_channels, return_interm_indices)
144
-
145
-
146
- class Joiner(nn.Sequential):
147
- def __init__(self, backbone, position_embedding):
148
- super().__init__(backbone, position_embedding)
149
-
150
- def forward(self, tensor_list: NestedTensor):
151
- xs = self[0](tensor_list)
152
- out: List[NestedTensor] = []
153
- pos = []
154
- for name, x in xs.items():
155
- out.append(x)
156
- # position encoding
157
- pos.append(self[1](x).to(x.tensors.dtype))
158
-
159
- return out, pos
160
-
161
-
162
- def build_backbone(args):
163
- """
164
- Useful args:
165
- - backbone: backbone name
166
- - lr_backbone:
167
- - dilation
168
- - return_interm_indices: available: [0,1,2,3], [1,2,3], [3]
169
- - backbone_freeze_keywords:
170
- - use_checkpoint: for swin only for now
171
-
172
- """
173
- position_embedding = build_position_encoding(args)
174
- train_backbone = True
175
- if not train_backbone:
176
- raise ValueError("Please set lr_backbone > 0")
177
- return_interm_indices = args.return_interm_indices
178
- assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
179
- args.backbone_freeze_keywords
180
- use_checkpoint = getattr(args, "use_checkpoint", False)
181
-
182
- if args.backbone in ["resnet50", "resnet101"]:
183
- backbone = Backbone(
184
- args.backbone,
185
- train_backbone,
186
- args.dilation,
187
- return_interm_indices,
188
- batch_norm=FrozenBatchNorm2d,
189
- )
190
- bb_num_channels = backbone.num_channels
191
- elif args.backbone in [
192
- "swin_T_224_1k",
193
- "swin_B_224_22k",
194
- "swin_B_384_22k",
195
- "swin_L_224_22k",
196
- "swin_L_384_22k",
197
- ]:
198
- pretrain_img_size = int(args.backbone.split("_")[-2])
199
- backbone = build_swin_transformer(
200
- args.backbone,
201
- pretrain_img_size=pretrain_img_size,
202
- out_indices=tuple(return_interm_indices),
203
- dilation=False,
204
- use_checkpoint=use_checkpoint,
205
- )
206
-
207
- bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :]
208
- else:
209
- raise NotImplementedError("Unknown backbone {}".format(args.backbone))
210
-
211
- assert len(bb_num_channels) == len(
212
- return_interm_indices
213
- ), f"len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}"
214
-
215
- model = Joiner(backbone, position_embedding)
216
- model.num_channels = bb_num_channels
217
- assert isinstance(
218
- bb_num_channels, List
219
- ), "bb_num_channels is expected to be a List but {}".format(type(bb_num_channels))
220
- # import ipdb; ipdb.set_trace()
221
- return model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/position_encoding-checkpoint.py DELETED
@@ -1,186 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # Grounding DINO
3
- # url: https://github.com/IDEA-Research/GroundingDINO
4
- # Copyright (c) 2023 IDEA. All Rights Reserved.
5
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
- # ------------------------------------------------------------------------
7
- # DINO
8
- # Copyright (c) 2022 IDEA. All Rights Reserved.
9
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10
- # ------------------------------------------------------------------------
11
- # Conditional DETR
12
- # Copyright (c) 2021 Microsoft. All Rights Reserved.
13
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
14
- # ------------------------------------------------------------------------
15
- # Copied from DETR (https://github.com/facebookresearch/detr)
16
- # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
17
- # ------------------------------------------------------------------------
18
-
19
- """
20
- Various positional encodings for the transformer.
21
- """
22
- import math
23
-
24
- import torch
25
- from torch import nn
26
-
27
- from groundingdino.util.misc import NestedTensor
28
-
29
-
30
- class PositionEmbeddingSine(nn.Module):
31
- """
32
- This is a more standard version of the position embedding, very similar to the one
33
- used by the Attention is all you need paper, generalized to work on images.
34
- """
35
-
36
- def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
37
- super().__init__()
38
- self.num_pos_feats = num_pos_feats
39
- self.temperature = temperature
40
- self.normalize = normalize
41
- if scale is not None and normalize is False:
42
- raise ValueError("normalize should be True if scale is passed")
43
- if scale is None:
44
- scale = 2 * math.pi
45
- self.scale = scale
46
-
47
- def forward(self, tensor_list: NestedTensor):
48
- x = tensor_list.tensors
49
- mask = tensor_list.mask
50
- assert mask is not None
51
- not_mask = ~mask
52
- y_embed = not_mask.cumsum(1, dtype=torch.float32)
53
- x_embed = not_mask.cumsum(2, dtype=torch.float32)
54
- if self.normalize:
55
- eps = 1e-6
56
- # if os.environ.get("SHILONG_AMP", None) == '1':
57
- # eps = 1e-4
58
- # else:
59
- # eps = 1e-6
60
- y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
61
- x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
62
-
63
- dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
64
- dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
65
-
66
- pos_x = x_embed[:, :, :, None] / dim_t
67
- pos_y = y_embed[:, :, :, None] / dim_t
68
- pos_x = torch.stack(
69
- (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
70
- ).flatten(3)
71
- pos_y = torch.stack(
72
- (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
73
- ).flatten(3)
74
- pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
75
- return pos
76
-
77
-
78
- class PositionEmbeddingSineHW(nn.Module):
79
- """
80
- This is a more standard version of the position embedding, very similar to the one
81
- used by the Attention is all you need paper, generalized to work on images.
82
- """
83
-
84
- def __init__(
85
- self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None
86
- ):
87
- super().__init__()
88
- self.num_pos_feats = num_pos_feats
89
- self.temperatureH = temperatureH
90
- self.temperatureW = temperatureW
91
- self.normalize = normalize
92
- if scale is not None and normalize is False:
93
- raise ValueError("normalize should be True if scale is passed")
94
- if scale is None:
95
- scale = 2 * math.pi
96
- self.scale = scale
97
-
98
- def forward(self, tensor_list: NestedTensor):
99
- x = tensor_list.tensors
100
- mask = tensor_list.mask
101
- assert mask is not None
102
- not_mask = ~mask
103
- y_embed = not_mask.cumsum(1, dtype=torch.float32)
104
- x_embed = not_mask.cumsum(2, dtype=torch.float32)
105
-
106
- # import ipdb; ipdb.set_trace()
107
-
108
- if self.normalize:
109
- eps = 1e-6
110
- y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
111
- x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
112
-
113
- dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
114
- dim_tx = self.temperatureW ** (2 * (torch.div(dim_tx, 2, rounding_mode='floor')) / self.num_pos_feats)
115
- pos_x = x_embed[:, :, :, None] / dim_tx
116
-
117
- dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
118
- dim_ty = self.temperatureH ** (2 * (torch.div(dim_ty, 2, rounding_mode='floor')) / self.num_pos_feats)
119
- pos_y = y_embed[:, :, :, None] / dim_ty
120
-
121
- pos_x = torch.stack(
122
- (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
123
- ).flatten(3)
124
- pos_y = torch.stack(
125
- (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
126
- ).flatten(3)
127
- pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
128
-
129
- # import ipdb; ipdb.set_trace()
130
-
131
- return pos
132
-
133
-
134
- class PositionEmbeddingLearned(nn.Module):
135
- """
136
- Absolute pos embedding, learned.
137
- """
138
-
139
- def __init__(self, num_pos_feats=256):
140
- super().__init__()
141
- self.row_embed = nn.Embedding(50, num_pos_feats)
142
- self.col_embed = nn.Embedding(50, num_pos_feats)
143
- self.reset_parameters()
144
-
145
- def reset_parameters(self):
146
- nn.init.uniform_(self.row_embed.weight)
147
- nn.init.uniform_(self.col_embed.weight)
148
-
149
- def forward(self, tensor_list: NestedTensor):
150
- x = tensor_list.tensors
151
- h, w = x.shape[-2:]
152
- i = torch.arange(w, device=x.device)
153
- j = torch.arange(h, device=x.device)
154
- x_emb = self.col_embed(i)
155
- y_emb = self.row_embed(j)
156
- pos = (
157
- torch.cat(
158
- [
159
- x_emb.unsqueeze(0).repeat(h, 1, 1),
160
- y_emb.unsqueeze(1).repeat(1, w, 1),
161
- ],
162
- dim=-1,
163
- )
164
- .permute(2, 0, 1)
165
- .unsqueeze(0)
166
- .repeat(x.shape[0], 1, 1, 1)
167
- )
168
- return pos
169
-
170
-
171
- def build_position_encoding(args):
172
- N_steps = args.hidden_dim // 2
173
- if args.position_embedding in ("v2", "sine"):
174
- # TODO find a better way of exposing other arguments
175
- position_embedding = PositionEmbeddingSineHW(
176
- N_steps,
177
- temperatureH=args.pe_temperatureH,
178
- temperatureW=args.pe_temperatureW,
179
- normalize=True,
180
- )
181
- elif args.position_embedding in ("v3", "learned"):
182
- position_embedding = PositionEmbeddingLearned(N_steps)
183
- else:
184
- raise ValueError(f"not supported {args.position_embedding}")
185
-
186
- return position_embedding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/GroundingDINO/backbone/.ipynb_checkpoints/swin_transformer-checkpoint.py DELETED
@@ -1,804 +0,0 @@
1
- # ------------------------------------------------------------------------
2
- # Grounding DINO
3
- # url: https://github.com/IDEA-Research/GroundingDINO
4
- # Copyright (c) 2023 IDEA. All Rights Reserved.
5
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
- # ------------------------------------------------------------------------
7
- # DINO
8
- # Copyright (c) 2022 IDEA. All Rights Reserved.
9
- # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
10
- # --------------------------------------------------------
11
- # modified from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py
12
- # --------------------------------------------------------
13
-
14
- import numpy as np
15
- import torch
16
- import torch.nn as nn
17
- import torch.nn.functional as F
18
- import torch.utils.checkpoint as checkpoint
19
- from timm.models.layers import DropPath, to_2tuple, trunc_normal_
20
- import loralib as lora
21
- from groundingdino.util.misc import NestedTensor
22
-
23
-
24
- class Mlp(nn.Module):
25
- """Multilayer perceptron."""
26
-
27
- def __init__(
28
- self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
29
- ):
30
- super().__init__()
31
- r = 12
32
- out_features = out_features or in_features
33
- hidden_features = hidden_features or in_features
34
- self.fc1 = lora.Linear(in_features, hidden_features , r=r)
35
- self.act = act_layer()
36
- self.fc2 = lora.Linear(hidden_features, out_features , r=r)
37
- self.drop = nn.Dropout(drop)
38
-
39
- def forward(self, x):
40
- x = self.fc1(x)
41
- x = self.act(x)
42
- x = self.drop(x)
43
- x = self.fc2(x)
44
- x = self.drop(x)
45
- return x
46
-
47
-
48
- def window_partition(x, window_size):
49
- """
50
- Args:
51
- x: (B, H, W, C)
52
- window_size (int): window size
53
- Returns:
54
- windows: (num_windows*B, window_size, window_size, C)
55
- """
56
- B, H, W, C = x.shape
57
- x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
58
- windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
59
- return windows
60
-
61
-
62
- def window_reverse(windows, window_size, H, W):
63
- """
64
- Args:
65
- windows: (num_windows*B, window_size, window_size, C)
66
- window_size (int): Window size
67
- H (int): Height of image
68
- W (int): Width of image
69
- Returns:
70
- x: (B, H, W, C)
71
- """
72
- B = int(windows.shape[0] / (H * W / window_size / window_size))
73
- x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
74
- x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
75
- return x
76
-
77
-
78
- class WindowAttention(nn.Module):
79
- """Window based multi-head self attention (W-MSA) module with relative position bias.
80
- It supports both of shifted and non-shifted window.
81
- Args:
82
- dim (int): Number of input channels.
83
- window_size (tuple[int]): The height and width of the window.
84
- num_heads (int): Number of attention heads.
85
- qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
86
- qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
87
- attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
88
- proj_drop (float, optional): Dropout ratio of output. Default: 0.0
89
- """
90
-
91
- def __init__(
92
- self,
93
- dim,
94
- window_size,
95
- num_heads,
96
- qkv_bias=True,
97
- qk_scale=None,
98
- attn_drop=0.0,
99
- proj_drop=0.0,
100
- ):
101
-
102
- super().__init__()
103
- self.dim = dim
104
- self.window_size = window_size # Wh, Ww
105
- self.num_heads = num_heads
106
- head_dim = dim // num_heads
107
- self.scale = qk_scale or head_dim**-0.5
108
-
109
- # define a parameter table of relative position bias
110
- self.relative_position_bias_table = nn.Parameter(
111
- torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
112
- ) # 2*Wh-1 * 2*Ww-1, nH
113
-
114
- # get pair-wise relative position index for each token inside the window
115
- coords_h = torch.arange(self.window_size[0])
116
- coords_w = torch.arange(self.window_size[1])
117
- coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
118
- coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
119
- relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
120
- relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
121
- relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
122
- relative_coords[:, :, 1] += self.window_size[1] - 1
123
- relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
124
- relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
125
- self.register_buffer("relative_position_index", relative_position_index)
126
- r = 12
127
- self.qkv = lora.Linear(dim, dim * 3, r=r , bias=qkv_bias)
128
- self.attn_drop = nn.Dropout(attn_drop)
129
- self.proj = lora.Linear(dim, dim , r=r)
130
- self.proj_drop = nn.Dropout(proj_drop)
131
-
132
- trunc_normal_(self.relative_position_bias_table, std=0.02)
133
- self.softmax = nn.Softmax(dim=-1)
134
-
135
- def forward(self, x, mask=None):
136
- """Forward function.
137
- Args:
138
- x: input features with shape of (num_windows*B, N, C)
139
- mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
140
- """
141
- B_, N, C = x.shape
142
- qkv = (
143
- self.qkv(x)
144
- .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
145
- .permute(2, 0, 3, 1, 4)
146
- )
147
- q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
148
-
149
- q = q * self.scale
150
- attn = q @ k.transpose(-2, -1)
151
-
152
- relative_position_bias = self.relative_position_bias_table[
153
- self.relative_position_index.view(-1)
154
- ].view(
155
- self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
156
- ) # Wh*Ww,Wh*Ww,nH
157
- relative_position_bias = relative_position_bias.permute(
158
- 2, 0, 1
159
- ).contiguous() # nH, Wh*Ww, Wh*Ww
160
- attn = attn + relative_position_bias.unsqueeze(0)
161
-
162
- if mask is not None:
163
- nW = mask.shape[0]
164
- attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
165
- attn = attn.view(-1, self.num_heads, N, N)
166
- attn = self.softmax(attn)
167
- else:
168
- attn = self.softmax(attn)
169
-
170
- attn = self.attn_drop(attn)
171
-
172
- x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
173
- x = self.proj(x)
174
- x = self.proj_drop(x)
175
- return x
176
-
177
-
178
- class SwinTransformerBlock(nn.Module):
179
- """Swin Transformer Block.
180
- Args:
181
- dim (int): Number of input channels.
182
- num_heads (int): Number of attention heads.
183
- window_size (int): Window size.
184
- shift_size (int): Shift size for SW-MSA.
185
- mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
186
- qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
187
- qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
188
- drop (float, optional): Dropout rate. Default: 0.0
189
- attn_drop (float, optional): Attention dropout rate. Default: 0.0
190
- drop_path (float, optional): Stochastic depth rate. Default: 0.0
191
- act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
192
- norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
193
- """
194
-
195
- def __init__(
196
- self,
197
- dim,
198
- num_heads,
199
- window_size=7,
200
- shift_size=0,
201
- mlp_ratio=4.0,
202
- qkv_bias=True,
203
- qk_scale=None,
204
- drop=0.0,
205
- attn_drop=0.0,
206
- drop_path=0.0,
207
- act_layer=nn.GELU,
208
- norm_layer=nn.LayerNorm,
209
- ):
210
- super().__init__()
211
- self.dim = dim
212
- self.num_heads = num_heads
213
- self.window_size = window_size
214
- self.shift_size = shift_size
215
- self.mlp_ratio = mlp_ratio
216
- assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
217
-
218
- self.norm1 = norm_layer(dim)
219
- self.attn = WindowAttention(
220
- dim,
221
- window_size=to_2tuple(self.window_size),
222
- num_heads=num_heads,
223
- qkv_bias=qkv_bias,
224
- qk_scale=qk_scale,
225
- attn_drop=attn_drop,
226
- proj_drop=drop,
227
- )
228
-
229
- self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
230
- self.norm2 = norm_layer(dim)
231
- mlp_hidden_dim = int(dim * mlp_ratio)
232
- self.mlp = Mlp(
233
- in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
234
- )
235
-
236
- self.H = None
237
- self.W = None
238
-
239
- def forward(self, x, mask_matrix):
240
- """Forward function.
241
- Args:
242
- x: Input feature, tensor size (B, H*W, C).
243
- H, W: Spatial resolution of the input feature.
244
- mask_matrix: Attention mask for cyclic shift.
245
- """
246
- B, L, C = x.shape
247
- H, W = self.H, self.W
248
- assert L == H * W, "input feature has wrong size"
249
-
250
- shortcut = x
251
- x = self.norm1(x)
252
- x = x.view(B, H, W, C)
253
-
254
- # pad feature maps to multiples of window size
255
- pad_l = pad_t = 0
256
- pad_r = (self.window_size - W % self.window_size) % self.window_size
257
- pad_b = (self.window_size - H % self.window_size) % self.window_size
258
- x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
259
- _, Hp, Wp, _ = x.shape
260
-
261
- # cyclic shift
262
- if self.shift_size > 0:
263
- shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
264
- attn_mask = mask_matrix
265
- else:
266
- shifted_x = x
267
- attn_mask = None
268
-
269
- # partition windows
270
- x_windows = window_partition(
271
- shifted_x, self.window_size
272
- ) # nW*B, window_size, window_size, C
273
- x_windows = x_windows.view(
274
- -1, self.window_size * self.window_size, C
275
- ) # nW*B, window_size*window_size, C
276
-
277
- # W-MSA/SW-MSA
278
- attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
279
-
280
- # merge windows
281
- attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
282
- shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
283
-
284
- # reverse cyclic shift
285
- if self.shift_size > 0:
286
- x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
287
- else:
288
- x = shifted_x
289
-
290
- if pad_r > 0 or pad_b > 0:
291
- x = x[:, :H, :W, :].contiguous()
292
-
293
- x = x.view(B, H * W, C)
294
-
295
- # FFN
296
- x = shortcut + self.drop_path(x)
297
- x = x + self.drop_path(self.mlp(self.norm2(x)))
298
-
299
- return x
300
-
301
-
302
- class PatchMerging(nn.Module):
303
- """Patch Merging Layer
304
- Args:
305
- dim (int): Number of input channels.
306
- norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
307
- """
308
-
309
- def __init__(self, dim, norm_layer=nn.LayerNorm):
310
- super().__init__()
311
- r = 24
312
- self.dim = dim
313
- self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
314
- self.norm = norm_layer(4 * dim)
315
-
316
- def forward(self, x, H, W):
317
- """Forward function.
318
- Args:
319
- x: Input feature, tensor size (B, H*W, C).
320
- H, W: Spatial resolution of the input feature.
321
- """
322
- B, L, C = x.shape
323
- assert L == H * W, "input feature has wrong size"
324
-
325
- x = x.view(B, H, W, C)
326
-
327
- # padding
328
- pad_input = (H % 2 == 1) or (W % 2 == 1)
329
- if pad_input:
330
- x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
331
-
332
- x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
333
- x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
334
- x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
335
- x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
336
- x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
337
- x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
338
-
339
- x = self.norm(x)
340
- x = self.reduction(x)
341
-
342
- return x
343
-
344
-
345
- class BasicLayer(nn.Module):
346
- """A basic Swin Transformer layer for one stage.
347
- Args:
348
- dim (int): Number of feature channels
349
- depth (int): Depths of this stage.
350
- num_heads (int): Number of attention head.
351
- window_size (int): Local window size. Default: 7.
352
- mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
353
- qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
354
- qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
355
- drop (float, optional): Dropout rate. Default: 0.0
356
- attn_drop (float, optional): Attention dropout rate. Default: 0.0
357
- drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
358
- norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
359
- downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
360
- use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
361
- """
362
-
363
- def __init__(
364
- self,
365
- dim,
366
- depth,
367
- num_heads,
368
- window_size=7,
369
- mlp_ratio=4.0,
370
- qkv_bias=True,
371
- qk_scale=None,
372
- drop=0.0,
373
- attn_drop=0.0,
374
- drop_path=0.0,
375
- norm_layer=nn.LayerNorm,
376
- downsample=None,
377
- use_checkpoint=False,
378
- ):
379
- super().__init__()
380
- self.window_size = window_size
381
- self.shift_size = window_size // 2
382
- self.depth = depth
383
- self.use_checkpoint = use_checkpoint
384
-
385
- # build blocks
386
- self.blocks = nn.ModuleList(
387
- [
388
- SwinTransformerBlock(
389
- dim=dim,
390
- num_heads=num_heads,
391
- window_size=window_size,
392
- shift_size=0 if (i % 2 == 0) else window_size // 2,
393
- mlp_ratio=mlp_ratio,
394
- qkv_bias=qkv_bias,
395
- qk_scale=qk_scale,
396
- drop=drop,
397
- attn_drop=attn_drop,
398
- drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
399
- norm_layer=norm_layer,
400
- )
401
- for i in range(depth)
402
- ]
403
- )
404
-
405
- # patch merging layer
406
- if downsample is not None:
407
- self.downsample = downsample(dim=dim, norm_layer=norm_layer)
408
- else:
409
- self.downsample = None
410
-
411
- def forward(self, x, H, W):
412
- """Forward function.
413
- Args:
414
- x: Input feature, tensor size (B, H*W, C).
415
- H, W: Spatial resolution of the input feature.
416
- """
417
-
418
- # calculate attention mask for SW-MSA
419
- Hp = int(np.ceil(H / self.window_size)) * self.window_size
420
- Wp = int(np.ceil(W / self.window_size)) * self.window_size
421
- img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
422
- h_slices = (
423
- slice(0, -self.window_size),
424
- slice(-self.window_size, -self.shift_size),
425
- slice(-self.shift_size, None),
426
- )
427
- w_slices = (
428
- slice(0, -self.window_size),
429
- slice(-self.window_size, -self.shift_size),
430
- slice(-self.shift_size, None),
431
- )
432
- cnt = 0
433
- for h in h_slices:
434
- for w in w_slices:
435
- img_mask[:, h, w, :] = cnt
436
- cnt += 1
437
-
438
- mask_windows = window_partition(
439
- img_mask, self.window_size
440
- ) # nW, window_size, window_size, 1
441
- mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
442
- attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
443
- attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
444
- attn_mask == 0, float(0.0)
445
- )
446
-
447
- for blk in self.blocks:
448
- blk.H, blk.W = H, W
449
- if self.use_checkpoint:
450
- x = checkpoint.checkpoint(blk, x, attn_mask)
451
- else:
452
- x = blk(x, attn_mask)
453
- if self.downsample is not None:
454
- x_down = self.downsample(x, H, W)
455
- Wh, Ww = (H + 1) // 2, (W + 1) // 2
456
- return x, H, W, x_down, Wh, Ww
457
- else:
458
- return x, H, W, x, H, W
459
-
460
-
461
- class PatchEmbed(nn.Module):
462
- """Image to Patch Embedding
463
- Args:
464
- patch_size (int): Patch token size. Default: 4.
465
- in_chans (int): Number of input image channels. Default: 3.
466
- embed_dim (int): Number of linear projection output channels. Default: 96.
467
- norm_layer (nn.Module, optional): Normalization layer. Default: None
468
- """
469
-
470
- def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
471
- super().__init__()
472
- patch_size = to_2tuple(patch_size)
473
- self.patch_size = patch_size
474
-
475
- self.in_chans = in_chans
476
- self.embed_dim = embed_dim
477
-
478
- self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
479
- if norm_layer is not None:
480
- self.norm = norm_layer(embed_dim)
481
- else:
482
- self.norm = None
483
-
484
- def forward(self, x):
485
- """Forward function."""
486
- # padding
487
- _, _, H, W = x.size()
488
- if W % self.patch_size[1] != 0:
489
- x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
490
- if H % self.patch_size[0] != 0:
491
- x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
492
-
493
- x = self.proj(x) # B C Wh Ww
494
- if self.norm is not None:
495
- Wh, Ww = x.size(2), x.size(3)
496
- x = x.flatten(2).transpose(1, 2)
497
- x = self.norm(x)
498
- x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
499
-
500
- return x
501
-
502
-
503
- class SwinTransformer(nn.Module):
504
- """Swin Transformer backbone.
505
- A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
506
- https://arxiv.org/pdf/2103.14030
507
- Args:
508
- pretrain_img_size (int): Input image size for training the pretrained model,
509
- used in absolute postion embedding. Default 224.
510
- patch_size (int | tuple(int)): Patch size. Default: 4.
511
- in_chans (int): Number of input image channels. Default: 3.
512
- embed_dim (int): Number of linear projection output channels. Default: 96.
513
- depths (tuple[int]): Depths of each Swin Transformer stage.
514
- num_heads (tuple[int]): Number of attention head of each stage.
515
- window_size (int): Window size. Default: 7.
516
- mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
517
- qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
518
- qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
519
- drop_rate (float): Dropout rate.
520
- attn_drop_rate (float): Attention dropout rate. Default: 0.
521
- drop_path_rate (float): Stochastic depth rate. Default: 0.2.
522
- norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
523
- ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
524
- patch_norm (bool): If True, add normalization after patch embedding. Default: True.
525
- out_indices (Sequence[int]): Output from which stages.
526
- frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
527
- -1 means not freezing any parameters.
528
- use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
529
- dilation (bool): if True, the output size if 16x downsample, ow 32x downsample.
530
- """
531
-
532
- def __init__(
533
- self,
534
- pretrain_img_size=224,
535
- patch_size=4,
536
- in_chans=3,
537
- embed_dim=96,
538
- depths=[2, 2, 6, 2],
539
- num_heads=[3, 6, 12, 24],
540
- window_size=7,
541
- mlp_ratio=4.0,
542
- qkv_bias=True,
543
- qk_scale=None,
544
- drop_rate=0.0,
545
- attn_drop_rate=0.0,
546
- drop_path_rate=0.2,
547
- norm_layer=nn.LayerNorm,
548
- ape=False,
549
- patch_norm=True,
550
- out_indices=(0, 1, 2, 3),
551
- frozen_stages=-1,
552
- dilation=False,
553
- use_checkpoint=False,
554
- ):
555
- super().__init__()
556
-
557
- self.pretrain_img_size = pretrain_img_size
558
- self.num_layers = len(depths)
559
- self.embed_dim = embed_dim
560
- self.ape = ape
561
- self.patch_norm = patch_norm
562
- self.out_indices = out_indices
563
- self.frozen_stages = frozen_stages
564
- self.dilation = dilation
565
-
566
- # if use_checkpoint:
567
- # print("use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!")
568
-
569
- # split image into non-overlapping patches
570
- self.patch_embed = PatchEmbed(
571
- patch_size=patch_size,
572
- in_chans=in_chans,
573
- embed_dim=embed_dim,
574
- norm_layer=norm_layer if self.patch_norm else None,
575
- )
576
-
577
- # absolute position embedding
578
- if self.ape:
579
- pretrain_img_size = to_2tuple(pretrain_img_size)
580
- patch_size = to_2tuple(patch_size)
581
- patches_resolution = [
582
- pretrain_img_size[0] // patch_size[0],
583
- pretrain_img_size[1] // patch_size[1],
584
- ]
585
-
586
- self.absolute_pos_embed = nn.Parameter(
587
- torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
588
- )
589
- trunc_normal_(self.absolute_pos_embed, std=0.02)
590
-
591
- self.pos_drop = nn.Dropout(p=drop_rate)
592
-
593
- # stochastic depth
594
- dpr = [
595
- x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
596
- ] # stochastic depth decay rule
597
-
598
- # build layers
599
- self.layers = nn.ModuleList()
600
- # prepare downsample list
601
- downsamplelist = [PatchMerging for i in range(self.num_layers)]
602
- downsamplelist[-1] = None
603
- num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
604
- if self.dilation:
605
- downsamplelist[-2] = None
606
- num_features[-1] = int(embed_dim * 2 ** (self.num_layers - 1)) // 2
607
- for i_layer in range(self.num_layers):
608
- layer = BasicLayer(
609
- # dim=int(embed_dim * 2 ** i_layer),
610
- dim=num_features[i_layer],
611
- depth=depths[i_layer],
612
- num_heads=num_heads[i_layer],
613
- window_size=window_size,
614
- mlp_ratio=mlp_ratio,
615
- qkv_bias=qkv_bias,
616
- qk_scale=qk_scale,
617
- drop=drop_rate,
618
- attn_drop=attn_drop_rate,
619
- drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
620
- norm_layer=norm_layer,
621
- # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
622
- downsample=downsamplelist[i_layer],
623
- use_checkpoint=use_checkpoint,
624
- )
625
- self.layers.append(layer)
626
-
627
- # num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
628
- self.num_features = num_features
629
-
630
- # add a norm layer for each output
631
- for i_layer in out_indices:
632
- layer = norm_layer(num_features[i_layer])
633
- layer_name = f"norm{i_layer}"
634
- self.add_module(layer_name, layer)
635
-
636
- self._freeze_stages()
637
-
638
- def _freeze_stages(self):
639
- if self.frozen_stages >= 0:
640
- self.patch_embed.eval()
641
- for param in self.patch_embed.parameters():
642
- param.requires_grad = False
643
-
644
- if self.frozen_stages >= 1 and self.ape:
645
- self.absolute_pos_embed.requires_grad = False
646
-
647
- if self.frozen_stages >= 2:
648
- self.pos_drop.eval()
649
- for i in range(0, self.frozen_stages - 1):
650
- m = self.layers[i]
651
- m.eval()
652
- for param in m.parameters():
653
- param.requires_grad = False
654
-
655
- # def init_weights(self, pretrained=None):
656
- # """Initialize the weights in backbone.
657
- # Args:
658
- # pretrained (str, optional): Path to pre-trained weights.
659
- # Defaults to None.
660
- # """
661
-
662
- # def _init_weights(m):
663
- # if isinstance(m, nn.Linear):
664
- # trunc_normal_(m.weight, std=.02)
665
- # if isinstance(m, nn.Linear) and m.bias is not None:
666
- # nn.init.constant_(m.bias, 0)
667
- # elif isinstance(m, nn.LayerNorm):
668
- # nn.init.constant_(m.bias, 0)
669
- # nn.init.constant_(m.weight, 1.0)
670
-
671
- # if isinstance(pretrained, str):
672
- # self.apply(_init_weights)
673
- # logger = get_root_logger()
674
- # load_checkpoint(self, pretrained, strict=False, logger=logger)
675
- # elif pretrained is None:
676
- # self.apply(_init_weights)
677
- # else:
678
- # raise TypeError('pretrained must be a str or None')
679
-
680
- def forward_raw(self, x):
681
- """Forward function."""
682
- x = self.patch_embed(x)
683
-
684
- Wh, Ww = x.size(2), x.size(3)
685
- if self.ape:
686
- # interpolate the position embedding to the corresponding size
687
- absolute_pos_embed = F.interpolate(
688
- self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
689
- )
690
- x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
691
- else:
692
- x = x.flatten(2).transpose(1, 2)
693
- x = self.pos_drop(x)
694
-
695
- outs = []
696
- for i in range(self.num_layers):
697
- layer = self.layers[i]
698
- x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
699
- # import ipdb; ipdb.set_trace()
700
-
701
- if i in self.out_indices:
702
- norm_layer = getattr(self, f"norm{i}")
703
- x_out = norm_layer(x_out)
704
-
705
- out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
706
- outs.append(out)
707
- # in:
708
- # torch.Size([2, 3, 1024, 1024])
709
- # outs:
710
- # [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
711
- # torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
712
- return tuple(outs)
713
-
714
- def forward(self, tensor_list: NestedTensor):
715
- x = tensor_list.tensors
716
-
717
- """Forward function."""
718
- x = self.patch_embed(x)
719
-
720
- Wh, Ww = x.size(2), x.size(3)
721
- if self.ape:
722
- # interpolate the position embedding to the corresponding size
723
- absolute_pos_embed = F.interpolate(
724
- self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
725
- )
726
- x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
727
- else:
728
- x = x.flatten(2).transpose(1, 2)
729
- x = self.pos_drop(x)
730
-
731
- outs = []
732
- for i in range(self.num_layers):
733
- layer = self.layers[i]
734
- x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
735
-
736
- if i in self.out_indices:
737
- norm_layer = getattr(self, f"norm{i}")
738
- x_out = norm_layer(x_out)
739
-
740
- out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
741
- outs.append(out)
742
- # in:
743
- # torch.Size([2, 3, 1024, 1024])
744
- # out:
745
- # [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
746
- # torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
747
-
748
- # collect for nesttensors
749
- outs_dict = {}
750
- for idx, out_i in enumerate(outs):
751
- m = tensor_list.mask
752
- assert m is not None
753
- mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0]
754
- outs_dict[idx] = NestedTensor(out_i, mask)
755
-
756
- return outs_dict
757
-
758
- def train(self, mode=True):
759
- """Convert the model into training mode while keep layers freezed."""
760
- super(SwinTransformer, self).train(mode)
761
- self._freeze_stages()
762
-
763
-
764
- def build_swin_transformer(modelname, pretrain_img_size, **kw):
765
- assert modelname in [
766
- "swin_T_224_1k",
767
- "swin_B_224_22k",
768
- "swin_B_384_22k",
769
- "swin_L_224_22k",
770
- "swin_L_384_22k",
771
- ]
772
-
773
- model_para_dict = {
774
- "swin_T_224_1k": dict(
775
- embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7
776
- ),
777
- "swin_B_224_22k": dict(
778
- embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7
779
- ),
780
- "swin_B_384_22k": dict(
781
- embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=12
782
- ),
783
- "swin_L_224_22k": dict(
784
- embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=7
785
- ),
786
- "swin_L_384_22k": dict(
787
- embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12
788
- ),
789
- }
790
- kw_cgf = model_para_dict[modelname]
791
- kw_cgf.update(kw)
792
- model = SwinTransformer(pretrain_img_size=pretrain_img_size, **kw_cgf)
793
- return model
794
-
795
-
796
- if __name__ == "__main__":
797
- model = build_swin_transformer("swin_L_384_22k", 384, dilation=True)
798
- x = torch.rand(2, 3, 1024, 1024)
799
- y = model.forward_raw(x)
800
- import ipdb
801
-
802
- ipdb.set_trace()
803
- x = torch.rand(2, 3, 384, 384)
804
- y = model.forward_raw(x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
groundingdino/models/GroundingDINO/backbone/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (242 Bytes)
 
groundingdino/models/GroundingDINO/backbone/__pycache__/backbone.cpython-310.pyc DELETED
Binary file (6.24 kB)