dianecy commited on
Commit
9b855a7
·
verified ·
1 Parent(s): 5c8ef86

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .history/datasets/__init___20250113130146.py +38 -0
  2. .history/datasets/ytvos_ref_20250113161625.py +243 -0
  3. .history/datasets/ytvos_ref_20250113161634.py +242 -0
  4. .history/datasets/ytvos_ref_20250113162627.py +242 -0
  5. .history/datasets/ytvos_ref_20250113163106.py +244 -0
  6. .history/datasets/ytvos_ref_20250113163121.py +245 -0
  7. .history/datasets/ytvos_ref_20250113163340.py +249 -0
  8. .history/datasets/ytvos_ref_20250113163347.py +249 -0
  9. .history/datasets/ytvos_ref_20250114202456.py +251 -0
  10. .history/datasets/ytvos_ref_20250114205130.py +250 -0
  11. .history/datasets/ytvos_ref_20250114211235.py +252 -0
  12. .history/datasets/ytvos_ref_20250114211331.py +250 -0
  13. .history/datasets/ytvos_ref_20250114211640.py +242 -0
  14. .history/datasets/ytvos_ref_20250114211841.py +242 -0
  15. .history/datasets/ytvos_ref_20250114212623.py +242 -0
  16. .history/datasets/ytvos_ref_20250116071135.py +240 -0
  17. .history/datasets/ytvos_ref_20250116071255.py +239 -0
  18. .history/datasets/ytvos_ref_20250116071502.py +240 -0
  19. .history/datasets/ytvos_ref_20250116071546.py +240 -0
  20. .history/datasets/ytvos_ref_20250116071553.py +240 -0
  21. .history/datasets/ytvos_ref_20250116071841.py +239 -0
  22. .history/datasets/ytvos_ref_20250116072442.py +241 -0
  23. .history/slurm_script/mbench_ref-ytvos_json_20250113182526.sh +0 -0
  24. LICENSE +201 -0
  25. README.md +214 -0
  26. davis2017/__init__.py +3 -0
  27. davis2017/evaluation.py +110 -0
  28. davis2017/metrics.py +197 -0
  29. docs/A2D-Sentences.md +55 -0
  30. docs/JHMDB-Sentences.md +27 -0
  31. docs/Ref-DAVIS17.md +24 -0
  32. docs/Ref-Youtube-VOS.md +83 -0
  33. docs/data.md +127 -0
  34. engine.py +253 -0
  35. eval_davis.py +68 -0
  36. jptr_chaeyun.txt +179 -0
  37. make_ref-ytvos/annotate_ref_ytvos.py +288 -0
  38. make_ref-ytvos/folder2lmdb.py +109 -0
  39. make_ref-ytvos/manual_selected_frames.jsonl +101 -0
  40. make_ref-ytvos/review_images.ipynb +0 -0
  41. make_ref-ytvos/revised_frames.jsonl +0 -0
  42. make_ref-ytvos/selected_frames.jsonl +0 -0
  43. mbench/result.json +465 -0
  44. models/__init__.py +5 -0
  45. models/backbone.py +132 -0
  46. models/criterion.py +208 -0
  47. models/deformable_transformer.py +444 -0
  48. models/matcher.py +206 -0
  49. models/ops/make.sh +10 -0
  50. models/ops/modules/__init__.py +9 -0
.history/datasets/__init___20250113130146.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.utils.data
2
+ import torchvision
3
+
4
+ from .ytvos import build as build_ytvos
5
+ from .ytvos_ref import build as build_ytvos_ref
6
+ from .davis import build as build_davis
7
+ from .a2d import build as build_a2d
8
+ from .jhmdb import build as build_jhmdb
9
+ from .refexp import build as build_refexp
10
+ from .concat_dataset import build as build_joint
11
+
12
+
13
+ def get_coco_api_from_dataset(dataset):
14
+ for _ in range(10):
15
+ # if isinstance(dataset, torchvision.datasets.CocoDetection):
16
+ # break
17
+ if isinstance(dataset, torch.utils.data.Subset):
18
+ dataset = dataset.dataset
19
+ if isinstance(dataset, torchvision.datasets.CocoDetection):
20
+ return dataset.coco
21
+
22
+
23
+ def build_dataset(dataset_file: str, image_set: str, args):
24
+ if dataset_file == 'ytvos':
25
+ return build_ytvos(image_set, args)
26
+ if dataset_file == 'davis':
27
+ return build_davis(image_set, args)
28
+ if dataset_file == 'a2d':
29
+ return build_a2d(image_set, args)
30
+ if dataset_file == 'jhmdb':
31
+ return build_jhmdb(image_set, args)
32
+ # for pretraining
33
+ if dataset_file == "refcoco" or dataset_file == "refcoco+" or dataset_file == "refcocog":
34
+ return build_refexp(dataset_file, image_set, args)
35
+ # for joint training of refcoco and ytvos
36
+ if dataset_file == 'joint':
37
+ return build_joint(image_set, args)
38
+ raise ValueError(f'dataset {dataset_file} not supported')
.history/datasets/ytvos_ref_20250113161625.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ for vid in self.videos:
61
+ vid_meta = subset_metas_by_video[vid]
62
+ vid_data = subset_expressions_by_video[vid]
63
+ vid_frames = sorted(vid_data['frames'])
64
+ vid_len = len(vid_frames)
65
+
66
+
67
+ for exp_id, exp_dict in vid_data['expressions'].items():
68
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
69
+ start_idx , end_idx = 2, vid_len-2
70
+ bin_size = (end_idx - start_idx) // 4
71
+
72
+ bins = []
73
+ for i in range(4):
74
+ bin_start = start_idx + i * bin_size
75
+ bin_end = bin_start + bin_size if i < 3 else end_idx
76
+
77
+ bins.append((bin_start, bin_end))
78
+
79
+ for bin_id in range(len(bins)):
80
+ start_idx, end_idx = bins[bin_id]
81
+ frame_id = random.randint(start_idx, end_idx - 1)
82
+
83
+ meta = {
84
+ 'video': vid,
85
+ 'exp': exp_dict['exp'],
86
+ 'obj_id': int(exp_dict['obj_id']),
87
+ 'frames': vid_frames,
88
+ 'frame_id' : frame_id,
89
+ 'bins': bins,
90
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
91
+ }
92
+ self.metas.append(meta)
93
+
94
+
95
+ @staticmethod
96
+ def bounding_box(img):
97
+ rows = np.any(img, axis=1)
98
+ cols = np.any(img, axis=0)
99
+ rmin, rmax = np.where(rows)[0][[0, -1]]
100
+ cmin, cmax = np.where(cols)[0][[0, -1]]
101
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
102
+
103
+ def __len__(self):
104
+ return len(self.metas)
105
+
106
+ def __getitem__(self, idx):
107
+ instance_check = False
108
+ while not instance_check:
109
+ meta = self.metas[idx] # dict
110
+
111
+
112
+ video, exp, obj_id, category, frames, bins = \
113
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
114
+
115
+
116
+ # clean up the caption
117
+ exp = " ".join(exp.lower().split())
118
+ category_id = category_dict[category]
119
+ vid_len = len(frames)
120
+
121
+ # num_frames = self.num_frames
122
+
123
+ # Random sample one frame from each bin
124
+ sample_indx = []
125
+ for start_idx, end_idx in bins:
126
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
127
+ sample_indx.sort() # Ensure indices are in order
128
+
129
+ # read frames and masks
130
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
131
+ for frame_indx in sample_indx:
132
+ frame_name = frames[frame_indx]
133
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
134
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
135
+ img = Image.open(img_path).convert('RGB')
136
+ mask = Image.open(mask_path).convert('P')
137
+
138
+ # create the target
139
+ label = torch.tensor(category_id)
140
+ mask = np.array(mask)
141
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
142
+ if (mask > 0).any():
143
+ y1, y2, x1, x2 = self.bounding_box(mask)
144
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
145
+ valid.append(1)
146
+ else: # some frame didn't contain the instance
147
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
148
+ valid.append(0)
149
+ mask = torch.from_numpy(mask)
150
+
151
+ # append
152
+ imgs.append(img)
153
+ labels.append(label)
154
+ masks.append(mask)
155
+ boxes.append(box)
156
+
157
+ # transform
158
+ w, h = img.size
159
+ labels = torch.stack(labels, dim=0)
160
+ boxes = torch.stack(boxes, dim=0)
161
+ boxes[:, 0::2].clamp_(min=0, max=w)
162
+ boxes[:, 1::2].clamp_(min=0, max=h)
163
+ masks = torch.stack(masks, dim=0)
164
+ target = {
165
+ 'frames_idx': torch.tensor(sample_indx), # [T,]
166
+ 'labels': labels, # [T,]
167
+ 'boxes': boxes, # [T, 4], xyxy
168
+ 'masks': masks, # [T, H, W]
169
+ 'valid': torch.tensor(valid), # [T,]
170
+ 'caption': exp,
171
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
172
+ 'size': torch.as_tensor([int(h), int(w)])
173
+ }
174
+
175
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
176
+ if self._transforms:
177
+ imgs, target = self._transforms(imgs, target)
178
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
179
+ else:
180
+ imgs = np.array(imgs)
181
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
182
+
183
+
184
+ # FIXME: handle "valid", since some box may be removed due to random crop
185
+ if torch.any(target['valid'] == 1): # at leatst one instance
186
+ instance_check = True
187
+ else:
188
+ idx = random.randint(0, self.__len__() - 1)
189
+
190
+ return imgs, target
191
+
192
+
193
+ def make_coco_transforms(image_set, max_size=640):
194
+ normalize = T.Compose([
195
+ T.ToTensor(),
196
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
197
+ ])
198
+
199
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
200
+
201
+ if image_set == 'train':
202
+ return T.Compose([
203
+ T.RandomHorizontalFlip(),
204
+ T.PhotometricDistort(),
205
+ T.RandomSelect(
206
+ T.Compose([
207
+ T.RandomResize(scales, max_size=max_size),
208
+ T.Check(),
209
+ ]),
210
+ T.Compose([
211
+ T.RandomResize([400, 500, 600]),
212
+ T.RandomSizeCrop(384, 600),
213
+ T.RandomResize(scales, max_size=max_size),
214
+ T.Check(),
215
+ ])
216
+ ),
217
+ normalize,
218
+ ])
219
+
220
+ # we do not use the 'val' set since the annotations are inaccessible
221
+ if image_set == 'val':
222
+ return T.Compose([
223
+ T.RandomResize([360], max_size=640),
224
+ normalize,
225
+ ])
226
+
227
+ raise ValueError(f'unknown {image_set}')
228
+
229
+
230
+ def build(image_set, args):
231
+ root = Path(args.ytvos_path)
232
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
233
+ PATHS = {
234
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
235
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
236
+ }
237
+ img_folder, ann_file = PATHS[image_set]
238
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
239
+ # num_frames=args.num_frames, max_skip=args.max_skip)
240
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
241
+ num_frames=args.num_frames, max_skip=args.max_skip)
242
+ return dataset
243
+
.history/datasets/ytvos_ref_20250113161634.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ for vid in self.videos:
61
+ vid_meta = subset_metas_by_video[vid]
62
+ vid_data = subset_expressions_by_video[vid]
63
+ vid_frames = sorted(vid_data['frames'])
64
+ vid_len = len(vid_frames)
65
+
66
+ for exp_id, exp_dict in vid_data['expressions'].items():
67
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
68
+ start_idx , end_idx = 2, vid_len-2
69
+ bin_size = (end_idx - start_idx) // 4
70
+
71
+ bins = []
72
+ for i in range(4):
73
+ bin_start = start_idx + i * bin_size
74
+ bin_end = bin_start + bin_size if i < 3 else end_idx
75
+
76
+ bins.append((bin_start, bin_end))
77
+
78
+ for bin_id in range(len(bins)):
79
+ start_idx, end_idx = bins[bin_id]
80
+ frame_id = random.randint(start_idx, end_idx - 1)
81
+
82
+ meta = {
83
+ 'video': vid,
84
+ 'exp': exp_dict['exp'],
85
+ 'obj_id': int(exp_dict['obj_id']),
86
+ 'frames': vid_frames,
87
+ 'frame_id' : frame_id,
88
+ 'bins': bins,
89
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
90
+ }
91
+ self.metas.append(meta)
92
+
93
+
94
+ @staticmethod
95
+ def bounding_box(img):
96
+ rows = np.any(img, axis=1)
97
+ cols = np.any(img, axis=0)
98
+ rmin, rmax = np.where(rows)[0][[0, -1]]
99
+ cmin, cmax = np.where(cols)[0][[0, -1]]
100
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
101
+
102
+ def __len__(self):
103
+ return len(self.metas)
104
+
105
+ def __getitem__(self, idx):
106
+ instance_check = False
107
+ while not instance_check:
108
+ meta = self.metas[idx] # dict
109
+
110
+
111
+ video, exp, obj_id, category, frames, bins = \
112
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
113
+
114
+
115
+ # clean up the caption
116
+ exp = " ".join(exp.lower().split())
117
+ category_id = category_dict[category]
118
+ vid_len = len(frames)
119
+
120
+ # num_frames = self.num_frames
121
+
122
+ # Random sample one frame from each bin
123
+ sample_indx = []
124
+ for start_idx, end_idx in bins:
125
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
126
+ sample_indx.sort() # Ensure indices are in order
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_indx:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ mask = Image.open(mask_path).convert('P')
136
+
137
+ # create the target
138
+ label = torch.tensor(category_id)
139
+ mask = np.array(mask)
140
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
141
+ if (mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ mask = torch.from_numpy(mask)
149
+
150
+ # append
151
+ imgs.append(img)
152
+ labels.append(label)
153
+ masks.append(mask)
154
+ boxes.append(box)
155
+
156
+ # transform
157
+ w, h = img.size
158
+ labels = torch.stack(labels, dim=0)
159
+ boxes = torch.stack(boxes, dim=0)
160
+ boxes[:, 0::2].clamp_(min=0, max=w)
161
+ boxes[:, 1::2].clamp_(min=0, max=h)
162
+ masks = torch.stack(masks, dim=0)
163
+ target = {
164
+ 'frames_idx': torch.tensor(sample_indx), # [T,]
165
+ 'labels': labels, # [T,]
166
+ 'boxes': boxes, # [T, 4], xyxy
167
+ 'masks': masks, # [T, H, W]
168
+ 'valid': torch.tensor(valid), # [T,]
169
+ 'caption': exp,
170
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
171
+ 'size': torch.as_tensor([int(h), int(w)])
172
+ }
173
+
174
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
175
+ if self._transforms:
176
+ imgs, target = self._transforms(imgs, target)
177
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
178
+ else:
179
+ imgs = np.array(imgs)
180
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
181
+
182
+
183
+ # FIXME: handle "valid", since some box may be removed due to random crop
184
+ if torch.any(target['valid'] == 1): # at leatst one instance
185
+ instance_check = True
186
+ else:
187
+ idx = random.randint(0, self.__len__() - 1)
188
+
189
+ return imgs, target
190
+
191
+
192
+ def make_coco_transforms(image_set, max_size=640):
193
+ normalize = T.Compose([
194
+ T.ToTensor(),
195
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
196
+ ])
197
+
198
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
199
+
200
+ if image_set == 'train':
201
+ return T.Compose([
202
+ T.RandomHorizontalFlip(),
203
+ T.PhotometricDistort(),
204
+ T.RandomSelect(
205
+ T.Compose([
206
+ T.RandomResize(scales, max_size=max_size),
207
+ T.Check(),
208
+ ]),
209
+ T.Compose([
210
+ T.RandomResize([400, 500, 600]),
211
+ T.RandomSizeCrop(384, 600),
212
+ T.RandomResize(scales, max_size=max_size),
213
+ T.Check(),
214
+ ])
215
+ ),
216
+ normalize,
217
+ ])
218
+
219
+ # we do not use the 'val' set since the annotations are inaccessible
220
+ if image_set == 'val':
221
+ return T.Compose([
222
+ T.RandomResize([360], max_size=640),
223
+ normalize,
224
+ ])
225
+
226
+ raise ValueError(f'unknown {image_set}')
227
+
228
+
229
+ def build(image_set, args):
230
+ root = Path(args.ytvos_path)
231
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
232
+ PATHS = {
233
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
234
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
235
+ }
236
+ img_folder, ann_file = PATHS[image_set]
237
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
238
+ # num_frames=args.num_frames, max_skip=args.max_skip)
239
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
240
+ num_frames=args.num_frames, max_skip=args.max_skip)
241
+ return dataset
242
+
.history/datasets/ytvos_ref_20250113162627.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ for vid in self.videos:
61
+ vid_meta = subset_metas_by_video[vid]
62
+ vid_data = subset_expressions_by_video[vid]
63
+ vid_frames = sorted(vid_data['frames'])
64
+ vid_len = len(vid_frames)
65
+
66
+ for exp_id, exp_dict in vid_data['expressions'].items():
67
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
68
+ start_idx , end_idx = 2, vid_len-2
69
+ bin_size = (end_idx - start_idx) // 4
70
+
71
+ bins = []
72
+ for i in range(4):
73
+ bin_start = start_idx + i * bin_size
74
+ bin_end = bin_start + bin_size if i < 3 else end_idx
75
+
76
+ bins.append((bin_start, bin_end))
77
+
78
+ # Random sample one frame from each bin
79
+ sample_indx = []
80
+ for start_idx, end_idx in bins:
81
+ print(start_idx, end_idx)
82
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
83
+ sample_indx.sort() # Ensure indices are in order
84
+
85
+
86
+ for frame_id in sample_indx:
87
+ meta = {
88
+ 'video': vid,
89
+ 'exp': exp_dict['exp'],
90
+ 'obj_id': int(exp_dict['obj_id']),
91
+ 'frames': vid_frames,
92
+ 'frame_id' : frame_id,
93
+ 'sample_frames_id' : sample_indx,
94
+ 'bins': bins,
95
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
96
+ }
97
+ self.metas.append(meta)
98
+
99
+
100
+ @staticmethod
101
+ def bounding_box(img):
102
+ rows = np.any(img, axis=1)
103
+ cols = np.any(img, axis=0)
104
+ rmin, rmax = np.where(rows)[0][[0, -1]]
105
+ cmin, cmax = np.where(cols)[0][[0, -1]]
106
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
107
+
108
+ def __len__(self):
109
+ return len(self.metas)
110
+
111
+ def __getitem__(self, idx):
112
+ instance_check = False
113
+ while not instance_check:
114
+ meta = self.metas[idx] # dict
115
+
116
+
117
+ video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
118
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
119
+
120
+
121
+ # clean up the caption
122
+ exp = " ".join(exp.lower().split())
123
+ category_id = category_dict[category]
124
+ vid_len = len(frames)
125
+
126
+ # num_frames = self.num_frames
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_frames_id:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ mask = Image.open(mask_path).convert('P')
136
+
137
+ # create the target
138
+ label = torch.tensor(category_id)
139
+ mask = np.array(mask)
140
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
141
+ if (mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ mask = torch.from_numpy(mask)
149
+
150
+ # append
151
+ imgs.append(img)
152
+ labels.append(label)
153
+ masks.append(mask)
154
+ boxes.append(box)
155
+
156
+ # transform
157
+ w, h = img.size
158
+ labels = torch.stack(labels, dim=0)
159
+ boxes = torch.stack(boxes, dim=0)
160
+ boxes[:, 0::2].clamp_(min=0, max=w)
161
+ boxes[:, 1::2].clamp_(min=0, max=h)
162
+ masks = torch.stack(masks, dim=0)
163
+ target = {
164
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
165
+ 'labels': labels, # [T,]
166
+ 'boxes': boxes, # [T, 4], xyxy
167
+ 'masks': masks, # [T, H, W]
168
+ 'valid': torch.tensor(valid), # [T,]
169
+ 'caption': exp,
170
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
171
+ 'size': torch.as_tensor([int(h), int(w)])
172
+ }
173
+
174
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
175
+ if self._transforms:
176
+ imgs, target = self._transforms(imgs, target)
177
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
178
+ else:
179
+ imgs = np.array(imgs)
180
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
181
+
182
+
183
+ # FIXME: handle "valid", since some box may be removed due to random crop
184
+ if torch.any(target['valid'] == 1): # at leatst one instance
185
+ instance_check = True
186
+ else:
187
+ idx = random.randint(0, self.__len__() - 1)
188
+
189
+ return imgs, target
190
+
191
+
192
+ def make_coco_transforms(image_set, max_size=640):
193
+ normalize = T.Compose([
194
+ T.ToTensor(),
195
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
196
+ ])
197
+
198
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
199
+
200
+ if image_set == 'train':
201
+ return T.Compose([
202
+ T.RandomHorizontalFlip(),
203
+ T.PhotometricDistort(),
204
+ T.RandomSelect(
205
+ T.Compose([
206
+ T.RandomResize(scales, max_size=max_size),
207
+ T.Check(),
208
+ ]),
209
+ T.Compose([
210
+ T.RandomResize([400, 500, 600]),
211
+ T.RandomSizeCrop(384, 600),
212
+ T.RandomResize(scales, max_size=max_size),
213
+ T.Check(),
214
+ ])
215
+ ),
216
+ normalize,
217
+ ])
218
+
219
+ # we do not use the 'val' set since the annotations are inaccessible
220
+ if image_set == 'val':
221
+ return T.Compose([
222
+ T.RandomResize([360], max_size=640),
223
+ normalize,
224
+ ])
225
+
226
+ raise ValueError(f'unknown {image_set}')
227
+
228
+
229
+ def build(image_set, args):
230
+ root = Path(args.ytvos_path)
231
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
232
+ PATHS = {
233
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
234
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
235
+ }
236
+ img_folder, ann_file = PATHS[image_set]
237
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
238
+ # num_frames=args.num_frames, max_skip=args.max_skip)
239
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
240
+ num_frames=args.num_frames, max_skip=args.max_skip)
241
+ return dataset
242
+
.history/datasets/ytvos_ref_20250113163106.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ for vid in self.videos:
61
+ vid_meta = subset_metas_by_video[vid]
62
+ vid_data = subset_expressions_by_video[vid]
63
+ vid_frames = sorted(vid_data['frames'])
64
+ vid_len = len(vid_frames)
65
+
66
+ if vid_len < 11:
67
+ continue
68
+
69
+ for exp_id, exp_dict in vid_data['expressions'].items():
70
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
71
+ start_idx , end_idx = 2, vid_len-2
72
+ bin_size = (end_idx - start_idx) // 4
73
+
74
+ bins = []
75
+ for i in range(4):
76
+ bin_start = start_idx + i * bin_size
77
+ bin_end = bin_start + bin_size if i < 3 else end_idx
78
+
79
+ bins.append((bin_start, bin_end))
80
+
81
+ # Random sample one frame from each bin
82
+ sample_indx = []
83
+ for start_idx, end_idx in bins:
84
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
85
+ sample_indx.sort() # Ensure indices are in order
86
+
87
+
88
+ for frame_id in sample_indx:
89
+ meta = {
90
+ 'video': vid,
91
+ 'exp': exp_dict['exp'],
92
+ 'obj_id': int(exp_dict['obj_id']),
93
+ 'frames': vid_frames,
94
+ 'frame_id' : frame_id,
95
+ 'sample_frames_id' : sample_indx,
96
+ 'bins': bins,
97
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
98
+ }
99
+ self.metas.append(meta)
100
+
101
+
102
+ @staticmethod
103
+ def bounding_box(img):
104
+ rows = np.any(img, axis=1)
105
+ cols = np.any(img, axis=0)
106
+ rmin, rmax = np.where(rows)[0][[0, -1]]
107
+ cmin, cmax = np.where(cols)[0][[0, -1]]
108
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
109
+
110
+ def __len__(self):
111
+ return len(self.metas)
112
+
113
+ def __getitem__(self, idx):
114
+ instance_check = False
115
+ while not instance_check:
116
+ meta = self.metas[idx] # dict
117
+
118
+
119
+ video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
120
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
121
+
122
+
123
+ # clean up the caption
124
+ exp = " ".join(exp.lower().split())
125
+ category_id = category_dict[category]
126
+ vid_len = len(frames)
127
+
128
+ # num_frames = self.num_frames
129
+
130
+ # read frames and masks
131
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
132
+ for frame_indx in sample_frames_id:
133
+ frame_name = frames[frame_indx]
134
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
135
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
136
+ img = Image.open(img_path).convert('RGB')
137
+ mask = Image.open(mask_path).convert('P')
138
+
139
+ # create the target
140
+ label = torch.tensor(category_id)
141
+ mask = np.array(mask)
142
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
143
+ if (mask > 0).any():
144
+ y1, y2, x1, x2 = self.bounding_box(mask)
145
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
146
+ valid.append(1)
147
+ else: # some frame didn't contain the instance
148
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
149
+ valid.append(0)
150
+ mask = torch.from_numpy(mask)
151
+
152
+ # append
153
+ imgs.append(img)
154
+ labels.append(label)
155
+ masks.append(mask)
156
+ boxes.append(box)
157
+
158
+ # transform
159
+ w, h = img.size
160
+ labels = torch.stack(labels, dim=0)
161
+ boxes = torch.stack(boxes, dim=0)
162
+ boxes[:, 0::2].clamp_(min=0, max=w)
163
+ boxes[:, 1::2].clamp_(min=0, max=h)
164
+ masks = torch.stack(masks, dim=0)
165
+ target = {
166
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
167
+ 'labels': labels, # [T,]
168
+ 'boxes': boxes, # [T, 4], xyxy
169
+ 'masks': masks, # [T, H, W]
170
+ 'valid': torch.tensor(valid), # [T,]
171
+ 'caption': exp,
172
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
173
+ 'size': torch.as_tensor([int(h), int(w)])
174
+ }
175
+
176
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
177
+ if self._transforms:
178
+ imgs, target = self._transforms(imgs, target)
179
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
180
+ else:
181
+ imgs = np.array(imgs)
182
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
183
+
184
+
185
+ # FIXME: handle "valid", since some box may be removed due to random crop
186
+ if torch.any(target['valid'] == 1): # at leatst one instance
187
+ instance_check = True
188
+ else:
189
+ idx = random.randint(0, self.__len__() - 1)
190
+
191
+ return imgs, target
192
+
193
+
194
+ def make_coco_transforms(image_set, max_size=640):
195
+ normalize = T.Compose([
196
+ T.ToTensor(),
197
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
198
+ ])
199
+
200
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
201
+
202
+ if image_set == 'train':
203
+ return T.Compose([
204
+ T.RandomHorizontalFlip(),
205
+ T.PhotometricDistort(),
206
+ T.RandomSelect(
207
+ T.Compose([
208
+ T.RandomResize(scales, max_size=max_size),
209
+ T.Check(),
210
+ ]),
211
+ T.Compose([
212
+ T.RandomResize([400, 500, 600]),
213
+ T.RandomSizeCrop(384, 600),
214
+ T.RandomResize(scales, max_size=max_size),
215
+ T.Check(),
216
+ ])
217
+ ),
218
+ normalize,
219
+ ])
220
+
221
+ # we do not use the 'val' set since the annotations are inaccessible
222
+ if image_set == 'val':
223
+ return T.Compose([
224
+ T.RandomResize([360], max_size=640),
225
+ normalize,
226
+ ])
227
+
228
+ raise ValueError(f'unknown {image_set}')
229
+
230
+
231
+ def build(image_set, args):
232
+ root = Path(args.ytvos_path)
233
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
234
+ PATHS = {
235
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
236
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
237
+ }
238
+ img_folder, ann_file = PATHS[image_set]
239
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
240
+ # num_frames=args.num_frames, max_skip=args.max_skip)
241
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
242
+ num_frames=args.num_frames, max_skip=args.max_skip)
243
+ return dataset
244
+
.history/datasets/ytvos_ref_20250113163121.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ for vid in self.videos:
61
+ vid_meta = subset_metas_by_video[vid]
62
+ vid_data = subset_expressions_by_video[vid]
63
+ vid_frames = sorted(vid_data['frames'])
64
+ vid_len = len(vid_frames)
65
+
66
+ if vid_len < 11:
67
+ print(f"Too short video: {vid}")
68
+ continue
69
+
70
+ for exp_id, exp_dict in vid_data['expressions'].items():
71
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
72
+ start_idx , end_idx = 2, vid_len-2
73
+ bin_size = (end_idx - start_idx) // 4
74
+
75
+ bins = []
76
+ for i in range(4):
77
+ bin_start = start_idx + i * bin_size
78
+ bin_end = bin_start + bin_size if i < 3 else end_idx
79
+
80
+ bins.append((bin_start, bin_end))
81
+
82
+ # Random sample one frame from each bin
83
+ sample_indx = []
84
+ for start_idx, end_idx in bins:
85
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
86
+ sample_indx.sort() # Ensure indices are in order
87
+
88
+
89
+ for frame_id in sample_indx:
90
+ meta = {
91
+ 'video': vid,
92
+ 'exp': exp_dict['exp'],
93
+ 'obj_id': int(exp_dict['obj_id']),
94
+ 'frames': vid_frames,
95
+ 'frame_id' : frame_id,
96
+ 'sample_frames_id' : sample_indx,
97
+ 'bins': bins,
98
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
99
+ }
100
+ self.metas.append(meta)
101
+
102
+
103
+ @staticmethod
104
+ def bounding_box(img):
105
+ rows = np.any(img, axis=1)
106
+ cols = np.any(img, axis=0)
107
+ rmin, rmax = np.where(rows)[0][[0, -1]]
108
+ cmin, cmax = np.where(cols)[0][[0, -1]]
109
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
110
+
111
+ def __len__(self):
112
+ return len(self.metas)
113
+
114
+ def __getitem__(self, idx):
115
+ instance_check = False
116
+ while not instance_check:
117
+ meta = self.metas[idx] # dict
118
+
119
+
120
+ video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
121
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
122
+
123
+
124
+ # clean up the caption
125
+ exp = " ".join(exp.lower().split())
126
+ category_id = category_dict[category]
127
+ vid_len = len(frames)
128
+
129
+ # num_frames = self.num_frames
130
+
131
+ # read frames and masks
132
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
133
+ for frame_indx in sample_frames_id:
134
+ frame_name = frames[frame_indx]
135
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
136
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
137
+ img = Image.open(img_path).convert('RGB')
138
+ mask = Image.open(mask_path).convert('P')
139
+
140
+ # create the target
141
+ label = torch.tensor(category_id)
142
+ mask = np.array(mask)
143
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
144
+ if (mask > 0).any():
145
+ y1, y2, x1, x2 = self.bounding_box(mask)
146
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
147
+ valid.append(1)
148
+ else: # some frame didn't contain the instance
149
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
150
+ valid.append(0)
151
+ mask = torch.from_numpy(mask)
152
+
153
+ # append
154
+ imgs.append(img)
155
+ labels.append(label)
156
+ masks.append(mask)
157
+ boxes.append(box)
158
+
159
+ # transform
160
+ w, h = img.size
161
+ labels = torch.stack(labels, dim=0)
162
+ boxes = torch.stack(boxes, dim=0)
163
+ boxes[:, 0::2].clamp_(min=0, max=w)
164
+ boxes[:, 1::2].clamp_(min=0, max=h)
165
+ masks = torch.stack(masks, dim=0)
166
+ target = {
167
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
168
+ 'labels': labels, # [T,]
169
+ 'boxes': boxes, # [T, 4], xyxy
170
+ 'masks': masks, # [T, H, W]
171
+ 'valid': torch.tensor(valid), # [T,]
172
+ 'caption': exp,
173
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
174
+ 'size': torch.as_tensor([int(h), int(w)])
175
+ }
176
+
177
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
178
+ if self._transforms:
179
+ imgs, target = self._transforms(imgs, target)
180
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
181
+ else:
182
+ imgs = np.array(imgs)
183
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
184
+
185
+
186
+ # FIXME: handle "valid", since some box may be removed due to random crop
187
+ if torch.any(target['valid'] == 1): # at leatst one instance
188
+ instance_check = True
189
+ else:
190
+ idx = random.randint(0, self.__len__() - 1)
191
+
192
+ return imgs, target
193
+
194
+
195
+ def make_coco_transforms(image_set, max_size=640):
196
+ normalize = T.Compose([
197
+ T.ToTensor(),
198
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
199
+ ])
200
+
201
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
202
+
203
+ if image_set == 'train':
204
+ return T.Compose([
205
+ T.RandomHorizontalFlip(),
206
+ T.PhotometricDistort(),
207
+ T.RandomSelect(
208
+ T.Compose([
209
+ T.RandomResize(scales, max_size=max_size),
210
+ T.Check(),
211
+ ]),
212
+ T.Compose([
213
+ T.RandomResize([400, 500, 600]),
214
+ T.RandomSizeCrop(384, 600),
215
+ T.RandomResize(scales, max_size=max_size),
216
+ T.Check(),
217
+ ])
218
+ ),
219
+ normalize,
220
+ ])
221
+
222
+ # we do not use the 'val' set since the annotations are inaccessible
223
+ if image_set == 'val':
224
+ return T.Compose([
225
+ T.RandomResize([360], max_size=640),
226
+ normalize,
227
+ ])
228
+
229
+ raise ValueError(f'unknown {image_set}')
230
+
231
+
232
+ def build(image_set, args):
233
+ root = Path(args.ytvos_path)
234
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
235
+ PATHS = {
236
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
237
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
238
+ }
239
+ img_folder, ann_file = PATHS[image_set]
240
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
241
+ # num_frames=args.num_frames, max_skip=args.max_skip)
242
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
243
+ num_frames=args.num_frames, max_skip=args.max_skip)
244
+ return dataset
245
+
.history/datasets/ytvos_ref_20250113163340.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+ for exp_id, exp_dict in vid_data['expressions'].items():
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ for frame_id in sample_indx:
93
+ meta = {
94
+ 'video': vid,
95
+ 'exp': exp_dict['exp'],
96
+ 'obj_id': int(exp_dict['obj_id']),
97
+ 'frames': vid_frames,
98
+ 'frame_id' : frame_id,
99
+ 'sample_frames_id' : sample_indx,
100
+ 'bins': bins,
101
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
102
+ }
103
+ self.metas.append(meta)
104
+ print(skip_vid_count)
105
+
106
+
107
+ @staticmethod
108
+ def bounding_box(img):
109
+ rows = np.any(img, axis=1)
110
+ cols = np.any(img, axis=0)
111
+ rmin, rmax = np.where(rows)[0][[0, -1]]
112
+ cmin, cmax = np.where(cols)[0][[0, -1]]
113
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
114
+
115
+ def __len__(self):
116
+ return len(self.metas)
117
+
118
+ def __getitem__(self, idx):
119
+ instance_check = False
120
+ while not instance_check:
121
+ meta = self.metas[idx] # dict
122
+
123
+
124
+ video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
125
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
126
+
127
+
128
+ # clean up the caption
129
+ exp = " ".join(exp.lower().split())
130
+ category_id = category_dict[category]
131
+ vid_len = len(frames)
132
+
133
+ # num_frames = self.num_frames
134
+
135
+ # read frames and masks
136
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
137
+ for frame_indx in sample_frames_id:
138
+ frame_name = frames[frame_indx]
139
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
140
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
141
+ img = Image.open(img_path).convert('RGB')
142
+ mask = Image.open(mask_path).convert('P')
143
+
144
+ # create the target
145
+ label = torch.tensor(category_id)
146
+ mask = np.array(mask)
147
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
148
+ if (mask > 0).any():
149
+ y1, y2, x1, x2 = self.bounding_box(mask)
150
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
151
+ valid.append(1)
152
+ else: # some frame didn't contain the instance
153
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
154
+ valid.append(0)
155
+ mask = torch.from_numpy(mask)
156
+
157
+ # append
158
+ imgs.append(img)
159
+ labels.append(label)
160
+ masks.append(mask)
161
+ boxes.append(box)
162
+
163
+ # transform
164
+ w, h = img.size
165
+ labels = torch.stack(labels, dim=0)
166
+ boxes = torch.stack(boxes, dim=0)
167
+ boxes[:, 0::2].clamp_(min=0, max=w)
168
+ boxes[:, 1::2].clamp_(min=0, max=h)
169
+ masks = torch.stack(masks, dim=0)
170
+ target = {
171
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
172
+ 'labels': labels, # [T,]
173
+ 'boxes': boxes, # [T, 4], xyxy
174
+ 'masks': masks, # [T, H, W]
175
+ 'valid': torch.tensor(valid), # [T,]
176
+ 'caption': exp,
177
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
178
+ 'size': torch.as_tensor([int(h), int(w)])
179
+ }
180
+
181
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
182
+ if self._transforms:
183
+ imgs, target = self._transforms(imgs, target)
184
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
185
+ else:
186
+ imgs = np.array(imgs)
187
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
188
+
189
+
190
+ # FIXME: handle "valid", since some box may be removed due to random crop
191
+ if torch.any(target['valid'] == 1): # at leatst one instance
192
+ instance_check = True
193
+ else:
194
+ idx = random.randint(0, self.__len__() - 1)
195
+
196
+ return imgs, target
197
+
198
+
199
+ def make_coco_transforms(image_set, max_size=640):
200
+ normalize = T.Compose([
201
+ T.ToTensor(),
202
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
203
+ ])
204
+
205
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
206
+
207
+ if image_set == 'train':
208
+ return T.Compose([
209
+ T.RandomHorizontalFlip(),
210
+ T.PhotometricDistort(),
211
+ T.RandomSelect(
212
+ T.Compose([
213
+ T.RandomResize(scales, max_size=max_size),
214
+ T.Check(),
215
+ ]),
216
+ T.Compose([
217
+ T.RandomResize([400, 500, 600]),
218
+ T.RandomSizeCrop(384, 600),
219
+ T.RandomResize(scales, max_size=max_size),
220
+ T.Check(),
221
+ ])
222
+ ),
223
+ normalize,
224
+ ])
225
+
226
+ # we do not use the 'val' set since the annotations are inaccessible
227
+ if image_set == 'val':
228
+ return T.Compose([
229
+ T.RandomResize([360], max_size=640),
230
+ normalize,
231
+ ])
232
+
233
+ raise ValueError(f'unknown {image_set}')
234
+
235
+
236
+ def build(image_set, args):
237
+ root = Path(args.ytvos_path)
238
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
239
+ PATHS = {
240
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
241
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
242
+ }
243
+ img_folder, ann_file = PATHS[image_set]
244
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
245
+ # num_frames=args.num_frames, max_skip=args.max_skip)
246
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
247
+ num_frames=args.num_frames, max_skip=args.max_skip)
248
+ return dataset
249
+
.history/datasets/ytvos_ref_20250113163347.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+ for exp_id, exp_dict in vid_data['expressions'].items():
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ for frame_id in sample_indx:
93
+ meta = {
94
+ 'video': vid,
95
+ 'exp': exp_dict['exp'],
96
+ 'obj_id': int(exp_dict['obj_id']),
97
+ 'frames': vid_frames,
98
+ 'frame_id' : frame_id,
99
+ 'sample_frames_id' : sample_indx,
100
+ 'bins': bins,
101
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
102
+ }
103
+ self.metas.append(meta)
104
+ print(skip_vid_count)
105
+
106
+
107
+ @staticmethod
108
+ def bounding_box(img):
109
+ rows = np.any(img, axis=1)
110
+ cols = np.any(img, axis=0)
111
+ rmin, rmax = np.where(rows)[0][[0, -1]]
112
+ cmin, cmax = np.where(cols)[0][[0, -1]]
113
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
114
+
115
+ def __len__(self):
116
+ return len(self.metas)
117
+
118
+ def __getitem__(self, idx):
119
+ instance_check = False
120
+ while not instance_check:
121
+ meta = self.metas[idx] # dict
122
+
123
+
124
+ video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
125
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
126
+
127
+
128
+ # clean up the caption
129
+ exp = " ".join(exp.lower().split())
130
+ category_id = category_dict[category]
131
+ vid_len = len(frames)
132
+
133
+ # num_frames = self.num_frames
134
+
135
+ # read frames and masks
136
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
137
+ for frame_indx in sample_frames_id:
138
+ frame_name = frames[frame_indx]
139
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
140
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
141
+ img = Image.open(img_path).convert('RGB')
142
+ mask = Image.open(mask_path).convert('P')
143
+
144
+ # create the target
145
+ label = torch.tensor(category_id)
146
+ mask = np.array(mask)
147
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
148
+ if (mask > 0).any():
149
+ y1, y2, x1, x2 = self.bounding_box(mask)
150
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
151
+ valid.append(1)
152
+ else: # some frame didn't contain the instance
153
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
154
+ valid.append(0)
155
+ mask = torch.from_numpy(mask)
156
+
157
+ # append
158
+ imgs.append(img)
159
+ labels.append(label)
160
+ masks.append(mask)
161
+ boxes.append(box)
162
+
163
+ # transform
164
+ w, h = img.size
165
+ labels = torch.stack(labels, dim=0)
166
+ boxes = torch.stack(boxes, dim=0)
167
+ boxes[:, 0::2].clamp_(min=0, max=w)
168
+ boxes[:, 1::2].clamp_(min=0, max=h)
169
+ masks = torch.stack(masks, dim=0)
170
+ target = {
171
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
172
+ 'labels': labels, # [T,]
173
+ 'boxes': boxes, # [T, 4], xyxy
174
+ 'masks': masks, # [T, H, W]
175
+ 'valid': torch.tensor(valid), # [T,]
176
+ 'caption': exp,
177
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
178
+ 'size': torch.as_tensor([int(h), int(w)])
179
+ }
180
+
181
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
182
+ if self._transforms:
183
+ imgs, target = self._transforms(imgs, target)
184
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
185
+ else:
186
+ imgs = np.array(imgs)
187
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
188
+
189
+
190
+ # FIXME: handle "valid", since some box may be removed due to random crop
191
+ if torch.any(target['valid'] == 1): # at leatst one instance
192
+ instance_check = True
193
+ else:
194
+ idx = random.randint(0, self.__len__() - 1)
195
+
196
+ return imgs, target
197
+
198
+
199
+ def make_coco_transforms(image_set, max_size=640):
200
+ normalize = T.Compose([
201
+ T.ToTensor(),
202
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
203
+ ])
204
+
205
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
206
+
207
+ if image_set == 'train':
208
+ return T.Compose([
209
+ T.RandomHorizontalFlip(),
210
+ T.PhotometricDistort(),
211
+ T.RandomSelect(
212
+ T.Compose([
213
+ T.RandomResize(scales, max_size=max_size),
214
+ T.Check(),
215
+ ]),
216
+ T.Compose([
217
+ T.RandomResize([400, 500, 600]),
218
+ T.RandomSizeCrop(384, 600),
219
+ T.RandomResize(scales, max_size=max_size),
220
+ T.Check(),
221
+ ])
222
+ ),
223
+ normalize,
224
+ ])
225
+
226
+ # we do not use the 'val' set since the annotations are inaccessible
227
+ if image_set == 'val':
228
+ return T.Compose([
229
+ T.RandomResize([360], max_size=640),
230
+ normalize,
231
+ ])
232
+
233
+ raise ValueError(f'unknown {image_set}')
234
+
235
+
236
+ def build(image_set, args):
237
+ root = Path(args.ytvos_path)
238
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
239
+ PATHS = {
240
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
241
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
242
+ }
243
+ img_folder, ann_file = PATHS[image_set]
244
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
245
+ # num_frames=args.num_frames, max_skip=args.max_skip)
246
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
247
+ num_frames=args.num_frames, max_skip=args.max_skip)
248
+ return dataset
249
+
.history/datasets/ytvos_ref_20250114202456.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+ return vid_meta, vid_data
74
+ for exp_id, exp_dict in vid_data['expressions'].items():
75
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
76
+ start_idx , end_idx = 2, vid_len-2
77
+ bin_size = (end_idx - start_idx) // 4
78
+
79
+ bins = []
80
+ for i in range(4):
81
+ bin_start = start_idx + i * bin_size
82
+ bin_end = bin_start + bin_size if i < 3 else end_idx
83
+
84
+ bins.append((bin_start, bin_end))
85
+
86
+ # Random sample one frame from each bin
87
+ sample_indx = []
88
+ for start_idx, end_idx in bins:
89
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
90
+ sample_indx.sort() # Ensure indices are in order
91
+
92
+
93
+ for sample_id in sample_indx:
94
+ meta = {
95
+ 'video': vid,
96
+ 'exp': exp_dict['exp'],
97
+ 'obj_id': int(exp_dict['obj_id']),
98
+ 'frames': vid_frames,
99
+ 'sample_id' : sample_id,
100
+ 'sample_frames_id' : sample_indx,
101
+ 'bins': bins,
102
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
103
+ }
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ instance_check = False
122
+ while not instance_check:
123
+ meta = self.metas[idx] # dict
124
+
125
+
126
+ video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
127
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
128
+
129
+
130
+ # clean up the caption
131
+ exp = " ".join(exp.lower().split())
132
+ category_id = category_dict[category]
133
+ vid_len = len(frames)
134
+
135
+ # num_frames = self.num_frames
136
+
137
+ # read frames and masks
138
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
139
+ for frame_indx in sample_frames_id:
140
+ frame_name = frames[frame_indx]
141
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
142
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
143
+ img = Image.open(img_path).convert('RGB')
144
+ mask = Image.open(mask_path).convert('P')
145
+
146
+ # create the target
147
+ label = torch.tensor(category_id)
148
+ mask = np.array(mask)
149
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
150
+ if (mask > 0).any():
151
+ y1, y2, x1, x2 = self.bounding_box(mask)
152
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
153
+ valid.append(1)
154
+ else: # some frame didn't contain the instance
155
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
156
+ valid.append(0)
157
+ mask = torch.from_numpy(mask)
158
+
159
+ # append
160
+ imgs.append(img)
161
+ labels.append(label)
162
+ masks.append(mask)
163
+ boxes.append(box)
164
+
165
+ # transform
166
+ w, h = img.size
167
+ labels = torch.stack(labels, dim=0)
168
+ boxes = torch.stack(boxes, dim=0)
169
+ boxes[:, 0::2].clamp_(min=0, max=w)
170
+ boxes[:, 1::2].clamp_(min=0, max=h)
171
+ masks = torch.stack(masks, dim=0)
172
+ target = {
173
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
174
+ 'labels': labels, # [T,]
175
+ 'boxes': boxes, # [T, 4], xyxy
176
+ 'masks': masks, # [T, H, W]
177
+ 'valid': torch.tensor(valid), # [T,]
178
+ 'caption': exp,
179
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
180
+ 'size': torch.as_tensor([int(h), int(w)])
181
+ }
182
+
183
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
184
+ if self._transforms:
185
+ imgs, target = self._transforms(imgs, target)
186
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
187
+ else:
188
+ imgs = np.array(imgs)
189
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
190
+
191
+
192
+ # FIXME: handle "valid", since some box may be removed due to random crop
193
+ if torch.any(target['valid'] == 1): # at leatst one instance
194
+ instance_check = True
195
+ else:
196
+ idx = random.randint(0, self.__len__() - 1)
197
+
198
+ return imgs, target
199
+
200
+
201
+ def make_coco_transforms(image_set, max_size=640):
202
+ normalize = T.Compose([
203
+ T.ToTensor(),
204
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
205
+ ])
206
+
207
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
208
+
209
+ if image_set == 'train':
210
+ return T.Compose([
211
+ T.RandomHorizontalFlip(),
212
+ T.PhotometricDistort(),
213
+ T.RandomSelect(
214
+ T.Compose([
215
+ T.RandomResize(scales, max_size=max_size),
216
+ T.Check(),
217
+ ]),
218
+ T.Compose([
219
+ T.RandomResize([400, 500, 600]),
220
+ T.RandomSizeCrop(384, 600),
221
+ T.RandomResize(scales, max_size=max_size),
222
+ T.Check(),
223
+ ])
224
+ ),
225
+ normalize,
226
+ ])
227
+
228
+ # we do not use the 'val' set since the annotations are inaccessible
229
+ if image_set == 'val':
230
+ return T.Compose([
231
+ T.RandomResize([360], max_size=640),
232
+ normalize,
233
+ ])
234
+
235
+ raise ValueError(f'unknown {image_set}')
236
+
237
+
238
+ def build(image_set, args):
239
+ root = Path(args.ytvos_path)
240
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
241
+ PATHS = {
242
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
243
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
244
+ }
245
+ img_folder, ann_file = PATHS[image_set]
246
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
247
+ # num_frames=args.num_frames, max_skip=args.max_skip)
248
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
249
+ num_frames=args.num_frames, max_skip=args.max_skip)
250
+ return dataset
251
+
.history/datasets/ytvos_ref_20250114205130.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+ for exp_id, exp_dict in vid_data['expressions'].items():
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ for sample_id in sample_indx:
93
+ meta = {
94
+ 'video': vid,
95
+ 'exp': exp_dict['exp'],
96
+ 'obj_id': int(exp_dict['obj_id']),
97
+ 'frames': vid_frames,
98
+ 'sample_id' : sample_id,
99
+ 'sample_frames_id' : sample_indx,
100
+ 'bins': bins,
101
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
102
+ }
103
+ self.metas.append(meta)
104
+
105
+ print(f"skipped {skip_vid_count} short videos")
106
+
107
+
108
+ @staticmethod
109
+ def bounding_box(img):
110
+ rows = np.any(img, axis=1)
111
+ cols = np.any(img, axis=0)
112
+ rmin, rmax = np.where(rows)[0][[0, -1]]
113
+ cmin, cmax = np.where(cols)[0][[0, -1]]
114
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
115
+
116
+ def __len__(self):
117
+ return len(self.metas)
118
+
119
+ def __getitem__(self, idx):
120
+ instance_check = False
121
+ while not instance_check:
122
+ meta = self.metas[idx] # dict
123
+
124
+
125
+ video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
126
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
127
+
128
+
129
+ # clean up the caption
130
+ exp = " ".join(exp.lower().split())
131
+ category_id = category_dict[category]
132
+ vid_len = len(frames)
133
+
134
+ # num_frames = self.num_frames
135
+
136
+ # read frames and masks
137
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
138
+ for frame_indx in sample_frames_id:
139
+ frame_name = frames[frame_indx]
140
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
141
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
142
+ img = Image.open(img_path).convert('RGB')
143
+ mask = Image.open(mask_path).convert('P')
144
+
145
+ # create the target
146
+ label = torch.tensor(category_id)
147
+ mask = np.array(mask)
148
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
149
+ if (mask > 0).any():
150
+ y1, y2, x1, x2 = self.bounding_box(mask)
151
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
152
+ valid.append(1)
153
+ else: # some frame didn't contain the instance
154
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
155
+ valid.append(0)
156
+ mask = torch.from_numpy(mask)
157
+
158
+ # append
159
+ imgs.append(img)
160
+ labels.append(label)
161
+ masks.append(mask)
162
+ boxes.append(box)
163
+
164
+ # transform
165
+ w, h = img.size
166
+ labels = torch.stack(labels, dim=0)
167
+ boxes = torch.stack(boxes, dim=0)
168
+ boxes[:, 0::2].clamp_(min=0, max=w)
169
+ boxes[:, 1::2].clamp_(min=0, max=h)
170
+ masks = torch.stack(masks, dim=0)
171
+ target = {
172
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
173
+ 'labels': labels, # [T,]
174
+ 'boxes': boxes, # [T, 4], xyxy
175
+ 'masks': masks, # [T, H, W]
176
+ 'valid': torch.tensor(valid), # [T,]
177
+ 'caption': exp,
178
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
179
+ 'size': torch.as_tensor([int(h), int(w)])
180
+ }
181
+
182
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
183
+ if self._transforms:
184
+ imgs, target = self._transforms(imgs, target)
185
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
186
+ else:
187
+ imgs = np.array(imgs)
188
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
189
+
190
+
191
+ # FIXME: handle "valid", since some box may be removed due to random crop
192
+ if torch.any(target['valid'] == 1): # at leatst one instance
193
+ instance_check = True
194
+ else:
195
+ idx = random.randint(0, self.__len__() - 1)
196
+
197
+ return imgs, target
198
+
199
+
200
+ def make_coco_transforms(image_set, max_size=640):
201
+ normalize = T.Compose([
202
+ T.ToTensor(),
203
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
204
+ ])
205
+
206
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
207
+
208
+ if image_set == 'train':
209
+ return T.Compose([
210
+ T.RandomHorizontalFlip(),
211
+ T.PhotometricDistort(),
212
+ T.RandomSelect(
213
+ T.Compose([
214
+ T.RandomResize(scales, max_size=max_size),
215
+ T.Check(),
216
+ ]),
217
+ T.Compose([
218
+ T.RandomResize([400, 500, 600]),
219
+ T.RandomSizeCrop(384, 600),
220
+ T.RandomResize(scales, max_size=max_size),
221
+ T.Check(),
222
+ ])
223
+ ),
224
+ normalize,
225
+ ])
226
+
227
+ # we do not use the 'val' set since the annotations are inaccessible
228
+ if image_set == 'val':
229
+ return T.Compose([
230
+ T.RandomResize([360], max_size=640),
231
+ normalize,
232
+ ])
233
+
234
+ raise ValueError(f'unknown {image_set}')
235
+
236
+
237
+ def build(image_set, args):
238
+ root = Path(args.ytvos_path)
239
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
240
+ PATHS = {
241
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
242
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
243
+ }
244
+ img_folder, ann_file = PATHS[image_set]
245
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
246
+ # num_frames=args.num_frames, max_skip=args.max_skip)
247
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
248
+ num_frames=args.num_frames, max_skip=args.max_skip)
249
+ return dataset
250
+
.history/datasets/ytvos_ref_20250114211235.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins
96
+ }
97
+ obj_id_cat = {}
98
+ for exp_id, exp_dict in vid_data['expressions'].items():
99
+ obj_id = exp_dict['obj_id']
100
+ print(obj_id, type(obj_id))
101
+ print(vid_meta.keys())
102
+ if obj_id not in obj_id_cat:
103
+ obj_id_cat[obj_id] = vid_meta[obj_id]['category']
104
+ meta['obj_id_cat'] = obj_id_cat
105
+ self.metas.append(meta)
106
+
107
+ print(f"skipped {skip_vid_count} short videos")
108
+
109
+
110
+ @staticmethod
111
+ def bounding_box(img):
112
+ rows = np.any(img, axis=1)
113
+ cols = np.any(img, axis=0)
114
+ rmin, rmax = np.where(rows)[0][[0, -1]]
115
+ cmin, cmax = np.where(cols)[0][[0, -1]]
116
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
117
+
118
+ def __len__(self):
119
+ return len(self.metas)
120
+
121
+ def __getitem__(self, idx):
122
+ instance_check = False
123
+ while not instance_check:
124
+ meta = self.metas[idx] # dict
125
+
126
+
127
+ video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
128
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
129
+
130
+
131
+ # clean up the caption
132
+ exp = " ".join(exp.lower().split())
133
+ category_id = category_dict[category]
134
+ vid_len = len(frames)
135
+
136
+ # num_frames = self.num_frames
137
+
138
+ # read frames and masks
139
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
140
+ for frame_indx in sample_frames_id:
141
+ frame_name = frames[frame_indx]
142
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
143
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
144
+ img = Image.open(img_path).convert('RGB')
145
+ mask = Image.open(mask_path).convert('P')
146
+
147
+ # create the target
148
+ label = torch.tensor(category_id)
149
+ mask = np.array(mask)
150
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
151
+ if (mask > 0).any():
152
+ y1, y2, x1, x2 = self.bounding_box(mask)
153
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
154
+ valid.append(1)
155
+ else: # some frame didn't contain the instance
156
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
157
+ valid.append(0)
158
+ mask = torch.from_numpy(mask)
159
+
160
+ # append
161
+ imgs.append(img)
162
+ labels.append(label)
163
+ masks.append(mask)
164
+ boxes.append(box)
165
+
166
+ # transform
167
+ w, h = img.size
168
+ labels = torch.stack(labels, dim=0)
169
+ boxes = torch.stack(boxes, dim=0)
170
+ boxes[:, 0::2].clamp_(min=0, max=w)
171
+ boxes[:, 1::2].clamp_(min=0, max=h)
172
+ masks = torch.stack(masks, dim=0)
173
+ target = {
174
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
175
+ 'labels': labels, # [T,]
176
+ 'boxes': boxes, # [T, 4], xyxy
177
+ 'masks': masks, # [T, H, W]
178
+ 'valid': torch.tensor(valid), # [T,]
179
+ 'caption': exp,
180
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
181
+ 'size': torch.as_tensor([int(h), int(w)])
182
+ }
183
+
184
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
185
+ if self._transforms:
186
+ imgs, target = self._transforms(imgs, target)
187
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
188
+ else:
189
+ imgs = np.array(imgs)
190
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
191
+
192
+
193
+ # FIXME: handle "valid", since some box may be removed due to random crop
194
+ if torch.any(target['valid'] == 1): # at leatst one instance
195
+ instance_check = True
196
+ else:
197
+ idx = random.randint(0, self.__len__() - 1)
198
+
199
+ return imgs, target
200
+
201
+
202
+ def make_coco_transforms(image_set, max_size=640):
203
+ normalize = T.Compose([
204
+ T.ToTensor(),
205
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
206
+ ])
207
+
208
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
209
+
210
+ if image_set == 'train':
211
+ return T.Compose([
212
+ T.RandomHorizontalFlip(),
213
+ T.PhotometricDistort(),
214
+ T.RandomSelect(
215
+ T.Compose([
216
+ T.RandomResize(scales, max_size=max_size),
217
+ T.Check(),
218
+ ]),
219
+ T.Compose([
220
+ T.RandomResize([400, 500, 600]),
221
+ T.RandomSizeCrop(384, 600),
222
+ T.RandomResize(scales, max_size=max_size),
223
+ T.Check(),
224
+ ])
225
+ ),
226
+ normalize,
227
+ ])
228
+
229
+ # we do not use the 'val' set since the annotations are inaccessible
230
+ if image_set == 'val':
231
+ return T.Compose([
232
+ T.RandomResize([360], max_size=640),
233
+ normalize,
234
+ ])
235
+
236
+ raise ValueError(f'unknown {image_set}')
237
+
238
+
239
+ def build(image_set, args):
240
+ root = Path(args.ytvos_path)
241
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
242
+ PATHS = {
243
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
244
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
245
+ }
246
+ img_folder, ann_file = PATHS[image_set]
247
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
248
+ # num_frames=args.num_frames, max_skip=args.max_skip)
249
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
250
+ num_frames=args.num_frames, max_skip=args.max_skip)
251
+ return dataset
252
+
.history/datasets/ytvos_ref_20250114211331.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins
96
+ }
97
+ obj_id_cat = {}
98
+ for exp_id, exp_dict in vid_data['expressions'].items():
99
+ obj_id = exp_dict['obj_id']
100
+ if obj_id not in obj_id_cat:
101
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
102
+ meta['obj_id_cat'] = obj_id_cat
103
+ self.metas.append(meta)
104
+
105
+ print(f"skipped {skip_vid_count} short videos")
106
+
107
+
108
+ @staticmethod
109
+ def bounding_box(img):
110
+ rows = np.any(img, axis=1)
111
+ cols = np.any(img, axis=0)
112
+ rmin, rmax = np.where(rows)[0][[0, -1]]
113
+ cmin, cmax = np.where(cols)[0][[0, -1]]
114
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
115
+
116
+ def __len__(self):
117
+ return len(self.metas)
118
+
119
+ def __getitem__(self, idx):
120
+ instance_check = False
121
+ while not instance_check:
122
+ meta = self.metas[idx] # dict
123
+
124
+
125
+ video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
126
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
127
+
128
+
129
+ # clean up the caption
130
+ exp = " ".join(exp.lower().split())
131
+ category_id = category_dict[category]
132
+ vid_len = len(frames)
133
+
134
+ # num_frames = self.num_frames
135
+
136
+ # read frames and masks
137
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
138
+ for frame_indx in sample_frames_id:
139
+ frame_name = frames[frame_indx]
140
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
141
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
142
+ img = Image.open(img_path).convert('RGB')
143
+ mask = Image.open(mask_path).convert('P')
144
+
145
+ # create the target
146
+ label = torch.tensor(category_id)
147
+ mask = np.array(mask)
148
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
149
+ if (mask > 0).any():
150
+ y1, y2, x1, x2 = self.bounding_box(mask)
151
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
152
+ valid.append(1)
153
+ else: # some frame didn't contain the instance
154
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
155
+ valid.append(0)
156
+ mask = torch.from_numpy(mask)
157
+
158
+ # append
159
+ imgs.append(img)
160
+ labels.append(label)
161
+ masks.append(mask)
162
+ boxes.append(box)
163
+
164
+ # transform
165
+ w, h = img.size
166
+ labels = torch.stack(labels, dim=0)
167
+ boxes = torch.stack(boxes, dim=0)
168
+ boxes[:, 0::2].clamp_(min=0, max=w)
169
+ boxes[:, 1::2].clamp_(min=0, max=h)
170
+ masks = torch.stack(masks, dim=0)
171
+ target = {
172
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
173
+ 'labels': labels, # [T,]
174
+ 'boxes': boxes, # [T, 4], xyxy
175
+ 'masks': masks, # [T, H, W]
176
+ 'valid': torch.tensor(valid), # [T,]
177
+ 'caption': exp,
178
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
179
+ 'size': torch.as_tensor([int(h), int(w)])
180
+ }
181
+
182
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
183
+ if self._transforms:
184
+ imgs, target = self._transforms(imgs, target)
185
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
186
+ else:
187
+ imgs = np.array(imgs)
188
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
189
+
190
+
191
+ # FIXME: handle "valid", since some box may be removed due to random crop
192
+ if torch.any(target['valid'] == 1): # at leatst one instance
193
+ instance_check = True
194
+ else:
195
+ idx = random.randint(0, self.__len__() - 1)
196
+
197
+ return imgs, target
198
+
199
+
200
+ def make_coco_transforms(image_set, max_size=640):
201
+ normalize = T.Compose([
202
+ T.ToTensor(),
203
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
204
+ ])
205
+
206
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
207
+
208
+ if image_set == 'train':
209
+ return T.Compose([
210
+ T.RandomHorizontalFlip(),
211
+ T.PhotometricDistort(),
212
+ T.RandomSelect(
213
+ T.Compose([
214
+ T.RandomResize(scales, max_size=max_size),
215
+ T.Check(),
216
+ ]),
217
+ T.Compose([
218
+ T.RandomResize([400, 500, 600]),
219
+ T.RandomSizeCrop(384, 600),
220
+ T.RandomResize(scales, max_size=max_size),
221
+ T.Check(),
222
+ ])
223
+ ),
224
+ normalize,
225
+ ])
226
+
227
+ # we do not use the 'val' set since the annotations are inaccessible
228
+ if image_set == 'val':
229
+ return T.Compose([
230
+ T.RandomResize([360], max_size=640),
231
+ normalize,
232
+ ])
233
+
234
+ raise ValueError(f'unknown {image_set}')
235
+
236
+
237
+ def build(image_set, args):
238
+ root = Path(args.ytvos_path)
239
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
240
+ PATHS = {
241
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
242
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
243
+ }
244
+ img_folder, ann_file = PATHS[image_set]
245
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
246
+ # num_frames=args.num_frames, max_skip=args.max_skip)
247
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
248
+ num_frames=args.num_frames, max_skip=args.max_skip)
249
+ return dataset
250
+
.history/datasets/ytvos_ref_20250114211640.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ instance_check = False
122
+ while not instance_check:
123
+ meta = self.metas[idx] # dict
124
+
125
+ video, sample_indx, bins, obj_id_cat = \
126
+ meta['video'], meta['sample_indx'], meta['bins'], meta['obj_id_cat']
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_indx:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ mask = Image.open(mask_path).convert('P')
136
+
137
+ # create the target
138
+ label = torch.tensor(category_id)
139
+ mask = np.array(mask)
140
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
141
+ if (mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ mask = torch.from_numpy(mask)
149
+
150
+ # append
151
+ imgs.append(img)
152
+ labels.append(label)
153
+ masks.append(mask)
154
+ boxes.append(box)
155
+
156
+ # transform
157
+ w, h = img.size
158
+ labels = torch.stack(labels, dim=0)
159
+ boxes = torch.stack(boxes, dim=0)
160
+ boxes[:, 0::2].clamp_(min=0, max=w)
161
+ boxes[:, 1::2].clamp_(min=0, max=h)
162
+ masks = torch.stack(masks, dim=0)
163
+ target = {
164
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
165
+ 'labels': labels, # [T,]
166
+ 'boxes': boxes, # [T, 4], xyxy
167
+ 'masks': masks, # [T, H, W]
168
+ 'valid': torch.tensor(valid), # [T,]
169
+ 'caption': exp,
170
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
171
+ 'size': torch.as_tensor([int(h), int(w)])
172
+ }
173
+
174
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
175
+ if self._transforms:
176
+ imgs, target = self._transforms(imgs, target)
177
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
178
+ else:
179
+ imgs = np.array(imgs)
180
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
181
+
182
+
183
+ # FIXME: handle "valid", since some box may be removed due to random crop
184
+ if torch.any(target['valid'] == 1): # at leatst one instance
185
+ instance_check = True
186
+ else:
187
+ idx = random.randint(0, self.__len__() - 1)
188
+
189
+ return imgs, target
190
+
191
+
192
+ def make_coco_transforms(image_set, max_size=640):
193
+ normalize = T.Compose([
194
+ T.ToTensor(),
195
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
196
+ ])
197
+
198
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
199
+
200
+ if image_set == 'train':
201
+ return T.Compose([
202
+ T.RandomHorizontalFlip(),
203
+ T.PhotometricDistort(),
204
+ T.RandomSelect(
205
+ T.Compose([
206
+ T.RandomResize(scales, max_size=max_size),
207
+ T.Check(),
208
+ ]),
209
+ T.Compose([
210
+ T.RandomResize([400, 500, 600]),
211
+ T.RandomSizeCrop(384, 600),
212
+ T.RandomResize(scales, max_size=max_size),
213
+ T.Check(),
214
+ ])
215
+ ),
216
+ normalize,
217
+ ])
218
+
219
+ # we do not use the 'val' set since the annotations are inaccessible
220
+ if image_set == 'val':
221
+ return T.Compose([
222
+ T.RandomResize([360], max_size=640),
223
+ normalize,
224
+ ])
225
+
226
+ raise ValueError(f'unknown {image_set}')
227
+
228
+
229
+ def build(image_set, args):
230
+ root = Path(args.ytvos_path)
231
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
232
+ PATHS = {
233
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
234
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
235
+ }
236
+ img_folder, ann_file = PATHS[image_set]
237
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
238
+ # num_frames=args.num_frames, max_skip=args.max_skip)
239
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
240
+ num_frames=args.num_frames, max_skip=args.max_skip)
241
+ return dataset
242
+
.history/datasets/ytvos_ref_20250114211841.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ instance_check = False
122
+ while not instance_check:
123
+ meta = self.metas[idx] # dict
124
+
125
+ video, sample_indx, bins, frames, obj_id_cat = \
126
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_indx:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ mask = Image.open(mask_path).convert('P')
136
+
137
+ # create the target
138
+ label = torch.tensor(category_id)
139
+ mask = np.array(mask)
140
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
141
+ if (mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ mask = torch.from_numpy(mask)
149
+
150
+ # append
151
+ imgs.append(img)
152
+ labels.append(label)
153
+ masks.append(mask)
154
+ boxes.append(box)
155
+
156
+ # transform
157
+ w, h = img.size
158
+ labels = torch.stack(labels, dim=0)
159
+ boxes = torch.stack(boxes, dim=0)
160
+ boxes[:, 0::2].clamp_(min=0, max=w)
161
+ boxes[:, 1::2].clamp_(min=0, max=h)
162
+ masks = torch.stack(masks, dim=0)
163
+ target = {
164
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
165
+ 'labels': labels, # [T,]
166
+ 'boxes': boxes, # [T, 4], xyxy
167
+ 'masks': masks, # [T, H, W]
168
+ 'valid': torch.tensor(valid), # [T,]
169
+ 'caption': exp,
170
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
171
+ 'size': torch.as_tensor([int(h), int(w)])
172
+ }
173
+
174
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
175
+ if self._transforms:
176
+ imgs, target = self._transforms(imgs, target)
177
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
178
+ else:
179
+ imgs = np.array(imgs)
180
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
181
+
182
+
183
+ # FIXME: handle "valid", since some box may be removed due to random crop
184
+ if torch.any(target['valid'] == 1): # at leatst one instance
185
+ instance_check = True
186
+ else:
187
+ idx = random.randint(0, self.__len__() - 1)
188
+
189
+ return imgs, target
190
+
191
+
192
+ def make_coco_transforms(image_set, max_size=640):
193
+ normalize = T.Compose([
194
+ T.ToTensor(),
195
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
196
+ ])
197
+
198
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
199
+
200
+ if image_set == 'train':
201
+ return T.Compose([
202
+ T.RandomHorizontalFlip(),
203
+ T.PhotometricDistort(),
204
+ T.RandomSelect(
205
+ T.Compose([
206
+ T.RandomResize(scales, max_size=max_size),
207
+ T.Check(),
208
+ ]),
209
+ T.Compose([
210
+ T.RandomResize([400, 500, 600]),
211
+ T.RandomSizeCrop(384, 600),
212
+ T.RandomResize(scales, max_size=max_size),
213
+ T.Check(),
214
+ ])
215
+ ),
216
+ normalize,
217
+ ])
218
+
219
+ # we do not use the 'val' set since the annotations are inaccessible
220
+ if image_set == 'val':
221
+ return T.Compose([
222
+ T.RandomResize([360], max_size=640),
223
+ normalize,
224
+ ])
225
+
226
+ raise ValueError(f'unknown {image_set}')
227
+
228
+
229
+ def build(image_set, args):
230
+ root = Path(args.ytvos_path)
231
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
232
+ PATHS = {
233
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
234
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
235
+ }
236
+ img_folder, ann_file = PATHS[image_set]
237
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
238
+ # num_frames=args.num_frames, max_skip=args.max_skip)
239
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
240
+ num_frames=args.num_frames, max_skip=args.max_skip)
241
+ return dataset
242
+
.history/datasets/ytvos_ref_20250114212623.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ instance_check = False
122
+ while not instance_check:
123
+ meta = self.metas[idx] # dict
124
+
125
+ video, sample_indx, bins, frames, obj_id_cat = \
126
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_indx:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ mask = Image.open(mask_path).convert('P')
136
+ mask = np.array(mask)
137
+
138
+ # create the target
139
+ for obj_id in list(obj_id_cat.keys()):
140
+ obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
141
+ if (obj_mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ obj_mask = torch.from_numpy(obj_mask)
149
+
150
+ # append
151
+ imgs.append(img)
152
+ masks.append(obj_mask)
153
+ boxes.append(box)
154
+
155
+
156
+ # transform
157
+ w, h = img.size
158
+ labels = torch.stack(labels, dim=0)
159
+ boxes = torch.stack(boxes, dim=0)
160
+ boxes[:, 0::2].clamp_(min=0, max=w)
161
+ boxes[:, 1::2].clamp_(min=0, max=h)
162
+ masks = torch.stack(masks, dim=0)
163
+ target = {
164
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
165
+ 'labels': labels, # [T,]
166
+ 'boxes': boxes, # [T, 4], xyxy
167
+ 'masks': masks, # [T, H, W]
168
+ 'valid': torch.tensor(valid), # [T,]
169
+ 'caption': exp,
170
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
171
+ 'size': torch.as_tensor([int(h), int(w)])
172
+ }
173
+
174
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
175
+ if self._transforms:
176
+ imgs, target = self._transforms(imgs, target)
177
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
178
+ else:
179
+ imgs = np.array(imgs)
180
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
181
+
182
+
183
+ # FIXME: handle "valid", since some box may be removed due to random crop
184
+ if torch.any(target['valid'] == 1): # at leatst one instance
185
+ instance_check = True
186
+ else:
187
+ idx = random.randint(0, self.__len__() - 1)
188
+
189
+ return imgs, target
190
+
191
+
192
+ def make_coco_transforms(image_set, max_size=640):
193
+ normalize = T.Compose([
194
+ T.ToTensor(),
195
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
196
+ ])
197
+
198
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
199
+
200
+ if image_set == 'train':
201
+ return T.Compose([
202
+ T.RandomHorizontalFlip(),
203
+ T.PhotometricDistort(),
204
+ T.RandomSelect(
205
+ T.Compose([
206
+ T.RandomResize(scales, max_size=max_size),
207
+ T.Check(),
208
+ ]),
209
+ T.Compose([
210
+ T.RandomResize([400, 500, 600]),
211
+ T.RandomSizeCrop(384, 600),
212
+ T.RandomResize(scales, max_size=max_size),
213
+ T.Check(),
214
+ ])
215
+ ),
216
+ normalize,
217
+ ])
218
+
219
+ # we do not use the 'val' set since the annotations are inaccessible
220
+ if image_set == 'val':
221
+ return T.Compose([
222
+ T.RandomResize([360], max_size=640),
223
+ normalize,
224
+ ])
225
+
226
+ raise ValueError(f'unknown {image_set}')
227
+
228
+
229
+ def build(image_set, args):
230
+ root = Path(args.ytvos_path)
231
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
232
+ PATHS = {
233
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
234
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
235
+ }
236
+ img_folder, ann_file = PATHS[image_set]
237
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
238
+ # num_frames=args.num_frames, max_skip=args.max_skip)
239
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
240
+ num_frames=args.num_frames, max_skip=args.max_skip)
241
+ return dataset
242
+
.history/datasets/ytvos_ref_20250116071135.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ instance_check = False
122
+ while not instance_check:
123
+ meta = self.metas[idx] # dict
124
+
125
+ video, sample_indx, bins, frames, obj_id_cat = \
126
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_indx:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ mask = Image.open(mask_path).convert('P')
136
+ mask = np.array(mask)
137
+
138
+ # create the target
139
+ for obj_id in list(obj_id_cat.keys()):
140
+ obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
141
+ if (obj_mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ obj_mask = torch.from_numpy(obj_mask)
149
+
150
+ # append
151
+ imgs.append(img)
152
+ masks.append(obj_mask)
153
+ boxes.append(box)
154
+
155
+
156
+ # transform
157
+ w, h = img.size
158
+ labels = torch.stack(labels, dim=0)
159
+ boxes = torch.stack(boxes, dim=0)
160
+ boxes[:, 0::2].clamp_(min=0, max=w)
161
+ boxes[:, 1::2].clamp_(min=0, max=h)
162
+ masks = torch.stack(masks, dim=0)
163
+ target = {
164
+ 'frames_idx': sample_indx, # [T,]
165
+ 'boxes': boxes, # [T, 4], xyxy
166
+ 'masks': masks, # [T, H, W]
167
+ 'valid': torch.tensor(valid), # [T,]
168
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
169
+ 'size': torch.as_tensor([int(h), int(w)])
170
+ }
171
+
172
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
173
+ if self._transforms:
174
+ imgs, target = self._transforms(imgs, target)
175
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
176
+ else:
177
+ imgs = np.array(imgs)
178
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
179
+
180
+
181
+ # FIXME: handle "valid", since some box may be removed due to random crop
182
+ if torch.any(target['valid'] == 1): # at leatst one instance
183
+ instance_check = True
184
+ else:
185
+ idx = random.randint(0, self.__len__() - 1)
186
+
187
+ return imgs, target
188
+
189
+
190
+ def make_coco_transforms(image_set, max_size=640):
191
+ normalize = T.Compose([
192
+ T.ToTensor(),
193
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
194
+ ])
195
+
196
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
197
+
198
+ if image_set == 'train':
199
+ return T.Compose([
200
+ T.RandomHorizontalFlip(),
201
+ T.PhotometricDistort(),
202
+ T.RandomSelect(
203
+ T.Compose([
204
+ T.RandomResize(scales, max_size=max_size),
205
+ T.Check(),
206
+ ]),
207
+ T.Compose([
208
+ T.RandomResize([400, 500, 600]),
209
+ T.RandomSizeCrop(384, 600),
210
+ T.RandomResize(scales, max_size=max_size),
211
+ T.Check(),
212
+ ])
213
+ ),
214
+ normalize,
215
+ ])
216
+
217
+ # we do not use the 'val' set since the annotations are inaccessible
218
+ if image_set == 'val':
219
+ return T.Compose([
220
+ T.RandomResize([360], max_size=640),
221
+ normalize,
222
+ ])
223
+
224
+ raise ValueError(f'unknown {image_set}')
225
+
226
+
227
+ def build(image_set, args):
228
+ root = Path(args.ytvos_path)
229
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
230
+ PATHS = {
231
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
232
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
233
+ }
234
+ img_folder, ann_file = PATHS[image_set]
235
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
236
+ # num_frames=args.num_frames, max_skip=args.max_skip)
237
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
238
+ num_frames=args.num_frames, max_skip=args.max_skip)
239
+ return dataset
240
+
.history/datasets/ytvos_ref_20250116071255.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ instance_check = False
122
+ while not instance_check:
123
+ meta = self.metas[idx] # dict
124
+
125
+ video, sample_indx, bins, frames, obj_id_cat = \
126
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_indx:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ mask = Image.open(mask_path).convert('P')
136
+ mask = np.array(mask)
137
+
138
+ # create the target
139
+ for obj_id in list(obj_id_cat.keys()):
140
+ obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
141
+ if (obj_mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ obj_mask = torch.from_numpy(obj_mask)
149
+
150
+ # append
151
+ imgs.append(img)
152
+ masks.append(obj_mask)
153
+ boxes.append(box)
154
+
155
+
156
+ # transform
157
+ w, h = img.size
158
+ boxes = torch.stack(boxes, dim=0)
159
+ boxes[:, 0::2].clamp_(min=0, max=w)
160
+ boxes[:, 1::2].clamp_(min=0, max=h)
161
+ masks = torch.stack(masks, dim=0)
162
+ target = {
163
+ 'frames_idx': sample_indx, # [T,]
164
+ 'boxes': boxes, # [T, 4], xyxy
165
+ 'masks': masks, # [T, H, W]
166
+ 'valid': torch.tensor(valid), # [T,]
167
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
168
+ 'size': torch.as_tensor([int(h), int(w)])
169
+ }
170
+
171
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
172
+ if self._transforms:
173
+ imgs, target = self._transforms(imgs, target)
174
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
175
+ else:
176
+ imgs = np.array(imgs)
177
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
178
+
179
+
180
+ # FIXME: handle "valid", since some box may be removed due to random crop
181
+ if torch.any(target['valid'] == 1): # at leatst one instance
182
+ instance_check = True
183
+ else:
184
+ idx = random.randint(0, self.__len__() - 1)
185
+
186
+ return imgs, target
187
+
188
+
189
+ def make_coco_transforms(image_set, max_size=640):
190
+ normalize = T.Compose([
191
+ T.ToTensor(),
192
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
193
+ ])
194
+
195
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
196
+
197
+ if image_set == 'train':
198
+ return T.Compose([
199
+ T.RandomHorizontalFlip(),
200
+ T.PhotometricDistort(),
201
+ T.RandomSelect(
202
+ T.Compose([
203
+ T.RandomResize(scales, max_size=max_size),
204
+ T.Check(),
205
+ ]),
206
+ T.Compose([
207
+ T.RandomResize([400, 500, 600]),
208
+ T.RandomSizeCrop(384, 600),
209
+ T.RandomResize(scales, max_size=max_size),
210
+ T.Check(),
211
+ ])
212
+ ),
213
+ normalize,
214
+ ])
215
+
216
+ # we do not use the 'val' set since the annotations are inaccessible
217
+ if image_set == 'val':
218
+ return T.Compose([
219
+ T.RandomResize([360], max_size=640),
220
+ normalize,
221
+ ])
222
+
223
+ raise ValueError(f'unknown {image_set}')
224
+
225
+
226
+ def build(image_set, args):
227
+ root = Path(args.ytvos_path)
228
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
229
+ PATHS = {
230
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
231
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
232
+ }
233
+ img_folder, ann_file = PATHS[image_set]
234
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
235
+ # num_frames=args.num_frames, max_skip=args.max_skip)
236
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
237
+ num_frames=args.num_frames, max_skip=args.max_skip)
238
+ return dataset
239
+
.history/datasets/ytvos_ref_20250116071502.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ instance_check = False
122
+ while not instance_check:
123
+ meta = self.metas[idx] # dict
124
+
125
+ video, sample_indx, bins, frames, obj_id_cat = \
126
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_indx:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ mask = Image.open(mask_path).convert('P')
136
+ mask = np.array(mask)
137
+
138
+ # create the target
139
+ for obj_id in list(obj_id_cat.keys()):
140
+ obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
141
+ if (obj_mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ obj_mask = torch.from_numpy(obj_mask)
149
+
150
+ # append
151
+ imgs.append(img)
152
+ masks.append(obj_mask)
153
+ boxes.append(box)
154
+
155
+
156
+ # transform
157
+ w, h = img.size
158
+ boxes = torch.stack(boxes, dim=0)
159
+ boxes[:, 0::2].clamp_(min=0, max=w)
160
+ boxes[:, 1::2].clamp_(min=0, max=h)
161
+ masks = torch.stack(masks, dim=0)
162
+ target = {
163
+ 'frames_idx': sample_indx, # [T,]
164
+ 'boxes': boxes, # [T, 4], xyxy
165
+ 'masks': masks, # [T, H, W]
166
+ 'valid': torch.tensor(valid), # [T,]
167
+ 'obj_ids' : list(obj_id_cat.keys())
168
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
169
+ 'size': torch.as_tensor([int(h), int(w)])
170
+ }
171
+
172
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
173
+ if self._transforms:
174
+ imgs, target = self._transforms(imgs, target)
175
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
176
+ else:
177
+ imgs = np.array(imgs)
178
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
179
+
180
+
181
+ # FIXME: handle "valid", since some box may be removed due to random crop
182
+ if torch.any(target['valid'] == 1): # at leatst one instance
183
+ instance_check = True
184
+ else:
185
+ idx = random.randint(0, self.__len__() - 1)
186
+
187
+ return imgs, target
188
+
189
+
190
+ def make_coco_transforms(image_set, max_size=640):
191
+ normalize = T.Compose([
192
+ T.ToTensor(),
193
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
194
+ ])
195
+
196
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
197
+
198
+ if image_set == 'train':
199
+ return T.Compose([
200
+ T.RandomHorizontalFlip(),
201
+ T.PhotometricDistort(),
202
+ T.RandomSelect(
203
+ T.Compose([
204
+ T.RandomResize(scales, max_size=max_size),
205
+ T.Check(),
206
+ ]),
207
+ T.Compose([
208
+ T.RandomResize([400, 500, 600]),
209
+ T.RandomSizeCrop(384, 600),
210
+ T.RandomResize(scales, max_size=max_size),
211
+ T.Check(),
212
+ ])
213
+ ),
214
+ normalize,
215
+ ])
216
+
217
+ # we do not use the 'val' set since the annotations are inaccessible
218
+ if image_set == 'val':
219
+ return T.Compose([
220
+ T.RandomResize([360], max_size=640),
221
+ normalize,
222
+ ])
223
+
224
+ raise ValueError(f'unknown {image_set}')
225
+
226
+
227
+ def build(image_set, args):
228
+ root = Path(args.ytvos_path)
229
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
230
+ PATHS = {
231
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
232
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
233
+ }
234
+ img_folder, ann_file = PATHS[image_set]
235
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
236
+ # num_frames=args.num_frames, max_skip=args.max_skip)
237
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
238
+ num_frames=args.num_frames, max_skip=args.max_skip)
239
+ return dataset
240
+
.history/datasets/ytvos_ref_20250116071546.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ instance_check = False
122
+ while not instance_check:
123
+ meta = self.metas[idx] # dict
124
+
125
+ video, sample_indx, bins, frames, obj_id_cat = \
126
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_indx:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ mask = Image.open(mask_path).convert('P')
136
+ mask = np.array(mask)
137
+
138
+ # create the target
139
+ for obj_id in list(obj_id_cat.keys()):
140
+ obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
141
+ if (obj_mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ obj_mask = torch.from_numpy(obj_mask)
149
+
150
+ # append
151
+ imgs.append(img)
152
+ masks.append(obj_mask)
153
+ boxes.append(box)
154
+
155
+
156
+ # transform
157
+ w, h = img.size
158
+ boxes = torch.stack(boxes, dim=0)
159
+ boxes[:, 0::2].clamp_(min=0, max=w)
160
+ boxes[:, 1::2].clamp_(min=0, max=h)
161
+ masks = torch.stack(masks, dim=0)
162
+ target = {
163
+ 'frames_idx': sample_indx, # [T,]
164
+ 'boxes': boxes, # [T, 4], xyxy
165
+ 'masks': masks, # [T, H, W]
166
+ 'valid': torch.tensor(valid), # [T,]
167
+ 'obj_ids' : list(obj_id_cat.keys()),
168
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
169
+ 'size': torch.as_tensor([int(h), int(w)])
170
+ }
171
+
172
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
173
+ if self._transforms:
174
+ imgs, target = self._transforms(imgs, target)
175
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
176
+ else:
177
+ imgs = np.array(imgs)
178
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
179
+
180
+
181
+ # FIXME: handle "valid", since some box may be removed due to random crop
182
+ if torch.any(target['valid'] == 1): # at leatst one instance
183
+ instance_check = True
184
+ else:
185
+ idx = random.randint(0, self.__len__() - 1)
186
+
187
+ return imgs, target
188
+
189
+
190
+ def make_coco_transforms(image_set, max_size=640):
191
+ normalize = T.Compose([
192
+ T.ToTensor(),
193
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
194
+ ])
195
+
196
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
197
+
198
+ if image_set == 'train':
199
+ return T.Compose([
200
+ T.RandomHorizontalFlip(),
201
+ T.PhotometricDistort(),
202
+ T.RandomSelect(
203
+ T.Compose([
204
+ T.RandomResize(scales, max_size=max_size),
205
+ T.Check(),
206
+ ]),
207
+ T.Compose([
208
+ T.RandomResize([400, 500, 600]),
209
+ T.RandomSizeCrop(384, 600),
210
+ T.RandomResize(scales, max_size=max_size),
211
+ T.Check(),
212
+ ])
213
+ ),
214
+ normalize,
215
+ ])
216
+
217
+ # we do not use the 'val' set since the annotations are inaccessible
218
+ if image_set == 'val':
219
+ return T.Compose([
220
+ T.RandomResize([360], max_size=640),
221
+ normalize,
222
+ ])
223
+
224
+ raise ValueError(f'unknown {image_set}')
225
+
226
+
227
+ def build(image_set, args):
228
+ root = Path(args.ytvos_path)
229
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
230
+ PATHS = {
231
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
232
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
233
+ }
234
+ img_folder, ann_file = PATHS[image_set]
235
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
236
+ # num_frames=args.num_frames, max_skip=args.max_skip)
237
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
238
+ num_frames=args.num_frames, max_skip=args.max_skip)
239
+ return dataset
240
+
.history/datasets/ytvos_ref_20250116071553.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ instance_check = False
122
+ while not instance_check:
123
+ meta = self.metas[idx] # dict
124
+
125
+ video, sample_indx, bins, frames, obj_id_cat = \
126
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_indx:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ mask = Image.open(mask_path).convert('P')
136
+ mask = np.array(mask)
137
+
138
+ # create the target
139
+ for obj_id in list(obj_id_cat.keys()):
140
+ obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
141
+ if (obj_mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ obj_mask = torch.from_numpy(obj_mask)
149
+
150
+ # append
151
+ imgs.append(img)
152
+ masks.append(obj_mask)
153
+ boxes.append(box)
154
+
155
+
156
+ # transform
157
+ w, h = img.size
158
+ boxes = torch.stack(boxes, dim=0)
159
+ boxes[:, 0::2].clamp_(min=0, max=w)
160
+ boxes[:, 1::2].clamp_(min=0, max=h)
161
+ masks = torch.stack(masks, dim=0)
162
+ target = {
163
+ 'frames_idx': sample_indx, # [T,]
164
+ 'boxes': boxes, # [T, 4], xyxy
165
+ 'masks': masks, # [T, H, W]
166
+ 'valid': torch.tensor(valid), # [T,]
167
+ 'obj_ids' : list(obj_id_cat.keys()),
168
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
169
+ 'size': torch.as_tensor([int(h), int(w)])
170
+ }
171
+
172
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
173
+ if self._transforms:
174
+ imgs, target = self._transforms(imgs, target)
175
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
176
+ else:
177
+ imgs = np.array(imgs)
178
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
179
+
180
+
181
+ # FIXME: handle "valid", since some box may be removed due to random crop
182
+ if torch.any(target['valid'] == 1): # at leatst one instance
183
+ instance_check = True
184
+ else:
185
+ idx = random.randint(0, self.__len__() - 1)
186
+
187
+ return imgs, target
188
+
189
+
190
+ def make_coco_transforms(image_set, max_size=640):
191
+ normalize = T.Compose([
192
+ T.ToTensor(),
193
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
194
+ ])
195
+
196
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
197
+
198
+ if image_set == 'train':
199
+ return T.Compose([
200
+ T.RandomHorizontalFlip(),
201
+ T.PhotometricDistort(),
202
+ T.RandomSelect(
203
+ T.Compose([
204
+ T.RandomResize(scales, max_size=max_size),
205
+ T.Check(),
206
+ ]),
207
+ T.Compose([
208
+ T.RandomResize([400, 500, 600]),
209
+ T.RandomSizeCrop(384, 600),
210
+ T.RandomResize(scales, max_size=max_size),
211
+ T.Check(),
212
+ ])
213
+ ),
214
+ normalize,
215
+ ])
216
+
217
+ # we do not use the 'val' set since the annotations are inaccessible
218
+ if image_set == 'val':
219
+ return T.Compose([
220
+ T.RandomResize([360], max_size=640),
221
+ normalize,
222
+ ])
223
+
224
+ raise ValueError(f'unknown {image_set}')
225
+
226
+
227
+ def build(image_set, args):
228
+ root = Path(args.ytvos_path)
229
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
230
+ PATHS = {
231
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
232
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
233
+ }
234
+ img_folder, ann_file = PATHS[image_set]
235
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
236
+ # num_frames=args.num_frames, max_skip=args.max_skip)
237
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
238
+ num_frames=args.num_frames, max_skip=args.max_skip)
239
+ return dataset
240
+
.history/datasets/ytvos_ref_20250116071841.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ instance_check = False
122
+ while not instance_check:
123
+ meta = self.metas[idx] # dict
124
+
125
+ video, sample_indx, bins, frames, obj_id_cat = \
126
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_indx:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ mask = Image.open(mask_path).convert('P')
136
+ mask = np.array(mask)
137
+
138
+ # create the target
139
+ for obj_id in list(obj_id_cat.keys()):
140
+ obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
141
+ if (obj_mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ obj_mask = torch.from_numpy(obj_mask)
149
+
150
+ # append
151
+ masks.append(obj_mask)
152
+ boxes.append(box)
153
+
154
+
155
+ # transform
156
+ w, h = img.size
157
+ boxes = torch.stack(boxes, dim=0)
158
+ boxes[:, 0::2].clamp_(min=0, max=w)
159
+ boxes[:, 1::2].clamp_(min=0, max=h)
160
+ masks = torch.stack(masks, dim=0)
161
+ target = {
162
+ 'frames_idx': sample_indx, # [T,]
163
+ 'boxes': boxes, # [T, 4], xyxy
164
+ 'masks': masks, # [T, H, W]
165
+ 'valid': torch.tensor(valid), # [T,]
166
+ 'obj_ids' : list(obj_id_cat.keys()),
167
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
168
+ 'size': torch.as_tensor([int(h), int(w)])
169
+ }
170
+
171
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
172
+ if self._transforms:
173
+ imgs, target = self._transforms(imgs, target)
174
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
175
+ else:
176
+ imgs = np.array(imgs)
177
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
178
+
179
+
180
+ # FIXME: handle "valid", since some box may be removed due to random crop
181
+ if torch.any(target['valid'] == 1): # at leatst one instance
182
+ instance_check = True
183
+ else:
184
+ idx = random.randint(0, self.__len__() - 1)
185
+
186
+ return imgs, target
187
+
188
+
189
+ def make_coco_transforms(image_set, max_size=640):
190
+ normalize = T.Compose([
191
+ T.ToTensor(),
192
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
193
+ ])
194
+
195
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
196
+
197
+ if image_set == 'train':
198
+ return T.Compose([
199
+ T.RandomHorizontalFlip(),
200
+ T.PhotometricDistort(),
201
+ T.RandomSelect(
202
+ T.Compose([
203
+ T.RandomResize(scales, max_size=max_size),
204
+ T.Check(),
205
+ ]),
206
+ T.Compose([
207
+ T.RandomResize([400, 500, 600]),
208
+ T.RandomSizeCrop(384, 600),
209
+ T.RandomResize(scales, max_size=max_size),
210
+ T.Check(),
211
+ ])
212
+ ),
213
+ normalize,
214
+ ])
215
+
216
+ # we do not use the 'val' set since the annotations are inaccessible
217
+ if image_set == 'val':
218
+ return T.Compose([
219
+ T.RandomResize([360], max_size=640),
220
+ normalize,
221
+ ])
222
+
223
+ raise ValueError(f'unknown {image_set}')
224
+
225
+
226
+ def build(image_set, args):
227
+ root = Path(args.ytvos_path)
228
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
229
+ PATHS = {
230
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
231
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
232
+ }
233
+ img_folder, ann_file = PATHS[image_set]
234
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
235
+ # num_frames=args.num_frames, max_skip=args.max_skip)
236
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
237
+ num_frames=args.num_frames, max_skip=args.max_skip)
238
+ return dataset
239
+
.history/datasets/ytvos_ref_20250116072442.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ instance_check = False
122
+ while not instance_check:
123
+ meta = self.metas[idx] # dict
124
+
125
+ video, sample_indx, bins, frames, obj_id_cat = \
126
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_indx:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ imgs.append(img)
136
+
137
+ mask = Image.open(mask_path).convert('P')
138
+ mask = np.array(mask)
139
+
140
+ # create the target
141
+ for obj_id in list(obj_id_cat.keys()):
142
+ obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
143
+ if (obj_mask > 0).any():
144
+ y1, y2, x1, x2 = self.bounding_box(mask)
145
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
146
+ valid.append(1)
147
+ else: # some frame didn't contain the instance
148
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
149
+ valid.append(0)
150
+ obj_mask = torch.from_numpy(obj_mask)
151
+
152
+ # append
153
+ masks.append(obj_mask)
154
+ boxes.append(box)
155
+
156
+
157
+ # transform
158
+ w, h = img.size
159
+ boxes = torch.stack(boxes, dim=0)
160
+ boxes[:, 0::2].clamp_(min=0, max=w)
161
+ boxes[:, 1::2].clamp_(min=0, max=h)
162
+ masks = torch.stack(masks, dim=0)
163
+ target = {
164
+ 'frames_idx': sample_indx, # [T,]
165
+ 'boxes': boxes, # [T, 4], xyxy
166
+ 'masks': masks, # [T, H, W]
167
+ 'valid': torch.tensor(valid), # [T,]
168
+ 'obj_ids' : list(obj_id_cat.keys()),
169
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
170
+ 'size': torch.as_tensor([int(h), int(w)])
171
+ }
172
+
173
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
174
+ if self._transforms:
175
+ imgs, target = self._transforms(imgs, target)
176
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
177
+ else:
178
+ imgs = np.array(imgs)
179
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
180
+
181
+
182
+ # FIXME: handle "valid", since some box may be removed due to random crop
183
+ if torch.any(target['valid'] == 1): # at leatst one instance
184
+ instance_check = True
185
+ else:
186
+ idx = random.randint(0, self.__len__() - 1)
187
+
188
+ return imgs, target
189
+
190
+
191
+ def make_coco_transforms(image_set, max_size=640):
192
+ normalize = T.Compose([
193
+ T.ToTensor(),
194
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
195
+ ])
196
+
197
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
198
+
199
+ if image_set == 'train':
200
+ return T.Compose([
201
+ T.RandomHorizontalFlip(),
202
+ T.PhotometricDistort(),
203
+ T.RandomSelect(
204
+ T.Compose([
205
+ T.RandomResize(scales, max_size=max_size),
206
+ T.Check(),
207
+ ]),
208
+ T.Compose([
209
+ T.RandomResize([400, 500, 600]),
210
+ T.RandomSizeCrop(384, 600),
211
+ T.RandomResize(scales, max_size=max_size),
212
+ T.Check(),
213
+ ])
214
+ ),
215
+ normalize,
216
+ ])
217
+
218
+ # we do not use the 'val' set since the annotations are inaccessible
219
+ if image_set == 'val':
220
+ return T.Compose([
221
+ T.RandomResize([360], max_size=640),
222
+ normalize,
223
+ ])
224
+
225
+ raise ValueError(f'unknown {image_set}')
226
+
227
+
228
+ def build(image_set, args):
229
+ root = Path(args.ytvos_path)
230
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
231
+ PATHS = {
232
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
233
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
234
+ }
235
+ img_folder, ann_file = PATHS[image_set]
236
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
237
+ # num_frames=args.num_frames, max_skip=args.max_skip)
238
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
239
+ num_frames=args.num_frames, max_skip=args.max_skip)
240
+ return dataset
241
+
.history/slurm_script/mbench_ref-ytvos_json_20250113182526.sh ADDED
File without changes
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2020 - present, Facebook, Inc
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
2
+ [![Framework](https://img.shields.io/badge/PyTorch-%23EE4C2C.svg?&logo=PyTorch&logoColor=white)](https://pytorch.org/)
3
+
4
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/language-as-queries-for-referring-video/referring-expression-segmentation-on-refer-1)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refer-1?p=language-as-queries-for-referring-video)
5
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/language-as-queries-for-referring-video/referring-expression-segmentation-on-a2d)](https://paperswithcode.com/sota/referring-expression-segmentation-on-a2d?p=language-as-queries-for-referring-video)
6
+
7
+ The official implementation of the **CVPR2022** paper:
8
+
9
+ <div align="center">
10
+ <h1>
11
+ <b>
12
+ Language as Queries for Referring <br> Video Object Segmentation
13
+ </b>
14
+ </h1>
15
+ </div>
16
+
17
+ <p align="center"><img src="docs/network.png" width="800"/></p>
18
+
19
+ > [**Language as Queries for Referring Video Object Segmentation**](https://arxiv.org/abs/2201.00487)
20
+ >
21
+ > Jiannan Wu, Yi Jiang, Peize Sun, Zehuan Yuan, Ping Luo
22
+
23
+ ### Abstract
24
+
25
+ In this work, we propose a simple and unified framework built upon Transformer, termed ReferFormer. It views the language as queries and directly attends to the most relevant regions in the video frames. Concretely, we introduce a small set of object queries conditioned on the language as the input to the Transformer. In this manner, all the queries are obligated to find the referred objects only. They are eventually transformed into dynamic kernels which capture the crucial object-level information, and play the role of convolution filters to generate the segmentation masks from feature maps. The object tracking is achieved naturally by linking the corresponding queries across frames. This mechanism greatly simplifies the pipeline and the end-to-end framework is significantly different from the previous methods. Extensive experiments on Ref-Youtube-VOS, Ref-DAVIS17, A2D-Sentences and JHMDB-Sentences show the effectiveness of ReferFormer.
26
+
27
+ ## Update
28
+ - **(2022/12/19)** We add the results on RefCOCO/+/g validation set.
29
+ - **(2022/07/31)** We upload the files for joint-training.
30
+ - **(2022/04/04)** We upload the data conversion and main files for pre-training.
31
+ - **(2022/03/11)** We upload the model on Ref-Youtube-VOS by jointly training Ref-Youtube-VOS and Ref-COCO/+/g, which leads to higher performance.
32
+ - **(2022/03/03)** ReferFormer is accepted by CVPR2022. 👏
33
+
34
+ ## Demo
35
+
36
+ - Ref-DAVIS17
37
+
38
+ <img src="docs/davis_demo1.gif" width="400"/><img src="docs/davis_demo2.gif" width="400"/>
39
+
40
+ - Ref-Youtube-VOS
41
+
42
+ <img src="docs/ytvos_demo1.gif" width="400"/><img src="docs/ytvos_demo2.gif" width="400"/>
43
+
44
+
45
+
46
+ ## Requirements
47
+
48
+ We test the codes in the following environments, other versions may also be compatible:
49
+
50
+ - CUDA 11.1
51
+ - Python 3.7
52
+ - Pytorch 1.8.1
53
+
54
+
55
+ ## Installation
56
+
57
+ Please refer to [install.md](docs/install.md) for installation.
58
+
59
+ ## Data Preparation
60
+
61
+ Please refer to [data.md](docs/data.md) for data preparation.
62
+
63
+ We provide the pretrained model for different visual backbones. You may download them [here]([https://drive.google.com/drive/u/0/folders/11_qps3q75aH41IYHlXToyeIBUKkfdqso](https://connecthkuhk-my.sharepoint.com/:f:/g/personal/wjn922_connect_hku_hk/Et657S8tgGRNguj2hf4azsUBn1UVbMNLAmyjcRWGobs2_A?e=xobQFH)) and put them in the directory `pretrained_weights`.
64
+
65
+ <!-- For the Swin Transformer and Video Swin Transformer backbones, the weights are intialized using the pretrained model provided in the repo [Swin-Transformer](https://github.com/microsoft/Swin-Transformer) and [Video-Swin-Transformer](https://github.com/SwinTransformer/Video-Swin-Transformer). For your convenience, we upload the pretrained model in the google drives [swin_pretrained](https://drive.google.com/drive/u/0/folders/1QWLayukDJYAxTFk7NPwerfso3Lrx35NL) and [video_swin_pretrained](https://drive.google.com/drive/u/0/folders/19qb9VbKSjuwgxsiPI3uv06XzQkB5brYM). -->
66
+
67
+
68
+ After the organization, we expect the directory struture to be the following:
69
+
70
+ ```
71
+ ReferFormer/
72
+ ├── data/
73
+ │ ├── ref-youtube-vos/
74
+ │ ├── ref-davis/
75
+ │ ├── a2d_sentences/
76
+ │ ├── jhmdb_sentences/
77
+ ├── davis2017/
78
+ ├── datasets/
79
+ ├── models/
80
+ ├── scipts/
81
+ ├── tools/
82
+ ├── util/
83
+ ├── pretrained_weights/
84
+ ├── eval_davis.py
85
+ ├── main.py
86
+ ├── engine.py
87
+ ├── inference_ytvos.py
88
+ ├── inference_davis.py
89
+ ├── opts.py
90
+ ...
91
+ ```
92
+
93
+ ## Model Zoo
94
+
95
+ All the models are trained using 8 NVIDIA Tesla V100 GPU. You may change the `--backbone` parameter to use different backbones (see [here](https://github.com/wjn922/ReferFormer/blob/232b4066fb7d10845e4083e6a5a2cc0af5d1757e/opts.py#L31)).
96
+
97
+ **Note:** If you encounter the `OOM` error, please add the command `--use_checkpoint` (we add this command for Swin-L, Video-Swin-S and Video-Swin-B models).
98
+
99
+
100
+ ### Ref-Youtube-VOS
101
+
102
+ To evaluate the results, please upload the zip file to the [competition server](https://competitions.codalab.org/competitions/29139#participate-submit_results).
103
+
104
+ | Backbone| J&F | CFBI J&F | Pretrain | Model | Submission | CFBI Submission |
105
+ | :----: | :----: | :----: | :----: | :----: | :----: | :----: |
106
+ | ResNet-50 | 55.6 | 59.4 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EepGibYBfyRGt_QedfE9SywBLF3v-bjoxo2R9E9YDqmIcw?e=7J7k1J) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EVRsV76e78lKuekbMLHgwlsBdG09pRVafEuBPN_wKXjJ1Q?e=SMeZlS) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EZ8tt46rv4xIjoiUkHGGPjwB1Yi6w2H-9BBVTyINOINmgQ?e=yWbDjp) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EZp0dd70UCNGvla2g25lTawB2AZyCDPN7QMl_KeESI5dkQ?e=1BfD2W) |
107
+ | ResNet-101 | 57.3 | 60.3 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESTAK4QCkMdNkVlQz1dd7GoBo3n_i9K4_FK4YLFBAFvBrg?e=Y3PlD5) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EaHNEx5MWR9HjTNh__W3IlYBIfhGd-nHKrshJ-MOyvofdw?e=shM4Ok) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EbrNhmt-wiNIv2tmQ-gOupgBrSBzhM1OJlNvid0J_8cPJg?e=8Fgets) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EWSPiUjgmORMuyaL91ueY1oBl159pO4k7RQYF-9eWrSJ-A?e=81hzDF) |
108
+ | Swin-T | 58.7 | 61.2 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESdasB6JLydDrs6mf68FrLMBuQBLBF7y_uxdveWl9oK68w?e=H5zeqk) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EUxJmp6QYR5LoUK12Wj55E0Bm0o6_9zl3OvOBN5KE9kJkg?e=SRS0qL) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EUMveO7cX1VAq48IAk9c6zoBc_Zy5f1kwa5h6C9q4LYt0A?e=iz9uMg) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EcnHrx4S5KVPqFYhr9CCARoBftAxdtldaWyGQAougBFnig?e=KG1LDq) |
109
+ | Swin-L | 62.4 | 63.3 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESngRLeZfV1LtrlZ7x5cVo4BR5_deWfov4Igt28LZGoDew?e=AVAsws) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EcCfv66Vl0xDl-rFukByXyQBEFNRTyLeVEKoeWrIvXmjNg?e=GcVTIr) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EdI15ujU4UpBilI4wt5lUQQB98JOq6KnMV5GHh77QiAn-w?e=o91ITz) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ETmJUpRGgyFHlGdEhcXqzekBDAfbFTExfHtmA4wHKCOkLw?e=l951Ea) |
110
+ | Video-Swin-T* | 56.0 | - | - | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EYXL3SKYOsRGtfSN-Wr9JCUBDvcXbbp67Sa4hs5dEDplxw?e=g2hGWo) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EUosvwAGikhGsyTPEOELMjEBQM-HZOaJ3fqcJjG2SV-5YA?e=vSUD12) | - |
111
+ | Video-Swin-T | 59.4 | - | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EUNTvEGXlsdLv3gicAbHfN0Ba23kcyy2-Z15IJTDLXKx_A?e=GqAYxT) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EZI2zogC5mtDu3KL5MVIaXIBzG3_3yTthoqyxjfTsGrvzA?e=lT5sVp) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EVKtr-5ZK5NIhhTvaUXGdRcBcHEGahAevUh1YCO2nvFfaQ?e=9Am7dc) | - |
112
+ | Video-Swin-S | 60.1 | - | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/Eb015DXX1LhDpiDoojxJTu8BBQ8ACicpVS8gwFStRJDK1w?e=NC368q) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EZI2zogC5mtDu3KL5MVIaXIBzG3_3yTthoqyxjfTsGrvzA?e=QEAdwh) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EUUJn8Zu7mlCnxLP8eNSbpIBvoEqz88EOg3y9ftQHhAhCw?e=RnSwxX) | - |
113
+ | Video-Swin-B | 62.9 | - |[weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ETDj4aGm_pRMuz8hLBi9Jy0BEFnsco0Uoz5qQEhWrxdNKQ?e=kKImMX) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EduJ_zS-Vd5Hn1qexxv5_mYBKX_8kRBOeX6dlfhED_GSwg?e=TxTWHb) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EZKduAM1fLpJrLK7l762xZ8BesK7zWKBjR0b9dFbCWhbfQ?e=SlAdyg) | - |
114
+
115
+ \* indicates the model is trained from scratch.
116
+
117
+ Joint training with Ref-COCO/+/g datasets.
118
+ | Backbone| J&F | J | F | Model | Submission |
119
+ | :----: | :----: | :----: | :----: | :----: | :----: |
120
+ | ResNet-50 | 58.7 | 57.4 | 60.1 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EcxDd8USU4BGo_HlgukKiG4BXLvetkjLdi3_-N-3SpjMvw?e=tAPNFv) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EYmORJYVsUJLp8NnbtfnZigBCM-IJ5oomZZrXEbNPhIyww?e=Bh5eYx) |
121
+ | ResNet-101 | 59.3 | 58.1 | 60.4 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EShgDd650nBBsfoNEiUbybcB84Ma5NydxOucISeCrZmzHw?e=YOSszd) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EcW6Lt67k0RCjr_FT2XOxVcBUcrFSlFJo19-YdFZpBxOsg?e=avszXt) |
122
+ | Swin-L | 64.2 | 62.3 | 66.2 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/Ec_qxgvukuBPr-GQ_3gNcX0B8VCHCqIUvXX-0ydtk1s7HQ?e=7X99M1) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EbNV0kBQ7ZVDrfRafG6B3CwBbpM-yMJtQ9jI01HwEgWXBQ?e=FzoSrT) |
123
+ | Video-Swin-T | 62.6 | 59.9 | 63.3 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EdCVQzM4HxxIvdZUBLiNpBwBrcPTLlFEqxHVxOzx0geF3A?e=1ZSZvK) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EdAT37_CDDZKkbC1U9MDxTYBkR1DVwTn0zxzqEvgrG-5ig?e=6P065H) |
124
+ | Video-Swin-S | 63.3 | 61.4 | 65.2 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EdYbp2xp-xFFuolQopvILNMBYRq88ksNjpcv-zKfGzHxbA?e=NqRzTf) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EU6d1rGtkfBFkIoA-xUH2koBwdKW2fPCghYTzzd49KvFLQ?e=FMsJLT) |
125
+ | Video-Swin-B | 64.9 | 62.8 | 67.0 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EY3-adM5LptFj--klo5gWgsBhpSDOps91j-C81sBI8i9Hw?e=n19q0w) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EcSdF-jsBmZLn7iUzc3zXTUBnlfnXDFxPP7mtRbC1ttJwg?e=0wzR0t) |
126
+
127
+
128
+
129
+
130
+ ### Ref-DAVIS17
131
+
132
+ As described in the paper, we report the results using the model trained on Ref-Youtube-VOS without finetune.
133
+
134
+ | Backbone| J&F | J | F | Model |
135
+ | :----: | :----: | :----: | :----: | :----: |
136
+ | ResNet-50 | 58.5 | 55.8 | 61.3 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EVRsV76e78lKuekbMLHgwlsBdG09pRVafEuBPN_wKXjJ1Q?e=SMeZlS) |
137
+ | Swin-L | 60.5 | 57.6 | 63.4 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EcCfv66Vl0xDl-rFukByXyQBEFNRTyLeVEKoeWrIvXmjNg?e=GcVTIr) |
138
+ | Video-Swin-B | 61.1 | 58.1 | 64.1 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EduJ_zS-Vd5Hn1qexxv5_mYBKX_8kRBOeX6dlfhED_GSwg?e=TxTWHb) |
139
+
140
+
141
+ ### A2D-Sentences
142
+
143
+ The pretrained models are the same as those provided for Ref-Youtube-VOS.
144
+
145
+ | Backbone| Overall IoU | Mean IoU | mAP | Pretrain | Model |
146
+ | :----: | :----: | :----: | :----: | :----: | :----: |
147
+ | Video-Swin-T* | 72.3 | 64.1 | 48.6 | - | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EfJs5WPRKfxEvifnIO3impABNgydbiO5qqI_uCF6LYKlCQ?e=mSRLCQ) \| [log](https://connecthkuhk-my.sharepoint.com/:t:/g/personal/wjn922_connect_hku_hk/EVJyHq6zy6ZGuxE--K9nECwB333gFkP9vjXKjh9Mt0otcA?e=Kwnngd) |
148
+ | Video-Swin-T | 77.6 | 69.6 | 52.8 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EUNTvEGXlsdLv3gicAbHfN0Ba23kcyy2-Z15IJTDLXKx_A?e=GqAYxT) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/Ed3po2mJGQZHivGwMJJg8oMBumXm3Ye94oPH6wfRFK1d8A?e=NG2E9c) \| [log](https://connecthkuhk-my.sharepoint.com/:t:/g/personal/wjn922_connect_hku_hk/EfO50qMduZNGvFcYJdRVKzABIJ8ZHhMiKWWvmDM14K9mnw?e=dgInSK) |
149
+ | Video-Swin-S | 77.7 | 69.8 | 53.9 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/Eb015DXX1LhDpiDoojxJTu8BBQ8ACicpVS8gwFStRJDK1w?e=NC368q) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EbAiydTvu41KsMYBEFzy_d8B0Nyy1fIf2tWG7Ao-FYD0Ug?e=tmaVAu) \| [log](https://connecthkuhk-my.sharepoint.com/:t:/g/personal/wjn922_connect_hku_hk/EZl6sHhFDTBMgVGKVp18sqwBouTTnwPdirWId4PR6klTfg?e=17lDVV) |
150
+ | Video-Swin-B | 78.6 | 70.3 | 55.0 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ETDj4aGm_pRMuz8hLBi9Jy0BEFnsco0Uoz5qQEhWrxdNKQ?e=kKImMX) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EeP1aneDDbBCo9HnMTbNjsgBpMqrgfIzJzF_jVROpZ2GWQ?e=YmkNHC) \| [log](https://connecthkuhk-my.sharepoint.com/:t:/g/personal/wjn922_connect_hku_hk/EUnV-O_IAe5Mkyupsd7NosMBxUg8OjqepmQbpbV0PFB4gQ?e=W14suT) |
151
+
152
+ \* the model is trained from scratch and set `--num_frames 6`.
153
+
154
+
155
+ ### JHMDB-Sentences
156
+
157
+ As described in the paper, we report the results using the model trained on A2D-Sentences without finetune.
158
+
159
+ | Backbone| Overall IoU | Mean IoU | mAP | Model |
160
+ | :----: | :----: | :----: | :----: | :----: |
161
+ | Video-Swin-T* | 70.0 | 69.3 | 39.1 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EfJs5WPRKfxEvifnIO3impABNgydbiO5qqI_uCF6LYKlCQ?e=mSRLCQ) |
162
+ | Video-Swin-T | 71.9 | 71.0 | 42.2 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/Ed3po2mJGQZHivGwMJJg8oMBumXm3Ye94oPH6wfRFK1d8A?e=NG2E9c) |
163
+ | Video-Swin-S | 72.8 | 71.5 | 42.4 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EbAiydTvu41KsMYBEFzy_d8B0Nyy1fIf2tWG7Ao-FYD0Ug?e=tmaVAu) |
164
+ | Video-Swin-B | 73.0 | 71.8 | 43.7 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EeP1aneDDbBCo9HnMTbNjsgBpMqrgfIzJzF_jVROpZ2GWQ?e=YmkNHC) |
165
+
166
+ \* the model is trained from scratch and set `--num_frames 6`.
167
+
168
+
169
+ ### RefCOCO/+/g
170
+
171
+ We also support evaluate on RefCOCO/+/g validation set by using the pretrained weights (num_frames=1).
172
+ Specifically, we measure the [email protected] and overall IoU (oIoU) for REC and RIS tasks, respectively.
173
+
174
+ REC (referring epression understanding):
175
+
176
+ | Backbone| RefCOCO | RefCOCO+ | RefCOCOg | Model |
177
+ | :----: | :----: | :----: | :----: | :----: |
178
+ | ResNet-50 | 85.0 | 79.2 | 79.0 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EepGibYBfyRGt_QedfE9SywBLF3v-bjoxo2R9E9YDqmIcw?e=7J7k1J) |
179
+ | ResNet-101 | 85.4 | 75.8 | 79.9 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESTAK4QCkMdNkVlQz1dd7GoBo3n_i9K4_FK4YLFBAFvBrg?e=Y3PlD5) |
180
+ | Swin-T | 86.7 | 77.2 | 80.6 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESdasB6JLydDrs6mf68FrLMBuQBLBF7y_uxdveWl9oK68w?e=H5zeqk) |
181
+ | Swin-L | 89.8 | 80.0 | 83.9 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESngRLeZfV1LtrlZ7x5cVo4BR5_deWfov4Igt28LZGoDew?e=AVAsws) |
182
+
183
+ RIS (referring image segmentation):
184
+
185
+ | Backbone| RefCOCO | RefCOCO+ | RefCOCOg | Model |
186
+ | :----: | :----: | :----: | :----: | :----: |
187
+ | ResNet-50 | 71.1 | 64.1 | 64.1 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EepGibYBfyRGt_QedfE9SywBLF3v-bjoxo2R9E9YDqmIcw?e=7J7k1J) |
188
+ | ResNet-101 | 71.8 | 61.1 | 64.9 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESTAK4QCkMdNkVlQz1dd7GoBo3n_i9K4_FK4YLFBAFvBrg?e=Y3PlD5) |
189
+ | Swin-T | 72.9 | 62.4 | 66.1 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESdasB6JLydDrs6mf68FrLMBuQBLBF7y_uxdveWl9oK68w?e=H5zeqk) |
190
+ | Swin-L | 77.1 | 65.8 | 69.3 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESngRLeZfV1LtrlZ7x5cVo4BR5_deWfov4Igt28LZGoDew?e=AVAsws) |
191
+
192
+
193
+ ## Get Started
194
+
195
+ Please see [Ref-Youtube-VOS](docs/Ref-Youtube-VOS.md), [Ref-DAVIS17](docs/Ref-DAVIS17.md), [A2D-Sentences](docs/A2D-Sentences.md) and [JHMDB-Sentences](docs/JHMDB-Sentences.md) for details.
196
+
197
+
198
+
199
+ ## Acknowledgement
200
+
201
+ This repo is based on [Deformable DETR](https://github.com/fundamentalvision/Deformable-DETR) and [VisTR](https://github.com/Epiphqny/VisTR). We also refer to the repositories [MDETR](https://github.com/ashkamath/mdetr) and [MTTR](https://github.com/mttr2021/MTTR). Thanks for their wonderful works.
202
+
203
+
204
+ ## Citation
205
+
206
+ ```
207
+ @article{wu2022referformer,
208
+ title={Language as Queries for Referring Video Object Segmentation},
209
+ author={Jiannan Wu and Yi Jiang and Peize Sun and Zehuan Yuan and Ping Luo},
210
+ journal={arXiv preprint arXiv:2201.00487},
211
+ year={2022},
212
+ }
213
+ ```
214
+
davis2017/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from __future__ import absolute_import
2
+
3
+ __version__ = '0.1.0'
davis2017/evaluation.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from tqdm import tqdm
3
+ import warnings
4
+ warnings.filterwarnings("ignore", category=RuntimeWarning)
5
+
6
+ import numpy as np
7
+ from davis2017.davis import DAVIS
8
+ from davis2017.metrics import db_eval_boundary, db_eval_iou
9
+ from davis2017 import utils
10
+ from davis2017.results import Results
11
+ from scipy.optimize import linear_sum_assignment
12
+
13
+
14
+ class DAVISEvaluation(object):
15
+ def __init__(self, davis_root, task, gt_set, sequences='all', codalab=False):
16
+ """
17
+ Class to evaluate DAVIS sequences from a certain set and for a certain task
18
+ :param davis_root: Path to the DAVIS folder that contains JPEGImages, Annotations, etc. folders.
19
+ :param task: Task to compute the evaluation, chose between semi-supervised or unsupervised.
20
+ :param gt_set: Set to compute the evaluation
21
+ :param sequences: Sequences to consider for the evaluation, 'all' to use all the sequences in a set.
22
+ """
23
+ self.davis_root = davis_root
24
+ self.task = task
25
+ self.dataset = DAVIS(root=davis_root, task=task, subset=gt_set, sequences=sequences, codalab=codalab)
26
+
27
+ @staticmethod
28
+ def _evaluate_semisupervised(all_gt_masks, all_res_masks, all_void_masks, metric):
29
+ if all_res_masks.shape[0] > all_gt_masks.shape[0]:
30
+ sys.stdout.write("\nIn your PNG files there is an index higher than the number of objects in the sequence!")
31
+ sys.exit()
32
+ elif all_res_masks.shape[0] < all_gt_masks.shape[0]:
33
+ zero_padding = np.zeros((all_gt_masks.shape[0] - all_res_masks.shape[0], *all_res_masks.shape[1:]))
34
+ all_res_masks = np.concatenate([all_res_masks, zero_padding], axis=0)
35
+ j_metrics_res, f_metrics_res = np.zeros(all_gt_masks.shape[:2]), np.zeros(all_gt_masks.shape[:2])
36
+ for ii in range(all_gt_masks.shape[0]):
37
+ if 'J' in metric:
38
+ j_metrics_res[ii, :] = db_eval_iou(all_gt_masks[ii, ...], all_res_masks[ii, ...], all_void_masks)
39
+ if 'F' in metric:
40
+ f_metrics_res[ii, :] = db_eval_boundary(all_gt_masks[ii, ...], all_res_masks[ii, ...], all_void_masks)
41
+ return j_metrics_res, f_metrics_res
42
+
43
+ @staticmethod
44
+ def _evaluate_unsupervised(all_gt_masks, all_res_masks, all_void_masks, metric, max_n_proposals=20):
45
+ if all_res_masks.shape[0] > max_n_proposals:
46
+ sys.stdout.write(f"\nIn your PNG files there is an index higher than the maximum number ({max_n_proposals}) of proposals allowed!")
47
+ sys.exit()
48
+ elif all_res_masks.shape[0] < all_gt_masks.shape[0]:
49
+ zero_padding = np.zeros((all_gt_masks.shape[0] - all_res_masks.shape[0], *all_res_masks.shape[1:]))
50
+ all_res_masks = np.concatenate([all_res_masks, zero_padding], axis=0)
51
+ j_metrics_res = np.zeros((all_res_masks.shape[0], all_gt_masks.shape[0], all_gt_masks.shape[1]))
52
+ f_metrics_res = np.zeros((all_res_masks.shape[0], all_gt_masks.shape[0], all_gt_masks.shape[1]))
53
+ for ii in range(all_gt_masks.shape[0]):
54
+ for jj in range(all_res_masks.shape[0]):
55
+ if 'J' in metric:
56
+ j_metrics_res[jj, ii, :] = db_eval_iou(all_gt_masks[ii, ...], all_res_masks[jj, ...], all_void_masks)
57
+ if 'F' in metric:
58
+ f_metrics_res[jj, ii, :] = db_eval_boundary(all_gt_masks[ii, ...], all_res_masks[jj, ...], all_void_masks)
59
+ if 'J' in metric and 'F' in metric:
60
+ all_metrics = (np.mean(j_metrics_res, axis=2) + np.mean(f_metrics_res, axis=2)) / 2
61
+ else:
62
+ all_metrics = np.mean(j_metrics_res, axis=2) if 'J' in metric else np.mean(f_metrics_res, axis=2)
63
+ row_ind, col_ind = linear_sum_assignment(-all_metrics)
64
+ return j_metrics_res[row_ind, col_ind, :], f_metrics_res[row_ind, col_ind, :]
65
+
66
+ def evaluate(self, res_path, metric=('J', 'F'), debug=False):
67
+ metric = metric if isinstance(metric, tuple) or isinstance(metric, list) else [metric]
68
+ if 'T' in metric:
69
+ raise ValueError('Temporal metric not supported!')
70
+ if 'J' not in metric and 'F' not in metric:
71
+ raise ValueError('Metric possible values are J for IoU or F for Boundary')
72
+
73
+ # Containers
74
+ metrics_res = {}
75
+ if 'J' in metric:
76
+ metrics_res['J'] = {"M": [], "R": [], "D": [], "M_per_object": {}}
77
+ if 'F' in metric:
78
+ metrics_res['F'] = {"M": [], "R": [], "D": [], "M_per_object": {}}
79
+
80
+ # Sweep all sequences
81
+ results = Results(root_dir=res_path)
82
+ for seq in tqdm(list(self.dataset.get_sequences())):
83
+ all_gt_masks, all_void_masks, all_masks_id = self.dataset.get_all_masks(seq, True)
84
+ if self.task == 'semi-supervised':
85
+ all_gt_masks, all_masks_id = all_gt_masks[:, 1:-1, :, :], all_masks_id[1:-1]
86
+ all_res_masks = results.read_masks(seq, all_masks_id)
87
+ if self.task == 'unsupervised':
88
+ j_metrics_res, f_metrics_res = self._evaluate_unsupervised(all_gt_masks, all_res_masks, all_void_masks, metric)
89
+ elif self.task == 'semi-supervised':
90
+ j_metrics_res, f_metrics_res = self._evaluate_semisupervised(all_gt_masks, all_res_masks, None, metric)
91
+ for ii in range(all_gt_masks.shape[0]):
92
+ seq_name = f'{seq}_{ii+1}'
93
+ if 'J' in metric:
94
+ [JM, JR, JD] = utils.db_statistics(j_metrics_res[ii])
95
+ metrics_res['J']["M"].append(JM)
96
+ metrics_res['J']["R"].append(JR)
97
+ metrics_res['J']["D"].append(JD)
98
+ metrics_res['J']["M_per_object"][seq_name] = JM
99
+ if 'F' in metric:
100
+ [FM, FR, FD] = utils.db_statistics(f_metrics_res[ii])
101
+ metrics_res['F']["M"].append(FM)
102
+ metrics_res['F']["R"].append(FR)
103
+ metrics_res['F']["D"].append(FD)
104
+ metrics_res['F']["M_per_object"][seq_name] = FM
105
+
106
+ # Show progress
107
+ if debug:
108
+ sys.stdout.write(seq + '\n')
109
+ sys.stdout.flush()
110
+ return metrics_res
davis2017/metrics.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import cv2
4
+
5
+
6
+ def db_eval_iou(annotation, segmentation, void_pixels=None):
7
+ """ Compute region similarity as the Jaccard Index.
8
+ Arguments:
9
+ annotation (ndarray): binary annotation map.
10
+ segmentation (ndarray): binary segmentation map.
11
+ void_pixels (ndarray): optional mask with void pixels
12
+
13
+ Return:
14
+ jaccard (float): region similarity
15
+ """
16
+ assert annotation.shape == segmentation.shape, \
17
+ f'Annotation({annotation.shape}) and segmentation:{segmentation.shape} dimensions do not match.'
18
+ annotation = annotation.astype(np.bool)
19
+ segmentation = segmentation.astype(np.bool)
20
+
21
+ if void_pixels is not None:
22
+ assert annotation.shape == void_pixels.shape, \
23
+ f'Annotation({annotation.shape}) and void pixels:{void_pixels.shape} dimensions do not match.'
24
+ void_pixels = void_pixels.astype(np.bool)
25
+ else:
26
+ void_pixels = np.zeros_like(segmentation)
27
+
28
+ # Intersection between all sets
29
+ inters = np.sum((segmentation & annotation) & np.logical_not(void_pixels), axis=(-2, -1))
30
+ union = np.sum((segmentation | annotation) & np.logical_not(void_pixels), axis=(-2, -1))
31
+
32
+ j = inters / union
33
+ if j.ndim == 0:
34
+ j = 1 if np.isclose(union, 0) else j
35
+ else:
36
+ j[np.isclose(union, 0)] = 1
37
+ return j
38
+
39
+
40
+ def db_eval_boundary(annotation, segmentation, void_pixels=None, bound_th=0.008):
41
+ assert annotation.shape == segmentation.shape
42
+ if void_pixels is not None:
43
+ assert annotation.shape == void_pixels.shape
44
+ if annotation.ndim == 3:
45
+ n_frames = annotation.shape[0]
46
+ f_res = np.zeros(n_frames)
47
+ for frame_id in range(n_frames):
48
+ void_pixels_frame = None if void_pixels is None else void_pixels[frame_id, :, :, ]
49
+ f_res[frame_id] = f_measure(segmentation[frame_id, :, :, ], annotation[frame_id, :, :], void_pixels_frame, bound_th=bound_th)
50
+ elif annotation.ndim == 2:
51
+ f_res = f_measure(segmentation, annotation, void_pixels, bound_th=bound_th)
52
+ else:
53
+ raise ValueError(f'db_eval_boundary does not support tensors with {annotation.ndim} dimensions')
54
+ return f_res
55
+
56
+
57
+ def f_measure(foreground_mask, gt_mask, void_pixels=None, bound_th=0.008):
58
+ """
59
+ Compute mean,recall and decay from per-frame evaluation.
60
+ Calculates precision/recall for boundaries between foreground_mask and
61
+ gt_mask using morphological operators to speed it up.
62
+
63
+ Arguments:
64
+ foreground_mask (ndarray): binary segmentation image.
65
+ gt_mask (ndarray): binary annotated image.
66
+ void_pixels (ndarray): optional mask with void pixels
67
+
68
+ Returns:
69
+ F (float): boundaries F-measure
70
+ """
71
+ assert np.atleast_3d(foreground_mask).shape[2] == 1
72
+ if void_pixels is not None:
73
+ void_pixels = void_pixels.astype(np.bool)
74
+ else:
75
+ void_pixels = np.zeros_like(foreground_mask).astype(np.bool)
76
+
77
+ bound_pix = bound_th if bound_th >= 1 else \
78
+ np.ceil(bound_th * np.linalg.norm(foreground_mask.shape))
79
+
80
+ # Get the pixel boundaries of both masks
81
+ fg_boundary = _seg2bmap(foreground_mask * np.logical_not(void_pixels))
82
+ gt_boundary = _seg2bmap(gt_mask * np.logical_not(void_pixels))
83
+
84
+ from skimage.morphology import disk
85
+
86
+ # fg_dil = binary_dilation(fg_boundary, disk(bound_pix))
87
+ fg_dil = cv2.dilate(fg_boundary.astype(np.uint8), disk(bound_pix).astype(np.uint8))
88
+ # gt_dil = binary_dilation(gt_boundary, disk(bound_pix))
89
+ gt_dil = cv2.dilate(gt_boundary.astype(np.uint8), disk(bound_pix).astype(np.uint8))
90
+
91
+ # Get the intersection
92
+ gt_match = gt_boundary * fg_dil
93
+ fg_match = fg_boundary * gt_dil
94
+
95
+ # Area of the intersection
96
+ n_fg = np.sum(fg_boundary)
97
+ n_gt = np.sum(gt_boundary)
98
+
99
+ # % Compute precision and recall
100
+ if n_fg == 0 and n_gt > 0:
101
+ precision = 1
102
+ recall = 0
103
+ elif n_fg > 0 and n_gt == 0:
104
+ precision = 0
105
+ recall = 1
106
+ elif n_fg == 0 and n_gt == 0:
107
+ precision = 1
108
+ recall = 1
109
+ else:
110
+ precision = np.sum(fg_match) / float(n_fg)
111
+ recall = np.sum(gt_match) / float(n_gt)
112
+
113
+ # Compute F measure
114
+ if precision + recall == 0:
115
+ F = 0
116
+ else:
117
+ F = 2 * precision * recall / (precision + recall)
118
+
119
+ return F
120
+
121
+
122
+ def _seg2bmap(seg, width=None, height=None):
123
+ """
124
+ From a segmentation, compute a binary boundary map with 1 pixel wide
125
+ boundaries. The boundary pixels are offset by 1/2 pixel towards the
126
+ origin from the actual segment boundary.
127
+ Arguments:
128
+ seg : Segments labeled from 1..k.
129
+ width : Width of desired bmap <= seg.shape[1]
130
+ height : Height of desired bmap <= seg.shape[0]
131
+ Returns:
132
+ bmap (ndarray): Binary boundary map.
133
+ David Martin <[email protected]>
134
+ January 2003
135
+ """
136
+
137
+ seg = seg.astype(np.bool)
138
+ seg[seg > 0] = 1
139
+
140
+ assert np.atleast_3d(seg).shape[2] == 1
141
+
142
+ width = seg.shape[1] if width is None else width
143
+ height = seg.shape[0] if height is None else height
144
+
145
+ h, w = seg.shape[:2]
146
+
147
+ ar1 = float(width) / float(height)
148
+ ar2 = float(w) / float(h)
149
+
150
+ assert not (
151
+ width > w | height > h | abs(ar1 - ar2) > 0.01
152
+ ), "Can" "t convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
153
+
154
+ e = np.zeros_like(seg)
155
+ s = np.zeros_like(seg)
156
+ se = np.zeros_like(seg)
157
+
158
+ e[:, :-1] = seg[:, 1:]
159
+ s[:-1, :] = seg[1:, :]
160
+ se[:-1, :-1] = seg[1:, 1:]
161
+
162
+ b = seg ^ e | seg ^ s | seg ^ se
163
+ b[-1, :] = seg[-1, :] ^ e[-1, :]
164
+ b[:, -1] = seg[:, -1] ^ s[:, -1]
165
+ b[-1, -1] = 0
166
+
167
+ if w == width and h == height:
168
+ bmap = b
169
+ else:
170
+ bmap = np.zeros((height, width))
171
+ for x in range(w):
172
+ for y in range(h):
173
+ if b[y, x]:
174
+ j = 1 + math.floor((y - 1) + height / h)
175
+ i = 1 + math.floor((x - 1) + width / h)
176
+ bmap[j, i] = 1
177
+
178
+ return bmap
179
+
180
+
181
+ if __name__ == '__main__':
182
+ from davis2017.davis import DAVIS
183
+ from davis2017.results import Results
184
+
185
+ dataset = DAVIS(root='input_dir/ref', subset='val', sequences='aerobatics')
186
+ results = Results(root_dir='examples/osvos')
187
+ # Test timing F measure
188
+ for seq in dataset.get_sequences():
189
+ all_gt_masks, _, all_masks_id = dataset.get_all_masks(seq, True)
190
+ all_gt_masks, all_masks_id = all_gt_masks[:, 1:-1, :, :], all_masks_id[1:-1]
191
+ all_res_masks = results.read_masks(seq, all_masks_id)
192
+ f_metrics_res = np.zeros(all_gt_masks.shape[:2])
193
+ for ii in range(all_gt_masks.shape[0]):
194
+ f_metrics_res[ii, :] = db_eval_boundary(all_gt_masks[ii, ...], all_res_masks[ii, ...])
195
+
196
+ # Run using to profile code: python -m cProfile -o f_measure.prof metrics.py
197
+ # snakeviz f_measure.prof
docs/A2D-Sentences.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## A2D-Sentences
2
+
3
+ ### Model Zoo
4
+
5
+ The pretrained models are the same as those provided for Ref-Youtube-VOS.
6
+
7
+ | Backbone| Overall IoU | Mean IoU | mAP | Pretrain | Model |
8
+ | :----: | :----: | :----: | :----: | :----: | :----: |
9
+ | Video-Swin-T* | 72.3 | 64.1 | 48.6 | - | [model](https://drive.google.com/file/d/1z-HO71IcFOZ9A6KD71wAXkbiQgKDpSp7/view?usp=sharing) \| [log](https://drive.google.com/file/d/1JhsXgcWOYv97u6tpAUnBi9-D3mxcHXzO/view?usp=sharing) |
10
+ | Video-Swin-T | 77.6 | 69.6 | 52.8 | [weight](https://drive.google.com/file/d/1g9Dm1vLdwpwSKVtIZzWKPUk2-zK3IbQa/view?usp=sharing) | [model](https://drive.google.com/file/d/1z-HO71IcFOZ9A6KD71wAXkbiQgKDpSp7/view?usp=sharing) \| [log](https://drive.google.com/file/d/1xjevouL3a1gHZN5KHtA07Cpa07R4T1Qi/view?usp=sharing) |
11
+ | Video-Swin-S | 77.7 | 69.8 | 53.9 | [weight](https://drive.google.com/file/d/1GrhFhsUidsVs7-dhY8NkVgWfBZdeit9C/view?usp=sharing) | [model](https://drive.google.com/file/d/1ng2FAX9J4FyQ7Bq1eeQC9Vvv1W8JZmek/view?usp=sharing) \| [log](https://drive.google.com/file/d/1Uu72THexbtEje4aKXR7Q2Yd4zyPmQsi3/view?usp=sharing) |
12
+ | Video-Swin-B | 78.6 | 70.3 | 55.0 | [weight](https://drive.google.com/file/d/1MJ1362zjqu-uZdXsSQH6pI1QOFqwv5lY/view?usp=sharing) | [model](https://drive.google.com/file/d/1WlNjKS_Li-1KoUzuPM4MRM4b-oK2Ka7c/view?usp=sharing) \| [log](https://drive.google.com/file/d/1tH-f9_U0gY-iNfXm6GRyttJp3uvm5NQw/view?usp=sharing) |
13
+
14
+ \* the model is trained from scratch and set `--num_frames 6`.
15
+
16
+
17
+ ### Inference & Evaluation
18
+
19
+ ```
20
+ python3 -m torch.distributed.launch --nproc_per_node=8 --use_env main.py --dataset_file a2d --with_box_refine --freeze_text_encoder --batch_size 2 --resume [/path/to/model_weight] --backbone [backbone] --eval
21
+ ```
22
+
23
+ For example, evaluating the Video-Swin-Tiny model, run the following command:
24
+
25
+ ```
26
+ python3 -m torch.distributed.launch --nproc_per_node=8 --use_env main.py --dataset_file a2d --with_box_refine --freeze_text_encoder --batch_size 2 --resume a2d_video_swin_tiny.pth --backbone video_swin_t_p4w7 --eval
27
+ ```
28
+
29
+ ### Training
30
+
31
+ - Finetune
32
+
33
+ ```
34
+ ./scripts/dist_train_a2d.sh [/path/to/output_dir] [/path/to/pretrained_weight] --backbone [backbone]
35
+ ```
36
+
37
+ For example, training the Video-Swin-Tiny model, run the following command:
38
+ ```
39
+ ./scripts/dist_train_a2d.sh a2d_dirs/video_swin_tiny pretrained_weights/video_swin_tiny_pretrained.pth --backbone video_swin_t_p4w7
40
+ ```
41
+
42
+ - Train from scratch
43
+
44
+ ```
45
+ python3 -m torch.distributed.launch --nproc_per_node=8 --use_env main.py --dataset_file a2d --with_box_refine --freeze_text_encoder --epochs 12 --lr_drop 8 10 --dropout 0 --weight_decay 1e-4 --output_dir=[/path/to/output_dir] --backbone [backbone] --backbone_pretrained [/path/to/pretrained backbone weight] [other args]
46
+ ```
47
+
48
+ For example, training the Video-Swin-Tiny model from scratch and set window size as 6, run the following command:
49
+
50
+ ```
51
+ python3 -m torch.distributed.launch --nproc_per_node=8 --use_env main.py --dataset_file a2d --with_box_refine --freeze_text_encoder --epochs 12 --lr_drop 8 10 --dropout 0 --weight_decay 1e-4 --output_dir a2d_dirs/video_swin_tiny_scratch_frame6 --backbone video_swin_t_p4w7 --bacbkone_pretrained video_swin_pretrained/swin_tiny_patch244_window877_kinetics400_1k.pth --num_frames 6
52
+ ```
53
+
54
+
55
+
docs/JHMDB-Sentences.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## JHMDB-Sentences
2
+
3
+ ### Model Zoo
4
+
5
+ As described in the paper, we report the results using the model trained on A2D-Sentences without finetune.
6
+
7
+ | Backbone| Overall IoU | Mean IoU | mAP | Model |
8
+ | :----: | :----: | :----: | :----: | :----: |
9
+ | Video-Swin-T* | 70.0 | 69.3 | 39.1 | [model](https://drive.google.com/file/d/1z-HO71IcFOZ9A6KD71wAXkbiQgKDpSp7/view?usp=sharing) |
10
+ | Video-Swin-T | 71.9 | 71.0 | 42.2 | [model](https://drive.google.com/file/d/1z-HO71IcFOZ9A6KD71wAXkbiQgKDpSp7/view?usp=sharing) |
11
+ | Video-Swin-S | 72.8 | 71.5 | 42.4 | [model](https://drive.google.com/file/d/1ng2FAX9J4FyQ7Bq1eeQC9Vvv1W8JZmek/view?usp=sharing) |
12
+ | Video-Swin-B | 73.0 | 71.8 | 43.7 | [model](https://drive.google.com/file/d/1WlNjKS_Li-1KoUzuPM4MRM4b-oK2Ka7c/view?usp=sharing) |
13
+
14
+ \* the model is trained from scratch and set `--num_frames 6`.
15
+
16
+
17
+ ### Inference & Evaluation
18
+
19
+ ```
20
+ python3 -m torch.distributed.launch --nproc_per_node=8 --use_env main.py --dataset_file jhmdb --with_box_refine --freeze_text_encoder --batch_size 2 --resume [/path/to/model_weight] --backbone [backbone] --eval
21
+ ```
22
+
23
+ For example, evaluating the Video-Swin-Tiny model, run the following command:
24
+
25
+ ```
26
+ python3 -m torch.distributed.launch --nproc_per_node=8 --use_env main.py --dataset_file jhmdb --with_box_refine --freeze_text_encoder --batch_size 2 --resume a2d_video_swin_tiny.pth --backbone video_swin_t_p4w7 --eval
27
+ ```
docs/Ref-DAVIS17.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Ref-DAVIS17
2
+
3
+ ### Model Zoo
4
+
5
+ As described in the paper, we report the results using the model trained on Ref-Youtube-VOS without finetune.
6
+
7
+ | Backbone| J&F | J | F | Model |
8
+ | :----: | :----: | :----: | :----: | :----: |
9
+ | ResNet-50 | 58.5 | 55.8 | 61.3 | [model](https://drive.google.com/file/d/1VKYIbd3tiuLyWkh7ajnIiA3HZ3_IdvxV/view?usp=sharing) |
10
+ | Swin-L | 60.5 | 57.6 | 63.4 | [model](https://drive.google.com/file/d/1_uwwlWv8AXhHfE8GVId7YtGraznRebaZ/view?usp=sharing) |
11
+ | Video-Swin-B | 61.1 | 58.1 | 64.1 | [model](https://drive.google.com/file/d/1nw7D3C_RrKTMzwtzjo39snbYLbv73anH/view?usp=sharing) |
12
+
13
+
14
+ ### Inference & Evaluation
15
+
16
+ ```
17
+ ./scripts/dist_test_davis.sh [/path/to/output_dir] [/path/to/model_weight] --backbone [backbone]
18
+ ```
19
+
20
+ For example, evaluating the Swin-Large model, run the following command:
21
+
22
+ ```
23
+ ./scripts/dist_test_davis.sh davis_dirs/swin_large ytvos_swin_large.pth --backbone swin_l_p4w7
24
+ ```
docs/Ref-Youtube-VOS.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Ref-Youtube-VOS
2
+
3
+ ### Model Zoo
4
+
5
+ To evaluate the results, please upload the zip file to the [competition server](https://competitions.codalab.org/competitions/29139#participate-submit_results).
6
+
7
+ | Backbone| J&F | CFBI J&F | Pretrain | Model | Submission | CFBI Submission |
8
+ | :----: | :----: | :----: | :----: | :----: | :----: | :----: |
9
+ | ResNet-50 | 55.6 | 59.4 | [weight](https://drive.google.com/file/d/1mJd5zBUv4EYLOKQ0H87-NeAuInyrn577/view?usp=sharing) | [model](https://drive.google.com/file/d/1VKYIbd3tiuLyWkh7ajnIiA3HZ3_IdvxV/view?usp=sharing) | [link](https://drive.google.com/file/d/1IXKu8a06ppPAVBvy4Y0UfcKhCat4HRJt/view?usp=sharing) | [link](https://drive.google.com/file/d/1VJAKZ_j7kQFpocv_vDzER47CXWwAAE8h/view?usp=sharing) |
10
+ | ResNet-101 | 57.3 | 60.3 | [weight](https://drive.google.com/file/d/1EMOwwAygdSfTZiVxI4f0UaVd7P6JzmuM/view?usp=sharing) | [model](https://drive.google.com/file/d/1FCHAAMf-HXPhZGTZp748l3pn6FfMyV1L/view?usp=sharing) | [link](https://drive.google.com/file/d/1cFxjVW2RlwjoVYR1M6NlkRpv9L3tPlcZ/view?usp=sharing) | [link](https://drive.google.com/file/d/1RPnFPqf7iiVypc7QbN-ev6s6xfmD-m5c/view?usp=sharing) |
11
+ | Swin-T | 58.7 | 61.2 | [weight](https://drive.google.com/file/d/155sZm6yE7YQ8Y8Ln0ShaVZKLejYORqTQ/view?usp=sharing) | [model](https://drive.google.com/file/d/19jIbjRRUGDhfnI604Pw7hcGP5DqdvVtl/view?usp=sharing) | [link](https://drive.google.com/file/d/1eZZ-2zz0gdCwPrislGP3WKAHk-RnNY7v/view?usp=sharing) | [link](https://drive.google.com/file/d/1O9B35oieBfo7sRjxTpSyFz52J2AAHLce/view?usp=sharing) |
12
+ | Swin-L | 62.4 | 63.3 | [weight](https://drive.google.com/file/d/1eJKNHvk_KcFuT4k6Te7HDuuSXH2DVOY5/view?usp=sharing) | [model](https://drive.google.com/file/d/1_uwwlWv8AXhHfE8GVId7YtGraznRebaZ/view?usp=sharing) | [link](https://drive.google.com/file/d/1uxBwbKdlilaCNt-RbdcPj1LshA-WY9Q6/view?usp=sharing) | [link](https://drive.google.com/file/d/16kVmJzv5oXzk3zGcfMcb2sEiN6HTOCmW/view?usp=sharing) |
13
+ | Video-Swin-T* | 55.8 | - | - | [model](https://drive.google.com/file/d/1vNiQGpKuYfR7F7YKZK7H2HAzljDf9Wuf/view?usp=sharing) | [link](https://drive.google.com/file/d/18G0qIeZndacj3Y0EuyJsZFeFRWJ0_3O_/view?usp=sharing) | - |
14
+ | Video-Swin-T | 59.4 | - | [weight](https://drive.google.com/file/d/1g9Dm1vLdwpwSKVtIZzWKPUk2-zK3IbQa/view?usp=sharing) | [model](https://drive.google.com/file/d/17RL6o_A57giHT-bMuP7ysUGogueT7wYm/view?usp=sharing) | [link](https://drive.google.com/file/d/1nhjvDWgMWufMGAjOKesgyLRB_-Ct6kXP/view?usp=sharing) | - |
15
+ | Video-Swin-S | 60.1 | - | [weight](https://drive.google.com/file/d/1GrhFhsUidsVs7-dhY8NkVgWfBZdeit9C/view?usp=sharing) | [model](https://drive.google.com/file/d/1GrhFhsUidsVs7-dhY8NkVgWfBZdeit9C/view?usp=sharing) | [link](https://drive.google.com/file/d/1mhb0UAaJkTFYmGrwXHHJuaXVp-0BSkgm/view?usp=sharing) | - |
16
+ | Video-Swin-B | 62.9 | - |[weight](https://drive.google.com/file/d/1MJ1362zjqu-uZdXsSQH6pI1QOFqwv5lY/view?usp=sharing) | [model](https://drive.google.com/file/d/1nw7D3C_RrKTMzwtzjo39snbYLbv73anH/view?usp=sharing) | [link](https://drive.google.com/file/d/1dAQdr2RqCxYUmOVQ4jFE-vv5zavNhz7B/view?usp=sharing) | - |
17
+
18
+ \* indicates the model is trained from scratch.
19
+
20
+
21
+ Joint training with Ref-COCO/+/g datasets.
22
+ | Backbone| J&F | J | F | Model | Submission |
23
+ | :----: | :----: | :----: | :----: | :----: | :----: |
24
+ | ResNet-50 | 58.7 | 57.4 | 60.1 | [model](https://drive.google.com/file/d/1tXgC_GRmQCvHjhlNoT0uXc_0oQ21d0hk/view?usp=sharing) | [link](https://drive.google.com/file/d/1Vbrl11mBfjwpM-H4DOleyD1i2STCN-SM/view?usp=sharing) |
25
+ | ResNet-101 | 59.3 | 58.1 | 60.4 | [model](https://drive.google.com/file/d/1LUflgRgwZgTpYr5V9qeDKTIlBjLqHOVj/view?usp=sharing) | [link](https://drive.google.com/file/d/1BANQcqY34SebORZ9_PTF4C-QWuCJl2_W/view?usp=sharing) |
26
+ | Swin-L | 64.2 | 62.3 | 66.2 | [model](https://drive.google.com/file/d/1JeppEr8m0O9844xncSfSZrYE_NH8oXb7/view?usp=sharing) | [link](https://drive.google.com/file/d/14klluhPeQhhNKl3EBibtiziChSKfBHU0/view?usp=sharing) |
27
+ | Video-Swin-T | 62.6 | 59.9 | 63.3 | [model](https://drive.google.com/file/d/1rVO2ZC4U4symSh9Ifgg68YGdYBZH00MT/view?usp=sharing) | [link](https://drive.google.com/file/d/1-i67hTmo-qpyICbJ9vbTeQdPaL2VnbXQ/view?usp=sharing) |
28
+ | Video-Swin-S | 63.3 | 61.4 | 65.2 | [model](https://drive.google.com/file/d/15ifI2yd9oDqMB05DgjhNVMe2MGXVvZnj/view?usp=sharing) | [link](https://drive.google.com/file/d/1II1gZl99FGECkS7DR6B8MszxAKadu-9y/view?usp=sharing) |
29
+ | Video-Swin-B | 64.9 | 62.8 | 67.0 | [model](https://drive.google.com/file/d/19XO5VoR6qTE3VNLF-IjYzabL-2tb9E14/view?usp=sharing) | [link](https://drive.google.com/file/d/11FTV-B3MkWfl4azNI-aRmiRqQ9TBXG03/view?usp=sharing) |
30
+
31
+ ### Inference & Evaluation
32
+
33
+
34
+ First, inference using the trained model.
35
+
36
+ ```
37
+ python3 inference_ytvos.py --with_box_refine --binary --freeze_text_encoder --output_dir=[/path/to/output_dir] --resume=[/path/to/model_weight] --backbone [backbone]
38
+ ```
39
+
40
+ ```
41
+ python3 inference_ytvos.py --with_box_refine --binary --freeze_text_encoder --output_dir=ytvos_dirs/swin_tiny --resume=ytvos_swin_tiny.pth --backbone swin_t_p4w7
42
+ ```
43
+
44
+ If you want to visualize the predited masks, you may add `--visualize` to the above command.
45
+
46
+ Then, enter the `output_dir`, rename the folder `valid` as `Annotations`. Use the following command to zip the folder:
47
+
48
+ ```
49
+ zip -q -r submission.zip Annotations
50
+ ```
51
+
52
+ To evaluate the results, please upload the zip file to the [competition server](https://competitions.codalab.org/competitions/29139#participate-submit_results).
53
+
54
+ ### Training
55
+
56
+
57
+ - Finetune
58
+
59
+ The following command includes the training and inference stages.
60
+
61
+ ```
62
+ ./scripts/dist_train_test_ytvos.sh [/path/to/output_dir] [/path/to/pretrained_weight] --backbone [backbone]
63
+ ```
64
+
65
+ For example, training the Video-Swin-Tiny model, run the following command:
66
+
67
+ ```
68
+ ./scripts/dist_train_test_ytvos.sh ytvos_dirs/video_swin_tiny pretrained_weights/video_swin_tiny_pretrained.pth --backbone video_swin_t_p4w7
69
+ ```
70
+
71
+ - Train from scratch
72
+
73
+ The following command includes the training and inference stages.
74
+
75
+ ```
76
+ ./scripts/dist_train_test_ytvos_scratch.sh [/path/to/output_dir] --backbone [backbone] --backbone_pretrained [/path/to/backbone_pretrained_weight] [other args]
77
+ ```
78
+
79
+ For example, training the Video-Swin-Tiny model, run the following command:
80
+
81
+ ```
82
+ ./scripts/dist_train_test_ytvos.sh ytvos_dirs/video_swin_tiny_scratch --backbone video_swin_t_p4w7 --backbone_pretrained video_swin_pretrained/swin_tiny_patch244_window877_kinetics400_1k.pth
83
+ ```
docs/data.md ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data Preparation
2
+
3
+ Create a new directory `data` to store all the datasets.
4
+
5
+ ## Ref-COCO
6
+
7
+ Download the dataset from the official website [COCO](https://cocodataset.org/#download).
8
+ RefCOCO/+/g use the COCO2014 train split.
9
+ Download the annotation files from [github](https://github.com/lichengunc/refer).
10
+
11
+ Convert the annotation files:
12
+
13
+ ```
14
+ python3 tools/data/convert_refexp_to_coco.py
15
+ ```
16
+
17
+ Finally, we expect the directory structure to be the following:
18
+
19
+ ```
20
+ ReferFormer
21
+ ├── data
22
+ │ ├── coco
23
+ │ │ ├── train2014
24
+ │ │ ├── refcoco
25
+ │ │ │ ├── instances_refcoco_train.json
26
+ │ │ │ ├── instances_refcoco_val.json
27
+ │ │ ├── refcoco+
28
+ │ │ │ ├── instances_refcoco+_train.json
29
+ │ │ │ ├── instances_refcoco+_val.json
30
+ │ │ ├── refcocog
31
+ │ │ │ ├── instances_refcocog_train.json
32
+ │ │ │ ├── instances_refcocog_val.json
33
+ ```
34
+
35
+
36
+ ## Ref-Youtube-VOS
37
+
38
+ Download the dataset from the competition's website [here](https://competitions.codalab.org/competitions/29139#participate-get_data).
39
+ Then, extract and organize the file. We expect the directory structure to be the following:
40
+
41
+ ```
42
+ ReferFormer
43
+ ├── data
44
+ │ ├── ref-youtube-vos
45
+ │ │ ├── meta_expressions
46
+ │ │ ├── train
47
+ │ │ │ ├── JPEGImages
48
+ │ │ │ ├── Annotations
49
+ │ │ │ ├── meta.json
50
+ │ │ ├── valid
51
+ │ │ │ ├── JPEGImages
52
+ ```
53
+
54
+ ## Ref-DAVIS17
55
+
56
+ Downlaod the DAVIS2017 dataset from the [website](https://davischallenge.org/davis2017/code.html). Note that you only need to download the two zip files `DAVIS-2017-Unsupervised-trainval-480p.zip` and `DAVIS-2017_semantics-480p.zip`.
57
+ Download the text annotations from the [website](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/research/video-segmentation/video-object-segmentation-with-language-referring-expressions).
58
+ Then, put the zip files in the directory as follows.
59
+
60
+
61
+ ```
62
+ ReferFormer
63
+ ├── data
64
+ │ ├── ref-davis
65
+ │ │ ├── DAVIS-2017_semantics-480p.zip
66
+ │ │ ├── DAVIS-2017-Unsupervised-trainval-480p.zip
67
+ │ │ ├── davis_text_annotations.zip
68
+ ```
69
+
70
+ Unzip these zip files.
71
+ ```
72
+ unzip -o davis_text_annotations.zip
73
+ unzip -o DAVIS-2017_semantics-480p.zip
74
+ unzip -o DAVIS-2017-Unsupervised-trainval-480p.zip
75
+ ```
76
+
77
+ Preprocess the dataset to Ref-Youtube-VOS format. (Make sure you are in the main directory)
78
+
79
+ ```
80
+ python tools/data/convert_davis_to_ytvos.py
81
+ ```
82
+
83
+ Finally, unzip the file `DAVIS-2017-Unsupervised-trainval-480p.zip` again (since we use `mv` in preprocess for efficiency).
84
+
85
+ ```
86
+ unzip -o DAVIS-2017-Unsupervised-trainval-480p.zip
87
+ ```
88
+
89
+
90
+
91
+
92
+ ## A2D-Sentences
93
+
94
+ Follow the instructions and download the dataset from the website [here](https://kgavrilyuk.github.io/publication/actor_action/).
95
+ Then, extract the files. Additionally, we use the same json annotation files generated by [MTTR](https://github.com/mttr2021/MTTR). Please download these files from [onedrive](https://connecthkuhk-my.sharepoint.com/:f:/g/personal/wjn922_connect_hku_hk/EnvcpWsMsY5NrMF5If3F6DwBseMrqmzQwpTtL8HXoLAChw?e=Vlv1et).
96
+ We expect the directory structure to be the following:
97
+
98
+ ```
99
+ ReferFormer
100
+ ├── data
101
+ │ ├── a2d_sentences
102
+ │ │ ├── Release
103
+ │ │ ├── text_annotations
104
+ │ │ │ ├── a2d_annotation_with_instances
105
+ │ │ │ ├── a2d_annotation.txt
106
+ │ │ │ ├── a2d_missed_videos.txt
107
+ │ │ ├── a2d_sentences_single_frame_test_annotations.json
108
+ │ │ ├── a2d_sentences_single_frame_train_annotations.json
109
+ │ │ ├── a2d_sentences_test_annotations_in_coco_format.json
110
+ ```
111
+
112
+ ## JHMDB-Sentences
113
+
114
+ Follow the instructions and download the dataset from the website [here](https://kgavrilyuk.github.io/publication/actor_action/).
115
+ Then, extract the files. Additionally, we use the same json annotation files generated by [MTTR](https://github.com/mttr2021/MTTR). Please download these files from [onedrive](https://connecthkuhk-my.sharepoint.com/:f:/g/personal/wjn922_connect_hku_hk/EjPyzXq93s5Jm4GU07JrWIMBb6nObY8fEmLyuiGg-0uBtg?e=GsZ6jP).
116
+ We expect the directory structure to be the following:
117
+
118
+ ```
119
+ ReferFormer
120
+ ├── data
121
+ │ ├── jhmdb_sentences
122
+ │ │ ├── Rename_Images
123
+ │ │ ├── puppet_mask
124
+ │ │ ├── jhmdb_annotation.txt
125
+ │ │ ├── jhmdb_sentences_samples_metadata.json
126
+ │ │ ├── jhmdb_sentences_gt_annotations_in_coco_format.json
127
+ ```
engine.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Train and eval functions used in main.py
3
+ Modified from DETR (https://github.com/facebookresearch/detr)
4
+ """
5
+ import math
6
+ from models import postprocessors
7
+ import os
8
+ import sys
9
+ from typing import Iterable
10
+
11
+ import torch
12
+ import torch.distributed as dist
13
+
14
+ import util.misc as utils
15
+ from datasets.coco_eval import CocoEvaluator
16
+ from datasets.refexp_eval import RefExpEvaluator
17
+
18
+ from pycocotools.coco import COCO
19
+ from pycocotools.cocoeval import COCOeval
20
+ from datasets.a2d_eval import calculate_precision_at_k_and_iou_metrics, calculate_bbox_precision_at_k_and_iou_metrics
21
+
22
+ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
23
+ data_loader: Iterable, optimizer: torch.optim.Optimizer,
24
+ device: torch.device, epoch: int, max_norm: float = 0):
25
+ model.train()
26
+ criterion.train()
27
+ metric_logger = utils.MetricLogger(delimiter=" ")
28
+ metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
29
+ header = 'Epoch: [{}]'.format(epoch)
30
+ print_freq = 10
31
+ for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
32
+ samples = samples.to(device)
33
+ captions = [t["caption"] for t in targets]
34
+ targets = utils.targets_to(targets, device)
35
+
36
+ outputs = model(samples, captions, targets)
37
+ loss_dict = criterion(outputs, targets)
38
+
39
+ weight_dict = criterion.weight_dict
40
+ losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
41
+
42
+ # reduce losses over all GPUs for logging purposes
43
+ loss_dict_reduced = utils.reduce_dict(loss_dict)
44
+ loss_dict_reduced_unscaled = {f'{k}_unscaled': v
45
+ for k, v in loss_dict_reduced.items()}
46
+ loss_dict_reduced_scaled = {k: v * weight_dict[k]
47
+ for k, v in loss_dict_reduced.items() if k in weight_dict}
48
+ losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())
49
+
50
+ loss_value = losses_reduced_scaled.item()
51
+
52
+ if not math.isfinite(loss_value):
53
+ print("Loss is {}, stopping training".format(loss_value))
54
+ print(loss_dict_reduced)
55
+ sys.exit(1)
56
+ optimizer.zero_grad()
57
+ losses.backward()
58
+ if max_norm > 0:
59
+ grad_total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
60
+ else:
61
+ grad_total_norm = utils.get_total_grad_norm(model.parameters(), max_norm)
62
+ optimizer.step()
63
+
64
+ metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled)
65
+ metric_logger.update(lr=optimizer.param_groups[0]["lr"])
66
+ metric_logger.update(grad_norm=grad_total_norm)
67
+
68
+ # gather the stats from all processes
69
+ metric_logger.synchronize_between_processes()
70
+ print("Averaged stats:", metric_logger)
71
+ return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
72
+
73
+
74
+ @torch.no_grad()
75
+ def evaluate(model, criterion, postprocessors, data_loader, evaluator_list, device, args):
76
+ model.eval()
77
+ criterion.eval()
78
+
79
+ metric_logger = utils.MetricLogger(delimiter=" ")
80
+ header = 'Test:'
81
+
82
+ predictions = []
83
+ for samples, targets in metric_logger.log_every(data_loader, 10, header):
84
+ dataset_name = targets[0]["dataset_name"]
85
+ samples = samples.to(device)
86
+ captions = [t["caption"] for t in targets]
87
+ targets = utils.targets_to(targets, device)
88
+
89
+ outputs = model(samples, captions, targets)
90
+ loss_dict = criterion(outputs, targets)
91
+ weight_dict = criterion.weight_dict
92
+
93
+ # reduce losses over all GPUs for logging purposes
94
+ loss_dict_reduced = utils.reduce_dict(loss_dict)
95
+ loss_dict_reduced_scaled = {k: v * weight_dict[k]
96
+ for k, v in loss_dict_reduced.items() if k in weight_dict}
97
+ loss_dict_reduced_unscaled = {f'{k}_unscaled': v
98
+ for k, v in loss_dict_reduced.items()}
99
+ metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()),
100
+ **loss_dict_reduced_scaled,
101
+ **loss_dict_reduced_unscaled)
102
+
103
+ orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
104
+ results = postprocessors['bbox'](outputs, orig_target_sizes)
105
+ if 'segm' in postprocessors.keys():
106
+ target_sizes = torch.stack([t["size"] for t in targets], dim=0)
107
+ results = postprocessors['segm'](results, outputs, orig_target_sizes, target_sizes)
108
+ res = {target['image_id'].item(): output for target, output in zip(targets, results)}
109
+
110
+ for evaluator in evaluator_list:
111
+ evaluator.update(res)
112
+
113
+ # REC & RES predictions
114
+ for p, target in zip(results, targets):
115
+ for s, b, m in zip(p['scores'], p['boxes'], p['rle_masks']):
116
+ predictions.append({'image_id': target['image_id'].item(),
117
+ 'category_id': 1, # dummy label, as categories are not predicted in ref-vos
118
+ 'bbox': b.tolist(),
119
+ 'segmentation': m,
120
+ 'score': s.item()})
121
+
122
+
123
+ # gather the stats from all processes
124
+ metric_logger.synchronize_between_processes()
125
+ print("Averaged stats:", metric_logger)
126
+ for evaluator in evaluator_list:
127
+ evaluator.synchronize_between_processes()
128
+
129
+ # accumulate predictions from all images
130
+ refexp_res = None
131
+ for evaluator in evaluator_list:
132
+ if isinstance(evaluator, CocoEvaluator):
133
+ evaluator.accumulate()
134
+ evaluator.summarize()
135
+ elif isinstance(evaluator, RefExpEvaluator):
136
+ refexp_res = evaluator.summarize()
137
+
138
+ stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
139
+
140
+ # update stats
141
+ for evaluator in evaluator_list:
142
+ if isinstance(evaluator, CocoEvaluator):
143
+ if "bbox" in postprocessors.keys():
144
+ stats["coco_eval_bbox"] = evaluator.coco_eval["bbox"].stats.tolist()
145
+ if "segm" in postprocessors.keys():
146
+ stats["coco_eval_masks"] = evaluator.coco_eval["segm"].stats.tolist()
147
+ if refexp_res is not None:
148
+ stats.update(refexp_res)
149
+
150
+ # evaluate RES
151
+ # gather and merge predictions from all gpus
152
+ gathered_pred_lists = utils.all_gather(predictions)
153
+ predictions = [p for p_list in gathered_pred_lists for p in p_list]
154
+
155
+ eval_metrics = {}
156
+ if utils.is_main_process():
157
+ if dataset_name == 'refcoco':
158
+ coco_gt = COCO(os.path.join(args.coco_path, 'refcoco/instances_refcoco_val.json'))
159
+ elif dataset_name == 'refcoco+':
160
+ coco_gt = COCO(os.path.join(args.coco_path, 'refcoco+/instances_refcoco+_val.json'))
161
+ elif dataset_name == 'refcocog':
162
+ coco_gt = COCO(os.path.join(args.coco_path, 'refcocog/instances_refcocog_val.json'))
163
+ else:
164
+ raise NotImplementedError
165
+ coco_pred = coco_gt.loadRes(predictions)
166
+ coco_eval = COCOeval(coco_gt, coco_pred, iouType='segm')
167
+ coco_eval.params.useCats = 0 # ignore categories as they are not predicted in ref-vos task
168
+ coco_eval.evaluate()
169
+ coco_eval.accumulate()
170
+ coco_eval.summarize()
171
+ # ap_labels = ['mAP 0.5:0.95', 'AP 0.5', 'AP 0.75', 'AP 0.5:0.95 S', 'AP 0.5:0.95 M', 'AP 0.5:0.95 L']
172
+ # ap_metrics = coco_eval.stats[:6]
173
+ # eval_metrics = {l: m for l, m in zip(ap_labels, ap_metrics)}
174
+ # Precision and IOU
175
+ # bbox
176
+ precision_at_k, overall_iou, mean_iou = calculate_bbox_precision_at_k_and_iou_metrics(coco_gt, coco_pred)
177
+ eval_metrics.update({f'bbox P@{k}': m for k, m in zip([0.5, 0.6, 0.7, 0.8, 0.9], precision_at_k)})
178
+ eval_metrics.update({'bbox overall_iou': overall_iou, 'bbox mean_iou': mean_iou})
179
+ # mask
180
+ precision_at_k, overall_iou, mean_iou = calculate_precision_at_k_and_iou_metrics(coco_gt, coco_pred)
181
+ eval_metrics.update({f'segm P@{k}': m for k, m in zip([0.5, 0.6, 0.7, 0.8, 0.9], precision_at_k)})
182
+ eval_metrics.update({'segm overall_iou': overall_iou, 'segm mean_iou': mean_iou})
183
+ print(eval_metrics)
184
+ stats.update(eval_metrics)
185
+
186
+ return stats
187
+
188
+
189
+ @torch.no_grad()
190
+ def evaluate_a2d(model, data_loader, postprocessor, device, args):
191
+ model.eval()
192
+ predictions = []
193
+ metric_logger = utils.MetricLogger(delimiter=" ")
194
+ header = 'Test:'
195
+
196
+ for samples, targets in metric_logger.log_every(data_loader, 10, header):
197
+ image_ids = [t['image_id'] for t in targets]
198
+
199
+ samples = samples.to(device)
200
+ captions = [t["caption"] for t in targets]
201
+ targets = utils.targets_to(targets, device)
202
+
203
+ outputs = model(samples, captions, targets)
204
+
205
+ orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
206
+ target_sizes = torch.stack([t["size"] for t in targets], dim=0)
207
+ processed_outputs = postprocessor(outputs, orig_target_sizes, target_sizes)
208
+
209
+ for p, image_id in zip(processed_outputs, image_ids):
210
+ for s, m in zip(p['scores'], p['rle_masks']):
211
+ predictions.append({'image_id': image_id,
212
+ 'category_id': 1, # dummy label, as categories are not predicted in ref-vos
213
+ 'segmentation': m,
214
+ 'score': s.item()})
215
+
216
+ # gather and merge predictions from all gpus
217
+ gathered_pred_lists = utils.all_gather(predictions)
218
+ predictions = [p for p_list in gathered_pred_lists for p in p_list]
219
+ # evaluation
220
+ eval_metrics = {}
221
+ if utils.is_main_process():
222
+ if args.dataset_file == 'a2d':
223
+ coco_gt = COCO(os.path.join(args.a2d_path, 'a2d_sentences_test_annotations_in_coco_format.json'))
224
+ elif args.dataset_file == 'jhmdb':
225
+ coco_gt = COCO(os.path.join(args.jhmdb_path, 'jhmdb_sentences_gt_annotations_in_coco_format.json'))
226
+ else:
227
+ raise NotImplementedError
228
+ coco_pred = coco_gt.loadRes(predictions)
229
+ coco_eval = COCOeval(coco_gt, coco_pred, iouType='segm')
230
+ coco_eval.params.useCats = 0 # ignore categories as they are not predicted in ref-vos task
231
+ coco_eval.evaluate()
232
+ coco_eval.accumulate()
233
+ coco_eval.summarize()
234
+ ap_labels = ['mAP 0.5:0.95', 'AP 0.5', 'AP 0.75', 'AP 0.5:0.95 S', 'AP 0.5:0.95 M', 'AP 0.5:0.95 L']
235
+ ap_metrics = coco_eval.stats[:6]
236
+ eval_metrics = {l: m for l, m in zip(ap_labels, ap_metrics)}
237
+ # Precision and IOU
238
+ precision_at_k, overall_iou, mean_iou = calculate_precision_at_k_and_iou_metrics(coco_gt, coco_pred)
239
+ eval_metrics.update({f'P@{k}': m for k, m in zip([0.5, 0.6, 0.7, 0.8, 0.9], precision_at_k)})
240
+ eval_metrics.update({'overall_iou': overall_iou, 'mean_iou': mean_iou})
241
+ print(eval_metrics)
242
+
243
+ # sync all processes before starting a new epoch or exiting
244
+ dist.barrier()
245
+ return eval_metrics
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
eval_davis.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ import os
3
+ import sys
4
+ from time import time
5
+ import argparse
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from davis2017.evaluation import DAVISEvaluation
10
+
11
+ default_davis_path = 'data/ref-davis/DAVIS'
12
+
13
+ time_start = time()
14
+ parser = argparse.ArgumentParser()
15
+ parser.add_argument('--davis_path', type=str, help='Path to the DAVIS folder containing the JPEGImages, Annotations, '
16
+ 'ImageSets, Annotations_unsupervised folders',
17
+ required=False, default=default_davis_path)
18
+ parser.add_argument('--set', type=str, help='Subset to evaluate the results', default='val') # val subset
19
+ parser.add_argument('--task', type=str, help='Task to evaluate the results', default='unsupervised',
20
+ choices=['semi-supervised', 'unsupervised'])
21
+ parser.add_argument('--results_path', type=str, help='Path to the folder containing the sequences folders',
22
+ required=True)
23
+ args, _ = parser.parse_known_args()
24
+ csv_name_global = f'global_results-{args.set}.csv'
25
+ csv_name_per_sequence = f'per-sequence_results-{args.set}.csv'
26
+
27
+ # Check if the method has been evaluated before, if so read the results, otherwise compute the results
28
+ csv_name_global_path = os.path.join(args.results_path, csv_name_global)
29
+ csv_name_per_sequence_path = os.path.join(args.results_path, csv_name_per_sequence)
30
+ if os.path.exists(csv_name_global_path) and os.path.exists(csv_name_per_sequence_path):
31
+ print('Using precomputed results...')
32
+ table_g = pd.read_csv(csv_name_global_path)
33
+ table_seq = pd.read_csv(csv_name_per_sequence_path)
34
+ else:
35
+ print(f'Evaluating sequences for the {args.task} task...')
36
+ # Create dataset and evaluate
37
+ dataset_eval = DAVISEvaluation(davis_root=args.davis_path, task=args.task, gt_set=args.set)
38
+ metrics_res = dataset_eval.evaluate(args.results_path)
39
+ J, F = metrics_res['J'], metrics_res['F']
40
+
41
+ # Generate dataframe for the general results
42
+ g_measures = ['J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay']
43
+ final_mean = (np.mean(J["M"]) + np.mean(F["M"])) / 2.
44
+ g_res = np.array([final_mean, np.mean(J["M"]), np.mean(J["R"]), np.mean(J["D"]), np.mean(F["M"]), np.mean(F["R"]),
45
+ np.mean(F["D"])])
46
+ g_res = np.reshape(g_res, [1, len(g_res)])
47
+ table_g = pd.DataFrame(data=g_res, columns=g_measures)
48
+ with open(csv_name_global_path, 'w') as f:
49
+ table_g.to_csv(f, index=False, float_format="%.5f")
50
+ print(f'Global results saved in {csv_name_global_path}')
51
+
52
+ # Generate a dataframe for the per sequence results
53
+ seq_names = list(J['M_per_object'].keys())
54
+ seq_measures = ['Sequence', 'J-Mean', 'F-Mean']
55
+ J_per_object = [J['M_per_object'][x] for x in seq_names]
56
+ F_per_object = [F['M_per_object'][x] for x in seq_names]
57
+ table_seq = pd.DataFrame(data=list(zip(seq_names, J_per_object, F_per_object)), columns=seq_measures)
58
+ with open(csv_name_per_sequence_path, 'w') as f:
59
+ table_seq.to_csv(f, index=False, float_format="%.5f")
60
+ print(f'Per-sequence results saved in {csv_name_per_sequence_path}')
61
+
62
+ # Print the results
63
+ sys.stdout.write(f"--------------------------- Global results for {args.set} ---------------------------\n")
64
+ print(table_g.to_string(index=False))
65
+ sys.stdout.write(f"\n---------- Per sequence results for {args.set} ----------\n")
66
+ print(table_seq.to_string(index=False))
67
+ total_time = time() - time_start
68
+ sys.stdout.write('\nTotal time:' + str(total_time))
jptr_chaeyun.txt ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [I 2025-02-06 14:30:15.041 ServerApp] Extension package jupyter_lsp took 0.2983s to import
2
+ [I 2025-02-06 14:30:16.739 ServerApp] jupyter_lsp | extension was successfully linked.
3
+ [I 2025-02-06 14:30:16.744 ServerApp] jupyter_server_terminals | extension was successfully linked.
4
+ [I 2025-02-06 14:30:16.749 ServerApp] jupyterlab | extension was successfully linked.
5
+ [W 2025-02-06 14:30:16.751 JupyterNotebookApp] 'password' has moved from NotebookApp to ServerApp. This config will be passed to ServerApp. Be sure to update your config before our next release.
6
+ [W 2025-02-06 14:30:16.754 ServerApp] ServerApp.password config is deprecated in 2.0. Use PasswordIdentityProvider.hashed_password.
7
+ [I 2025-02-06 14:30:16.754 ServerApp] notebook | extension was successfully linked.
8
+ [I 2025-02-06 14:30:17.430 ServerApp] notebook_shim | extension was successfully linked.
9
+ [I 2025-02-06 14:30:17.804 ServerApp] notebook_shim | extension was successfully loaded.
10
+ [I 2025-02-06 14:30:17.807 ServerApp] jupyter_lsp | extension was successfully loaded.
11
+ [I 2025-02-06 14:30:17.808 ServerApp] jupyter_server_terminals | extension was successfully loaded.
12
+ [I 2025-02-06 14:30:18.042 LabApp] JupyterLab extension loaded from /home/chaeyun/.conda/envs/risall/lib/python3.9/site-packages/jupyterlab
13
+ [I 2025-02-06 14:30:18.042 LabApp] JupyterLab application directory is /data/conda_envs/chaeyun/envs/risall/share/jupyter/lab
14
+ [I 2025-02-06 14:30:18.053 LabApp] Extension Manager is 'pypi'.
15
+ [I 2025-02-06 14:30:18.386 ServerApp] jupyterlab | extension was successfully loaded.
16
+ [I 2025-02-06 14:30:18.394 ServerApp] notebook | extension was successfully loaded.
17
+ [I 2025-02-06 14:30:18.395 ServerApp] Serving notebooks from local directory: /data/projects/yejin/VerbCentric_RIS/ReferFormer
18
+ [I 2025-02-06 14:30:18.395 ServerApp] Jupyter Server 2.15.0 is running at:
19
+ [I 2025-02-06 14:30:18.395 ServerApp] http://localhost:5727/tree
20
+ [I 2025-02-06 14:30:18.395 ServerApp] http://127.0.0.1:5727/tree
21
+ [I 2025-02-06 14:30:18.395 ServerApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
22
+ [I 2025-02-06 14:30:18.574 ServerApp] Skipped non-installed server(s): bash-language-server, dockerfile-language-server-nodejs, javascript-typescript-langserver, jedi-language-server, julia-language-server, pyright, python-language-server, python-lsp-server, r-languageserver, sql-language-server, texlab, typescript-language-server, unified-language-server, vscode-css-languageserver-bin, vscode-html-languageserver-bin, vscode-json-languageserver-bin, yaml-language-server
23
+ [W 2025-02-06 14:31:50.823 ServerApp] 404 GET /hub/api (@::1) 179.74ms referer=None
24
+ [I 2025-02-06 14:31:50.827 JupyterNotebookApp] 302 GET /tree? (@::1) 0.51ms
25
+ [I 2025-02-06 14:31:53.423 ServerApp] User 09e1c030b1ec4bb68957ab993d4377f9 logged in.
26
+ [I 2025-02-06 14:31:53.423 ServerApp] 302 POST /login? (09e1c030b1ec4bb68957ab993d4377f9@::1) 1.16ms
27
+ [I 2025-02-06 14:32:05.968 ServerApp] Creating new notebook in
28
+ [I 2025-02-06 14:32:06.446 ServerApp] Kernel started: 5d7b2dc4-2827-441f-b000-c315d487a88b
29
+ [W 2025-02-06 14:32:06.458 ServerApp] delete /gpt_ref-ytvos_numbered_cy-jvsc-01c75e0b-00a9-46d9-9ef5-cef387099deb810a8dd4-69a4-4831-a7c1-98121c4af797.ipynb
30
+ [I 2025-02-06 14:32:07.994 ServerApp] Connecting to kernel 5d7b2dc4-2827-441f-b000-c315d487a88b.
31
+ [W 2025-02-06 14:32:08.754 ServerApp] 404 GET /nbextensions/viewer/extension.js (@::1) 1.43ms referer=None
32
+ [W 2025-02-06 14:32:08.755 ServerApp] 404 GET /nbextensions/jupyter-js-widgets/extension.js (@::1) 1.86ms referer=None
33
+ [I 2025-02-06 14:32:45.217 ServerApp] Creating new notebook in
34
+ [I 2025-02-06 14:32:45.480 ServerApp] Kernel started: 72239f5d-4d18-4a03-8eed-3375f161975b
35
+ [W 2025-02-06 14:32:45.492 ServerApp] delete /check_image_numbered_cy-jvsc-9867d5b1-af20-4268-8b58-531863f46cb2f672a8a4-bc6f-48b5-923c-58bb212ea0d7.ipynb
36
+ [I 2025-02-06 14:32:46.166 ServerApp] Connecting to kernel 72239f5d-4d18-4a03-8eed-3375f161975b.
37
+ [W 2025-02-06 14:32:47.092 ServerApp] 404 GET /nbextensions/jupyter-js-widgets/extension.js (@::1) 1.11ms referer=None
38
+ [W 2025-02-06 14:32:47.093 ServerApp] 404 GET /nbextensions/viewer/extension.js (@::1) 0.84ms referer=None
39
+ [I 2025-02-06 14:38:36.722 ServerApp] Kernel interrupted: 5d7b2dc4-2827-441f-b000-c315d487a88b
40
+ [W 2025-02-06 15:27:22.906 ServerApp] 404 GET /hub/api (@::1) 332.79ms referer=None
41
+ [I 2025-02-06 15:27:23.033 JupyterNotebookApp] 302 GET /tree? (@::1) 0.64ms
42
+ [I 2025-02-06 15:27:25.982 ServerApp] User 9c560d44658d478aa5d6decbf8541260 logged in.
43
+ [I 2025-02-06 15:27:25.983 ServerApp] 302 POST /login? (9c560d44658d478aa5d6decbf8541260@::1) 1.08ms
44
+ [I 2025-02-06 15:32:06.555 ServerApp] Creating new notebook in
45
+ [I 2025-02-06 15:32:11.934 ServerApp] Kernel started: 97e74450-8dc0-4ea7-b396-deccdfc0a23f
46
+ [W 2025-02-06 15:32:11.955 ServerApp] delete /check_image_numbered_cy-jvsc-1a1215f3-c818-462e-a439-92c1dcbe474e70d6c3f2-c47b-4d0e-adef-589dd6523fcc.ipynb
47
+ [I 2025-02-06 15:32:15.473 ServerApp] Connecting to kernel 97e74450-8dc0-4ea7-b396-deccdfc0a23f.
48
+ [I 2025-02-06 15:32:15.678 ServerApp] Starting buffering for 5d7b2dc4-2827-441f-b000-c315d487a88b:2db8540d-faed-4333-80b7-7fde202eaafd
49
+ [I 2025-02-06 15:32:15.679 ServerApp] Starting buffering for 72239f5d-4d18-4a03-8eed-3375f161975b:848b127a-dce3-4988-9ab9-aa3fc5535255
50
+ [W 2025-02-06 15:43:43.450 ServerApp] 404 GET /hub/api (@::1) 172.61ms referer=None
51
+ [I 2025-02-06 15:43:43.455 JupyterNotebookApp] 302 GET /tree? (@::1) 0.67ms
52
+ [I 2025-02-06 15:43:46.430 ServerApp] User 49ee40ca9dfd47c7b4fd4bf6b592f8d0 logged in.
53
+ [I 2025-02-06 15:43:46.430 ServerApp] 302 POST /login? (49ee40ca9dfd47c7b4fd4bf6b592f8d0@::1) 1.25ms
54
+ [I 2025-02-06 15:45:30.966 ServerApp] Creating new notebook in
55
+ [I 2025-02-06 15:45:31.635 ServerApp] Kernel started: dde1cca7-f769-4156-a1a0-1303a7fb5ba5
56
+ [W 2025-02-06 15:45:31.654 ServerApp] delete /CRIS_hp_check_cygsds-jvsc-c01fd7dc-3a7d-48dc-b410-e6b5e4207a574d09512b-5033-42ea-91e2-034fca8b2587.ipynb
57
+ [I 2025-02-06 15:45:32.747 ServerApp] Connecting to kernel dde1cca7-f769-4156-a1a0-1303a7fb5ba5.
58
+ [W 2025-02-06 15:45:33.602 ServerApp] 404 GET /nbextensions/viewer/extension.js (@::1) 2.10ms referer=None
59
+ [W 2025-02-06 15:45:33.603 ServerApp] 404 GET /nbextensions/jupyter-js-widgets/extension.js (@::1) 2.54ms referer=None
60
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
61
+ To disable this warning, you can either:
62
+ - Avoid using `tokenizers` before the fork if possible
63
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
64
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
65
+ To disable this warning, you can either:
66
+ - Avoid using `tokenizers` before the fork if possible
67
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
68
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
69
+ To disable this warning, you can either:
70
+ - Avoid using `tokenizers` before the fork if possible
71
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
72
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
73
+ To disable this warning, you can either:
74
+ - Avoid using `tokenizers` before the fork if possible
75
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
76
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
77
+ To disable this warning, you can either:
78
+ - Avoid using `tokenizers` before the fork if possible
79
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
80
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
81
+ To disable this warning, you can either:
82
+ - Avoid using `tokenizers` before the fork if possible
83
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
84
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
85
+ To disable this warning, you can either:
86
+ - Avoid using `tokenizers` before the fork if possible
87
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
88
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
89
+ To disable this warning, you can either:
90
+ - Avoid using `tokenizers` before the fork if possible
91
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
92
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
93
+ To disable this warning, you can either:
94
+ - Avoid using `tokenizers` before the fork if possible
95
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
96
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
97
+ To disable this warning, you can either:
98
+ - Avoid using `tokenizers` before the fork if possible
99
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
101
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
102
+ To disable this warning, you can either:
103
+ - Avoid using `tokenizers` before the fork if possible
104
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
105
+ To disable this warning, you can either:
106
+ - Avoid using `tokenizers` before the fork if possible
107
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
108
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
109
+ To disable this warning, you can either:
110
+ - Avoid using `tokenizers` before the fork if possible
111
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
112
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
113
+ To disable this warning, you can either:
114
+ - Avoid using `tokenizers` before the fork if possible
115
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
116
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
117
+ To disable this warning, you can either:
118
+ - Avoid using `tokenizers` before the fork if possible
119
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
120
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
121
+ To disable this warning, you can either:
122
+ - Avoid using `tokenizers` before the fork if possible
123
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
124
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
125
+ To disable this warning, you can either:
126
+ - Avoid using `tokenizers` before the fork if possible
127
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
128
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
129
+ To disable this warning, you can either:
130
+ - Avoid using `tokenizers` before the fork if possible
131
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
132
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
133
+ To disable this warning, you can either:
134
+ - Avoid using `tokenizers` before the fork if possible
135
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
136
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
137
+ To disable this warning, you can either:
138
+ - Avoid using `tokenizers` before the fork if possible
139
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
140
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
141
+ To disable this warning, you can either:
142
+ - Avoid using `tokenizers` before the fork if possible
143
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
144
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
145
+ To disable this warning, you can either:
146
+ - Avoid using `tokenizers` before the fork if possible
147
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
148
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
149
+ To disable this warning, you can either:
150
+ - Avoid using `tokenizers` before the fork if possible
151
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
152
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
153
+ To disable this warning, you can either:
154
+ - Avoid using `tokenizers` before the fork if possible
155
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
156
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
157
+ To disable this warning, you can either:
158
+ - Avoid using `tokenizers` before the fork if possible
159
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
160
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
161
+ To disable this warning, you can either:
162
+ - Avoid using `tokenizers` before the fork if possible
163
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
164
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
165
+ To disable this warning, you can either:
166
+ - Avoid using `tokenizers` before the fork if possible
167
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
168
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
169
+ To disable this warning, you can either:
170
+ - Avoid using `tokenizers` before the fork if possible
171
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
172
+ huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
173
+ To disable this warning, you can either:
174
+ - Avoid using `tokenizers` before the fork if possible
175
+ - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
176
+ srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
177
+ slurmstepd-node05: error: *** STEP 7716.0 ON node05 CANCELLED AT 2025-02-06T16:54:16 ***
178
+ slurmstepd-node05: error: *** JOB 7716 ON node05 CANCELLED AT 2025-02-06T16:54:16 ***
179
+ [C 2025-02-06 16:54:16.389 ServerApp] received signal 15, stopping
make_ref-ytvos/annotate_ref_ytvos.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import build_dataset
2
+ import argparse
3
+ import opts
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ from os import path as osp
8
+ import io
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ import regex as re
13
+ import json
14
+
15
+ import cv2
16
+ from PIL import Image
17
+ import torch
18
+ from torchvision.transforms import functional as F
19
+
20
+ from skimage import measure # (pip install scikit-image)
21
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
22
+
23
+ import matplotlib.pyplot as plt
24
+ from matplotlib.collections import PatchCollection
25
+ from matplotlib.patches import Rectangle
26
+
27
+ import ipywidgets as widgets
28
+ from IPython.display import display, clear_output
29
+
30
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
31
+ args = parser.parse_args()
32
+
33
+ #==================데이터 불러오기===================
34
+ # 전체 데이터셋
35
+ train_dataset = build_dataset('ytvos', image_set = 'train', args = args)
36
+
37
+ # 전체 데이터셋 메타데이터
38
+ metas = train_dataset.metas
39
+
40
+ # 필터링한 프레임들
41
+ selected_frames_df = pd.read_json("selected_frames4.jsonl", lines = True)
42
+
43
+ #==================마스크 만드는 함수들===================
44
+ def prepare_mask_for_pil(mask_tensor):
45
+ mask_array = mask_tensor.squeeze(0).cpu().numpy()
46
+ mask_array = (mask_array * 255).astype(np.uint8)
47
+ mask_image = Image.fromarray(mask_array)
48
+ return mask_image
49
+
50
+ def create_sub_masks(mask_image):
51
+ width, height = mask_image.size
52
+
53
+ sub_masks = {}
54
+ for x in range(width):
55
+ for y in range(height):
56
+ # Get the RGB values of the pixel
57
+ pixel = mask_image.getpixel((x, y))
58
+
59
+ # If the pixel is not black...
60
+ if pixel != 0 :
61
+ # Check to see if we've created a sub-mask...
62
+ pixel_str = str(pixel)
63
+ sub_mask = sub_masks.get(pixel_str)
64
+ if sub_mask is None:
65
+ # Create a sub-mask (one bit per pixel) and add to the dictionary
66
+ # Note: we add 1 pixel of padding in each direction
67
+ # because the contours module doesn't handle cases
68
+ # where pixels bleed to the edge of the image
69
+ sub_masks[pixel_str] = Image.new('1', (width+2, height+2))
70
+
71
+ # Set the pixel value to 1 (default is 0), accounting for padding
72
+ sub_masks[pixel_str].putpixel((x+1, y+1), 1)
73
+ return sub_masks
74
+
75
+ #==================마스크 annotation 만드는 함수===================
76
+ def create_sub_mask_annotation(sub_mask, image_id, annotation_id, is_crowd):
77
+ # Find contours (boundary lines) around each sub-mask
78
+ # Note: there could be multiple contours if the object
79
+ # is partially occluded. (E.g. an elephant behind a tree)
80
+ contours = measure.find_contours(sub_mask, 0.5, positive_orientation='low')
81
+
82
+ segmentations = []
83
+ polygons = []
84
+ for contour in contours:
85
+ # Flip from (row, col) representation to (x, y)
86
+ # and subtract the padding pixel
87
+ for i in range(len(contour)):
88
+ row, col = contour[i]
89
+ contour[i] = (col - 1, row - 1)
90
+
91
+ # Make a polygon and simplify it
92
+ poly = Polygon(contour)
93
+ poly = poly.simplify(1.0, preserve_topology=False)
94
+ polygons.append(poly)
95
+ segmentation = np.array(poly.exterior.coords).ravel().tolist()
96
+ segmentations.append(segmentation)
97
+
98
+ # Combine the polygons to calculate the bounding box and area
99
+ multi_poly = MultiPolygon(polygons)
100
+ x, y, max_x, max_y = multi_poly.bounds
101
+ width = max_x - x
102
+ height = max_y - y
103
+ bbox = (x, y, width, height)
104
+ area = multi_poly.area
105
+
106
+ annotation = {
107
+ 'segmentation': segmentations,
108
+ 'iscrowd': is_crowd,
109
+ 'image_id': image_id,
110
+ 'id': annotation_id,
111
+ 'bbox': bbox,
112
+ 'area': area
113
+ }
114
+ return annotation
115
+
116
+ #==================시각화 함수===================
117
+ # annotation dictionary as input
118
+ def showRef(annotation, image_dir, seg_box='seg'):
119
+ ax = plt.gca()
120
+ I = io.imread(osp.join(image_dir, annotation['file_name']))
121
+ ax.imshow(I)
122
+
123
+
124
+ for sid, sent in enumerate(annotation['sentences']):
125
+ print('%s. %s' % (sid + 1, sent))
126
+
127
+ if seg_box == 'seg':
128
+ polygons = []
129
+ color = []
130
+ c = (np.random.random((1, 3)) * 0.6 + 0.4).tolist()[0]
131
+
132
+ if type(annotation['segmentation'][0]) == list:
133
+ # polygon used for refcoco*
134
+ for seg in annotation['segmentation']:
135
+ poly = np.array(seg).reshape((int(len(seg) / 2), 2))
136
+ polygons.append(Polygon(poly))
137
+ color.append(c)
138
+
139
+ p = PatchCollection(polygons,
140
+ facecolors=(221/255, 160/255, 221/255), # 연보라색
141
+ linewidths=0,
142
+ alpha=0.4)
143
+ ax.add_collection(p)
144
+
145
+ p = PatchCollection(polygons,
146
+ facecolors='none',
147
+ edgecolors=color,
148
+ linewidths=2)
149
+ ax.add_collection(p)
150
+ # else:
151
+ # # mask used for refclef
152
+ # rle = annotation['segmentation']
153
+ # m = mask.decode(rle)
154
+ # img = np.ones((m.shape[0], m.shape[1], 3))
155
+ # color_mask = np.array([2.0, 166.0, 101.0]) / 255
156
+ # for i in range(3):
157
+ # img[:, :, i] = color_mask[i]
158
+ # ax.imshow(np.dstack((img, m * 0.5)))
159
+
160
+ # bounding box
161
+ elif seg_box == 'box':
162
+ bbox = annotation['bbox']
163
+ box_plot = Rectangle((bbox[0], bbox[1]),
164
+ bbox[2],
165
+ bbox[3],
166
+ fill=False,
167
+ edgecolor='green',
168
+ linewidth=3)
169
+ ax.add_patch(box_plot)
170
+
171
+ #==================모두 종합한 함수===================
172
+ def create_dict_from_selected_images(selected_frames_df):
173
+
174
+ image_id = 0
175
+ anno_id = 0
176
+ train_idx = 0
177
+
178
+ with open("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_instances2.jsonl", "w") as f:
179
+
180
+ for selected_idx in range(len(selected_frames_df)):
181
+ selected = selected_frames_df.loc[selected_idx]
182
+ selected_vid_id = selected['video']
183
+ selected_frame_id = selected['frame_id']
184
+
185
+ for obj_id in selected['objects'].keys():
186
+
187
+ selected_exp = selected['objects'][obj_id][0] #캡션
188
+ selected_verb = selected['objects'][obj_id][1] #동사
189
+
190
+ train_idx = next(
191
+ idx for idx, meta in enumerate(metas)
192
+ if meta['video'] == selected_vid_id
193
+ and meta['frame_id'] == selected_frame_id
194
+ and meta['obj_id'] == int(obj_id)
195
+ and meta['exp'] == selected_exp
196
+ )
197
+
198
+ train_frames, train_info = train_dataset[train_idx]
199
+
200
+ try:
201
+ valid_frame_loc = train_info['frames_idx'].tolist().index(selected_frame_id) #valid한 frame이 있는 index
202
+ except ValueError:
203
+ print(f"selected vid id: {selected_vid_id}, metas['frame_id']: {metas[train_idx]['frame_id']}, selected frame id: {selected_frame_id}, train_info['frames_idx']: {train_info['frames_idx'].tolist()}")
204
+
205
+
206
+ frame = train_frames[valid_frame_loc] #해당 frame
207
+ frame = F.to_pil_image(frame)
208
+
209
+ image_file_name = f"{selected_vid_id}_{str(selected_frame_id).rjust(5, '0')}"
210
+
211
+ #원래 frame 저장하기
212
+ save_dir = Path("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_frames")
213
+ #save_dir.mkdir(exist_ok=True)
214
+ save_path = save_dir / f"{image_file_name}.png"
215
+ #frame.save(save_path)
216
+
217
+ #카테고리
218
+ label = train_info['labels'][valid_frame_loc].item() #category id
219
+ category_name = metas[train_idx]['category'] #category name
220
+
221
+ #박스 정보
222
+ box = train_info['boxes'][valid_frame_loc]
223
+
224
+ # Annotation tools ########################################################################
225
+ mask = train_info['masks'][valid_frame_loc]
226
+ # print(mask.shape)
227
+
228
+ # frame과 mask 맞는지 확인만
229
+ # plt.imshow(frame.permute(1, 2, 0))
230
+ # mask_color = np.zeros((*mask.shape, 3), dtype = np.uint8)
231
+ # mask_color[mask == 1] = [255, 0, 0]
232
+ # plt.imshow(mask_color, alpha = 0.5)
233
+ # plt.show()
234
+
235
+
236
+ mask_image = prepare_mask_for_pil(mask)
237
+ sub_masks = create_sub_masks(mask_image)
238
+
239
+ for color, sub_mask in sub_masks.items():
240
+ # print(f"Color: {color}, Sub-mask size: {sub_mask.size}")
241
+ sub_mask_array = np.array(sub_mask, dtype=np.uint8)
242
+ annotation = create_sub_mask_annotation(sub_mask_array, image_id, anno_id, is_crowd = 0)
243
+ anno_id += 1
244
+ image_id += 1
245
+
246
+ #파일 경로 추가
247
+ annotation['file_name'] = f"{image_file_name}.png"
248
+
249
+ #불필요한 정보 지우기
250
+ annotation.pop('iscrowd', None)
251
+ annotation.pop('image_id', None)
252
+ annotation.pop('id', None)
253
+
254
+ valid = train_info['valid'][valid_frame_loc]
255
+ orig_size = train_info['orig_size']
256
+ size = train_info['size']
257
+ caption = metas[train_idx]['exp']
258
+
259
+ #filename, height, width 추가
260
+ #annotation['file_name'] = save_path
261
+ annotation['height'] = orig_size[0].item()
262
+ annotation['width'] = orig_size[1].item()
263
+
264
+ # category id,name, sentence dictionary 추가
265
+ annotation['label'] = label
266
+ annotation['category_name'] = category_name
267
+ sentence_dict = {
268
+ "tokens" : caption.split(' '),
269
+ "raw" : caption,
270
+ "sent" : re.sub('[^A-Za-z0-9\s]+', '', caption.lower())
271
+ }
272
+ annotation['sentences'] = sentence_dict
273
+ ############################################################################################
274
+ # double check for segmentation annotation
275
+ # orig_img_np = draw_polygon_on_image(frame, annotation['segmentation'])
276
+ # plt.imshow(orig_img_np)
277
+ # plt.axis('off')
278
+ # plt.show()
279
+
280
+ # showRef(annotation, save_dir)
281
+ ############################################################################################
282
+
283
+ # 최종
284
+ f.write(json.dumps(annotation) + "\n")
285
+ f.flush()
286
+
287
+ # if __name__ == '__main__':
288
+ # create_dict_from_selected_images(selected_frames_df)
make_ref-ytvos/folder2lmdb.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import os.path as osp
4
+ import lmdb
5
+ from PIL import Image
6
+ import pyarrow as pa
7
+ import json
8
+ from tqdm import tqdm
9
+ import warnings
10
+ warnings.filterwarnings("ignore")
11
+
12
+
13
+ def loads_pyarrow(buf):
14
+ """
15
+ Args:
16
+ buf: the output of `dumps`.
17
+ """
18
+ return pa.deserialize(buf)
19
+
20
+
21
+ def raw_reader(path):
22
+ with open(path, 'rb') as f:
23
+ bin_data = f.read()
24
+ return bin_data
25
+
26
+
27
+ def dumps_pyarrow(obj):
28
+ """
29
+ Serialize an object.
30
+ Returns:
31
+ Implementation-dependent bytes-like object
32
+ """
33
+ return pa.serialize(obj).to_buffer()
34
+
35
+
36
+ def folder2lmdb(json_data, img_dir, mask_dir, output_dir, split, write_frequency=1000):
37
+ lmdb_path = osp.join(output_dir, "%s.lmdb" % split)
38
+ isdir = os.path.isdir(lmdb_path)
39
+
40
+ print("Generate LMDB to %s" % lmdb_path)
41
+ db = lmdb.open(lmdb_path, subdir=isdir,
42
+ map_size=1099511627776 * 2, readonly=False,
43
+ meminit=False, map_async=True)
44
+
45
+ txn = db.begin(write=True)
46
+ tbar = tqdm(json_data)
47
+ for idx, item in enumerate(tbar):
48
+ img = raw_reader(osp.join(img_dir, item['file_name']))
49
+ mask = raw_reader(osp.join(mask_dir, f"{idx}.png"))
50
+
51
+ # Pillow로 파일 크기 확인
52
+ #with Image.open(osp.join(img_dir, item['file_name'])) as im:
53
+ #print(f"Image size (Pillow): {im.size}")
54
+ #with Image.open(osp.join(mask_dir, item['file_name'])) as mk:
55
+ #print(f"Mask size (Pillow): {mk.size}")
56
+
57
+ data = {'img': img, 'mask': mask, 'cat': item['category_name'],
58
+ 'seg_id': idx, 'file_name': item['file_name'],
59
+ 'num_sents': 1, 'sents': item['sentences']['sent']}
60
+ txn.put(u'{}'.format(idx).encode('ascii'), dumps_pyarrow(data))
61
+ if idx % write_frequency == 0:
62
+ # print("[%d/%d]" % (idx, len(data_loader)))
63
+ txn.commit()
64
+ txn = db.begin(write=True)
65
+
66
+ # finish iterating through dataset
67
+ txn.commit()
68
+ keys = [u'{}'.format(k).encode('ascii') for k in range(idx + 1)]
69
+ with db.begin(write=True) as txn:
70
+ txn.put(b'__keys__', dumps_pyarrow(keys))
71
+ txn.put(b'__len__', dumps_pyarrow(len(keys)))
72
+
73
+ print("Flushing database ...")
74
+ db.sync()
75
+ db.close()
76
+
77
+
78
+ def parse_args():
79
+ parser = argparse.ArgumentParser(description='COCO Folder to LMDB.')
80
+ parser.add_argument('-j', '--json-dir', type=str,
81
+ default='',
82
+ help='the name of json file.')
83
+ parser.add_argument('-i', '--img-dir', type=str,
84
+ default='refcoco+',
85
+ help='the folder of images.')
86
+ parser.add_argument('-m', '--mask-dir', type=str,
87
+ default='refcoco+',
88
+ help='the folder of masks.')
89
+ parser.add_argument('-o', '--output-dir', type=str,
90
+ default='refcoco+',
91
+ help='the folder of output lmdb file.')
92
+ parser.add_argument('-s', '--split', type=str,
93
+ default='train',
94
+ help='the split type.')
95
+ args = parser.parse_args()
96
+ return args
97
+
98
+
99
+ if __name__ == '__main__':
100
+ args = parse_args()
101
+ args.split = osp.basename(args.json_dir).split(".")[0]
102
+ os.makedirs(args.output_dir, exist_ok=True)
103
+
104
+ json_data = []
105
+ with open(args.json_dir, 'rb') as f:
106
+ for line in f:
107
+ json_data.append(json.loads(line))
108
+
109
+ folder2lmdb(json_data, args.img_dir, args.mask_dir, args.output_dir, args.split)
make_ref-ytvos/manual_selected_frames.jsonl ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"index": 0, "new_sent": ""}
2
+ {"index": 4, "new_sent": "a zebra walking away from camera"}
3
+ {"index": 12, "new_sent": "the panda has his hand on another pandas back"}
4
+ {"index": 13, "new_sent": "the panda is standing on the rocks"}
5
+ {"index": 17, "new_sent": "the panda fell down on his back"}
6
+ {"index": 28, "new_sent": "a sheep lying down"}
7
+ {"index": 31, "new_sent": "a sheep lying down and getting spider legs"}
8
+ {"index": 40, "new_sent": ""}
9
+ {"index": 41, "new_sent": ""}
10
+ {"index": 48, "new_sent": "man using his hands next to an inside tree"}
11
+ {"index": 52, "new_sent": ""}
12
+ {"index": 55, "new_sent": ""}
13
+ {"index": 57, "new_sent": "a monkey hugging another monkey"}
14
+ {"index": 76, "new_sent": "an ape seated and breastfeeding while another ape plays nearby"}
15
+ {"index": 77, "new_sent": "an ape playing near a nursing ape"}
16
+ {"index": 78, "new_sent": "an ape is laying on the chest of another ape sitting on the dirt"}
17
+ {"index": 172, "new_sent": "person standing on stage and using a microphone"}
18
+ {"index": 173, "new_sent": "person sitting on stage playing a piano"}
19
+ {"index": 196, "new_sent": "a monkey eating some fruit"}
20
+ {"index": 197, "new_sent": "a monkey sitting while watching another monkey eat"}
21
+ {"index": 244, "new_sent": ""}
22
+ {"index": 270, "new_sent": "a turtle in water while another follows"}
23
+ {"index": 271, "new_sent": ""}
24
+ {"index": 299, "new_sent": "a duck stretching out its neck"}
25
+ {"index": 326, "new_sent": ""}
26
+ {"index": 327, "new_sent": ""}
27
+ {"index": 388, "new_sent": ""}
28
+ {"index": 389, "new_sent": "a lizard putting its head under a branch"}
29
+ {"index": 409, "new_sent": ""}
30
+ {"index": 410, "new_sent": "a raccoon standing and attacking another raccoon"}
31
+ {"index": 415, "new_sent": ""}
32
+ {"index": 416, "new_sent": ""}
33
+ {"index": 417, "new_sent": "a person taking a picture"}
34
+ {"index": 428, "new_sent": "a panda laying under another panda"}
35
+ {"index": 429, "new_sent": "a panda standing and playing with another panda"}
36
+ {"index": 447, "new_sent": "a panda playing and rolling over on the ground"}
37
+ {"index": 448, "new_sent": "a panda sitting and looking at another panda"}
38
+ {"index": 451, "new_sent": ""}
39
+ {"index": 495, "new_sent": "a lion sitting in front of a lion thats playing with a man"}
40
+ {"index": 509, "new_sent": ""}
41
+ {"index": 510, "new_sent": ""}
42
+ {"index": 517, "new_sent": "a person squatting and looking at a skateboarder perform"}
43
+ {"index": 518, "new_sent": ""}
44
+ {"index": 528, "new_sent": "a person doing a hand stand"}
45
+ {"index": 559, "new_sent": "a dog holding up his head"}
46
+ {"index": 560, "new_sent": "a dog smelling the ground"}
47
+ {"index": 561, "new_sent": ""}
48
+ {"index": 562, "new_sent": ""}
49
+ {"index": 569, "new_sent": ""}
50
+ {"index": 570, "new_sent": ""}
51
+ {"index": 594, "new_sent": "a mouse sitting under a wheel while another runs around"}
52
+ {"index": 595, "new_sent": ""}
53
+ {"index": 617, "new_sent": "a monkey moving underneath another monkey"}
54
+ {"index": 618, "new_sent": "a monkey laying on the ground with its arm over another monkey"}
55
+ {"index": 634, "new_sent": "ape laying under another ape"}
56
+ {"index": 644, "new_sent": "ape reaching out his arms and legs"}
57
+ {"index": 645, "new_sent": ""}
58
+ {"index": 646, "new_sent": "a person standing with his arms crossed in a room with others"}
59
+ {"index": 654, "new_sent": ""}
60
+ {"index": 659, "new_sent": "a giraffe eating hay"}
61
+ {"index": 662, "new_sent": "a penguin laying on its belly playing with another penguin"}
62
+ {"index": 673, "new_sent": "a penguin moving on its belly"}
63
+ {"index": 720, "new_sent": "a person riding a surfboard on a wave in front of other surfer"}
64
+ {"index": 722, "new_sent": "a person laying on surf board"}
65
+ {"index": 725, "new_sent": "person swimming away"}
66
+ {"index": 735, "new_sent": "person mounting a cow"}
67
+ {"index": 738, "new_sent": "person walking towards a cow"}
68
+ {"index": 741, "new_sent": "person riding a cow"}
69
+ {"index": 747, "new_sent": "person holding out his right arm"}
70
+ {"index": 764, "new_sent": "a grey duck facing away"}
71
+ {"index": 765, "new_sent": ""}
72
+ {"index": 766, "new_sent": "a grey duck eating bread"}
73
+ {"index": 816, "new_sent": "a person raising his arms and flying with another person"}
74
+ {"index": 821, "new_sent": "person holding on to the belt and screaming"}
75
+ {"index": 824, "new_sent": "person smiling"}
76
+ {"index": 828, "new_sent": "person holding another persons arms"}
77
+ {"index": 831, "new_sent": ""}
78
+ {"index": 832, "new_sent": ""}
79
+ {"index": 904, "new_sent": ""}
80
+ {"index": 914, "new_sent": "elephant walking away from the camera"}
81
+ {"index": 918, "new_sent": "a person riding a horse"}
82
+ {"index": 919, "new_sent": "a person opening a gate"}
83
+ {"index": 931, "new_sent": "person resting hand on a tree limb"}
84
+ {"index": 932, "new_sent": "person trying to feed a small animal"}
85
+ {"index": 993, "new_sent": "horse leading the way in the water"}
86
+ {"index": 994, "new_sent": "horse following another horse"}
87
+ {"index": 1049, "new_sent": "person fell from a bull"}
88
+ {"index": 1051, "new_sent": "person getting up from the ground"}
89
+ {"index": 1052, "new_sent": "person standing and running away from a bull"}
90
+ {"index": 1054, "new_sent": "person squatting down"}
91
+ {"index": 1096, "new_sent": "an ape crossing his arms"}
92
+ {"index": 1097, "new_sent": ""}
93
+ {"index": 1115, "new_sent": "a man trying to catch a frisbee"}
94
+ {"index": 1137, "new_sent": ""}
95
+ {"index": 1138, "new_sent": "the monkey leaning over the fence facing the bike"}
96
+ {"index": 1139, "new_sent": ""}
97
+ {"index": 1147, "new_sent": "the monkey hanging backwards"}
98
+ {"index": 1173, "new_sent": ""}
99
+ {"index": 1174, "new_sent": "a horse leading the way"}
100
+ {"index": 1184, "new_sent": ""}
101
+ {"index": 1194, "new_sent": "person milking a cow"}
make_ref-ytvos/review_images.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
make_ref-ytvos/revised_frames.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
make_ref-ytvos/selected_frames.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
mbench/result.json ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "8056117b89": null,
3
+ "30085a2cc6": {
4
+ "00020": {
5
+ "1": {
6
+ "ref_exp": "The elephant walks on four legs, with its trunk lowered and touching the ground.",
7
+ "caption": "The image shows two elephants performing distinct actions. \n\n1. The larger elephant walks forward, swinging its trunk side to side as it moves through the grass. \n2. The smaller elephant follows closely behind, walking on four legs, with its trunk lowered and touching the ground. \n\nThe larger elephant also flaps its ears open and closed, creating a visible motion as it progresses. Both elephants are primarily focused on their movement through the grassy area.",
8
+ "cat_name": "elephant",
9
+ "file_name": "00020",
10
+ "isValid": true
11
+ },
12
+ "2": {
13
+ "ref_exp": "The elephant follows closely behind while walking on four legs with its trunk lowered and touching the ground.",
14
+ "caption": "The image shows two elephants performing distinct actions. \n\n1. The larger elephant walks forward, swinging its trunk side to side as it moves through the grass. \n2. The smaller elephant follows closely behind, walking on four legs, with its trunk lowered and touching the ground. \n\nThe larger elephant also flaps its ears open and closed, creating a visible motion as it progresses. Both elephants are primarily focused on their movement through the grassy area.",
15
+ "cat_name": "elephant",
16
+ "file_name": "00020",
17
+ "isValid": false
18
+ }
19
+ },
20
+ "00035": {
21
+ "1": {
22
+ "ref_exp": "The elephant is using its trunk to grasp and pull foliage from the ground.",
23
+ "caption": "In the image, one elephant is standing upright, using its trunk to grasp and pull foliage from the ground. It appears to be consuming the vegetation. The second elephant, smaller in size, is positioned nearby, walking closely alongside the first, with its trunk lowered toward the ground, possibly interacting with the grasses. The body posture of both elephants indicates movement and engagement with their surroundings.",
24
+ "cat_name": "elephant",
25
+ "file_name": "00035",
26
+ "isValid": false
27
+ },
28
+ "2": {
29
+ "ref_exp": "The elephant is standing and using its trunk to grasp and pull foliage from the ground.",
30
+ "caption": "In the image, one elephant is standing upright, using its trunk to grasp and pull foliage from the ground. It appears to be consuming the vegetation. The second elephant, smaller in size, is positioned nearby, walking closely alongside the first, with its trunk lowered toward the ground, possibly interacting with the grasses. The body posture of both elephants indicates movement and engagement with their surroundings.",
31
+ "cat_name": "elephant",
32
+ "file_name": "00035",
33
+ "isValid": true
34
+ }
35
+ },
36
+ "00055": {
37
+ "1": {
38
+ "ref_exp": "The elephant is walking forward while using its trunk to reach down to the ground.",
39
+ "caption": "In the image, a larger elephant is walking forward with a slight bend in its trunk, while a smaller elephant follows closely behind. The larger elephant is moving through the tall grass, using its trunk to occasionally reach down to the ground. At the same time, the smaller elephant appears to be moving at a brisk pace and is also navigating through the grass, maintaining proximity to the larger one. The movement of both elephants indicates a coordinated action as they travel in the same direction toward the nearby water.",
40
+ "cat_name": "elephant",
41
+ "file_name": "00055",
42
+ "isValid": false
43
+ },
44
+ "2": {
45
+ "ref_exp": "The elephant is walking forward with its trunk slightly bent, using it to reach down to the ground.",
46
+ "caption": "In the image, a larger elephant is walking forward with a slight bend in its trunk, while a smaller elephant follows closely behind. The larger elephant is moving through the tall grass, using its trunk to occasionally reach down to the ground. At the same time, the smaller elephant appears to be moving at a brisk pace and is also navigating through the grass, maintaining proximity to the larger one. The movement of both elephants indicates a coordinated action as they travel in the same direction toward the nearby water.",
47
+ "cat_name": "elephant",
48
+ "file_name": "00055",
49
+ "isValid": false
50
+ }
51
+ },
52
+ "00075": {
53
+ "1": {
54
+ "ref_exp": "The elephant is moving forward with its trunk extended downward.",
55
+ "caption": "In the image, several elephants are depicted in motion. \n\n1. The largest elephant is moving forward, its trunk extended downward, and its ears are flapping. \n2. A smaller elephant is walking closely behind, appearing to be following the larger one, with its trunk raised slightly. \n3. A very small elephant is positioned near the middle, walking on a similar path, with its trunk held close to the ground. \n\nThe actions of the elephants are directed towards moving through the grassy area, maintaining a formation where the smaller elephants trail behind the larger one.",
56
+ "cat_name": "elephant",
57
+ "file_name": "00075",
58
+ "isValid": false
59
+ },
60
+ "2": {
61
+ "ref_exp": "The elephant is moving forward with its trunk extended downward.",
62
+ "caption": "In the image, several elephants are depicted in motion. \n\n1. The largest elephant is moving forward, its trunk extended downward, and its ears are flapping. \n2. A smaller elephant is walking closely behind, appearing to be following the larger one, with its trunk raised slightly. \n3. A very small elephant is positioned near the middle, walking on a similar path, with its trunk held close to the ground. \n\nThe actions of the elephants are directed towards moving through the grassy area, maintaining a formation where the smaller elephants trail behind the larger one.",
63
+ "cat_name": "elephant",
64
+ "file_name": "00075",
65
+ "isValid": false
66
+ }
67
+ }
68
+ },
69
+ "013099c098": {
70
+ "00020": {
71
+ "1": {
72
+ "ref_exp": "The giant_panda is climbing on the back of another giant_panda.",
73
+ "caption": "Two giant pandas are engaged in distinct actions. \n\n1. The panda on the left is climbing on the back of the panda on the right, using its front paws to grasp the other panda's shoulders. \n\n2. The panda on the right is seated in a slight stance, with its head lowered and looking towards the ground while water from a small stream splashes around.\n\n3. Water is trickling down from the rocks behind them, creating ripples around the panda on the right as it shifts its weight.\n\n4. The panda on the left is positioned slightly higher, with its body elevated above the other, indicating a playful interaction. \n\nEach action is prominent and clear, showcasing the pandas\u2019 movements distinctly.",
74
+ "cat_name": "giant_panda",
75
+ "file_name": "00020",
76
+ "isValid": false
77
+ },
78
+ "2": {
79
+ "ref_exp": "The giant_panda is climbing on the back of another panda.",
80
+ "caption": "Two giant pandas are engaged in distinct actions. \n\n1. The panda on the left is climbing on the back of the panda on the right, using its front paws to grasp the other panda's shoulders. \n\n2. The panda on the right is seated in a slight stance, with its head lowered and looking towards the ground while water from a small stream splashes around.\n\n3. Water is trickling down from the rocks behind them, creating ripples around the panda on the right as it shifts its weight.\n\n4. The panda on the left is positioned slightly higher, with its body elevated above the other, indicating a playful interaction. \n\nEach action is prominent and clear, showcasing the pandas\u2019 movements distinctly.",
81
+ "cat_name": "giant_panda",
82
+ "file_name": "00020",
83
+ "isValid": true
84
+ }
85
+ },
86
+ "00030": {
87
+ "1": {
88
+ "ref_exp": "The giant panda is splashing water with its paws.",
89
+ "caption": "In the image, one giant panda is positioned beside a small flow of water, using its paws to splash water playfully. The panda appears to be engaging with the water, causing droplets to scatter into the air. Nearby, another panda is focused on grasping something with its mouth, seemingly nibbling or chewing on it. This panda is low to the ground, with its front paws bracing itself as it maintains balance while interacting with the object in front of it. Both pandas are actively involved in their respective actions, creating a dynamic scene.",
90
+ "cat_name": "giant_panda",
91
+ "file_name": "00030",
92
+ "isValid": false
93
+ },
94
+ "2": {
95
+ "ref_exp": "The giant_panda is using its paws to splash water.",
96
+ "caption": "In the image, one giant panda is positioned beside a small flow of water, using its paws to splash water playfully. The panda appears to be engaging with the water, causing droplets to scatter into the air. Nearby, another panda is focused on grasping something with its mouth, seemingly nibbling or chewing on it. This panda is low to the ground, with its front paws bracing itself as it maintains balance while interacting with the object in front of it. Both pandas are actively involved in their respective actions, creating a dynamic scene.",
97
+ "cat_name": "giant_panda",
98
+ "file_name": "00030",
99
+ "isValid": true
100
+ }
101
+ },
102
+ "00050": {
103
+ "1": {
104
+ "ref_exp": "The giant_panda is leaning forward to make contact with the other panda.",
105
+ "caption": "The image features two giant pandas engaged in the following actions:\n\n1. One panda is positioned on its hind legs, interacting with the other panda, which is lying on its back. The upright panda leans forward, making direct contact with the other panda.\n\n2. The panda that is lying on its back uses its front paws to swat playfully at the upright panda, displaying an open posture with its limbs extended.\n\n3. Water droplets are visible on the lying panda as it rolls slightly, resulting in splashes, indicating movement in a wet area.\n\n4. The upright panda appears to gently push down on the other panda\u2019s chest with its front paw while maintaining its position on hind legs.\n\n5. The prostrate panda lifts its head, looking towards the upright panda, showcasing an engaged posture in response to the interaction.",
106
+ "cat_name": "giant_panda",
107
+ "file_name": "00050",
108
+ "isValid": true
109
+ },
110
+ "2": {
111
+ "ref_exp": "The giant_panda interacts playfully with another panda.",
112
+ "caption": "The image features two giant pandas engaged in the following actions:\n\n1. One panda is positioned on its hind legs, interacting with the other panda, which is lying on its back. The upright panda leans forward, making direct contact with the other panda.\n\n2. The panda that is lying on its back uses its front paws to swat playfully at the upright panda, displaying an open posture with its limbs extended.\n\n3. Water droplets are visible on the lying panda as it rolls slightly, resulting in splashes, indicating movement in a wet area.\n\n4. The upright panda appears to gently push down on the other panda\u2019s chest with its front paw while maintaining its position on hind legs.\n\n5. The prostrate panda lifts its head, looking towards the upright panda, showcasing an engaged posture in response to the interaction.",
113
+ "cat_name": "giant_panda",
114
+ "file_name": "00050",
115
+ "isValid": false
116
+ }
117
+ },
118
+ "00070": {
119
+ "1": {
120
+ "ref_exp": "The giant_panda is using its front paws to grip another panda while lying on its back.",
121
+ "caption": "In the image, two giant pandas are engaged in a playful interaction. \n\n1. The first panda is lying on its back and using its front paws to push against the second panda, which is positioned above it.\n2. The second panda is leaning forward, resting its front paws on the first panda's chest, its face directed toward the first panda\u2019s face.\n3. Water droplets are splashing from the fur of both pandas as they move, indicating their active engagement.\n4. The first panda adjusts its position, rolling slightly to one side, while the second panda remains steady on top, maintaining contact through their paws.\n5. Both pandas appear to be using their forelimbs to grip each other playfully. \n\nThese actions illustrate their interaction and physical engagement without any other contextual details.",
122
+ "cat_name": "giant_panda",
123
+ "file_name": "00070",
124
+ "isValid": false
125
+ },
126
+ "2": {
127
+ "ref_exp": "The giant_panda is using its front paws to push against the other giant_panda.",
128
+ "caption": "In the image, two giant pandas are engaged in a playful interaction. \n\n1. The first panda is lying on its back and using its front paws to push against the second panda, which is positioned above it.\n2. The second panda is leaning forward, resting its front paws on the first panda's chest, its face directed toward the first panda\u2019s face.\n3. Water droplets are splashing from the fur of both pandas as they move, indicating their active engagement.\n4. The first panda adjusts its position, rolling slightly to one side, while the second panda remains steady on top, maintaining contact through their paws.\n5. Both pandas appear to be using their forelimbs to grip each other playfully. \n\nThese actions illustrate their interaction and physical engagement without any other contextual details.",
129
+ "cat_name": "giant_panda",
130
+ "file_name": "00070",
131
+ "isValid": false
132
+ }
133
+ }
134
+ },
135
+ "863b4049d7": {
136
+ "00010": null,
137
+ "00065": {
138
+ "1": {
139
+ "ref_exp": "The sheep is grazing.",
140
+ "caption": "In the image, the sheep exhibit various actions:\n\n1. **Grazing**: Several sheep are positioned near the ground, using their mouths to pull at visible patches of grass or forage in the dirt.\n\n2. **Moving**: A group of sheep is shifting from one spot to another, their legs visibly stepping as they walk in different directions.\n\n3. **Standing**: Some sheep are standing still, facing different orientations, with their bodies upright and stable.\n\n4. **Nudging**: A few sheep are seen nudging each other gently with their heads, engaging in a behavior that involves physical contact.\n\n5. **Vocalizing**: Some sheep are shown with their mouths open, indicating they are making noise, contributing to the collective sound of the group.\n\n6. **Resting**: A handful of sheep are positioned with their bodies lowered, likely resting while remaining alert to their surroundings. \n\nEach action demonstrates a specific behavior that reflects the sheep's daily activities.",
141
+ "cat_name": "sheep",
142
+ "file_name": "00065",
143
+ "isValid": false
144
+ },
145
+ "2": {
146
+ "ref_exp": "The sheep is grazing.",
147
+ "caption": "In the image, the sheep exhibit various actions:\n\n1. **Grazing**: Several sheep are positioned near the ground, using their mouths to pull at visible patches of grass or forage in the dirt.\n\n2. **Moving**: A group of sheep is shifting from one spot to another, their legs visibly stepping as they walk in different directions.\n\n3. **Standing**: Some sheep are standing still, facing different orientations, with their bodies upright and stable.\n\n4. **Nudging**: A few sheep are seen nudging each other gently with their heads, engaging in a behavior that involves physical contact.\n\n5. **Vocalizing**: Some sheep are shown with their mouths open, indicating they are making noise, contributing to the collective sound of the group.\n\n6. **Resting**: A handful of sheep are positioned with their bodies lowered, likely resting while remaining alert to their surroundings. \n\nEach action demonstrates a specific behavior that reflects the sheep's daily activities.",
148
+ "cat_name": "sheep",
149
+ "file_name": "00065",
150
+ "isValid": false
151
+ }
152
+ },
153
+ "00115": null,
154
+ "00165": {
155
+ "1": {
156
+ "ref_exp": "The sheep is stepping forward with a foot raised off the ground.",
157
+ "caption": "In the image, several sheep are depicted engaging in distinct actions:\n\n1. One sheep is moving away from the group, stepping to the left with a single foot raised off the ground, suggesting a direction towards the open space.\n2. A cluster of sheep stands closely together, with their heads lowered, indicating they are grazing or examining the ground.\n3. Another sheep is positioned at the back, looking back towards the rest of the group, its head turned to the right.\n4. A few sheep are lined up near a fence, forming a semi-circle, with their bodies angled outward.\n5. Some sheep are standing with their legs apart on the ground, creating a stable posture.\n\nOverall, the actions of the sheep demonstrate various movements and interactions within the group.",
158
+ "cat_name": "sheep",
159
+ "file_name": "00165",
160
+ "isValid": false
161
+ },
162
+ "2": {
163
+ "ref_exp": "The sheep is stepping away from the group.",
164
+ "caption": "In the image, several sheep are depicted engaging in distinct actions:\n\n1. One sheep is moving away from the group, stepping to the left with a single foot raised off the ground, suggesting a direction towards the open space.\n2. A cluster of sheep stands closely together, with their heads lowered, indicating they are grazing or examining the ground.\n3. Another sheep is positioned at the back, looking back towards the rest of the group, its head turned to the right.\n4. A few sheep are lined up near a fence, forming a semi-circle, with their bodies angled outward.\n5. Some sheep are standing with their legs apart on the ground, creating a stable posture.\n\nOverall, the actions of the sheep demonstrate various movements and interactions within the group.",
165
+ "cat_name": "sheep",
166
+ "file_name": "00165",
167
+ "isValid": false
168
+ }
169
+ }
170
+ },
171
+ "c36240d96f": {
172
+ "00035": null,
173
+ "00045": null,
174
+ "00095": null,
175
+ "00120": {
176
+ "1": {
177
+ "ref_exp": "The parrot pecks at the floor near the mirror.",
178
+ "caption": "The image features two parrots interacting with each other and their reflection in a mirror. \n\n1. One parrot approaches the mirror and pauses in front of it, turning its head to look at its reflection.\n2. The second parrot moves closer to the first parrot, tilting its head as it observes the first parrot's actions.\n3. The first parrot begins to peck at the floor near the mirror, using its beak to make contact with the surface.\n4. The second parrot shifts its position slightly, fluttering its wings and then stepping back.\n5. The first parrot flaps its wings in response, appearing to mirror the actions of the second parrot.\n6. Both parrots take turns moving closer to the mirror, occasionally stopping to examine their reflections.",
179
+ "cat_name": "parrot",
180
+ "file_name": "00120",
181
+ "isValid": false
182
+ },
183
+ "2": {
184
+ "ref_exp": "The parrot pecks at the floor near the mirror.",
185
+ "caption": "The image features two parrots interacting with each other and their reflection in a mirror. \n\n1. One parrot approaches the mirror and pauses in front of it, turning its head to look at its reflection.\n2. The second parrot moves closer to the first parrot, tilting its head as it observes the first parrot's actions.\n3. The first parrot begins to peck at the floor near the mirror, using its beak to make contact with the surface.\n4. The second parrot shifts its position slightly, fluttering its wings and then stepping back.\n5. The first parrot flaps its wings in response, appearing to mirror the actions of the second parrot.\n6. Both parrots take turns moving closer to the mirror, occasionally stopping to examine their reflections.",
186
+ "cat_name": "parrot",
187
+ "file_name": "00120",
188
+ "isValid": true
189
+ }
190
+ }
191
+ },
192
+ "f66981af4e": null,
193
+ "ef45ce3035": {
194
+ "00035": {
195
+ "1": {
196
+ "ref_exp": "The earless_seal is resting on a rocky surface.",
197
+ "caption": "In the image, several earless seals are observed engaged in various actions:\n\n1. One seal is swimming close to the surface, its body largely submerged while its head is above the waterline.\n2. Another seal is diving deeper, with its flippers spread as it propels itself downwards.\n3. Several seals are resting on a rocky surface, positioned on their sides with their bodies mostly still.\n4. One seal is nudging another with its snout while floating, demonstrating interaction.\n5. A seal is splashing water with its flippers while maneuvering in the shallow area near the rocks.\n6. Another seal is emerging from the water, shaking its body to remove droplets as it climbs onto the rocky shoreline.",
198
+ "cat_name": "earless_seal",
199
+ "file_name": "00035",
200
+ "isValid": true
201
+ },
202
+ "2": {}
203
+ },
204
+ "00065": {
205
+ "1": {
206
+ "ref_exp": "The earless_seal is resting on the surface.",
207
+ "caption": "In the image, the earless seals exhibit several distinct actions:\n\n1. One seal is resting on a rock, positioned against the water, with its body sprawled comfortably on the surface.\n2. Another seal is swimming in the water, partially submerged, with its head and upper body visible as it moves forward.\n3. A group of seals is engaged in a playful interaction, splashing the water around them, causing ripples and waves.\n4. A seal is seen diving below the surface, its streamlined body disappearing as it moves downward.\n5. Several seals are clustered together, appearing to engage in social behavior, while others are scattered in various positions throughout the water.",
208
+ "cat_name": "earless_seal",
209
+ "file_name": "00065",
210
+ "isValid": true
211
+ },
212
+ "2": {
213
+ "ref_exp": "The earless_seal is positioned on the ground with its body partially submerged in the water.",
214
+ "caption": "In the image, the earless seals exhibit several distinct actions:\n\n1. One seal is resting on a rock, positioned against the water, with its body sprawled comfortably on the surface.\n2. Another seal is swimming in the water, partially submerged, with its head and upper body visible as it moves forward.\n3. A group of seals is engaged in a playful interaction, splashing the water around them, causing ripples and waves.\n4. A seal is seen diving below the surface, its streamlined body disappearing as it moves downward.\n5. Several seals are clustered together, appearing to engage in social behavior, while others are scattered in various positions throughout the water.",
215
+ "cat_name": "earless_seal",
216
+ "file_name": "00065",
217
+ "isValid": true
218
+ }
219
+ },
220
+ "00120": {
221
+ "1": {
222
+ "ref_exp": "The earless_seal is climbing onto a rock using its flippers.",
223
+ "caption": "In the image, several earless seals are engaged in distinct actions:\n\n1. One seal is resting on a smooth rock, positioned upright with its body supported by its flippers.\n2. Another seal is partially submerged in the water, with its head above the surface while its body is mainly underwater.\n3. A seal is swimming, moving through the water with its body streamlined, creating ripples around it.\n4. Two seals are interacting close to each other, appearing to engage in playful behavior, possibly splashing water.\n5. A seal is climbing onto a rock, using its flippers to push itself up and elevate its body from the water.\n6. One seal is yawning, displaying its mouth wide open while remaining on a rock. \n\nEach action is clear and distinct, showcasing the behaviors of the earless seals in their environment.",
224
+ "cat_name": "earless_seal",
225
+ "file_name": "00120",
226
+ "isValid": false
227
+ },
228
+ "2": {
229
+ "ref_exp": "The earless_seal is climbing onto a rock.",
230
+ "caption": "In the image, several earless seals are engaged in distinct actions:\n\n1. One seal is resting on a smooth rock, positioned upright with its body supported by its flippers.\n2. Another seal is partially submerged in the water, with its head above the surface while its body is mainly underwater.\n3. A seal is swimming, moving through the water with its body streamlined, creating ripples around it.\n4. Two seals are interacting close to each other, appearing to engage in playful behavior, possibly splashing water.\n5. A seal is climbing onto a rock, using its flippers to push itself up and elevate its body from the water.\n6. One seal is yawning, displaying its mouth wide open while remaining on a rock. \n\nEach action is clear and distinct, showcasing the behaviors of the earless seals in their environment.",
231
+ "cat_name": "earless_seal",
232
+ "file_name": "00120",
233
+ "isValid": false
234
+ }
235
+ },
236
+ "00165": {
237
+ "1": {
238
+ "ref_exp": "The earless seal interacts with the water, creating ripples as it splashes with its flippers.",
239
+ "caption": "In the image, several earless seals are engaged in various actions:\n\n1. An earless seal lies on a rock, positioned upright and utilizing its flippers for balance.\n2. Another seal is submerged in the water, with only its head visible above the surface, actively paddling with its fore flippers.\n3. A group of seals swims near the surface, moving in synchrony while occasionally basking in the water.\n4. One seal is climbing onto a rocky ledge, using its body to push against the stone for support.\n5. Another seal is interacting with the water, creating ripples as it splashes with its flippers.\n6. In the background, two seals are positioned adjacent to each other, grooming themselves by scratching at their fur.",
240
+ "cat_name": "earless_seal",
241
+ "file_name": "00165",
242
+ "isValid": true
243
+ },
244
+ "2": {
245
+ "ref_exp": "The earless_seal is lying on a rock and using its flippers for balance.",
246
+ "caption": "In the image, several earless seals are engaged in various actions:\n\n1. An earless seal lies on a rock, positioned upright and utilizing its flippers for balance.\n2. Another seal is submerged in the water, with only its head visible above the surface, actively paddling with its fore flippers.\n3. A group of seals swims near the surface, moving in synchrony while occasionally basking in the water.\n4. One seal is climbing onto a rocky ledge, using its body to push against the stone for support.\n5. Another seal is interacting with the water, creating ripples as it splashes with its flippers.\n6. In the background, two seals are positioned adjacent to each other, grooming themselves by scratching at their fur.",
247
+ "cat_name": "earless_seal",
248
+ "file_name": "00165",
249
+ "isValid": false
250
+ }
251
+ }
252
+ },
253
+ "750be4c4d8": {
254
+ "00065": {
255
+ "1": {
256
+ "ref_exp": "The person is walking away from the buses.",
257
+ "caption": "The image depicts several individuals interacting with buses in a bus yard. \n\n1. One person in a red shirt is pointing at the bus, likely directing attention toward it.\n2. Another person wearing a yellow vest is walking towards the bus with a clipboard in hand.\n3. A third individual is standing still, watching the buses, possibly assessing the situation.\n4. An individual with a backpack is walking away from the buses, moving toward the left side of the image.\n\nThe buses appear to be articulated, connected at their center, and are positioned in the yard.",
258
+ "cat_name": "person",
259
+ "file_name": "00065",
260
+ "isValid": false
261
+ },
262
+ "2": {
263
+ "ref_exp": "The person is pointing at the bus.",
264
+ "caption": "The image depicts several individuals interacting with buses in a bus yard. \n\n1. One person in a red shirt is pointing at the bus, likely directing attention toward it.\n2. Another person wearing a yellow vest is walking towards the bus with a clipboard in hand.\n3. A third individual is standing still, watching the buses, possibly assessing the situation.\n4. An individual with a backpack is walking away from the buses, moving toward the left side of the image.\n\nThe buses appear to be articulated, connected at their center, and are positioned in the yard.",
265
+ "cat_name": "person",
266
+ "file_name": "00065",
267
+ "isValid": false
268
+ },
269
+ "3": {
270
+ "ref_exp": "The person is pointing at the bus.",
271
+ "caption": "The image depicts several individuals interacting with buses in a bus yard. \n\n1. One person in a red shirt is pointing at the bus, likely directing attention toward it.\n2. Another person wearing a yellow vest is walking towards the bus with a clipboard in hand.\n3. A third individual is standing still, watching the buses, possibly assessing the situation.\n4. An individual with a backpack is walking away from the buses, moving toward the left side of the image.\n\nThe buses appear to be articulated, connected at their center, and are positioned in the yard.",
272
+ "cat_name": "person",
273
+ "file_name": "00065",
274
+ "isValid": false
275
+ }
276
+ },
277
+ "00090": {
278
+ "1": {
279
+ "ref_exp": "A person is standing with a backpack, observing a bus.",
280
+ "caption": "In the image, multiple actions are taking place:\n\n1. A person is standing with a backpack, facing a yellow and blue bus, observing it.\n2. Several individuals are grouped together, appearing to converse or wait near another bus in the background.\n3. A bus is parked with its doors closed, indicating that it is not currently in use.\n4. A small group of children is nearby, some appearing to play or engage in activity.\n5. Another bus is positioned slightly ahead, and a few individuals are walking around it, possibly preparing to board or disembark.\n6. A larger vehicle, likely a truck, is positioned in the background, not in direct interaction with the people. \n\nEach action clearly reflects movement in relation to the buses within the scene.",
281
+ "cat_name": "person",
282
+ "file_name": "00090",
283
+ "isValid": true
284
+ },
285
+ "2": {
286
+ "ref_exp": "Person standing and observing a bus.",
287
+ "caption": "In the image, multiple actions are taking place:\n\n1. A person is standing with a backpack, facing a yellow and blue bus, observing it.\n2. Several individuals are grouped together, appearing to converse or wait near another bus in the background.\n3. A bus is parked with its doors closed, indicating that it is not currently in use.\n4. A small group of children is nearby, some appearing to play or engage in activity.\n5. Another bus is positioned slightly ahead, and a few individuals are walking around it, possibly preparing to board or disembark.\n6. A larger vehicle, likely a truck, is positioned in the background, not in direct interaction with the people. \n\nEach action clearly reflects movement in relation to the buses within the scene.",
288
+ "cat_name": "person",
289
+ "file_name": "00090",
290
+ "isValid": true
291
+ },
292
+ "3": {
293
+ "ref_exp": "Person is standing with a backpack and observing a bus.",
294
+ "caption": "In the image, multiple actions are taking place:\n\n1. A person is standing with a backpack, facing a yellow and blue bus, observing it.\n2. Several individuals are grouped together, appearing to converse or wait near another bus in the background.\n3. A bus is parked with its doors closed, indicating that it is not currently in use.\n4. A small group of children is nearby, some appearing to play or engage in activity.\n5. Another bus is positioned slightly ahead, and a few individuals are walking around it, possibly preparing to board or disembark.\n6. A larger vehicle, likely a truck, is positioned in the background, not in direct interaction with the people. \n\nEach action clearly reflects movement in relation to the buses within the scene.",
295
+ "cat_name": "person",
296
+ "file_name": "00090",
297
+ "isValid": false
298
+ }
299
+ },
300
+ "00115": {
301
+ "1": {
302
+ "ref_exp": "Person walking towards the yellow bus.",
303
+ "caption": "In the image, several actions are taking place:\n\n1. A yellow bus is positioned at the center, with its back doors opened, indicating it's either loading or unloading passengers.\n2. A group of individuals, including a small child, is positioned to the left of the yellow bus, standing and facing towards it, some appearing to engage in conversation.\n3. Another individual is walking towards the yellow bus, carrying a backpack, potentially intending to board.\n4. A person in a red jacket is near a second bus, with their body turned slightly towards it, suggesting they are looking at the bus.\n5. In the background, a few figures are standing near a building, possibly waiting or observing the area.\n\nThese actions are distinct and highlight the interactions around the buses.",
304
+ "cat_name": "person",
305
+ "file_name": "00115",
306
+ "isValid": false
307
+ },
308
+ "2": {
309
+ "ref_exp": "The person is looking at the bus.",
310
+ "caption": "In the image, several actions are taking place:\n\n1. A yellow bus is positioned at the center, with its back doors opened, indicating it's either loading or unloading passengers.\n2. A group of individuals, including a small child, is positioned to the left of the yellow bus, standing and facing towards it, some appearing to engage in conversation.\n3. Another individual is walking towards the yellow bus, carrying a backpack, potentially intending to board.\n4. A person in a red jacket is near a second bus, with their body turned slightly towards it, suggesting they are looking at the bus.\n5. In the background, a few figures are standing near a building, possibly waiting or observing the area.\n\nThese actions are distinct and highlight the interactions around the buses.",
311
+ "cat_name": "person",
312
+ "file_name": "00115",
313
+ "isValid": false
314
+ },
315
+ "3": {
316
+ "ref_exp": "A person is standing near a second bus.",
317
+ "caption": "In the image, several actions are taking place:\n\n1. A yellow bus is positioned at the center, with its back doors opened, indicating it's either loading or unloading passengers.\n2. A group of individuals, including a small child, is positioned to the left of the yellow bus, standing and facing towards it, some appearing to engage in conversation.\n3. Another individual is walking towards the yellow bus, carrying a backpack, potentially intending to board.\n4. A person in a red jacket is near a second bus, with their body turned slightly towards it, suggesting they are looking at the bus.\n5. In the background, a few figures are standing near a building, possibly waiting or observing the area.\n\nThese actions are distinct and highlight the interactions around the buses.",
318
+ "cat_name": "person",
319
+ "file_name": "00115",
320
+ "isValid": false
321
+ }
322
+ },
323
+ "00125": {
324
+ "1": {
325
+ "ref_exp": "The person is walking away from the bus.",
326
+ "caption": "In the image, a yellow and blue bus is maneuvering, likely making a turn or preparing to depart. Nearby, a group of people is gathered, with some individuals walking in different directions. One person is carrying a bag and walking away from the bus, while another group appears to be interacting near the bus. A fire truck is parked in the background, with its lights visible. Children can be seen moving around the area, possibly playing or exploring. The setting suggests an active scene with various actions unfolding around the bus.",
327
+ "cat_name": "person",
328
+ "file_name": "00125",
329
+ "isValid": true
330
+ },
331
+ "2": {
332
+ "ref_exp": "The person is standing by the bus.",
333
+ "caption": "In the image, a yellow and blue bus is maneuvering, likely making a turn or preparing to depart. Nearby, a group of people is gathered, with some individuals walking in different directions. One person is carrying a bag and walking away from the bus, while another group appears to be interacting near the bus. A fire truck is parked in the background, with its lights visible. Children can be seen moving around the area, possibly playing or exploring. The setting suggests an active scene with various actions unfolding around the bus.",
334
+ "cat_name": "person",
335
+ "file_name": "00125",
336
+ "isValid": true
337
+ },
338
+ "3": {
339
+ "ref_exp": "Person is standing and interacting with others.",
340
+ "caption": "In the image, a yellow and blue bus is maneuvering, likely making a turn or preparing to depart. Nearby, a group of people is gathered, with some individuals walking in different directions. One person is carrying a bag and walking away from the bus, while another group appears to be interacting near the bus. A fire truck is parked in the background, with its lights visible. Children can be seen moving around the area, possibly playing or exploring. The setting suggests an active scene with various actions unfolding around the bus.",
341
+ "cat_name": "person",
342
+ "file_name": "00125",
343
+ "isValid": false
344
+ }
345
+ }
346
+ },
347
+ "c307f33da2": {
348
+ "00225": {
349
+ "1": {
350
+ "ref_exp": "The giraffe extends its neck towards a person offering food.",
351
+ "caption": "In the image, one giraffe extends its long neck towards a person holding a piece of green leafy food. The giraffe's mouth opens slightly, indicating an action of reaching or preparing to take the food. Another giraffe, positioned slightly behind the first, appears to be standing still, not actively engaged in the feeding process. The person is also leaning forward with an outstretched arm, clearly directing the food towards the giraffe. In the background, several other individuals are observing the scene, but their actions are not the focus of this description.",
352
+ "cat_name": "giraffe",
353
+ "file_name": "00225",
354
+ "isValid": true
355
+ },
356
+ "2": {
357
+ "ref_exp": "The giraffe reaches towards the person holding food.",
358
+ "caption": "In the image, one giraffe extends its long neck towards a person holding a piece of green leafy food. The giraffe's mouth opens slightly, indicating an action of reaching or preparing to take the food. Another giraffe, positioned slightly behind the first, appears to be standing still, not actively engaged in the feeding process. The person is also leaning forward with an outstretched arm, clearly directing the food towards the giraffe. In the background, several other individuals are observing the scene, but their actions are not the focus of this description.",
359
+ "cat_name": "giraffe",
360
+ "file_name": "00225",
361
+ "isValid": true
362
+ },
363
+ "3": {}
364
+ },
365
+ "00245": {
366
+ "1": {
367
+ "ref_exp": "The giraffe extends its neck to grasp leaves from a woman's hand.",
368
+ "caption": "In the image, one giraffe extends its long neck towards a woman holding green leaves, using its tongue to grasp the leaves from her hand. This giraffe seems focused on the leaves. A second giraffe is visible in the background, standing upright with its body facing away from the viewer. The woman's arm is extended while she remains stationary, and she appears to be interacting with the giraffe. The leaves are prominently held in front of the giraffe's mouth as it reaches for them.",
369
+ "cat_name": "giraffe",
370
+ "file_name": "00245",
371
+ "isValid": true
372
+ },
373
+ "2": {
374
+ "ref_exp": "The giraffe extends its neck to grasp leaves from a woman's hand.",
375
+ "caption": "In the image, one giraffe extends its long neck towards a woman holding green leaves, using its tongue to grasp the leaves from her hand. This giraffe seems focused on the leaves. A second giraffe is visible in the background, standing upright with its body facing away from the viewer. The woman's arm is extended while she remains stationary, and she appears to be interacting with the giraffe. The leaves are prominently held in front of the giraffe's mouth as it reaches for them.",
376
+ "cat_name": "giraffe",
377
+ "file_name": "00245",
378
+ "isValid": true
379
+ },
380
+ "3": {}
381
+ },
382
+ "00265": {
383
+ "1": {
384
+ "ref_exp": "The giraffe extends its neck toward the woman, reaching out its tongue to grasp the greens being offered.",
385
+ "caption": "The image shows two giraffes engaging in distinct actions. \n\n1. The closest giraffe extends its neck toward a woman holding a cluster of leafy greens. Its head is tilted forward, and its tongue is reaching out, likely to grasp the food being offered.\n\n2. A second giraffe is standing slightly behind, facing away from the camera. It is standing upright, with its legs straight and its posture relaxed, while observing the interaction between the first giraffe and the woman.\n\n3. The woman, holding a child, is extending her arm, presenting the greens in an upward motion towards the giraffe.\n\n4. A child in her arms is looking toward the giraffe, with a curious expression as the animal approaches. \n\nThe scene captures the direct interaction and feeding behavior of giraffes in response to human presence.",
386
+ "cat_name": "giraffe",
387
+ "file_name": "00265",
388
+ "isValid": true
389
+ },
390
+ "2": {
391
+ "ref_exp": "The giraffe extends its neck toward the woman offering food.",
392
+ "caption": "The image shows two giraffes engaging in distinct actions. \n\n1. The closest giraffe extends its neck toward a woman holding a cluster of leafy greens. Its head is tilted forward, and its tongue is reaching out, likely to grasp the food being offered.\n\n2. A second giraffe is standing slightly behind, facing away from the camera. It is standing upright, with its legs straight and its posture relaxed, while observing the interaction between the first giraffe and the woman.\n\n3. The woman, holding a child, is extending her arm, presenting the greens in an upward motion towards the giraffe.\n\n4. A child in her arms is looking toward the giraffe, with a curious expression as the animal approaches. \n\nThe scene captures the direct interaction and feeding behavior of giraffes in response to human presence.",
393
+ "cat_name": "giraffe",
394
+ "file_name": "00265",
395
+ "isValid": true
396
+ },
397
+ "3": {
398
+ "ref_exp": "The giraffe extends its neck toward a woman offering food.",
399
+ "caption": "The image shows two giraffes engaging in distinct actions. \n\n1. The closest giraffe extends its neck toward a woman holding a cluster of leafy greens. Its head is tilted forward, and its tongue is reaching out, likely to grasp the food being offered.\n\n2. A second giraffe is standing slightly behind, facing away from the camera. It is standing upright, with its legs straight and its posture relaxed, while observing the interaction between the first giraffe and the woman.\n\n3. The woman, holding a child, is extending her arm, presenting the greens in an upward motion towards the giraffe.\n\n4. A child in her arms is looking toward the giraffe, with a curious expression as the animal approaches. \n\nThe scene captures the direct interaction and feeding behavior of giraffes in response to human presence.",
400
+ "cat_name": "giraffe",
401
+ "file_name": "00265",
402
+ "isValid": true
403
+ }
404
+ },
405
+ "00275": {
406
+ "1": {
407
+ "ref_exp": "The giraffe reaches forward to grasp food offered by a person.",
408
+ "caption": "In the image, one giraffe extends its neck forward, reaching toward a person holding out a piece of food. Its long tongue emerges to grasp the food offered. Another giraffe stands nearby, with its head turned slightly, observing the interaction. The first giraffe's movements are distinct as it leans in to eat, while the second giraffe remains stationary, seemingly uninterested in the immediate food. The person extends their arm, actively presenting the food to the giraffe.",
409
+ "cat_name": "giraffe",
410
+ "file_name": "00275",
411
+ "isValid": true
412
+ },
413
+ "2": {
414
+ "ref_exp": "The giraffe extends its neck forward to grasp the food offered.",
415
+ "caption": "In the image, one giraffe extends its neck forward, reaching toward a person holding out a piece of food. Its long tongue emerges to grasp the food offered. Another giraffe stands nearby, with its head turned slightly, observing the interaction. The first giraffe's movements are distinct as it leans in to eat, while the second giraffe remains stationary, seemingly uninterested in the immediate food. The person extends their arm, actively presenting the food to the giraffe.",
416
+ "cat_name": "giraffe",
417
+ "file_name": "00275",
418
+ "isValid": true
419
+ },
420
+ "3": {
421
+ "ref_exp": "The giraffe extends its neck to grasp food from a person.",
422
+ "caption": "In the image, one giraffe extends its neck forward, reaching toward a person holding out a piece of food. Its long tongue emerges to grasp the food offered. Another giraffe stands nearby, with its head turned slightly, observing the interaction. The first giraffe's movements are distinct as it leans in to eat, while the second giraffe remains stationary, seemingly uninterested in the immediate food. The person extends their arm, actively presenting the food to the giraffe.",
423
+ "cat_name": "giraffe",
424
+ "file_name": "00275",
425
+ "isValid": true
426
+ }
427
+ }
428
+ },
429
+ "9877af5063": {
430
+ "00040": {
431
+ "1": {
432
+ "ref_exp": "The sheep extends its mouth toward the stump.",
433
+ "caption": "In the image, the black sheep is approached closely to a wooden stump. \n\n1. The black sheep stands next to the stump, leaning its head forward.\n2. The sheep extends its mouth toward the stump, making contact with the surface.\n3. The sheep tilts its head slightly, pulling back after an interaction with the stump.\n4. In the background, another sheep is seen grazing, bending its neck to eat grass.\n\nThese actions highlight the sheep's engagement with its immediate environment, particularly the wooden stump.",
434
+ "cat_name": "sheep",
435
+ "file_name": "00040",
436
+ "isValid": false
437
+ },
438
+ "2": {
439
+ "ref_exp": "The sheep extends its mouth toward the stump.",
440
+ "caption": "In the image, the black sheep is approached closely to a wooden stump. \n\n1. The black sheep stands next to the stump, leaning its head forward.\n2. The sheep extends its mouth toward the stump, making contact with the surface.\n3. The sheep tilts its head slightly, pulling back after an interaction with the stump.\n4. In the background, another sheep is seen grazing, bending its neck to eat grass.\n\nThese actions highlight the sheep's engagement with its immediate environment, particularly the wooden stump.",
441
+ "cat_name": "sheep",
442
+ "file_name": "00040",
443
+ "isValid": true
444
+ }
445
+ },
446
+ "00055": null,
447
+ "00090": {
448
+ "1": {
449
+ "ref_exp": "The sheep is inspecting a broken tree stump.",
450
+ "caption": "The image features several sheep engaged in various actions:\n\n1. One black lamb stands near a broken tree stump, inspecting it closely.\n2. The lamb appears to nibble at the exposed wood, using its mouth to pull fibers.\n3. Another sheep, in the background, grazes on the grass, using its head to brush against the ground.\n4. A third sheep is seen moving its head side to side, possibly looking for additional grazing spots. \n5. Occasionally, the black lamb shifts its weight, adjusting its stance while still interacting with the stump. \n\nThese actions are clear and distinct within the scene.",
451
+ "cat_name": "sheep",
452
+ "file_name": "00090",
453
+ "isValid": true
454
+ },
455
+ "2": {
456
+ "ref_exp": "The sheep is inspecting and nibbling at the broken tree stump.",
457
+ "caption": "The image features several sheep engaged in various actions:\n\n1. One black lamb stands near a broken tree stump, inspecting it closely.\n2. The lamb appears to nibble at the exposed wood, using its mouth to pull fibers.\n3. Another sheep, in the background, grazes on the grass, using its head to brush against the ground.\n4. A third sheep is seen moving its head side to side, possibly looking for additional grazing spots. \n5. Occasionally, the black lamb shifts its weight, adjusting its stance while still interacting with the stump. \n\nThese actions are clear and distinct within the scene.",
458
+ "cat_name": "sheep",
459
+ "file_name": "00090",
460
+ "isValid": true
461
+ }
462
+ },
463
+ "00155": null
464
+ }
465
+ }
models/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .referformer import build
2
+
3
+
4
+ def build_model(args):
5
+ return build(args)
models/backbone.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Backbone modules.
3
+ Modified from DETR (https://github.com/facebookresearch/detr)
4
+ """
5
+ from collections import OrderedDict
6
+
7
+ import torch
8
+ import torch.nn.functional as F
9
+ import torchvision
10
+ from torch import nn
11
+ from torchvision.models._utils import IntermediateLayerGetter
12
+ from typing import Dict, List
13
+ from einops import rearrange
14
+
15
+ from util.misc import NestedTensor, is_main_process
16
+
17
+ from .position_encoding import build_position_encoding
18
+
19
+
20
+ class FrozenBatchNorm2d(torch.nn.Module):
21
+ """
22
+ BatchNorm2d where the batch statistics and the affine parameters are fixed.
23
+
24
+ Copy-paste from torchvision.misc.ops with added eps before rqsrt,
25
+ without which any other models than torchvision.models.resnet[18,34,50,101]
26
+ produce nans.
27
+ """
28
+
29
+ def __init__(self, n):
30
+ super(FrozenBatchNorm2d, self).__init__()
31
+ self.register_buffer("weight", torch.ones(n))
32
+ self.register_buffer("bias", torch.zeros(n))
33
+ self.register_buffer("running_mean", torch.zeros(n))
34
+ self.register_buffer("running_var", torch.ones(n))
35
+
36
+ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
37
+ missing_keys, unexpected_keys, error_msgs):
38
+ num_batches_tracked_key = prefix + 'num_batches_tracked'
39
+ if num_batches_tracked_key in state_dict:
40
+ del state_dict[num_batches_tracked_key]
41
+
42
+ super(FrozenBatchNorm2d, self)._load_from_state_dict(
43
+ state_dict, prefix, local_metadata, strict,
44
+ missing_keys, unexpected_keys, error_msgs)
45
+
46
+ def forward(self, x):
47
+ # move reshapes to the beginning
48
+ # to make it fuser-friendly
49
+ w = self.weight.reshape(1, -1, 1, 1)
50
+ b = self.bias.reshape(1, -1, 1, 1)
51
+ rv = self.running_var.reshape(1, -1, 1, 1)
52
+ rm = self.running_mean.reshape(1, -1, 1, 1)
53
+ eps = 1e-5
54
+ scale = w * (rv + eps).rsqrt()
55
+ bias = b - rm * scale
56
+ return x * scale + bias
57
+
58
+
59
+ class BackboneBase(nn.Module):
60
+
61
+ def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool):
62
+ super().__init__()
63
+ for name, parameter in backbone.named_parameters():
64
+ if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
65
+ parameter.requires_grad_(False)
66
+ if return_interm_layers:
67
+ return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
68
+ # return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"} deformable detr
69
+ self.strides = [4, 8, 16, 32]
70
+ self.num_channels = [256, 512, 1024, 2048]
71
+ else:
72
+ return_layers = {'layer4': "0"}
73
+ self.strides = [32]
74
+ self.num_channels = [2048]
75
+ self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
76
+
77
+ def forward(self, tensor_list: NestedTensor):
78
+ xs = self.body(tensor_list.tensors)
79
+ out: Dict[str, NestedTensor] = {}
80
+ for name, x in xs.items():
81
+ m = tensor_list.mask
82
+ assert m is not None
83
+ mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
84
+ out[name] = NestedTensor(x, mask)
85
+ return out
86
+
87
+
88
+ class Backbone(BackboneBase):
89
+ """ResNet backbone with frozen BatchNorm."""
90
+ def __init__(self, name: str,
91
+ train_backbone: bool,
92
+ return_interm_layers: bool,
93
+ dilation: bool):
94
+ backbone = getattr(torchvision.models, name)(
95
+ replace_stride_with_dilation=[False, False, dilation],
96
+ pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d)
97
+ assert name not in ('resnet18', 'resnet34'), "number of channels are hard coded"
98
+ super().__init__(backbone, train_backbone, return_interm_layers)
99
+ if dilation:
100
+ self.strides[-1] = self.strides[-1] // 2
101
+
102
+
103
+ class Joiner(nn.Sequential):
104
+ def __init__(self, backbone, position_embedding):
105
+ super().__init__(backbone, position_embedding)
106
+ self.strides = backbone.strides
107
+ self.num_channels = backbone.num_channels
108
+
109
+
110
+ def forward(self, tensor_list: NestedTensor):
111
+ tensor_list.tensors = rearrange(tensor_list.tensors, 'b t c h w -> (b t) c h w')
112
+ tensor_list.mask = rearrange(tensor_list.mask, 'b t h w -> (b t) h w')
113
+
114
+ xs = self[0](tensor_list)
115
+ out: List[NestedTensor] = []
116
+ pos = []
117
+ for name, x in xs.items():
118
+ out.append(x)
119
+ # position encoding
120
+ pos.append(self[1](x).to(x.tensors.dtype))
121
+ return out, pos
122
+
123
+
124
+ def build_backbone(args):
125
+ position_embedding = build_position_encoding(args)
126
+ train_backbone = args.lr_backbone > 0
127
+ return_interm_layers = args.masks or (args.num)
128
+ backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
129
+ model = Joiner(backbone, position_embedding)
130
+ model.num_channels = backbone.num_channels
131
+ return model
132
+
models/criterion.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from torch import nn
4
+
5
+ from util import box_ops
6
+ from util.misc import (NestedTensor, nested_tensor_from_tensor_list,
7
+ accuracy, get_world_size, interpolate,
8
+ is_dist_avail_and_initialized, inverse_sigmoid)
9
+
10
+ from .segmentation import (dice_loss, sigmoid_focal_loss)
11
+
12
+ from einops import rearrange
13
+
14
+ class SetCriterion(nn.Module):
15
+ """ This class computes the loss for ReferFormer.
16
+ The process happens in two steps:
17
+ 1) we compute hungarian assignment between ground truth boxes and the outputs of the model
18
+ 2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
19
+ """
20
+ def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses, focal_alpha=0.25):
21
+ """ Create the criterion.
22
+ Parameters:
23
+ num_classes: number of object categories, omitting the special no-object category
24
+ matcher: module able to compute a matching between targets and proposals
25
+ weight_dict: dict containing as key the names of the losses and as values their relative weight.
26
+ eos_coef: relative classification weight applied to the no-object category
27
+ losses: list of all the losses to be applied. See get_loss for list of available losses.
28
+ """
29
+ super().__init__()
30
+ self.num_classes = num_classes
31
+ self.matcher = matcher
32
+ self.weight_dict = weight_dict
33
+ self.eos_coef = eos_coef
34
+ self.losses = losses
35
+ empty_weight = torch.ones(self.num_classes + 1)
36
+ empty_weight[-1] = self.eos_coef
37
+ self.register_buffer('empty_weight', empty_weight)
38
+ self.focal_alpha = focal_alpha
39
+ self.mask_out_stride = 4
40
+
41
+ def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
42
+ """Classification loss (NLL)
43
+ targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
44
+ """
45
+ assert 'pred_logits' in outputs
46
+ src_logits = outputs['pred_logits']
47
+ _, nf, nq = src_logits.shape[:3]
48
+ src_logits = rearrange(src_logits, 'b t q k -> b (t q) k')
49
+
50
+ # judge the valid frames
51
+ valid_indices = []
52
+ valids = [target['valid'] for target in targets]
53
+ for valid, (indice_i, indice_j) in zip(valids, indices):
54
+ valid_ind = valid.nonzero().flatten()
55
+ valid_i = valid_ind * nq + indice_i
56
+ valid_j = valid_ind + indice_j * nf
57
+ valid_indices.append((valid_i, valid_j))
58
+
59
+ idx = self._get_src_permutation_idx(valid_indices) # NOTE: use valid indices
60
+ target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, valid_indices)])
61
+ target_classes = torch.full(src_logits.shape[:2], self.num_classes,
62
+ dtype=torch.int64, device=src_logits.device)
63
+ if self.num_classes == 1: # binary referred
64
+ target_classes[idx] = 0
65
+ else:
66
+ target_classes[idx] = target_classes_o
67
+
68
+ target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
69
+ dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
70
+ target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
71
+
72
+ target_classes_onehot = target_classes_onehot[:,:,:-1]
73
+ loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]
74
+ losses = {'loss_ce': loss_ce}
75
+
76
+ if log:
77
+ # TODO this should probably be a separate loss, not hacked in this one here
78
+ pass
79
+ return losses
80
+
81
+
82
+ def loss_boxes(self, outputs, targets, indices, num_boxes):
83
+ """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
84
+ targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
85
+ The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
86
+ """
87
+ assert 'pred_boxes' in outputs
88
+ src_boxes = outputs['pred_boxes']
89
+ bs, nf, nq = src_boxes.shape[:3]
90
+ src_boxes = src_boxes.transpose(1, 2)
91
+
92
+ idx = self._get_src_permutation_idx(indices)
93
+ src_boxes = src_boxes[idx]
94
+ src_boxes = src_boxes.flatten(0, 1) # [b*t, 4]
95
+
96
+ target_boxes = torch.cat([t['boxes'] for t in targets], dim=0) # [b*t, 4]
97
+
98
+ loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
99
+
100
+ losses = {}
101
+ losses['loss_bbox'] = loss_bbox.sum() / num_boxes
102
+
103
+ loss_giou = 1 - torch.diag(box_ops.generalized_box_iou(
104
+ box_ops.box_cxcywh_to_xyxy(src_boxes),
105
+ box_ops.box_cxcywh_to_xyxy(target_boxes)))
106
+ losses['loss_giou'] = loss_giou.sum() / num_boxes
107
+ return losses
108
+
109
+
110
+ def loss_masks(self, outputs, targets, indices, num_boxes):
111
+ """Compute the losses related to the masks: the focal loss and the dice loss.
112
+ targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
113
+ """
114
+ assert "pred_masks" in outputs
115
+
116
+ src_idx = self._get_src_permutation_idx(indices)
117
+ # tgt_idx = self._get_tgt_permutation_idx(indices)
118
+
119
+ src_masks = outputs["pred_masks"]
120
+ src_masks = src_masks.transpose(1, 2)
121
+
122
+ # TODO use valid to mask invalid areas due to padding in loss
123
+ target_masks, valid = nested_tensor_from_tensor_list([t["masks"] for t in targets],
124
+ size_divisibility=32, split=False).decompose()
125
+ target_masks = target_masks.to(src_masks)
126
+
127
+ # downsample ground truth masks with ratio mask_out_stride
128
+ start = int(self.mask_out_stride // 2)
129
+ im_h, im_w = target_masks.shape[-2:]
130
+
131
+ target_masks = target_masks[:, :, start::self.mask_out_stride, start::self.mask_out_stride]
132
+ assert target_masks.size(2) * self.mask_out_stride == im_h
133
+ assert target_masks.size(3) * self.mask_out_stride == im_w
134
+
135
+ src_masks = src_masks[src_idx]
136
+ # upsample predictions to the target size
137
+ # src_masks = interpolate(src_masks, size=target_masks.shape[-2:], mode="bilinear", align_corners=False)
138
+ src_masks = src_masks.flatten(1) # [b, thw]
139
+
140
+ target_masks = target_masks.flatten(1) # [b, thw]
141
+
142
+ losses = {
143
+ "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
144
+ "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
145
+ }
146
+ return losses
147
+
148
+ def _get_src_permutation_idx(self, indices):
149
+ # permute predictions following indices
150
+ batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
151
+ src_idx = torch.cat([src for (src, _) in indices])
152
+ return batch_idx, src_idx
153
+
154
+ def _get_tgt_permutation_idx(self, indices):
155
+ # permute targets following indices
156
+ batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
157
+ tgt_idx = torch.cat([tgt for (_, tgt) in indices])
158
+ return batch_idx, tgt_idx
159
+
160
+ def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
161
+ loss_map = {
162
+ 'labels': self.loss_labels,
163
+ 'boxes': self.loss_boxes,
164
+ 'masks': self.loss_masks
165
+ }
166
+ assert loss in loss_map, f'do you really want to compute {loss} loss?'
167
+ return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
168
+
169
+ def forward(self, outputs, targets):
170
+ """ This performs the loss computation.
171
+ Parameters:
172
+ outputs: dict of tensors, see the output specification of the model for the format
173
+ targets: list of dicts, such that len(targets) == batch_size.
174
+ The expected keys in each dict depends on the losses applied, see each loss' doc
175
+ """
176
+ outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}
177
+ # Retrieve the matching between the outputs of the last layer and the targets
178
+ indices = self.matcher(outputs_without_aux, targets)
179
+
180
+ # Compute the average number of target boxes accross all nodes, for normalization purposes
181
+ target_valid = torch.stack([t["valid"] for t in targets], dim=0).reshape(-1) # [B, T] -> [B*T]
182
+ num_boxes = target_valid.sum().item()
183
+ num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
184
+ if is_dist_avail_and_initialized():
185
+ torch.distributed.all_reduce(num_boxes)
186
+ num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
187
+
188
+ # Compute all the requested losses
189
+ losses = {}
190
+ for loss in self.losses:
191
+ losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
192
+
193
+ # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
194
+ if 'aux_outputs' in outputs:
195
+ for i, aux_outputs in enumerate(outputs['aux_outputs']):
196
+ indices = self.matcher(aux_outputs, targets)
197
+ for loss in self.losses:
198
+ kwargs = {}
199
+ if loss == 'labels':
200
+ # Logging is enabled only for the last layer
201
+ kwargs = {'log': False}
202
+ l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
203
+ l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
204
+ losses.update(l_dict)
205
+
206
+ return losses
207
+
208
+
models/deformable_transformer.py ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------
6
+ # Modified from DETR (https://github.com/facebookresearch/detr)
7
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
8
+ # ------------------------------------------------------------------------
9
+
10
+ import copy
11
+ from typing import Optional, List
12
+ import math
13
+
14
+ import torch
15
+ import torch.nn.functional as F
16
+ from torch import nn, Tensor
17
+ from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
18
+
19
+ from util.misc import inverse_sigmoid
20
+ from models.ops.modules import MSDeformAttn
21
+
22
+ from einops import rearrange
23
+
24
+
25
+ class DeformableTransformer(nn.Module):
26
+ def __init__(self, d_model=256, nhead=8,
27
+ num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1,
28
+ activation="relu", return_intermediate_dec=False,
29
+ num_feature_levels=4, dec_n_points=4, enc_n_points=4,
30
+ two_stage=False, two_stage_num_proposals=300):
31
+ super().__init__()
32
+
33
+ self.d_model = d_model
34
+ self.nhead = nhead
35
+ self.dropout = dropout
36
+ self.two_stage = two_stage
37
+ self.two_stage_num_proposals = two_stage_num_proposals
38
+ self.num_feature_level = num_feature_levels
39
+
40
+ encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
41
+ dropout, activation,
42
+ num_feature_levels,
43
+ nhead, enc_n_points)
44
+ self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers)
45
+
46
+ decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
47
+ dropout, activation,
48
+ num_feature_levels,
49
+ nhead, dec_n_points)
50
+ self.decoder = DeformableTransformerDecoder(decoder_layer, num_decoder_layers, return_intermediate_dec)
51
+
52
+ self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
53
+
54
+ if two_stage:
55
+ self.enc_output = nn.Linear(d_model, d_model)
56
+ self.enc_output_norm = nn.LayerNorm(d_model)
57
+ self.pos_trans = nn.Linear(d_model * 2, d_model * 2)
58
+ self.pos_trans_norm = nn.LayerNorm(d_model * 2)
59
+ else:
60
+ self.reference_points = nn.Linear(d_model, 2) # reference point here (x, y)
61
+
62
+ self._reset_parameters()
63
+
64
+ def _reset_parameters(self):
65
+ for p in self.parameters():
66
+ if p.dim() > 1:
67
+ nn.init.xavier_uniform_(p)
68
+ for m in self.modules():
69
+ if isinstance(m, MSDeformAttn):
70
+ m._reset_parameters()
71
+ if not self.two_stage:
72
+ xavier_uniform_(self.reference_points.weight.data, gain=1.0)
73
+ constant_(self.reference_points.bias.data, 0.)
74
+ normal_(self.level_embed)
75
+
76
+ def get_proposal_pos_embed(self, proposals):
77
+ num_pos_feats = 128
78
+ temperature = 10000
79
+ scale = 2 * math.pi
80
+
81
+ dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
82
+ dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
83
+ # N, L, 4
84
+ proposals = proposals.sigmoid() * scale
85
+ # N, L, 4, 128
86
+ pos = proposals[:, :, :, None] / dim_t
87
+ # N, L, 4, 64, 2
88
+ pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
89
+ return pos
90
+
91
+ def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
92
+ N_, S_, C_ = memory.shape
93
+ base_scale = 4.0
94
+ proposals = []
95
+ _cur = 0
96
+ for lvl, (H_, W_) in enumerate(spatial_shapes):
97
+ mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
98
+ valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
99
+ valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
100
+
101
+ grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
102
+ torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
103
+ grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
104
+
105
+ scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
106
+ grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
107
+ wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
108
+ proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
109
+ proposals.append(proposal)
110
+ _cur += (H_ * W_)
111
+ output_proposals = torch.cat(proposals, 1)
112
+ output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
113
+ output_proposals = torch.log(output_proposals / (1 - output_proposals))
114
+ output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
115
+ output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
116
+
117
+ output_memory = memory
118
+ output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
119
+ output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
120
+ output_memory = self.enc_output_norm(self.enc_output(output_memory))
121
+ return output_memory, output_proposals
122
+
123
+ def get_valid_ratio(self, mask):
124
+ _, H, W = mask.shape
125
+ valid_H = torch.sum(~mask[:, :, 0], 1)
126
+ valid_W = torch.sum(~mask[:, 0, :], 1)
127
+ valid_ratio_h = valid_H.float() / H
128
+ valid_ratio_w = valid_W.float() / W
129
+ valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
130
+ return valid_ratio
131
+
132
+ def forward(self, srcs, tgt, masks, pos_embeds, query_embed=None):
133
+ assert self.two_stage or query_embed is not None
134
+ """
135
+ srcs (list[Tensor]): list of tensors num_layers x [batch_size*time, c, hi, wi], input of encoder
136
+ tgt (Tensor): [batch_size, time, c, num_queries_per_frame]
137
+ masks (list[Tensor]): list of tensors num_layers x [batch_size*time, hi, wi], the mask of srcs
138
+ pos_embeds (list[Tensor]): list of tensors num_layers x [batch_size*time, c, hi, wi], position encoding of srcs
139
+ query_embed (Tensor): [num_queries, c]
140
+ """
141
+ # prepare input for encoder
142
+ src_flatten = []
143
+ mask_flatten = []
144
+ lvl_pos_embed_flatten = []
145
+ spatial_shapes = []
146
+ for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
147
+ bs, c, h, w = src.shape
148
+ spatial_shape = (h, w)
149
+ spatial_shapes.append(spatial_shape)
150
+
151
+ src = src.flatten(2).transpose(1, 2) # [batch_size, hi*wi, c]
152
+ mask = mask.flatten(1) # [batch_size, hi*wi]
153
+ pos_embed = pos_embed.flatten(2).transpose(1, 2) # [batch_size, hi*wi, c]
154
+ lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
155
+
156
+ lvl_pos_embed_flatten.append(lvl_pos_embed)
157
+ src_flatten.append(src)
158
+ mask_flatten.append(mask)
159
+
160
+ # For a clip, concat all the features, first fpn layer size, then frame size
161
+ src_flatten = torch.cat(src_flatten, 1) # [bs*t, \sigma(hi*wi), c]
162
+ mask_flatten = torch.cat(mask_flatten, 1) # [bs*t, \sigma(hi*wi)]
163
+ lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
164
+ spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
165
+ level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
166
+ valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
167
+
168
+ # encoder
169
+ # memory: [bs*t, \sigma(hi*wi), c]
170
+ memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)
171
+
172
+ # prepare input for decoder
173
+ bs, _, c = memory.shape
174
+ if self.two_stage:
175
+ output_memory, output_proposals = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes)
176
+
177
+ # hack implementation for two-stage Deformable DETR
178
+ enc_outputs_class = self.decoder.class_embed[self.decoder.num_layers](output_memory)
179
+ enc_outputs_coord_unact = self.decoder.bbox_embed[self.decoder.num_layers](output_memory) + output_proposals
180
+
181
+ topk = self.two_stage_num_proposals
182
+ topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
183
+ topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
184
+ topk_coords_unact = topk_coords_unact.detach()
185
+ reference_points = topk_coords_unact.sigmoid()
186
+ init_reference_out = reference_points
187
+ pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))
188
+ query_embed, tgt = torch.split(pos_trans_out, c, dim=2)
189
+ else:
190
+ b, t, q, c = tgt.shape
191
+ tgt = rearrange(tgt, 'b t q c -> (b t) q c')
192
+ query_embed = query_embed.unsqueeze(0).expand(b*t, -1, -1) # [batch_size*time, num_queries_per_frame, c]
193
+ reference_points = self.reference_points(query_embed).sigmoid() # [batch_size*time, num_queries_per_frame, 2]
194
+ init_reference_out = reference_points
195
+
196
+ # decoder
197
+ hs, inter_references, inter_samples = self.decoder(tgt, reference_points, memory,
198
+ spatial_shapes, level_start_index, valid_ratios, query_embed, mask_flatten)
199
+
200
+ inter_references_out = inter_references
201
+
202
+ # convert memory to fpn format
203
+ memory_features = [] # 8x -> 32x
204
+ spatial_index = 0
205
+ for lvl in range(self.num_feature_level - 1):
206
+ h, w = spatial_shapes[lvl]
207
+ # [bs*t, c, h, w]
208
+ memory_lvl = memory[:, spatial_index : spatial_index + h * w, :].reshape(bs, h, w, c).permute(0, 3, 1, 2).contiguous()
209
+ memory_features.append(memory_lvl)
210
+ spatial_index += h * w
211
+
212
+ if self.two_stage:
213
+ return hs, memory_features, init_reference_out, inter_references_out, enc_outputs_class, enc_outputs_coord_unact, inter_samples
214
+ # hs: [l, batch_size*time, num_queries_per_frame, c], where l is number of decoder layers
215
+ # init_reference_out: [batch_size*time, num_queries_per_frame, 2]
216
+ # inter_references_out: [l, batch_size*time, num_queries_per_frame, 4]
217
+ # memory: [batch_size*time, \sigma(hi*wi), c]
218
+ # memory_features: list[Tensor]
219
+
220
+ return hs, memory_features, init_reference_out, inter_references_out, None, None, inter_samples
221
+
222
+
223
+ class DeformableTransformerEncoderLayer(nn.Module):
224
+ def __init__(self,
225
+ d_model=256, d_ffn=1024,
226
+ dropout=0.1, activation="relu",
227
+ n_levels=4, n_heads=8, n_points=4):
228
+ super().__init__()
229
+
230
+ # self attention
231
+ self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
232
+ self.dropout1 = nn.Dropout(dropout)
233
+ self.norm1 = nn.LayerNorm(d_model)
234
+
235
+ # ffn
236
+ self.linear1 = nn.Linear(d_model, d_ffn)
237
+ self.activation = _get_activation_fn(activation)
238
+ self.dropout2 = nn.Dropout(dropout)
239
+ self.linear2 = nn.Linear(d_ffn, d_model)
240
+ self.dropout3 = nn.Dropout(dropout)
241
+ self.norm2 = nn.LayerNorm(d_model)
242
+
243
+ @staticmethod
244
+ def with_pos_embed(tensor, pos):
245
+ return tensor if pos is None else tensor + pos
246
+
247
+ def forward_ffn(self, src):
248
+ src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
249
+ src = src + self.dropout3(src2)
250
+ src = self.norm2(src)
251
+ return src
252
+
253
+ def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None):
254
+ # self attention
255
+ src2, sampling_locations, attention_weights = self.self_attn(self.with_pos_embed(src, pos), reference_points,
256
+ src, spatial_shapes, level_start_index, padding_mask)
257
+ src = src + self.dropout1(src2)
258
+ src = self.norm1(src)
259
+
260
+ # ffn
261
+ src = self.forward_ffn(src)
262
+
263
+ return src
264
+
265
+
266
+ class DeformableTransformerEncoder(nn.Module):
267
+ def __init__(self, encoder_layer, num_layers):
268
+ super().__init__()
269
+ self.layers = _get_clones(encoder_layer, num_layers)
270
+ self.num_layers = num_layers
271
+
272
+ @staticmethod
273
+ def get_reference_points(spatial_shapes, valid_ratios, device):
274
+ reference_points_list = []
275
+ for lvl, (H_, W_) in enumerate(spatial_shapes):
276
+
277
+ ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
278
+ torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
279
+ ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
280
+ ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
281
+ ref = torch.stack((ref_x, ref_y), -1)
282
+ reference_points_list.append(ref)
283
+ reference_points = torch.cat(reference_points_list, 1)
284
+ reference_points = reference_points[:, :, None] * valid_ratios[:, None]
285
+ return reference_points
286
+
287
+ def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):
288
+ output = src
289
+ reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
290
+ for _, layer in enumerate(self.layers):
291
+ output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
292
+
293
+ return output
294
+
295
+
296
+ class DeformableTransformerDecoderLayer(nn.Module):
297
+ def __init__(self, d_model=256, d_ffn=1024,
298
+ dropout=0.1, activation="relu",
299
+ n_levels=4, n_heads=8, n_points=4):
300
+ super().__init__()
301
+
302
+ # cross attention
303
+ self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
304
+ self.dropout1 = nn.Dropout(dropout)
305
+ self.norm1 = nn.LayerNorm(d_model)
306
+
307
+ # self attention
308
+ self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
309
+ self.dropout2 = nn.Dropout(dropout)
310
+ self.norm2 = nn.LayerNorm(d_model)
311
+
312
+ # ffn
313
+ self.linear1 = nn.Linear(d_model, d_ffn)
314
+ self.activation = _get_activation_fn(activation)
315
+ self.dropout3 = nn.Dropout(dropout)
316
+ self.linear2 = nn.Linear(d_ffn, d_model)
317
+ self.dropout4 = nn.Dropout(dropout)
318
+ self.norm3 = nn.LayerNorm(d_model)
319
+
320
+ @staticmethod
321
+ def with_pos_embed(tensor, pos):
322
+ return tensor if pos is None else tensor + pos
323
+
324
+ def forward_ffn(self, tgt):
325
+ tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
326
+ tgt = tgt + self.dropout4(tgt2)
327
+ tgt = self.norm3(tgt)
328
+ return tgt
329
+
330
+ def forward(self, tgt, query_pos, reference_points, src, src_spatial_shapes, level_start_index, src_padding_mask=None):
331
+ # self attention
332
+ q = k = self.with_pos_embed(tgt, query_pos)
333
+ tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1))[0].transpose(0, 1)
334
+ tgt = tgt + self.dropout2(tgt2)
335
+ tgt = self.norm2(tgt)
336
+
337
+ # cross attention
338
+ tgt2, sampling_locations, attention_weights = self.cross_attn(self.with_pos_embed(tgt, query_pos),
339
+ reference_points,
340
+ src, src_spatial_shapes, level_start_index, src_padding_mask)
341
+ tgt = tgt + self.dropout1(tgt2)
342
+ tgt = self.norm1(tgt)
343
+
344
+ # ffn
345
+ tgt = self.forward_ffn(tgt)
346
+
347
+ return tgt, sampling_locations, attention_weights
348
+
349
+
350
+
351
+ class DeformableTransformerDecoder(nn.Module):
352
+ def __init__(self, decoder_layer, num_layers, return_intermediate=False):
353
+ super().__init__()
354
+ self.layers = _get_clones(decoder_layer, num_layers)
355
+ self.num_layers = num_layers
356
+ self.return_intermediate = return_intermediate
357
+ # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
358
+ self.bbox_embed = None
359
+ self.class_embed = None
360
+
361
+ def forward(self, tgt, reference_points, src, src_spatial_shapes, src_level_start_index, src_valid_ratios,
362
+ query_pos=None, src_padding_mask=None):
363
+ # we modify here for get the information of sample points
364
+ output = tgt
365
+
366
+ intermediate = []
367
+ intermediate_reference_points = []
368
+ intermediate_samples = [] # sample points
369
+ for lid, layer in enumerate(self.layers):
370
+ if reference_points.shape[-1] == 4:
371
+ reference_points_input = reference_points[:, :, None] \
372
+ * torch.cat([src_valid_ratios, src_valid_ratios], -1)[:, None]
373
+ else:
374
+ assert reference_points.shape[-1] == 2
375
+ reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None]
376
+ output, sampling_locations, attention_weights = layer(output, query_pos, reference_points_input,
377
+ src, src_spatial_shapes, src_level_start_index, src_padding_mask)
378
+
379
+ # sampling_loactions: [N, Len_q, self.n_heads, self.n_levels, self.n_points, 2],
380
+ # [B, Q, n_head, n_level(num_feature_level*num_frames), n_points, 2]
381
+ # attention_weights: [B, Q, n_head, n_level(num_feature_level*num_frames), n_points]
382
+ # src_valid_ratios: [N, self.n_levels, 2]
383
+ N, Len_q = sampling_locations.shape[:2]
384
+ sampling_locations = sampling_locations / src_valid_ratios[:, None, None, :, None, :]
385
+ weights_flat = attention_weights.view(N, Len_q, -1) # [B, Q, n_head * n_level * n_points]
386
+ samples_flat = sampling_locations.view(N, Len_q, -1, 2) # [B, Q, n_head * n_level * n_points, 2]
387
+ top_weights, top_idx = weights_flat.topk(30, dim=2) # [B, Q, 30], [B, Q, 30]
388
+ weights_keep = torch.gather(weights_flat, 2, top_idx) # [B, Q, 30]
389
+ samples_keep = torch.gather(samples_flat, 2, top_idx.unsqueeze(-1).repeat(1, 1, 1, 2)) # [B, Q, 30, 2]
390
+
391
+ # hack implementation for iterative bounding box refinement
392
+ if self.bbox_embed is not None:
393
+ tmp = self.bbox_embed[lid](output)
394
+ if reference_points.shape[-1] == 4:
395
+ new_reference_points = tmp + inverse_sigmoid(reference_points)
396
+ new_reference_points = new_reference_points.sigmoid()
397
+ else:
398
+ assert reference_points.shape[-1] == 2
399
+ new_reference_points = tmp
400
+ new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
401
+ new_reference_points = new_reference_points.sigmoid()
402
+ reference_points = new_reference_points.detach()
403
+
404
+ if self.return_intermediate:
405
+ intermediate.append(output)
406
+ intermediate_reference_points.append(reference_points)
407
+ intermediate_samples.append(samples_keep)
408
+
409
+ if self.return_intermediate:
410
+ return torch.stack(intermediate), torch.stack(intermediate_reference_points), torch.stack(intermediate_samples)
411
+
412
+ return output, reference_points, samples_keep
413
+
414
+
415
+ def _get_clones(module, N):
416
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
417
+
418
+
419
+ def _get_activation_fn(activation):
420
+ """Return an activation function given a string"""
421
+ if activation == "relu":
422
+ return F.relu
423
+ if activation == "gelu":
424
+ return F.gelu
425
+ if activation == "glu":
426
+ return F.glu
427
+ raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
428
+
429
+
430
+ def build_deforamble_transformer(args):
431
+ return DeformableTransformer(
432
+ d_model=args.hidden_dim,
433
+ nhead=args.nheads,
434
+ num_encoder_layers=args.enc_layers,
435
+ num_decoder_layers=args.dec_layers,
436
+ dim_feedforward=args.dim_feedforward,
437
+ dropout=args.dropout,
438
+ activation="relu",
439
+ return_intermediate_dec=True,
440
+ num_feature_levels=args.num_feature_levels,
441
+ dec_n_points=args.dec_n_points,
442
+ enc_n_points=args.enc_n_points,
443
+ two_stage=args.two_stage,
444
+ two_stage_num_proposals=args.num_queries)
models/matcher.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Instance Sequence Matching
3
+ Modified from DETR (https://github.com/facebookresearch/detr)
4
+ """
5
+ import torch
6
+ from scipy.optimize import linear_sum_assignment
7
+ from torch import nn
8
+ import torch.nn.functional as F
9
+
10
+ from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou, multi_iou
11
+ from util.misc import nested_tensor_from_tensor_list
12
+
13
+ INF = 100000000
14
+
15
+ def dice_coef(inputs, targets):
16
+ inputs = inputs.sigmoid()
17
+ inputs = inputs.flatten(1).unsqueeze(1) # [N, 1, THW]
18
+ targets = targets.flatten(1).unsqueeze(0) # [1, M, THW]
19
+ numerator = 2 * (inputs * targets).sum(2)
20
+ denominator = inputs.sum(-1) + targets.sum(-1)
21
+
22
+ # NOTE coef doesn't be subtracted to 1 as it is not necessary for computing costs
23
+ coef = (numerator + 1) / (denominator + 1)
24
+ return coef
25
+
26
+ def sigmoid_focal_coef(inputs, targets, alpha: float = 0.25, gamma: float = 2):
27
+ N, M = len(inputs), len(targets)
28
+ inputs = inputs.flatten(1).unsqueeze(1).expand(-1, M, -1) # [N, M, THW]
29
+ targets = targets.flatten(1).unsqueeze(0).expand(N, -1, -1) # [N, M, THW]
30
+
31
+ prob = inputs.sigmoid()
32
+ ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
33
+ p_t = prob * targets + (1 - prob) * (1 - targets)
34
+ coef = ce_loss * ((1 - p_t) ** gamma)
35
+
36
+ if alpha >= 0:
37
+ alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
38
+ coef = alpha_t * coef
39
+
40
+ return coef.mean(2) # [N, M]
41
+
42
+
43
+ class HungarianMatcher(nn.Module):
44
+ """This class computes an assignment between the targets and the predictions of the network
45
+
46
+ For efficiency reasons, the targets don't include the no_object. Because of this, in general,
47
+ there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
48
+ while the others are un-matched (and thus treated as non-objects).
49
+ """
50
+
51
+ def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1,
52
+ cost_mask: float = 1, cost_dice: float = 1, num_classes: int = 1):
53
+ """Creates the matcher
54
+
55
+ Params:
56
+ cost_class: This is the relative weight of the classification error in the matching cost
57
+ cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
58
+ cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
59
+ cost_mask: This is the relative weight of the sigmoid focal loss of the mask in the matching cost
60
+ cost_dice: This is the relative weight of the dice loss of the mask in the matching cost
61
+ """
62
+ super().__init__()
63
+ self.cost_class = cost_class
64
+ self.cost_bbox = cost_bbox
65
+ self.cost_giou = cost_giou
66
+ self.cost_mask = cost_mask
67
+ self.cost_dice = cost_dice
68
+ self.num_classes = num_classes
69
+ assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0 \
70
+ or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
71
+ self.mask_out_stride = 4
72
+
73
+ @torch.no_grad()
74
+ def forward(self, outputs, targets):
75
+ """ Performs the matching
76
+ Params:
77
+ outputs: This is a dict that contains at least these entries:
78
+ "pred_logits": Tensor of dim [batch_size, num_queries_per_frame, num_frames, num_classes] with the classification logits
79
+ "pred_boxes": Tensor of dim [batch_size, num_queries_per_frame, num_frames, 4] with the predicted box coordinates
80
+ "pred_masks": Tensor of dim [batch_size, num_queries_per_frame, num_frames, h, w], h,w in 4x size
81
+ targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
82
+ NOTE: Since every frame has one object at most
83
+ "labels": Tensor of dim [num_frames] (where num_target_boxes is the number of ground-truth
84
+ objects in the target) containing the class labels
85
+ "boxes": Tensor of dim [num_frames, 4] containing the target box coordinates
86
+ "masks": Tensor of dim [num_frames, h, w], h,w in origin size
87
+ Returns:
88
+ A list of size batch_size, containing tuples of (index_i, index_j) where:
89
+ - index_i is the indices of the selected predictions (in order)
90
+ - index_j is the indices of the corresponding selected targets (in order)
91
+ For each batch element, it holds:
92
+ len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
93
+ """
94
+ src_logits = outputs["pred_logits"]
95
+ src_boxes = outputs["pred_boxes"]
96
+ src_masks = outputs["pred_masks"]
97
+
98
+ bs, nf, nq, h, w = src_masks.shape
99
+
100
+ # handle mask padding issue
101
+ target_masks, valid = nested_tensor_from_tensor_list([t["masks"] for t in targets],
102
+ size_divisibility=32,
103
+ split=False).decompose()
104
+ target_masks = target_masks.to(src_masks) # [B, T, H, W]
105
+
106
+ # downsample ground truth masks with ratio mask_out_stride
107
+ start = int(self.mask_out_stride // 2)
108
+ im_h, im_w = target_masks.shape[-2:]
109
+
110
+ target_masks = target_masks[:, :, start::self.mask_out_stride, start::self.mask_out_stride]
111
+ assert target_masks.size(2) * self.mask_out_stride == im_h
112
+ assert target_masks.size(3) * self.mask_out_stride == im_w
113
+
114
+ indices = []
115
+ for i in range(bs):
116
+ out_prob = src_logits[i].sigmoid()
117
+ out_bbox = src_boxes[i]
118
+ out_mask = src_masks[i]
119
+
120
+ tgt_ids = targets[i]["labels"]
121
+ tgt_bbox = targets[i]["boxes"]
122
+ tgt_mask = target_masks[i]
123
+ tgt_valid = targets[i]["valid"]
124
+
125
+ # class cost
126
+ # we average the cost on valid frames
127
+ cost_class = []
128
+ for t in range(nf):
129
+ if tgt_valid[t] == 0:
130
+ continue
131
+
132
+ out_prob_split = out_prob[t]
133
+ tgt_ids_split = tgt_ids[t].unsqueeze(0)
134
+
135
+ # Compute the classification cost.
136
+ alpha = 0.25
137
+ gamma = 2.0
138
+ neg_cost_class = (1 - alpha) * (out_prob_split ** gamma) * (-(1 - out_prob_split + 1e-8).log())
139
+ pos_cost_class = alpha * ((1 - out_prob_split) ** gamma) * (-(out_prob_split + 1e-8).log())
140
+ if self.num_classes == 1: # binary referred
141
+ cost_class_split = pos_cost_class[:, [0]] - neg_cost_class[:, [0]]
142
+ else:
143
+ cost_class_split = pos_cost_class[:, tgt_ids_split] - neg_cost_class[:, tgt_ids_split]
144
+
145
+ cost_class.append(cost_class_split)
146
+ cost_class = torch.stack(cost_class, dim=0).mean(0) # [q, 1]
147
+
148
+ # box cost
149
+ # we average the cost on every frame
150
+ cost_bbox, cost_giou = [], []
151
+ for t in range(nf):
152
+ out_bbox_split = out_bbox[t]
153
+ tgt_bbox_split = tgt_bbox[t].unsqueeze(0)
154
+
155
+ # Compute the L1 cost between boxes
156
+ cost_bbox_split = torch.cdist(out_bbox_split, tgt_bbox_split, p=1)
157
+
158
+ # Compute the giou cost betwen boxes
159
+ cost_giou_split = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox_split),
160
+ box_cxcywh_to_xyxy(tgt_bbox_split))
161
+
162
+ cost_bbox.append(cost_bbox_split)
163
+ cost_giou.append(cost_giou_split)
164
+ cost_bbox = torch.stack(cost_bbox, dim=0).mean(0)
165
+ cost_giou = torch.stack(cost_giou, dim=0).mean(0)
166
+
167
+ # mask cost
168
+ # Compute the focal loss between masks
169
+ cost_mask = sigmoid_focal_coef(out_mask.transpose(0, 1), tgt_mask.unsqueeze(0))
170
+
171
+ # Compute the dice loss betwen masks
172
+ cost_dice = -dice_coef(out_mask.transpose(0, 1), tgt_mask.unsqueeze(0))
173
+
174
+ # Final cost matrix
175
+ C = self.cost_class * cost_class + self.cost_bbox * cost_bbox + self.cost_giou * cost_giou + \
176
+ self.cost_mask * cost_mask + self.cost_dice * cost_dice # [q, 1]
177
+
178
+ # Only has one tgt, MinCost Matcher
179
+ _, src_ind = torch.min(C, dim=0)
180
+ tgt_ind = torch.arange(1).to(src_ind)
181
+ indices.append((src_ind.long(), tgt_ind.long()))
182
+
183
+ # list[tuple], length is batch_size
184
+ return indices
185
+
186
+
187
+ def build_matcher(args):
188
+ if args.binary:
189
+ num_classes = 1
190
+ else:
191
+ if args.dataset_file == 'ytvos':
192
+ num_classes = 65
193
+ elif args.dataset_file == 'davis':
194
+ num_classes = 78
195
+ elif args.dataset_file == 'a2d' or args.dataset_file == 'jhmdb':
196
+ num_classes = 1
197
+ else:
198
+ num_classes = 91 # for coco
199
+ return HungarianMatcher(cost_class=args.set_cost_class,
200
+ cost_bbox=args.set_cost_bbox,
201
+ cost_giou=args.set_cost_giou,
202
+ cost_mask=args.set_cost_mask,
203
+ cost_dice=args.set_cost_dice,
204
+ num_classes=num_classes)
205
+
206
+
models/ops/make.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # ------------------------------------------------------------------------------------------------
3
+ # Deformable DETR
4
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
5
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
6
+ # ------------------------------------------------------------------------------------------------
7
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
8
+ # ------------------------------------------------------------------------------------------------
9
+
10
+ python setup.py build install
models/ops/modules/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------------------
2
+ # Deformable DETR
3
+ # Copyright (c) 2020 SenseTime. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------------------
6
+ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
7
+ # ------------------------------------------------------------------------------------------------
8
+
9
+ from .ms_deform_attn import MSDeformAttn