dianecy commited on
Commit
91e3dad
·
verified ·
1 Parent(s): 9b855a7

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .history/datasets/a2d_20241227174300.py +241 -0
  2. .history/datasets/a2d_20250203155857.py +243 -0
  3. .history/datasets/a2d_20250203160149.py +247 -0
  4. .history/datasets/a2d_20250203174309.py +247 -0
  5. .history/datasets/ytvos_ref_20250113163537.py +250 -0
  6. .history/datasets/ytvos_ref_20250116071955.py +240 -0
  7. .history/datasets/ytvos_ref_20250116072439.py +240 -0
  8. .history/datasets/ytvos_ref_20250116073540.py +239 -0
  9. .history/datasets/ytvos_ref_20250116073706.py +240 -0
  10. .history/datasets/ytvos_ref_20250116073858.py +239 -0
  11. .history/mbench/gpt_ref-ytvos-cy_20250121143328.py +0 -0
  12. .history/mbench/gpt_ref-ytvos-cy_20250121155631.py +428 -0
  13. .history/mbench/gpt_ref-ytvos_20250119071933.py +292 -0
  14. .history/mbench/gpt_ref-ytvos_20250119072546.py +292 -0
  15. .history/mbench/make_ref-ytvos_json_20250113181932.py +0 -0
  16. .history/mbench/make_ref-ytvos_json_20250113182455.py +100 -0
  17. .history/mbench/make_ref-ytvos_json_20250113182916.py +102 -0
  18. .history/mbench/make_ref-ytvos_json_20250113182917.py +102 -0
  19. .history/mbench/make_ref-ytvos_json_20250113183527.py +103 -0
  20. .history/mbench/make_ref-ytvos_json_20250113195258.py +103 -0
  21. .history/mbench/make_ref-ytvos_json_20250113195443.py +103 -0
  22. .history/mbench/make_ref-ytvos_json_20250116140957.py +103 -0
  23. .history/mbench/make_ref-ytvos_json_20250117032934.py +105 -0
  24. .history/mbench/make_ref-ytvos_json_20250117074200.py +107 -0
  25. .history/mbench/make_ref-ytvos_json_20250117074329.py +107 -0
  26. .history/slurm_script/jupyter_20250106230703.sh +16 -0
  27. .history/slurm_script/jupyter_20250113135212.sh +16 -0
  28. .history/slurm_script/jupyter_20250117012746.sh +16 -0
  29. .history/slurm_script/jupyter_20250117012750.sh +16 -0
  30. .history/slurm_script/jupyter_20250117143527.sh +16 -0
  31. .history/slurm_script/mbench_gpt_a2d_20250205122407.sh +0 -0
  32. .history/slurm_script/mbench_gpt_a2d_20250205151525.sh +19 -0
  33. .history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121155759.sh +0 -0
  34. .history/slurm_script/mbench_gpt_ref-ytvos_20250119070901.sh +0 -0
  35. .history/slurm_script/mbench_gpt_ref-ytvos_20250119070932.sh +18 -0
  36. .history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130185113.sh +0 -0
  37. .history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130220432.sh +20 -0
  38. .history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130220435.sh +20 -0
  39. .history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207171522.sh +20 -0
  40. .history/slurm_script/mbench_ref-ytvos_json_20250113182619.sh +18 -0
  41. .history/slurm_script/mbench_ref-ytvos_json_20250113182952.sh +18 -0
  42. .history/slurm_script/mbench_ref-ytvos_json_20250116141255.sh +18 -0
  43. .history/slurm_script/mbench_ref-ytvos_json_20250117072826.sh +18 -0
  44. davis2017/results.py +31 -0
  45. hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/7528dbb1b6ce860d242aff71294a5fef12a41572.lock +0 -0
  46. hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/cc6c13cb9acd48b061e2d2664a50963c338b4998.lock +0 -0
  47. hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/e7dbc990f8ede75b1ad2fd17028fbd89a950286a.lock +0 -0
  48. hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/model.safetensors +0 -0
  49. hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/model.safetensors.index.json +0 -0
  50. hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/7528dbb1b6ce860d242aff71294a5fef12a41572 +7 -0
.history/datasets/a2d_20241227174300.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A2D-Sentences data loader
3
+ modified from https://github.com/mttr2021/MTTR/blob/main/datasets/a2d_sentences/a2d_sentences_dataset.py
4
+ """
5
+ from pathlib import Path
6
+
7
+ import torch
8
+ from torchvision.io import read_video
9
+ import torchvision.transforms.functional as F
10
+
11
+ from torch.utils.data import Dataset
12
+ import datasets.transforms_video as T
13
+
14
+ import os
15
+ from PIL import Image
16
+ import json
17
+ import numpy as np
18
+ import random
19
+
20
+ import h5py
21
+ from pycocotools.mask import encode, area
22
+
23
+
24
+ def get_image_id(video_id, frame_idx, ref_instance_a2d_id):
25
+ image_id = f'v_{video_id}_f_{frame_idx}_i_{ref_instance_a2d_id}'
26
+ return image_id
27
+
28
+ class A2DSentencesDataset(Dataset):
29
+ """
30
+ A Torch dataset for A2D-Sentences.
31
+ For more information check out: https://kgavrilyuk.github.io/publication/actor_action/ or the original paper at:
32
+ https://arxiv.org/abs/1803.07485
33
+ """
34
+ def __init__(self, image_folder: Path, ann_file: Path, transforms, return_masks: bool,
35
+ num_frames: int, max_skip: int, subset):
36
+ super(A2DSentencesDataset, self).__init__()
37
+ dataset_path = str(image_folder)
38
+ self.mask_annotations_dir = os.path.join(dataset_path, 'text_annotations/a2d_annotation_with_instances')
39
+ self.videos_dir = os.path.join(dataset_path, 'Release/clips320H')
40
+ self.ann_file = ann_file
41
+ self.text_annotations = self.get_text_annotations()
42
+
43
+ self._transforms = transforms
44
+ self.return_masks = return_masks # not used
45
+ self.num_frames = num_frames
46
+ self.max_skip = max_skip
47
+ self.subset = subset
48
+
49
+ print(f'\n {subset} sample num: ', len(self.text_annotations))
50
+ print('\n')
51
+
52
+ def get_text_annotations(self):
53
+ with open(str(self.ann_file), 'r') as f:
54
+ text_annotations_by_frame = [tuple(a) for a in json.load(f)]
55
+ return text_annotations_by_frame
56
+
57
+ @staticmethod
58
+ def bounding_box(img):
59
+ rows = np.any(img, axis=1)
60
+ cols = np.any(img, axis=0)
61
+ rmin, rmax = np.where(rows)[0][[0, -1]]
62
+ cmin, cmax = np.where(cols)[0][[0, -1]]
63
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
64
+
65
+ def __len__(self):
66
+ return len(self.text_annotations)
67
+
68
+ def __getitem__(self, idx):
69
+ instance_check = False
70
+ while not instance_check:
71
+ text_query, video_id, frame_idx, instance_id = self.text_annotations[idx]
72
+
73
+ text_query = " ".join(text_query.lower().split()) # clean up the text query
74
+
75
+ # read the source window frames:
76
+ video_frames, _, _ = read_video(os.path.join(self.videos_dir, f'{video_id}.mp4'), pts_unit='sec') # (T, H, W, C)
77
+ vid_len = len(video_frames)
78
+ # note that the original a2d dataset is 1 indexed, so we have to subtract 1 from frame_idx
79
+ frame_id = frame_idx - 1
80
+
81
+ if self.subset == 'train':
82
+ # get a window of window_size frames with frame frame_id in the middle.
83
+ num_frames = self.num_frames
84
+ # random sparse sample
85
+ sample_indx = [frame_id]
86
+ # local sample
87
+ sample_id_before = random.randint(1, 3)
88
+ sample_id_after = random.randint(1, 3)
89
+ local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
90
+ sample_indx.extend(local_indx)
91
+
92
+ # global sampling
93
+ if num_frames > 3:
94
+ all_inds = list(range(vid_len))
95
+ global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
96
+ global_n = num_frames - len(sample_indx)
97
+ if len(global_inds) > global_n:
98
+ select_id = random.sample(range(len(global_inds)), global_n)
99
+ for s_id in select_id:
100
+ sample_indx.append(global_inds[s_id])
101
+ elif vid_len >=global_n: # sample long range global frames
102
+ select_id = random.sample(range(vid_len), global_n)
103
+ for s_id in select_id:
104
+ sample_indx.append(all_inds[s_id])
105
+ else:
106
+ select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
107
+ for s_id in select_id:
108
+ sample_indx.append(all_inds[s_id])
109
+ sample_indx.sort()
110
+ # find the valid frame index in sampled frame list, there is only one valid frame
111
+ valid_indices = sample_indx.index(frame_id)
112
+
113
+ elif self.subset == 'val':
114
+ start_idx, end_idx = frame_id - self.num_frames // 2, frame_id + (self.num_frames + 1) // 2
115
+ sample_indx = []
116
+ for i in range(start_idx, end_idx):
117
+ i = min(max(i, 0), len(video_frames)-1) # pad out of range indices with edge frames
118
+ sample_indx.append(i)
119
+ sample_indx.sort()
120
+ # find the valid frame index in sampled frame list, there is only one valid frame
121
+ valid_indices = sample_indx.index(frame_id)
122
+
123
+
124
+ # read frames
125
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
126
+ for j in range(self.num_frames):
127
+ frame_indx = sample_indx[j]
128
+ img = F.to_pil_image(video_frames[frame_indx].permute(2, 0, 1))
129
+ imgs.append(img)
130
+
131
+ # read the instance mask
132
+ frame_annot_path = os.path.join(self.mask_annotations_dir, video_id, f'{frame_idx:05d}.h5')
133
+ f = h5py.File(frame_annot_path)
134
+ instances = list(f['instance'])
135
+ instance_idx = instances.index(instance_id) # existence was already validated during init
136
+
137
+ instance_masks = np.array(f['reMask'])
138
+ if len(instances) == 1:
139
+ instance_masks = instance_masks[np.newaxis, ...]
140
+ instance_masks = torch.tensor(instance_masks).transpose(1, 2)
141
+ mask_rles = [encode(mask) for mask in instance_masks.numpy()]
142
+ mask_areas = area(mask_rles).astype(np.float)
143
+ f.close()
144
+
145
+ # select the referred mask
146
+ label = torch.tensor(0, dtype=torch.long)
147
+ mask = instance_masks[instance_idx].numpy()
148
+ if (mask > 0).any():
149
+ y1, y2, x1, x2 = self.bounding_box(mask)
150
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
151
+ valid.append(1)
152
+ else: # some frame didn't contain the instance
153
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
154
+ valid.append(0)
155
+ mask = torch.from_numpy(mask)
156
+ labels.append(label)
157
+ boxes.append(box)
158
+ masks.append(mask)
159
+
160
+ # transform
161
+ h, w = instance_masks.shape[-2:]
162
+ labels = torch.stack(labels, dim=0)
163
+ boxes = torch.stack(boxes, dim=0)
164
+ boxes[:, 0::2].clamp_(min=0, max=w)
165
+ boxes[:, 1::2].clamp_(min=0, max=h)
166
+ masks = torch.stack(masks, dim=0)
167
+ # there is only one valid frame
168
+ target = {
169
+ 'frames_idx': torch.tensor(sample_indx), # [T,]
170
+ 'valid_indices': torch.tensor([valid_indices]),
171
+ 'labels': labels, # [1,]
172
+ 'boxes': boxes, # [1, 4], xyxy
173
+ 'masks': masks, # [1, H, W]
174
+ 'valid': torch.tensor(valid), # [1,]
175
+ 'caption': text_query,
176
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
177
+ 'size': torch.as_tensor([int(h), int(w)]),
178
+ 'image_id': get_image_id(video_id,frame_idx, instance_id)
179
+ }
180
+
181
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
182
+ imgs, target = self._transforms(imgs, target)
183
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
184
+
185
+ # FIXME: handle "valid", since some box may be removed due to random crop
186
+ if torch.any(target['valid'] == 1): # at leatst one instance
187
+ instance_check = True
188
+ else:
189
+ idx = random.randint(0, self.__len__() - 1)
190
+
191
+ return imgs, target
192
+
193
+
194
+ def make_coco_transforms(image_set, max_size=640):
195
+ normalize = T.Compose([
196
+ T.ToTensor(),
197
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
198
+ ])
199
+
200
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
201
+
202
+ if image_set == 'train':
203
+ return T.Compose([
204
+ T.RandomHorizontalFlip(),
205
+ T.PhotometricDistort(),
206
+ T.RandomSelect(
207
+ T.Compose([
208
+ T.RandomResize(scales, max_size=max_size),
209
+ T.Check(),
210
+ ]),
211
+ T.Compose([
212
+ T.RandomResize([400, 500, 600]),
213
+ T.RandomSizeCrop(384, 600),
214
+ T.RandomResize(scales, max_size=max_size),
215
+ T.Check(),
216
+ ])
217
+ ),
218
+ normalize,
219
+ ])
220
+
221
+ # we do not use the 'val' set since the annotations are inaccessible
222
+ if image_set == 'val':
223
+ return T.Compose([
224
+ T.RandomResize([360], max_size=640),
225
+ normalize,
226
+ ])
227
+
228
+ raise ValueError(f'unknown {image_set}')
229
+
230
+
231
+ def build(image_set, args):
232
+ root = Path(args.a2d_path)
233
+ assert root.exists(), f'provided A2D-Sentences path {root} does not exist'
234
+ PATHS = {
235
+ "train": (root, root / "a2d_sentences_single_frame_train_annotations.json"),
236
+ "val": (root, root / "a2d_sentences_single_frame_test_annotations.json"),
237
+ }
238
+ img_folder, ann_file = PATHS[image_set]
239
+ dataset = A2DSentencesDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size),
240
+ return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
241
+ return dataset
.history/datasets/a2d_20250203155857.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A2D-Sentences data loader
3
+ modified from https://github.com/mttr2021/MTTR/blob/main/datasets/a2d_sentences/a2d_sentences_dataset.py
4
+ """
5
+ from pathlib import Path
6
+
7
+ import torch
8
+ from torchvision.io import read_video
9
+ import torchvision.transforms.functional as F
10
+
11
+ from torch.utils.data import Dataset
12
+ import datasets.transforms_video as T
13
+
14
+ import os
15
+ from PIL import Image
16
+ import json
17
+ import numpy as np
18
+ import random
19
+
20
+ import h5py
21
+ from pycocotools.mask import encode, area
22
+
23
+
24
+ def get_image_id(video_id, frame_idx, ref_instance_a2d_id):
25
+ image_id = f'v_{video_id}_f_{frame_idx}_i_{ref_instance_a2d_id}'
26
+ return image_id
27
+
28
+ class A2DSentencesDataset(Dataset):
29
+ """
30
+ A Torch dataset for A2D-Sentences.
31
+ For more information check out: https://kgavrilyuk.github.io/publication/actor_action/ or the original paper at:
32
+ https://arxiv.org/abs/1803.07485
33
+ """
34
+ def __init__(self, image_folder: Path, ann_file: Path, transforms, return_masks: bool,
35
+ num_frames: int, max_skip: int, subset):
36
+ super(A2DSentencesDataset, self).__init__()
37
+ dataset_path = str(image_folder)
38
+ self.mask_annotations_dir = os.path.join(dataset_path, 'text_annotations/a2d_annotation_with_instances')
39
+ self.videos_dir = os.path.join(dataset_path, 'Release/clips320H')
40
+ self.ann_file = ann_file
41
+ self.text_annotations = self.get_text_annotations()
42
+
43
+ self._transforms = transforms
44
+ self.return_masks = return_masks # not used
45
+ self.num_frames = num_frames
46
+ self.max_skip = max_skip
47
+ self.subset = subset
48
+
49
+ print(f'\n {subset} sample num: ', len(self.text_annotations))
50
+ print('\n')
51
+
52
+ def get_text_annotations(self):
53
+ with open(str(self.ann_file), 'r') as f:
54
+ text_annotations_by_frame = [tuple(a) for a in json.load(f)]
55
+ return text_annotations_by_frame
56
+
57
+ @staticmethod
58
+ def bounding_box(img):
59
+ rows = np.any(img, axis=1)
60
+ cols = np.any(img, axis=0)
61
+ rmin, rmax = np.where(rows)[0][[0, -1]]
62
+ cmin, cmax = np.where(cols)[0][[0, -1]]
63
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
64
+
65
+ def __len__(self):
66
+ return len(self.text_annotations)
67
+
68
+ def __getitem__(self, idx):
69
+ instance_check = False
70
+ while not instance_check:
71
+ text_query, video_id, frame_idx, instance_id = self.text_annotations[idx]
72
+
73
+ text_query = " ".join(text_query.lower().split()) # clean up the text query
74
+
75
+ # read the source window frames:
76
+ video_frames, _, _ = read_video(os.path.join(self.videos_dir, f'{video_id}.mp4'), pts_unit='sec') # (T, H, W, C)
77
+ vid_len = len(video_frames)
78
+ # note that the original a2d dataset is 1 indexed, so we have to subtract 1 from frame_idx
79
+ frame_id = frame_idx - 1
80
+
81
+ if self.subset == 'train':
82
+ # get a window of window_size frames with frame frame_id in the middle.
83
+ num_frames = self.num_frames
84
+ # random sparse sample
85
+ sample_indx = [frame_id]
86
+ # local sample
87
+ sample_id_before = random.randint(1, 3)
88
+ sample_id_after = random.randint(1, 3)
89
+ local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
90
+ sample_indx.extend(local_indx)
91
+
92
+ # global sampling
93
+ if num_frames > 3:
94
+ all_inds = list(range(vid_len))
95
+ global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
96
+ global_n = num_frames - len(sample_indx)
97
+ if len(global_inds) > global_n:
98
+ select_id = random.sample(range(len(global_inds)), global_n)
99
+ for s_id in select_id:
100
+ sample_indx.append(global_inds[s_id])
101
+ elif vid_len >=global_n: # sample long range global frames
102
+ select_id = random.sample(range(vid_len), global_n)
103
+ for s_id in select_id:
104
+ sample_indx.append(all_inds[s_id])
105
+ else:
106
+ select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
107
+ for s_id in select_id:
108
+ sample_indx.append(all_inds[s_id])
109
+ sample_indx.sort()
110
+ # find the valid frame index in sampled frame list, there is only one valid frame
111
+ valid_indices = sample_indx.index(frame_id)
112
+
113
+ elif self.subset == 'val':
114
+ start_idx, end_idx = frame_id - self.num_frames // 2, frame_id + (self.num_frames + 1) // 2
115
+ sample_indx = []
116
+ for i in range(start_idx, end_idx):
117
+ i = min(max(i, 0), len(video_frames)-1) # pad out of range indices with edge frames
118
+ sample_indx.append(i)
119
+ sample_indx.sort()
120
+ # find the valid frame index in sampled frame list, there is only one valid frame
121
+ valid_indices = sample_indx.index(frame_id)
122
+
123
+
124
+ # read frames
125
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
126
+ for j in range(self.num_frames):
127
+ frame_indx = sample_indx[j]
128
+ img = F.to_pil_image(video_frames[frame_indx].permute(2, 0, 1))
129
+ imgs.append(img)
130
+
131
+ # read the instance mask
132
+ frame_annot_path = os.path.join(self.mask_annotations_dir, video_id, f'{frame_idx:05d}.h5')
133
+ f = h5py.File(frame_annot_path)
134
+ instances = list(f['instance'])
135
+ instance_idx = instances.index(instance_id) # existence was already validated during init
136
+
137
+ instance_masks = np.array(f['reMask'])
138
+ if len(instances) == 1:
139
+ instance_masks = instance_masks[np.newaxis, ...]
140
+ instance_masks = torch.tensor(instance_masks).transpose(1, 2)
141
+ mask_rles = [encode(mask) for mask in instance_masks.numpy()]
142
+ mask_areas = area(mask_rles).astype(np.float)
143
+ f.close()
144
+
145
+ # select the referred mask
146
+ label = torch.tensor(0, dtype=torch.long)
147
+ mask = instance_masks[instance_idx].numpy()
148
+ if (mask > 0).any():
149
+ y1, y2, x1, x2 = self.bounding_box(mask)
150
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
151
+ valid.append(1)
152
+ else: # some frame didn't contain the instance
153
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
154
+ valid.append(0)
155
+ mask = torch.from_numpy(mask)
156
+ labels.append(label)
157
+ boxes.append(box)
158
+ masks.append(mask)
159
+
160
+ # transform
161
+ h, w = instance_masks.shape[-2:]
162
+ labels = torch.stack(labels, dim=0)
163
+ boxes = torch.stack(boxes, dim=0)
164
+ boxes[:, 0::2].clamp_(min=0, max=w)
165
+ boxes[:, 1::2].clamp_(min=0, max=h)
166
+ masks = torch.stack(masks, dim=0)
167
+ # there is only one valid frame
168
+ target = {
169
+ 'frames_idx': torch.tensor(sample_indx), # [T,]
170
+ 'valid_indices': torch.tensor([valid_indices]),
171
+ 'labels': labels, # [1,]
172
+ 'boxes': boxes, # [1, 4], xyxy
173
+ 'masks': masks, # [1, H, W]
174
+ 'valid': torch.tensor(valid), # [1,]
175
+ 'caption': text_query,
176
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
177
+ 'size': torch.as_tensor([int(h), int(w)]),
178
+ 'image_id': get_image_id(video_id,frame_idx, instance_id)
179
+ }
180
+
181
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
182
+ imgs, target = self._transforms(imgs, target)
183
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
184
+
185
+ # FIXME: handle "valid", since some box may be removed due to random crop
186
+ if torch.any(target['valid'] == 1): # at leatst one instance
187
+ instance_check = True
188
+ else:
189
+ idx = random.randint(0, self.__len__() - 1)
190
+
191
+ return imgs, target
192
+
193
+
194
+ def make_coco_transforms(image_set, max_size=640):
195
+ normalize = T.Compose([
196
+ T.ToTensor(),
197
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
198
+ ])
199
+
200
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
201
+
202
+ if image_set == 'train':
203
+ return T.Compose([
204
+ T.RandomHorizontalFlip(),
205
+ T.PhotometricDistort(),
206
+ T.RandomSelect(
207
+ T.Compose([
208
+ T.RandomResize(scales, max_size=max_size),
209
+ T.Check(),
210
+ ]),
211
+ T.Compose([
212
+ T.RandomResize([400, 500, 600]),
213
+ T.RandomSizeCrop(384, 600),
214
+ T.RandomResize(scales, max_size=max_size),
215
+ T.Check(),
216
+ ])
217
+ ),
218
+ normalize,
219
+ ])
220
+
221
+ # we do not use the 'val' set since the annotations are inaccessible
222
+ if image_set == 'val':
223
+ return T.Compose([
224
+ T.RandomResize([360], max_size=640),
225
+ normalize,
226
+ ])
227
+
228
+ raise ValueError(f'unknown {image_set}')
229
+
230
+
231
+ def build(image_set, args):
232
+ root = Path(args.a2d_path)
233
+ assert root.exists(), f'provided A2D-Sentences path {root} does not exist'
234
+ PATHS = {
235
+ "train": (root, root / "a2d_sentences_single_frame_train_annotations.json"),
236
+ "val": (root, root / "a2d_sentences_single_frame_test_annotations.json"),
237
+ }
238
+ img_folder, ann_file = PATHS[image_set]
239
+ #dataset = A2DSentencesDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size),
240
+ # return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
241
+ dataset = A2DSentencesDataset(img_folder, ann_file, transforms=None,
242
+ return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
243
+ return dataset
.history/datasets/a2d_20250203160149.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A2D-Sentences data loader
3
+ modified from https://github.com/mttr2021/MTTR/blob/main/datasets/a2d_sentences/a2d_sentences_dataset.py
4
+ """
5
+ from pathlib import Path
6
+
7
+ import torch
8
+ from torchvision.io import read_video
9
+ import torchvision.transforms.functional as F
10
+
11
+ from torch.utils.data import Dataset
12
+ import datasets.transforms_video as T
13
+
14
+ import os
15
+ from PIL import Image
16
+ import json
17
+ import numpy as np
18
+ import random
19
+
20
+ import h5py
21
+ from pycocotools.mask import encode, area
22
+
23
+
24
+ def get_image_id(video_id, frame_idx, ref_instance_a2d_id):
25
+ image_id = f'v_{video_id}_f_{frame_idx}_i_{ref_instance_a2d_id}'
26
+ return image_id
27
+
28
+ class A2DSentencesDataset(Dataset):
29
+ """
30
+ A Torch dataset for A2D-Sentences.
31
+ For more information check out: https://kgavrilyuk.github.io/publication/actor_action/ or the original paper at:
32
+ https://arxiv.org/abs/1803.07485
33
+ """
34
+ def __init__(self, image_folder: Path, ann_file: Path, transforms, return_masks: bool,
35
+ num_frames: int, max_skip: int, subset):
36
+ super(A2DSentencesDataset, self).__init__()
37
+ dataset_path = str(image_folder)
38
+ self.mask_annotations_dir = os.path.join(dataset_path, 'text_annotations/a2d_annotation_with_instances')
39
+ self.videos_dir = os.path.join(dataset_path, 'Release/clips320H')
40
+ self.ann_file = ann_file
41
+ self.text_annotations = self.get_text_annotations()
42
+
43
+ self._transforms = transforms
44
+ self.return_masks = return_masks # not used
45
+ self.num_frames = num_frames
46
+ self.max_skip = max_skip
47
+ self.subset = subset
48
+
49
+ print(f'\n {subset} sample num: ', len(self.text_annotations))
50
+ print('\n')
51
+
52
+ def get_text_annotations(self):
53
+ with open(str(self.ann_file), 'r') as f:
54
+ text_annotations_by_frame = [tuple(a) for a in json.load(f)]
55
+ return text_annotations_by_frame
56
+
57
+ @staticmethod
58
+ def bounding_box(img):
59
+ rows = np.any(img, axis=1)
60
+ cols = np.any(img, axis=0)
61
+ rmin, rmax = np.where(rows)[0][[0, -1]]
62
+ cmin, cmax = np.where(cols)[0][[0, -1]]
63
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
64
+
65
+ def __len__(self):
66
+ return len(self.text_annotations)
67
+
68
+ def __getitem__(self, idx):
69
+ instance_check = False
70
+ while not instance_check:
71
+ text_query, video_id, frame_idx, instance_id = self.text_annotations[idx]
72
+
73
+ text_query = " ".join(text_query.lower().split()) # clean up the text query
74
+
75
+ # read the source window frames:
76
+ video_frames, _, _ = read_video(os.path.join(self.videos_dir, f'{video_id}.mp4'), pts_unit='sec') # (T, H, W, C)
77
+ vid_len = len(video_frames)
78
+ # note that the original a2d dataset is 1 indexed, so we have to subtract 1 from frame_idx
79
+ frame_id = frame_idx - 1
80
+
81
+ if self.subset == 'train':
82
+ # get a window of window_size frames with frame frame_id in the middle.
83
+ num_frames = self.num_frames
84
+ # random sparse sample
85
+ sample_indx = [frame_id]
86
+ # local sample
87
+ sample_id_before = random.randint(1, 3)
88
+ sample_id_after = random.randint(1, 3)
89
+ local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
90
+ sample_indx.extend(local_indx)
91
+
92
+ # global sampling
93
+ if num_frames > 3:
94
+ all_inds = list(range(vid_len))
95
+ global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
96
+ global_n = num_frames - len(sample_indx)
97
+ if len(global_inds) > global_n:
98
+ select_id = random.sample(range(len(global_inds)), global_n)
99
+ for s_id in select_id:
100
+ sample_indx.append(global_inds[s_id])
101
+ elif vid_len >=global_n: # sample long range global frames
102
+ select_id = random.sample(range(vid_len), global_n)
103
+ for s_id in select_id:
104
+ sample_indx.append(all_inds[s_id])
105
+ else:
106
+ select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
107
+ for s_id in select_id:
108
+ sample_indx.append(all_inds[s_id])
109
+ sample_indx.sort()
110
+ # find the valid frame index in sampled frame list, there is only one valid frame
111
+ valid_indices = sample_indx.index(frame_id)
112
+
113
+ elif self.subset == 'val':
114
+ start_idx, end_idx = frame_id - self.num_frames // 2, frame_id + (self.num_frames + 1) // 2
115
+ sample_indx = []
116
+ for i in range(start_idx, end_idx):
117
+ i = min(max(i, 0), len(video_frames)-1) # pad out of range indices with edge frames
118
+ sample_indx.append(i)
119
+ sample_indx.sort()
120
+ # find the valid frame index in sampled frame list, there is only one valid frame
121
+ valid_indices = sample_indx.index(frame_id)
122
+
123
+
124
+ # read frames
125
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
126
+ for j in range(self.num_frames):
127
+ frame_indx = sample_indx[j]
128
+ img = F.to_pil_image(video_frames[frame_indx].permute(2, 0, 1))
129
+ imgs.append(img)
130
+
131
+ # read the instance mask
132
+ frame_annot_path = os.path.join(self.mask_annotations_dir, video_id, f'{frame_idx:05d}.h5')
133
+ f = h5py.File(frame_annot_path)
134
+ instances = list(f['instance'])
135
+ instance_idx = instances.index(instance_id) # existence was already validated during init
136
+
137
+ instance_masks = np.array(f['reMask'])
138
+ if len(instances) == 1:
139
+ instance_masks = instance_masks[np.newaxis, ...]
140
+ instance_masks = torch.tensor(instance_masks).transpose(1, 2)
141
+ mask_rles = [encode(mask) for mask in instance_masks.numpy()]
142
+ mask_areas = area(mask_rles).astype(np.float)
143
+ f.close()
144
+
145
+ # select the referred mask
146
+ label = torch.tensor(0, dtype=torch.long)
147
+ mask = instance_masks[instance_idx].numpy()
148
+ if (mask > 0).any():
149
+ y1, y2, x1, x2 = self.bounding_box(mask)
150
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
151
+ valid.append(1)
152
+ else: # some frame didn't contain the instance
153
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
154
+ valid.append(0)
155
+ mask = torch.from_numpy(mask)
156
+ labels.append(label)
157
+ boxes.append(box)
158
+ masks.append(mask)
159
+
160
+ # transform
161
+ h, w = instance_masks.shape[-2:]
162
+ labels = torch.stack(labels, dim=0)
163
+ boxes = torch.stack(boxes, dim=0)
164
+ boxes[:, 0::2].clamp_(min=0, max=w)
165
+ boxes[:, 1::2].clamp_(min=0, max=h)
166
+ masks = torch.stack(masks, dim=0)
167
+ # there is only one valid frame
168
+ target = {
169
+ 'frames_idx': torch.tensor(sample_indx), # [T,]
170
+ 'valid_indices': torch.tensor([valid_indices]),
171
+ 'labels': labels, # [1,]
172
+ 'boxes': boxes, # [1, 4], xyxy
173
+ 'masks': masks, # [1, H, W]
174
+ 'valid': torch.tensor(valid), # [1,]
175
+ 'caption': text_query,
176
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
177
+ 'size': torch.as_tensor([int(h), int(w)]),
178
+ 'image_id': get_image_id(video_id,frame_idx, instance_id)
179
+ }
180
+
181
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
182
+ if self._transforms:
183
+ imgs, target = self._transforms(imgs, target)
184
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
185
+ else:
186
+ imgs = np.array(imgs)
187
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
188
+
189
+ # FIXME: handle "valid", since some box may be removed due to random crop
190
+ if torch.any(target['valid'] == 1): # at leatst one instance
191
+ instance_check = True
192
+ else:
193
+ idx = random.randint(0, self.__len__() - 1)
194
+
195
+ return imgs, target
196
+
197
+
198
+ def make_coco_transforms(image_set, max_size=640):
199
+ normalize = T.Compose([
200
+ T.ToTensor(),
201
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
202
+ ])
203
+
204
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
205
+
206
+ if image_set == 'train':
207
+ return T.Compose([
208
+ T.RandomHorizontalFlip(),
209
+ T.PhotometricDistort(),
210
+ T.RandomSelect(
211
+ T.Compose([
212
+ T.RandomResize(scales, max_size=max_size),
213
+ T.Check(),
214
+ ]),
215
+ T.Compose([
216
+ T.RandomResize([400, 500, 600]),
217
+ T.RandomSizeCrop(384, 600),
218
+ T.RandomResize(scales, max_size=max_size),
219
+ T.Check(),
220
+ ])
221
+ ),
222
+ normalize,
223
+ ])
224
+
225
+ # we do not use the 'val' set since the annotations are inaccessible
226
+ if image_set == 'val':
227
+ return T.Compose([
228
+ T.RandomResize([360], max_size=640),
229
+ normalize,
230
+ ])
231
+
232
+ raise ValueError(f'unknown {image_set}')
233
+
234
+
235
+ def build(image_set, args):
236
+ root = Path(args.a2d_path)
237
+ assert root.exists(), f'provided A2D-Sentences path {root} does not exist'
238
+ PATHS = {
239
+ "train": (root, root / "a2d_sentences_single_frame_train_annotations.json"),
240
+ "val": (root, root / "a2d_sentences_single_frame_test_annotations.json"),
241
+ }
242
+ img_folder, ann_file = PATHS[image_set]
243
+ #dataset = A2DSentencesDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size),
244
+ # return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
245
+ dataset = A2DSentencesDataset(img_folder, ann_file, transforms=None,
246
+ return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
247
+ return dataset
.history/datasets/a2d_20250203174309.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A2D-Sentences data loader
3
+ modified from https://github.com/mttr2021/MTTR/blob/main/datasets/a2d_sentences/a2d_sentences_dataset.py
4
+ """
5
+ from pathlib import Path
6
+
7
+ import torch
8
+ from torchvision.io import read_video
9
+ import torchvision.transforms.functional as F
10
+
11
+ from torch.utils.data import Dataset
12
+ import datasets.transforms_video as T
13
+
14
+ import os
15
+ from PIL import Image
16
+ import json
17
+ import numpy as np
18
+ import random
19
+
20
+ import h5py
21
+ from pycocotools.mask import encode, area
22
+
23
+
24
+ def get_image_id(video_id, frame_idx, ref_instance_a2d_id):
25
+ image_id = f'v_{video_id}_f_{frame_idx}_i_{ref_instance_a2d_id}'
26
+ return image_id
27
+
28
+ class A2DSentencesDataset(Dataset):
29
+ """
30
+ A Torch dataset for A2D-Sentences.
31
+ For more information check out: https://kgavrilyuk.github.io/publication/actor_action/ or the original paper at:
32
+ https://arxiv.org/abs/1803.07485
33
+ """
34
+ def __init__(self, image_folder: Path, ann_file: Path, transforms, return_masks: bool,
35
+ num_frames: int, max_skip: int, subset):
36
+ super(A2DSentencesDataset, self).__init__()
37
+ dataset_path = str(image_folder)
38
+ self.mask_annotations_dir = os.path.join(dataset_path, 'text_annotations/a2d_annotation_with_instances')
39
+ self.videos_dir = os.path.join(dataset_path, 'Release/clips320H')
40
+ self.ann_file = ann_file
41
+ self.text_annotations = self.get_text_annotations()
42
+
43
+ self._transforms = transforms
44
+ self.return_masks = return_masks # not used
45
+ self.num_frames = num_frames
46
+ self.max_skip = max_skip
47
+ self.subset = subset
48
+
49
+ print(f'\n {subset} sample num: ', len(self.text_annotations))
50
+ print('\n')
51
+
52
+ def get_text_annotations(self):
53
+ with open(str(self.ann_file), 'r') as f:
54
+ text_annotations_by_frame = [tuple(a) for a in json.load(f)]
55
+ return text_annotations_by_frame
56
+
57
+ @staticmethod
58
+ def bounding_box(img):
59
+ rows = np.any(img, axis=1)
60
+ cols = np.any(img, axis=0)
61
+ rmin, rmax = np.where(rows)[0][[0, -1]]
62
+ cmin, cmax = np.where(cols)[0][[0, -1]]
63
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
64
+
65
+ def __len__(self):
66
+ return len(self.text_annotations)
67
+
68
+ def __getitem__(self, idx):
69
+ instance_check = False
70
+ while not instance_check:
71
+ text_query, video_id, frame_idx, instance_id = self.text_annotations[idx]
72
+
73
+ text_query = " ".join(text_query.lower().split()) # clean up the text query
74
+
75
+ # read the source window frames:
76
+ video_frames, _, _ = read_video(os.path.join(self.videos_dir, f'{video_id}.mp4'), pts_unit='sec') # (T, H, W, C)
77
+ vid_len = len(video_frames)
78
+ # note that the original a2d dataset is 1 indexed, so we have to subtract 1 from frame_idx
79
+ frame_id = frame_idx - 1
80
+
81
+ if self.subset == 'train':
82
+ # get a window of window_size frames with frame frame_id in the middle.
83
+ num_frames = self.num_frames
84
+ # random sparse sample
85
+ sample_indx = [frame_id]
86
+ # local sample
87
+ sample_id_before = random.randint(1, 3)
88
+ sample_id_after = random.randint(1, 3)
89
+ local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
90
+ sample_indx.extend(local_indx)
91
+
92
+ # global sampling
93
+ if num_frames > 3:
94
+ all_inds = list(range(vid_len))
95
+ global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
96
+ global_n = num_frames - len(sample_indx)
97
+ if len(global_inds) > global_n:
98
+ select_id = random.sample(range(len(global_inds)), global_n)
99
+ for s_id in select_id:
100
+ sample_indx.append(global_inds[s_id])
101
+ elif vid_len >=global_n: # sample long range global frames
102
+ select_id = random.sample(range(vid_len), global_n)
103
+ for s_id in select_id:
104
+ sample_indx.append(all_inds[s_id])
105
+ else:
106
+ select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
107
+ for s_id in select_id:
108
+ sample_indx.append(all_inds[s_id])
109
+ sample_indx.sort()
110
+ # find the valid frame index in sampled frame list, there is only one valid frame
111
+ valid_indices = sample_indx.index(frame_id)
112
+
113
+ elif self.subset == 'val':
114
+ start_idx, end_idx = frame_id - self.num_frames // 2, frame_id + (self.num_frames + 1) // 2
115
+ sample_indx = []
116
+ for i in range(start_idx, end_idx):
117
+ i = min(max(i, 0), len(video_frames)-1) # pad out of range indices with edge frames
118
+ sample_indx.append(i)
119
+ sample_indx.sort()
120
+ # find the valid frame index in sampled frame list, there is only one valid frame
121
+ valid_indices = sample_indx.index(frame_id)
122
+
123
+
124
+ # read frames
125
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
126
+ for j in range(self.num_frames):
127
+ frame_indx = sample_indx[j]
128
+ img = F.to_pil_image(video_frames[frame_indx].permute(2, 0, 1))
129
+ imgs.append(img)
130
+
131
+ # read the instance mask
132
+ frame_annot_path = os.path.join(self.mask_annotations_dir, video_id, f'{frame_idx:05d}.h5')
133
+ f = h5py.File(frame_annot_path)
134
+ instances = list(f['instance'])
135
+ instance_idx = instances.index(instance_id) # existence was already validated during init
136
+
137
+ instance_masks = np.array(f['reMask'])
138
+ if len(instances) == 1:
139
+ instance_masks = instance_masks[np.newaxis, ...]
140
+ instance_masks = torch.tensor(instance_masks).transpose(1, 2)
141
+ mask_rles = [encode(mask) for mask in instance_masks.numpy()]
142
+ mask_areas = area(mask_rles).astype(float)
143
+ f.close()
144
+
145
+ # select the referred mask
146
+ label = torch.tensor(0, dtype=torch.long)
147
+ mask = instance_masks[instance_idx].numpy()
148
+ if (mask > 0).any():
149
+ y1, y2, x1, x2 = self.bounding_box(mask)
150
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
151
+ valid.append(1)
152
+ else: # some frame didn't contain the instance
153
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
154
+ valid.append(0)
155
+ mask = torch.from_numpy(mask)
156
+ labels.append(label)
157
+ boxes.append(box)
158
+ masks.append(mask)
159
+
160
+ # transform
161
+ h, w = instance_masks.shape[-2:]
162
+ labels = torch.stack(labels, dim=0)
163
+ boxes = torch.stack(boxes, dim=0)
164
+ boxes[:, 0::2].clamp_(min=0, max=w)
165
+ boxes[:, 1::2].clamp_(min=0, max=h)
166
+ masks = torch.stack(masks, dim=0)
167
+ # there is only one valid frame
168
+ target = {
169
+ 'frames_idx': torch.tensor(sample_indx), # [T,]
170
+ 'valid_indices': torch.tensor([valid_indices]),
171
+ 'labels': labels, # [1,]
172
+ 'boxes': boxes, # [1, 4], xyxy
173
+ 'masks': masks, # [1, H, W]
174
+ 'valid': torch.tensor(valid), # [1,]
175
+ 'caption': text_query,
176
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
177
+ 'size': torch.as_tensor([int(h), int(w)]),
178
+ 'image_id': get_image_id(video_id,frame_idx, instance_id)
179
+ }
180
+
181
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
182
+ if self._transforms:
183
+ imgs, target = self._transforms(imgs, target)
184
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
185
+ else:
186
+ imgs = np.array(imgs)
187
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
188
+
189
+ # FIXME: handle "valid", since some box may be removed due to random crop
190
+ if torch.any(target['valid'] == 1): # at leatst one instance
191
+ instance_check = True
192
+ else:
193
+ idx = random.randint(0, self.__len__() - 1)
194
+
195
+ return imgs, target
196
+
197
+
198
+ def make_coco_transforms(image_set, max_size=640):
199
+ normalize = T.Compose([
200
+ T.ToTensor(),
201
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
202
+ ])
203
+
204
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
205
+
206
+ if image_set == 'train':
207
+ return T.Compose([
208
+ T.RandomHorizontalFlip(),
209
+ T.PhotometricDistort(),
210
+ T.RandomSelect(
211
+ T.Compose([
212
+ T.RandomResize(scales, max_size=max_size),
213
+ T.Check(),
214
+ ]),
215
+ T.Compose([
216
+ T.RandomResize([400, 500, 600]),
217
+ T.RandomSizeCrop(384, 600),
218
+ T.RandomResize(scales, max_size=max_size),
219
+ T.Check(),
220
+ ])
221
+ ),
222
+ normalize,
223
+ ])
224
+
225
+ # we do not use the 'val' set since the annotations are inaccessible
226
+ if image_set == 'val':
227
+ return T.Compose([
228
+ T.RandomResize([360], max_size=640),
229
+ normalize,
230
+ ])
231
+
232
+ raise ValueError(f'unknown {image_set}')
233
+
234
+
235
+ def build(image_set, args):
236
+ root = Path(args.a2d_path)
237
+ assert root.exists(), f'provided A2D-Sentences path {root} does not exist'
238
+ PATHS = {
239
+ "train": (root, root / "a2d_sentences_single_frame_train_annotations.json"),
240
+ "val": (root, root / "a2d_sentences_single_frame_test_annotations.json"),
241
+ }
242
+ img_folder, ann_file = PATHS[image_set]
243
+ #dataset = A2DSentencesDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size),
244
+ # return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
245
+ dataset = A2DSentencesDataset(img_folder, ann_file, transforms=None,
246
+ return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
247
+ return dataset
.history/datasets/ytvos_ref_20250113163537.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+ for exp_id, exp_dict in vid_data['expressions'].items():
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ for frame_id in sample_indx:
93
+ meta = {
94
+ 'video': vid,
95
+ 'exp': exp_dict['exp'],
96
+ 'obj_id': int(exp_dict['obj_id']),
97
+ 'frames': vid_frames,
98
+ 'frame_id' : frame_id,
99
+ 'sample_frames_id' : sample_indx,
100
+ 'bins': bins,
101
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
102
+ }
103
+ self.metas.append(meta)
104
+
105
+ print(f"skipped {skip_vid_count} short videos")
106
+
107
+
108
+ @staticmethod
109
+ def bounding_box(img):
110
+ rows = np.any(img, axis=1)
111
+ cols = np.any(img, axis=0)
112
+ rmin, rmax = np.where(rows)[0][[0, -1]]
113
+ cmin, cmax = np.where(cols)[0][[0, -1]]
114
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
115
+
116
+ def __len__(self):
117
+ return len(self.metas)
118
+
119
+ def __getitem__(self, idx):
120
+ instance_check = False
121
+ while not instance_check:
122
+ meta = self.metas[idx] # dict
123
+
124
+
125
+ video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
126
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['frame_id'], metas['sample_frames_id'], meta['bins']
127
+
128
+
129
+ # clean up the caption
130
+ exp = " ".join(exp.lower().split())
131
+ category_id = category_dict[category]
132
+ vid_len = len(frames)
133
+
134
+ # num_frames = self.num_frames
135
+
136
+ # read frames and masks
137
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
138
+ for frame_indx in sample_frames_id:
139
+ frame_name = frames[frame_indx]
140
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
141
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
142
+ img = Image.open(img_path).convert('RGB')
143
+ mask = Image.open(mask_path).convert('P')
144
+
145
+ # create the target
146
+ label = torch.tensor(category_id)
147
+ mask = np.array(mask)
148
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
149
+ if (mask > 0).any():
150
+ y1, y2, x1, x2 = self.bounding_box(mask)
151
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
152
+ valid.append(1)
153
+ else: # some frame didn't contain the instance
154
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
155
+ valid.append(0)
156
+ mask = torch.from_numpy(mask)
157
+
158
+ # append
159
+ imgs.append(img)
160
+ labels.append(label)
161
+ masks.append(mask)
162
+ boxes.append(box)
163
+
164
+ # transform
165
+ w, h = img.size
166
+ labels = torch.stack(labels, dim=0)
167
+ boxes = torch.stack(boxes, dim=0)
168
+ boxes[:, 0::2].clamp_(min=0, max=w)
169
+ boxes[:, 1::2].clamp_(min=0, max=h)
170
+ masks = torch.stack(masks, dim=0)
171
+ target = {
172
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
173
+ 'labels': labels, # [T,]
174
+ 'boxes': boxes, # [T, 4], xyxy
175
+ 'masks': masks, # [T, H, W]
176
+ 'valid': torch.tensor(valid), # [T,]
177
+ 'caption': exp,
178
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
179
+ 'size': torch.as_tensor([int(h), int(w)])
180
+ }
181
+
182
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
183
+ if self._transforms:
184
+ imgs, target = self._transforms(imgs, target)
185
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
186
+ else:
187
+ imgs = np.array(imgs)
188
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
189
+
190
+
191
+ # FIXME: handle "valid", since some box may be removed due to random crop
192
+ if torch.any(target['valid'] == 1): # at leatst one instance
193
+ instance_check = True
194
+ else:
195
+ idx = random.randint(0, self.__len__() - 1)
196
+
197
+ return imgs, target
198
+
199
+
200
+ def make_coco_transforms(image_set, max_size=640):
201
+ normalize = T.Compose([
202
+ T.ToTensor(),
203
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
204
+ ])
205
+
206
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
207
+
208
+ if image_set == 'train':
209
+ return T.Compose([
210
+ T.RandomHorizontalFlip(),
211
+ T.PhotometricDistort(),
212
+ T.RandomSelect(
213
+ T.Compose([
214
+ T.RandomResize(scales, max_size=max_size),
215
+ T.Check(),
216
+ ]),
217
+ T.Compose([
218
+ T.RandomResize([400, 500, 600]),
219
+ T.RandomSizeCrop(384, 600),
220
+ T.RandomResize(scales, max_size=max_size),
221
+ T.Check(),
222
+ ])
223
+ ),
224
+ normalize,
225
+ ])
226
+
227
+ # we do not use the 'val' set since the annotations are inaccessible
228
+ if image_set == 'val':
229
+ return T.Compose([
230
+ T.RandomResize([360], max_size=640),
231
+ normalize,
232
+ ])
233
+
234
+ raise ValueError(f'unknown {image_set}')
235
+
236
+
237
+ def build(image_set, args):
238
+ root = Path(args.ytvos_path)
239
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
240
+ PATHS = {
241
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
242
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
243
+ }
244
+ img_folder, ann_file = PATHS[image_set]
245
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
246
+ # num_frames=args.num_frames, max_skip=args.max_skip)
247
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
248
+ num_frames=args.num_frames, max_skip=args.max_skip)
249
+ return dataset
250
+
.history/datasets/ytvos_ref_20250116071955.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ instance_check = False
122
+ while not instance_check:
123
+ meta = self.metas[idx] # dict
124
+
125
+ video, sample_indx, bins, frames, obj_id_cat = \
126
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_indx:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ print(f"img size: {img.shape}")
136
+ mask = Image.open(mask_path).convert('P')
137
+ mask = np.array(mask)
138
+
139
+ # create the target
140
+ for obj_id in list(obj_id_cat.keys()):
141
+ obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
142
+ if (obj_mask > 0).any():
143
+ y1, y2, x1, x2 = self.bounding_box(mask)
144
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
145
+ valid.append(1)
146
+ else: # some frame didn't contain the instance
147
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
148
+ valid.append(0)
149
+ obj_mask = torch.from_numpy(obj_mask)
150
+
151
+ # append
152
+ masks.append(obj_mask)
153
+ boxes.append(box)
154
+
155
+
156
+ # transform
157
+ w, h = img.size
158
+ boxes = torch.stack(boxes, dim=0)
159
+ boxes[:, 0::2].clamp_(min=0, max=w)
160
+ boxes[:, 1::2].clamp_(min=0, max=h)
161
+ masks = torch.stack(masks, dim=0)
162
+ target = {
163
+ 'frames_idx': sample_indx, # [T,]
164
+ 'boxes': boxes, # [T, 4], xyxy
165
+ 'masks': masks, # [T, H, W]
166
+ 'valid': torch.tensor(valid), # [T,]
167
+ 'obj_ids' : list(obj_id_cat.keys()),
168
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
169
+ 'size': torch.as_tensor([int(h), int(w)])
170
+ }
171
+
172
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
173
+ if self._transforms:
174
+ imgs, target = self._transforms(imgs, target)
175
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
176
+ else:
177
+ imgs = np.array(imgs)
178
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
179
+
180
+
181
+ # FIXME: handle "valid", since some box may be removed due to random crop
182
+ if torch.any(target['valid'] == 1): # at leatst one instance
183
+ instance_check = True
184
+ else:
185
+ idx = random.randint(0, self.__len__() - 1)
186
+
187
+ return imgs, target
188
+
189
+
190
+ def make_coco_transforms(image_set, max_size=640):
191
+ normalize = T.Compose([
192
+ T.ToTensor(),
193
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
194
+ ])
195
+
196
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
197
+
198
+ if image_set == 'train':
199
+ return T.Compose([
200
+ T.RandomHorizontalFlip(),
201
+ T.PhotometricDistort(),
202
+ T.RandomSelect(
203
+ T.Compose([
204
+ T.RandomResize(scales, max_size=max_size),
205
+ T.Check(),
206
+ ]),
207
+ T.Compose([
208
+ T.RandomResize([400, 500, 600]),
209
+ T.RandomSizeCrop(384, 600),
210
+ T.RandomResize(scales, max_size=max_size),
211
+ T.Check(),
212
+ ])
213
+ ),
214
+ normalize,
215
+ ])
216
+
217
+ # we do not use the 'val' set since the annotations are inaccessible
218
+ if image_set == 'val':
219
+ return T.Compose([
220
+ T.RandomResize([360], max_size=640),
221
+ normalize,
222
+ ])
223
+
224
+ raise ValueError(f'unknown {image_set}')
225
+
226
+
227
+ def build(image_set, args):
228
+ root = Path(args.ytvos_path)
229
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
230
+ PATHS = {
231
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
232
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
233
+ }
234
+ img_folder, ann_file = PATHS[image_set]
235
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
236
+ # num_frames=args.num_frames, max_skip=args.max_skip)
237
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
238
+ num_frames=args.num_frames, max_skip=args.max_skip)
239
+ return dataset
240
+
.history/datasets/ytvos_ref_20250116072439.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ instance_check = False
122
+ while not instance_check:
123
+ meta = self.metas[idx] # dict
124
+
125
+ video, sample_indx, bins, frames, obj_id_cat = \
126
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
127
+
128
+ # read frames and masks
129
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
130
+ for frame_indx in sample_indx:
131
+ frame_name = frames[frame_indx]
132
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
133
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
134
+ img = Image.open(img_path).convert('RGB')
135
+ imgs.append(img)
136
+ mask = Image.open(mask_path).convert('P')
137
+ mask = np.array(mask)
138
+
139
+ # create the target
140
+ for obj_id in list(obj_id_cat.keys()):
141
+ obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
142
+ if (obj_mask > 0).any():
143
+ y1, y2, x1, x2 = self.bounding_box(mask)
144
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
145
+ valid.append(1)
146
+ else: # some frame didn't contain the instance
147
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
148
+ valid.append(0)
149
+ obj_mask = torch.from_numpy(obj_mask)
150
+
151
+ # append
152
+ masks.append(obj_mask)
153
+ boxes.append(box)
154
+
155
+
156
+ # transform
157
+ w, h = img.size
158
+ boxes = torch.stack(boxes, dim=0)
159
+ boxes[:, 0::2].clamp_(min=0, max=w)
160
+ boxes[:, 1::2].clamp_(min=0, max=h)
161
+ masks = torch.stack(masks, dim=0)
162
+ target = {
163
+ 'frames_idx': sample_indx, # [T,]
164
+ 'boxes': boxes, # [T, 4], xyxy
165
+ 'masks': masks, # [T, H, W]
166
+ 'valid': torch.tensor(valid), # [T,]
167
+ 'obj_ids' : list(obj_id_cat.keys()),
168
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
169
+ 'size': torch.as_tensor([int(h), int(w)])
170
+ }
171
+
172
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
173
+ if self._transforms:
174
+ imgs, target = self._transforms(imgs, target)
175
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
176
+ else:
177
+ imgs = np.array(imgs)
178
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
179
+
180
+
181
+ # FIXME: handle "valid", since some box may be removed due to random crop
182
+ if torch.any(target['valid'] == 1): # at leatst one instance
183
+ instance_check = True
184
+ else:
185
+ idx = random.randint(0, self.__len__() - 1)
186
+
187
+ return imgs, target
188
+
189
+
190
+ def make_coco_transforms(image_set, max_size=640):
191
+ normalize = T.Compose([
192
+ T.ToTensor(),
193
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
194
+ ])
195
+
196
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
197
+
198
+ if image_set == 'train':
199
+ return T.Compose([
200
+ T.RandomHorizontalFlip(),
201
+ T.PhotometricDistort(),
202
+ T.RandomSelect(
203
+ T.Compose([
204
+ T.RandomResize(scales, max_size=max_size),
205
+ T.Check(),
206
+ ]),
207
+ T.Compose([
208
+ T.RandomResize([400, 500, 600]),
209
+ T.RandomSizeCrop(384, 600),
210
+ T.RandomResize(scales, max_size=max_size),
211
+ T.Check(),
212
+ ])
213
+ ),
214
+ normalize,
215
+ ])
216
+
217
+ # we do not use the 'val' set since the annotations are inaccessible
218
+ if image_set == 'val':
219
+ return T.Compose([
220
+ T.RandomResize([360], max_size=640),
221
+ normalize,
222
+ ])
223
+
224
+ raise ValueError(f'unknown {image_set}')
225
+
226
+
227
+ def build(image_set, args):
228
+ root = Path(args.ytvos_path)
229
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
230
+ PATHS = {
231
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
232
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
233
+ }
234
+ img_folder, ann_file = PATHS[image_set]
235
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
236
+ # num_frames=args.num_frames, max_skip=args.max_skip)
237
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
238
+ num_frames=args.num_frames, max_skip=args.max_skip)
239
+ return dataset
240
+
.history/datasets/ytvos_ref_20250116073540.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ meta = self.metas[idx] # dict
122
+
123
+ video, sample_indx, bins, frames, obj_id_cat = \
124
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
125
+
126
+ # read frames and masks
127
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
128
+ for frame_indx in sample_indx:
129
+ frame_name = frames[frame_indx]
130
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
131
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
132
+ img = Image.open(img_path).convert('RGB')
133
+ imgs.append(img)
134
+
135
+ mask = Image.open(mask_path).convert('P')
136
+ mask = np.array(mask)
137
+
138
+ # create the target
139
+ for obj_id in list(obj_id_cat.keys()):
140
+ obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
141
+ if (obj_mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ obj_mask = torch.from_numpy(obj_mask)
149
+
150
+ # append
151
+ masks.append(obj_mask)
152
+ boxes.append(box)
153
+
154
+
155
+ # transform
156
+ w, h = img.size
157
+ boxes = torch.stack(boxes, dim=0)
158
+ boxes[:, 0::2].clamp_(min=0, max=w)
159
+ boxes[:, 1::2].clamp_(min=0, max=h)
160
+ masks = torch.stack(masks, dim=0)
161
+ target = {
162
+ 'frames_idx': sample_indx, # [T,]
163
+ 'boxes': boxes, # [T, 4], xyxy
164
+ 'masks': masks, # [T, H, W]
165
+ 'valid': torch.tensor(valid), # [T,]
166
+ 'obj_ids' : list(obj_id_cat.keys()),
167
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
168
+ 'size': torch.as_tensor([int(h), int(w)])
169
+ }
170
+
171
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
172
+ if self._transforms:
173
+ imgs, target = self._transforms(imgs, target)
174
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
175
+ else:
176
+ imgs = np.array(imgs)
177
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
178
+
179
+
180
+ # # FIXME: handle "valid", since some box may be removed due to random crop
181
+ # if torch.any(target['valid'] == 1): # at leatst one instance
182
+ # instance_check = True
183
+ # else:
184
+ # idx = random.randint(0, self.__len__() - 1)
185
+
186
+ return imgs, target
187
+
188
+
189
+ def make_coco_transforms(image_set, max_size=640):
190
+ normalize = T.Compose([
191
+ T.ToTensor(),
192
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
193
+ ])
194
+
195
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
196
+
197
+ if image_set == 'train':
198
+ return T.Compose([
199
+ T.RandomHorizontalFlip(),
200
+ T.PhotometricDistort(),
201
+ T.RandomSelect(
202
+ T.Compose([
203
+ T.RandomResize(scales, max_size=max_size),
204
+ T.Check(),
205
+ ]),
206
+ T.Compose([
207
+ T.RandomResize([400, 500, 600]),
208
+ T.RandomSizeCrop(384, 600),
209
+ T.RandomResize(scales, max_size=max_size),
210
+ T.Check(),
211
+ ])
212
+ ),
213
+ normalize,
214
+ ])
215
+
216
+ # we do not use the 'val' set since the annotations are inaccessible
217
+ if image_set == 'val':
218
+ return T.Compose([
219
+ T.RandomResize([360], max_size=640),
220
+ normalize,
221
+ ])
222
+
223
+ raise ValueError(f'unknown {image_set}')
224
+
225
+
226
+ def build(image_set, args):
227
+ root = Path(args.ytvos_path)
228
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
229
+ PATHS = {
230
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
231
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
232
+ }
233
+ img_folder, ann_file = PATHS[image_set]
234
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
235
+ # num_frames=args.num_frames, max_skip=args.max_skip)
236
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
237
+ num_frames=args.num_frames, max_skip=args.max_skip)
238
+ return dataset
239
+
.history/datasets/ytvos_ref_20250116073706.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ meta = self.metas[idx] # dict
122
+
123
+ video, sample_indx, bins, frames, obj_id_cat = \
124
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
125
+
126
+ # read frames and masks
127
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
128
+ for frame_indx in sample_indx:
129
+ frame_name = frames[frame_indx]
130
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
131
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
132
+ img = Image.open(img_path).convert('RGB')
133
+ imgs.append(img)
134
+
135
+ mask = Image.open(mask_path).convert('P')
136
+ mask = np.array(mask)
137
+ print(np.unique(mask))
138
+
139
+ # create the target
140
+ for obj_id in list(obj_id_cat.keys()):
141
+ obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
142
+ if (obj_mask > 0).any():
143
+ y1, y2, x1, x2 = self.bounding_box(mask)
144
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
145
+ valid.append(1)
146
+ else: # some frame didn't contain the instance
147
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
148
+ valid.append(0)
149
+ obj_mask = torch.from_numpy(obj_mask)
150
+
151
+ # append
152
+ masks.append(obj_mask)
153
+ boxes.append(box)
154
+
155
+
156
+ # transform
157
+ w, h = img.size
158
+ boxes = torch.stack(boxes, dim=0)
159
+ boxes[:, 0::2].clamp_(min=0, max=w)
160
+ boxes[:, 1::2].clamp_(min=0, max=h)
161
+ masks = torch.stack(masks, dim=0)
162
+ target = {
163
+ 'frames_idx': sample_indx, # [T,]
164
+ 'boxes': boxes, # [T, 4], xyxy
165
+ 'masks': masks, # [T, H, W]
166
+ 'valid': torch.tensor(valid), # [T,]
167
+ 'obj_ids' : list(obj_id_cat.keys()),
168
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
169
+ 'size': torch.as_tensor([int(h), int(w)])
170
+ }
171
+
172
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
173
+ if self._transforms:
174
+ imgs, target = self._transforms(imgs, target)
175
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
176
+ else:
177
+ imgs = np.array(imgs)
178
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
179
+
180
+
181
+ # # FIXME: handle "valid", since some box may be removed due to random crop
182
+ # if torch.any(target['valid'] == 1): # at leatst one instance
183
+ # instance_check = True
184
+ # else:
185
+ # idx = random.randint(0, self.__len__() - 1)
186
+
187
+ return imgs, target
188
+
189
+
190
+ def make_coco_transforms(image_set, max_size=640):
191
+ normalize = T.Compose([
192
+ T.ToTensor(),
193
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
194
+ ])
195
+
196
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
197
+
198
+ if image_set == 'train':
199
+ return T.Compose([
200
+ T.RandomHorizontalFlip(),
201
+ T.PhotometricDistort(),
202
+ T.RandomSelect(
203
+ T.Compose([
204
+ T.RandomResize(scales, max_size=max_size),
205
+ T.Check(),
206
+ ]),
207
+ T.Compose([
208
+ T.RandomResize([400, 500, 600]),
209
+ T.RandomSizeCrop(384, 600),
210
+ T.RandomResize(scales, max_size=max_size),
211
+ T.Check(),
212
+ ])
213
+ ),
214
+ normalize,
215
+ ])
216
+
217
+ # we do not use the 'val' set since the annotations are inaccessible
218
+ if image_set == 'val':
219
+ return T.Compose([
220
+ T.RandomResize([360], max_size=640),
221
+ normalize,
222
+ ])
223
+
224
+ raise ValueError(f'unknown {image_set}')
225
+
226
+
227
+ def build(image_set, args):
228
+ root = Path(args.ytvos_path)
229
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
230
+ PATHS = {
231
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
232
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
233
+ }
234
+ img_folder, ann_file = PATHS[image_set]
235
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
236
+ # num_frames=args.num_frames, max_skip=args.max_skip)
237
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
238
+ num_frames=args.num_frames, max_skip=args.max_skip)
239
+ return dataset
240
+
.history/datasets/ytvos_ref_20250116073858.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ meta = self.metas[idx] # dict
122
+
123
+ video, sample_indx, bins, frames, obj_id_cat = \
124
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
125
+
126
+ # read frames and masks
127
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
128
+ for frame_indx in sample_indx:
129
+ frame_name = frames[frame_indx]
130
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
131
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
132
+ img = Image.open(img_path).convert('RGB')
133
+ imgs.append(img)
134
+
135
+ mask = Image.open(mask_path).convert('P')
136
+ mask = np.array(mask)
137
+
138
+ # create the target
139
+ for obj_id in list(obj_id_cat.keys()):
140
+ obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary
141
+ if (obj_mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ obj_mask = torch.from_numpy(obj_mask)
149
+
150
+ # append
151
+ masks.append(obj_mask)
152
+ boxes.append(box)
153
+
154
+
155
+ # transform
156
+ w, h = img.size
157
+ boxes = torch.stack(boxes, dim=0)
158
+ boxes[:, 0::2].clamp_(min=0, max=w)
159
+ boxes[:, 1::2].clamp_(min=0, max=h)
160
+ masks = torch.stack(masks, dim=0)
161
+ target = {
162
+ 'frames_idx': sample_indx, # [T,]
163
+ 'boxes': boxes, # [T, 4], xyxy
164
+ 'masks': masks, # [T, H, W]
165
+ 'valid': torch.tensor(valid), # [T,]
166
+ 'obj_ids' : list(obj_id_cat.keys()),
167
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
168
+ 'size': torch.as_tensor([int(h), int(w)])
169
+ }
170
+
171
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
172
+ if self._transforms:
173
+ imgs, target = self._transforms(imgs, target)
174
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
175
+ else:
176
+ imgs = np.array(imgs)
177
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
178
+
179
+
180
+ # # FIXME: handle "valid", since some box may be removed due to random crop
181
+ # if torch.any(target['valid'] == 1): # at leatst one instance
182
+ # instance_check = True
183
+ # else:
184
+ # idx = random.randint(0, self.__len__() - 1)
185
+
186
+ return imgs, target
187
+
188
+
189
+ def make_coco_transforms(image_set, max_size=640):
190
+ normalize = T.Compose([
191
+ T.ToTensor(),
192
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
193
+ ])
194
+
195
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
196
+
197
+ if image_set == 'train':
198
+ return T.Compose([
199
+ T.RandomHorizontalFlip(),
200
+ T.PhotometricDistort(),
201
+ T.RandomSelect(
202
+ T.Compose([
203
+ T.RandomResize(scales, max_size=max_size),
204
+ T.Check(),
205
+ ]),
206
+ T.Compose([
207
+ T.RandomResize([400, 500, 600]),
208
+ T.RandomSizeCrop(384, 600),
209
+ T.RandomResize(scales, max_size=max_size),
210
+ T.Check(),
211
+ ])
212
+ ),
213
+ normalize,
214
+ ])
215
+
216
+ # we do not use the 'val' set since the annotations are inaccessible
217
+ if image_set == 'val':
218
+ return T.Compose([
219
+ T.RandomResize([360], max_size=640),
220
+ normalize,
221
+ ])
222
+
223
+ raise ValueError(f'unknown {image_set}')
224
+
225
+
226
+ def build(image_set, args):
227
+ root = Path(args.ytvos_path)
228
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
229
+ PATHS = {
230
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
231
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
232
+ }
233
+ img_folder, ann_file = PATHS[image_set]
234
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
235
+ # num_frames=args.num_frames, max_skip=args.max_skip)
236
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
237
+ num_frames=args.num_frames, max_skip=args.max_skip)
238
+ return dataset
239
+
.history/mbench/gpt_ref-ytvos-cy_20250121143328.py ADDED
File without changes
.history/mbench/gpt_ref-ytvos-cy_20250121155631.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from mbench.ytvos_ref import build as build_ytvos_ref
6
+ import argparse
7
+ import opts
8
+
9
+ import sys
10
+ from pathlib import Path
11
+ import os
12
+ from os import path as osp
13
+ import skimage
14
+ from io import BytesIO
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ import regex as re
19
+ import json
20
+
21
+ import cv2
22
+ from PIL import Image, ImageDraw
23
+ import torch
24
+ from torchvision.transforms import functional as F
25
+
26
+ from skimage import measure # (pip install scikit-image)
27
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
28
+
29
+ import matplotlib.pyplot as plt
30
+ import matplotlib.patches as patches
31
+ from matplotlib.collections import PatchCollection
32
+ from matplotlib.patches import Rectangle
33
+
34
+
35
+ import ipywidgets as widgets
36
+ from IPython.display import display, clear_output
37
+
38
+ from openai import OpenAI
39
+ import base64
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+ # Captioner
47
+ ytvos_category_valid_list = [
48
+ 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
49
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
50
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
51
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
52
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
53
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
54
+ ]
55
+ def getCaption(video_id, json_data):
56
+ #데이터 가져오기
57
+ video_data = json_data[video_id]
58
+ frame_names = video_data['frame_names']
59
+ video_path = video_data['video_path']
60
+
61
+ cat_names = set()
62
+ all_captions = dict()
63
+ for obj_id in list(video_data['annotations'][0].keys()):
64
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
65
+
66
+ # cat_names : person, snowboard
67
+ # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
68
+ # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
69
+
70
+ for cat_name in list(cat_names) :
71
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
72
+ image_captions = {}
73
+
74
+ captioner = OpenAI()
75
+
76
+ #0단계: action의 대상이 될 수 있는가?
77
+ is_movable = False
78
+ if cat_name in ytvos_category_valid_list :
79
+ is_movable = True
80
+
81
+ # response_check = captioner.chat.completions.create(
82
+ # model="gpt-4o",
83
+ # messages=[
84
+ # {
85
+ # "role": "user",
86
+ # "content": f"""
87
+ # Can a {cat_name} be a subject of distinct actions or movements?
88
+ # For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
89
+ # However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
90
+ # Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
91
+ # Answer only YES or NONE.
92
+ # """
93
+ # }
94
+ # ],
95
+ # )
96
+ # response_check_content = response_check.choices[0].message.content.strip().lower()
97
+ # print(f"Movable Check for {cat_name}: {response_check_content}")
98
+
99
+ # if response_check_content == "yes": is_movable = True
100
+
101
+ if not is_movable:
102
+ print(f"Skipping {cat_name}: Determined to be non-movable.")
103
+ continue
104
+
105
+ for i in range(len(image_paths)):
106
+ image_path = image_paths[i]
107
+ frame_name = frame_names[i]
108
+ base64_image = encode_image(image_path)
109
+
110
+ #1단계: 필터링
111
+ #print(f"-----------category name: {cat_name}, frame name: {frame_name}")
112
+ response1 = captioner.chat.completions.create(
113
+ model="chatgpt-4o-latest",
114
+ messages=[
115
+ {
116
+ "role": "user",
117
+ "content": [
118
+ {
119
+ "type": "text",
120
+
121
+ "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
122
+ Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
123
+ Each action should be unique and clearly associated with a specific object.
124
+
125
+ Respond with YES if:
126
+ - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
127
+ - The {cat_name}s involve clear, distinguishable actions performed independently.
128
+
129
+ Respond with NONE if:
130
+ - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
131
+ - Actions are ambiguous, minor, or not clearly visible.
132
+
133
+ If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
134
+ If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
135
+
136
+ Answer only YES or NONE."""
137
+
138
+ },
139
+ {
140
+ "type": "image_url",
141
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
142
+ },
143
+ ],
144
+ }
145
+ ],
146
+ )
147
+ response_content = response1.choices[0].message.content
148
+ should_caption = True if "yes" in response_content.lower() else False
149
+ #print(f"are {cat_name}s distinguished by action: {response_content}")
150
+
151
+ #2단계: dense caption 만들기
152
+ if should_caption:
153
+ response2 = captioner.chat.completions.create(
154
+ model="chatgpt-4o-latest",
155
+ messages=[
156
+ {
157
+ "role": "user",
158
+ "content": [
159
+ {
160
+ "type": "text",
161
+
162
+ "text": f"""
163
+ Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
164
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
165
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
166
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
167
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
168
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
169
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
170
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
171
+ 8. Include interactions with objects or other entities when they are prominent and observable.
172
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
173
+ Output only the caption.""",
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
178
+ },
179
+ ],
180
+ }
181
+ ],
182
+ )
183
+
184
+ caption = response2.choices[0].message.content
185
+ #print(f"{image_path} - {frame_name}: {caption}")
186
+ else:
187
+ caption = None
188
+
189
+ image_captions[frame_name] = caption
190
+ all_captions[cat_name] = image_captions
191
+
192
+ # final : also prepare valid object ids
193
+ valid_obj_ids = []
194
+ valid_cat_names = list(all_captions.keys())
195
+ for obj_id in list(video_data['annotations'][0].keys()):
196
+ cat = video_data['annotations'][0][obj_id]['category_name']
197
+ if cat in valid_cat_names : valid_obj_ids.append(obj_id)
198
+
199
+ return all_captions, valid_obj_ids
200
+
201
+ # Referring expression generator and QA filter
202
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
203
+
204
+ # 이미지에 해당 물체 바운딩 박스 그리기
205
+ video_data = json_data[video_id]
206
+ frame_names = video_data['frame_names']
207
+ video_path = video_data['video_path']
208
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
209
+ frame_indx = frame_names.index(frame_name)
210
+ obj_data = video_data['annotations'][frame_indx][obj_id]
211
+
212
+ bbox = obj_data['bbox']
213
+ cat_name = obj_data['category_name']
214
+ valid = obj_data['valid']
215
+
216
+ if valid == 0:
217
+ print("Object not in this frame!")
218
+ return {}
219
+
220
+
221
+ x_min, y_min, x_max, y_max = bbox
222
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
223
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
224
+ plt.figure()
225
+ plt.imshow(I)
226
+ plt.axis('off')
227
+ plt.show()
228
+
229
+ #cropped object for visibility check
230
+ cropped_I = I[y_min:y_max, x_min:x_max]
231
+ pil_cropped_I = Image.fromarray(cropped_I)
232
+ buff_crop = BytesIO()
233
+ pil_cropped_I.save(buff_crop, format='JPEG')
234
+ base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
235
+
236
+ #entire image for referring expression generation
237
+ pil_I = Image.fromarray(I)
238
+ buff = BytesIO()
239
+ pil_I.save(buff, format='JPEG')
240
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
241
+
242
+ # 구분 가능 여부 확인
243
+ generator = OpenAI()
244
+ response_check = generator.chat.completions.create(
245
+ model="chatgpt-4o-latest",
246
+ messages=[
247
+ {
248
+ "role": "user",
249
+ "content": [
250
+ {
251
+
252
+ "type": "text",
253
+ "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
254
+ Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
255
+
256
+ Guidelines:
257
+ - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
258
+ - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
259
+ - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
260
+
261
+ Output only either YES or NONE.
262
+ """
263
+ },
264
+ {
265
+ "type": "image_url",
266
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
267
+ }
268
+ ]
269
+ },
270
+ ]
271
+ )
272
+
273
+ response_check_content = response_check.choices[0].message.content.strip().lower()
274
+ #print(f"is object {obj_id} visible: {response_check_content}")
275
+
276
+ if "yes" not in response_check_content:
277
+ print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
278
+ return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
279
+
280
+ # Referring expression 만들기
281
+ # generator = OpenAI()
282
+ response = generator.chat.completions.create(
283
+ model="chatgpt-4o-latest",
284
+ messages=[
285
+ {
286
+ "role": "user",
287
+ "content": [
288
+ {
289
+ "type": "text",
290
+
291
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
292
+ Guidelines for creating the referring expression:
293
+ 1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
294
+ 2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
295
+ 3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
296
+ 4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
297
+ 5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
298
+ 6. Use '{cat_name}' as the noun for the referring expressions.
299
+ Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
300
+
301
+ {caption}
302
+ """
303
+ },
304
+ {
305
+ "type": "image_url",
306
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
307
+ },
308
+ # {
309
+ # "type": "image_url",
310
+ # "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
311
+ # }
312
+ ],
313
+ }
314
+ ],
315
+ )
316
+
317
+ ref_exp = response.choices[0].message.content.strip()
318
+
319
+ #QA filtering
320
+ #QA1: 원하는 물체를 설명하는지
321
+ filter = OpenAI()
322
+ response1 = filter.chat.completions.create(
323
+ model="chatgpt-4o-latest",
324
+ messages=[
325
+ {
326
+ "role": "user",
327
+ "content": [
328
+ {
329
+ "type": "text",
330
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
331
+ {ref_exp}""",
332
+ },
333
+ {
334
+ "type": "image_url",
335
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
336
+ },
337
+ ],
338
+ }
339
+ ],
340
+ )
341
+
342
+ response1_content = response1.choices[0].message.content
343
+ describesHighlighted = True if "yes" in response1_content.lower() else False
344
+
345
+ #QA2: 원하지 않는 물체를 설명하지 않는지
346
+ response2 = filter.chat.completions.create(
347
+ model="chatgpt-4o-latest",
348
+ messages=[
349
+ {
350
+ "role": "user",
351
+ "content": [
352
+ {
353
+ "type": "text",
354
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
355
+ {ref_exp}""",
356
+ },
357
+ {
358
+ "type": "image_url",
359
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
360
+ },
361
+ ],
362
+ }
363
+ ],
364
+ )
365
+
366
+ response2_content = response2.choices[0].message.content
367
+ notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
368
+
369
+ isValid = True if describesHighlighted and notDescribesNotHighlighted else False
370
+
371
+ #print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
372
+ #print(f"ref exp: {ref_exp}")
373
+ #print("")
374
+
375
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
376
+
377
+
378
+ if __name__ == '__main__':
379
+ with open('mbench/sampled_frame3.json', 'r') as file:
380
+ data = json.load(file)
381
+
382
+ vid_ids = list(data.keys())
383
+ all_ref_exps = {}
384
+
385
+ os.environ['OPENAI_API_KEY'] = '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'
386
+
387
+ # 전체 데이터셋의 vid_id에 대해
388
+ for i in range(1):
389
+ vid_id = vid_ids[i]
390
+
391
+ #====캡션 만들기====
392
+ # print("=====================captioner========================")
393
+ captions, valid_obj_ids = getCaption(vid_id, data)
394
+ cats_in_vid = list(captions.keys())
395
+ # print()
396
+
397
+ #====referring expression 만들고 QA filtering====
398
+ # print("=====================referring expression generator & QA filter========================")
399
+ ref_expressions = {}
400
+
401
+ # 각 카테고리별로
402
+ for cat_name in cats_in_vid:
403
+ if cat_name not in ref_expressions:
404
+ ref_expressions[cat_name] = {}
405
+ # 각 비디오 프레임 별로
406
+ for frame_name in data[vid_id]['frame_names']:
407
+ # print(f'--------category: {cat_name}, frame_name: {frame_name}')
408
+
409
+ if frame_name not in ref_expressions[cat_name]:
410
+ ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary
411
+ caption = captions[cat_name][frame_name]
412
+ if not caption : continue
413
+ else :
414
+ # 각 obj id별로
415
+ for obj_id in valid_obj_ids:
416
+ ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
417
+ ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp
418
+
419
+ all_ref_exps[vid_id] = ref_expressions
420
+
421
+
422
+ with open('mbench/result_revised.json', 'w') as file:
423
+ json.dump(all_ref_exps, file)
424
+
425
+
426
+
427
+
428
+
.history/mbench/gpt_ref-ytvos_20250119071933.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+ from pathlib import Path
10
+ import os
11
+ import skimage
12
+ from io import BytesIO
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ from openai import OpenAI
37
+ import base64
38
+
39
+ os.environ['OPENAI_API_KEY'] = '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'
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+ def getCaption(video_id, json_data):
47
+ #데이터 가져오기
48
+ video_data = json_data[video_id]
49
+ frame_names = video_data['frame_names']
50
+ video_path = video_data['video_path']
51
+
52
+ cat_names = set()
53
+ for obj_id in list(video_data['annotations'][0].keys()):
54
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
55
+
56
+ if len(cat_names) == 1:
57
+ cat_name = next(iter(cat_names))
58
+ else:
59
+ print("more than 2 categories")
60
+ return -1
61
+
62
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
63
+ image_captions = {}
64
+
65
+ captioner = OpenAI()
66
+ for i in range(len(image_paths)):
67
+ image_path = image_paths[i]
68
+ frame_name = frame_names[i]
69
+ base64_image = encode_image(image_path)
70
+
71
+ #1단계: 필터링
72
+ response1 = captioner.chat.completions.create(
73
+ model="gpt-4o-mini",
74
+ messages=[
75
+ {
76
+ "role": "user",
77
+ "content": [
78
+ {
79
+ "type": "text",
80
+ "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
81
+ },
82
+ {
83
+ "type": "image_url",
84
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
85
+ },
86
+ ],
87
+ }
88
+ ],
89
+ )
90
+ response_content = response1.choices[0].message.content
91
+ should_caption = True if "yes" in response_content.lower() else False
92
+
93
+ #2단계: dense caption 만들기
94
+ if should_caption:
95
+ response2 = captioner.chat.completions.create(
96
+ model="gpt-4o-mini",
97
+ messages=[
98
+ {
99
+ "role": "user",
100
+ "content": [
101
+ {
102
+ "type": "text",
103
+ "text": f"""
104
+ Describe the image in detail focusing on the {cat_name}s' actions.
105
+ 1. Each action should be prominent, clear and unique, describing the corresponding object only.
106
+ 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
107
+ 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
108
+ 4. Do not include actions that needs to be guessed or suggested.""",
109
+ },
110
+ {
111
+ "type": "image_url",
112
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
113
+ },
114
+ ],
115
+ }
116
+ ],
117
+ )
118
+
119
+ caption = response2.choices[0].message.content
120
+ else:
121
+ caption = None
122
+
123
+ image_captions[frame_name] = caption
124
+ return image_captions
125
+
126
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
127
+ # 이미지에 해당 물체 바운딩 박스 그리기
128
+ video_data = json_data[video_id]
129
+ frame_names = video_data['frame_names']
130
+ video_path = video_data['video_path']
131
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
132
+ frame_indx = frame_names.index(frame_name)
133
+ obj_data = video_data['annotations'][frame_indx][obj_id]
134
+
135
+ bbox = obj_data['bbox']
136
+ cat_name = obj_data['category_name']
137
+ valid = obj_data['valid']
138
+
139
+ if valid == 0:
140
+ print("Object not in this frame!")
141
+ return {}
142
+
143
+
144
+ x_min, y_min, x_max, y_max = bbox
145
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
146
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
147
+ plt.figure()
148
+ plt.imshow(I)
149
+ plt.axis('off')
150
+ plt.show()
151
+ pil_I = Image.fromarray(I)
152
+ buff = BytesIO()
153
+ pil_I.save(buff, format='JPEG')
154
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
155
+
156
+ #ref expression 만들기
157
+ generator = OpenAI()
158
+ response = generator.chat.completions.create(
159
+ model="gpt-4o-mini",
160
+ messages=[
161
+ {
162
+ "role": "user",
163
+ "content": [
164
+ {
165
+ "type": "text",
166
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
167
+ 1. The referring expression describes the action and does not contain information about appearance or location in the picture.
168
+ 2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
169
+ 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
170
+ 4. The referring expression should only describe the highlighted {cat_name} and not any other.
171
+ 5. Use '{cat_name}' as the noun for the referring expressions.
172
+ Output only the referring expression.
173
+ {caption}""",
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
178
+ },
179
+ ],
180
+ }
181
+ ],
182
+ )
183
+
184
+ ref_exp = response.choices[0].message.content
185
+
186
+ #QA filtering
187
+ #QA1: 원하는 물체를 설명하는지
188
+ filter = OpenAI()
189
+ response1 = filter.chat.completions.create(
190
+ model="gpt-4o-mini",
191
+ messages=[
192
+ {
193
+ "role": "user",
194
+ "content": [
195
+ {
196
+ "type": "text",
197
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
198
+ {ref_exp}""",
199
+ },
200
+ {
201
+ "type": "image_url",
202
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
203
+ },
204
+ ],
205
+ }
206
+ ],
207
+ )
208
+
209
+ response1_content = response1.choices[0].message.content
210
+ describesHighlighted = True if "yes" in response1_content.lower() else False
211
+
212
+ #QA2: 원하지 않는 물체를 설명하지 않는지
213
+ response2 = filter.chat.completions.create(
214
+ model="gpt-4o-mini",
215
+ messages=[
216
+ {
217
+ "role": "user",
218
+ "content": [
219
+ {
220
+ "type": "text",
221
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
222
+ {ref_exp}""",
223
+ },
224
+ {
225
+ "type": "image_url",
226
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
227
+ },
228
+ ],
229
+ }
230
+ ],
231
+ )
232
+
233
+ response2_content = response2.choices[0].message.content
234
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
235
+
236
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
237
+
238
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
239
+
240
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
241
+
242
+ def createRefExp(video_id, json_data):
243
+ video_data = json_data[video_id]
244
+ obj_ids = list(video_data['annotations'][0].keys())
245
+ frame_names = video_data['frame_names']
246
+
247
+ captions_per_frame = getCaption(video_id, json_data)
248
+
249
+ if captions_per_frame == -1:
250
+ print("There are more than 2 cateories")
251
+ return
252
+
253
+
254
+ video_ref_exps = {}
255
+
256
+ for frame_name in frame_names:
257
+ frame_caption = captions_per_frame[frame_name]
258
+
259
+ if frame_caption == None:
260
+ video_ref_exps[frame_name] = None
261
+
262
+ else:
263
+ frame_ref_exps = {}
264
+ for obj_id in obj_ids:
265
+ exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
266
+ frame_ref_exps[obj_id] = exp_per_obj
267
+ video_ref_exps[frame_name] = frame_ref_exps
268
+
269
+ return video_ref_exps
270
+
271
+ if __name__ == '__main__':
272
+ with open('mbench/sampled_frame3.json', 'r') as file:
273
+ data = json.load(file)
274
+
275
+ videos = set()
276
+ with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
277
+ manual_select = list(file)
278
+ for frame in manual_select:
279
+ result = json.loads(frame)
280
+ videos.add(result['video'])
281
+ videos = list(videos)
282
+
283
+
284
+ all_video_refs = {}
285
+ for i in range(1):
286
+ video_id = videos[i]
287
+ video_ref = createRefExp(video_id, data)
288
+ all_video_refs[video_id] = video_ref
289
+
290
+ json_obj = json.dumps(all_video_refs, indent=4)
291
+ with open('mbench/result.json', 'w') as file:
292
+ file.write(json_obj)
.history/mbench/gpt_ref-ytvos_20250119072546.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+ from pathlib import Path
10
+ import os
11
+ import skimage
12
+ from io import BytesIO
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ from openai import OpenAI
37
+ import base64
38
+
39
+ os.environ['OPENAI_API_KEY'] = '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'
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+ def getCaption(video_id, json_data):
47
+ #데이터 가져오기
48
+ video_data = json_data[video_id]
49
+ frame_names = video_data['frame_names']
50
+ video_path = video_data['video_path']
51
+
52
+ cat_names = set()
53
+ for obj_id in list(video_data['annotations'][0].keys()):
54
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
55
+
56
+ if len(cat_names) == 1:
57
+ cat_name = next(iter(cat_names))
58
+ else:
59
+ print("more than 2 categories")
60
+ return -1
61
+
62
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
63
+ image_captions = {}
64
+
65
+ captioner = OpenAI()
66
+ for i in range(len(image_paths)):
67
+ image_path = image_paths[i]
68
+ frame_name = frame_names[i]
69
+ base64_image = encode_image(image_path)
70
+
71
+ #1단계: 필터링
72
+ response1 = captioner.chat.completions.create(
73
+ model="gpt-4o-mini",
74
+ messages=[
75
+ {
76
+ "role": "user",
77
+ "content": [
78
+ {
79
+ "type": "text",
80
+ "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
81
+ },
82
+ {
83
+ "type": "image_url",
84
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
85
+ },
86
+ ],
87
+ }
88
+ ],
89
+ )
90
+ response_content = response1.choices[0].message.content
91
+ should_caption = True if "yes" in response_content.lower() else False
92
+
93
+ #2단계: dense caption 만들기
94
+ if should_caption:
95
+ response2 = captioner.chat.completions.create(
96
+ model="gpt-4o-mini",
97
+ messages=[
98
+ {
99
+ "role": "user",
100
+ "content": [
101
+ {
102
+ "type": "text",
103
+ "text": f"""
104
+ Describe the image in detail focusing on the {cat_name}s' actions.
105
+ 1. Each action should be prominent, clear and unique, describing the corresponding object only.
106
+ 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
107
+ 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
108
+ 4. Do not include actions that needs to be guessed or suggested.""",
109
+ },
110
+ {
111
+ "type": "image_url",
112
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
113
+ },
114
+ ],
115
+ }
116
+ ],
117
+ )
118
+
119
+ caption = response2.choices[0].message.content
120
+ else:
121
+ caption = None
122
+
123
+ image_captions[frame_name] = caption
124
+ return image_captions
125
+
126
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
127
+ # 이미지에 해당 물체 바운딩 박스 그리기
128
+ video_data = json_data[video_id]
129
+ frame_names = video_data['frame_names']
130
+ video_path = video_data['video_path']
131
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
132
+ frame_indx = frame_names.index(frame_name)
133
+ obj_data = video_data['annotations'][frame_indx][obj_id]
134
+
135
+ bbox = obj_data['bbox']
136
+ cat_name = obj_data['category_name']
137
+ valid = obj_data['valid']
138
+
139
+ if valid == 0:
140
+ print("Object not in this frame!")
141
+ return {}
142
+
143
+
144
+ x_min, y_min, x_max, y_max = bbox
145
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
146
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
147
+ plt.figure()
148
+ plt.imshow(I)
149
+ plt.axis('off')
150
+ plt.show()
151
+ pil_I = Image.fromarray(I)
152
+ buff = BytesIO()
153
+ pil_I.save(buff, format='JPEG')
154
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
155
+
156
+ #ref expression 만들기
157
+ generator = OpenAI()
158
+ response = generator.chat.completions.create(
159
+ model="gpt-4o-mini",
160
+ messages=[
161
+ {
162
+ "role": "user",
163
+ "content": [
164
+ {
165
+ "type": "text",
166
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
167
+ 1. The referring expression describes the action and does not contain information about appearance or location in the picture.
168
+ 2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
169
+ 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
170
+ 4. The referring expression should only describe the highlighted {cat_name} and not any other.
171
+ 5. Use '{cat_name}' as the noun for the referring expressions.
172
+ Output only the referring expression.
173
+ {caption}""",
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
178
+ },
179
+ ],
180
+ }
181
+ ],
182
+ )
183
+
184
+ ref_exp = response.choices[0].message.content
185
+
186
+ #QA filtering
187
+ #QA1: 원하는 물체를 설명하는지
188
+ filter = OpenAI()
189
+ response1 = filter.chat.completions.create(
190
+ model="gpt-4o-mini",
191
+ messages=[
192
+ {
193
+ "role": "user",
194
+ "content": [
195
+ {
196
+ "type": "text",
197
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
198
+ {ref_exp}""",
199
+ },
200
+ {
201
+ "type": "image_url",
202
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
203
+ },
204
+ ],
205
+ }
206
+ ],
207
+ )
208
+
209
+ response1_content = response1.choices[0].message.content
210
+ describesHighlighted = True if "yes" in response1_content.lower() else False
211
+
212
+ #QA2: 원하지 않는 물체를 설명하지 않는지
213
+ response2 = filter.chat.completions.create(
214
+ model="gpt-4o-mini",
215
+ messages=[
216
+ {
217
+ "role": "user",
218
+ "content": [
219
+ {
220
+ "type": "text",
221
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
222
+ {ref_exp}""",
223
+ },
224
+ {
225
+ "type": "image_url",
226
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
227
+ },
228
+ ],
229
+ }
230
+ ],
231
+ )
232
+
233
+ response2_content = response2.choices[0].message.content
234
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
235
+
236
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
237
+
238
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
239
+
240
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
241
+
242
+ def createRefExp(video_id, json_data):
243
+ video_data = json_data[video_id]
244
+ obj_ids = list(video_data['annotations'][0].keys())
245
+ frame_names = video_data['frame_names']
246
+
247
+ captions_per_frame = getCaption(video_id, json_data)
248
+
249
+ if captions_per_frame == -1:
250
+ print("There are more than 2 cateories")
251
+ return None
252
+
253
+
254
+ video_ref_exps = {}
255
+
256
+ for frame_name in frame_names:
257
+ frame_caption = captions_per_frame[frame_name]
258
+
259
+ if frame_caption == None:
260
+ video_ref_exps[frame_name] = None
261
+
262
+ else:
263
+ frame_ref_exps = {}
264
+ for obj_id in obj_ids:
265
+ exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
266
+ frame_ref_exps[obj_id] = exp_per_obj
267
+ video_ref_exps[frame_name] = frame_ref_exps
268
+
269
+ return video_ref_exps
270
+
271
+ if __name__ == '__main__':
272
+ with open('mbench/sampled_frame3.json', 'r') as file:
273
+ data = json.load(file)
274
+
275
+ videos = set()
276
+ with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
277
+ manual_select = list(file)
278
+ for frame in manual_select:
279
+ result = json.loads(frame)
280
+ videos.add(result['video'])
281
+ videos = list(videos)
282
+
283
+
284
+ all_video_refs = {}
285
+ for i in range(1):
286
+ video_id = videos[i]
287
+ video_ref = createRefExp(video_id, data)
288
+ all_video_refs[video_id] = video_ref
289
+
290
+ json_obj = json.dumps(all_video_refs, indent=4)
291
+ with open('mbench/result.json', 'w') as file:
292
+ file.write(json_obj)
.history/mbench/make_ref-ytvos_json_20250113181932.py ADDED
File without changes
.history/mbench/make_ref-ytvos_json_20250113182455.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import build_dataset
2
+ import argparse
3
+ import opts
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ import os
8
+ from os import path as osp
9
+ import io
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ import regex as re
14
+ import json
15
+
16
+ import cv2
17
+ from PIL import Image, ImageDraw
18
+ import torch
19
+ from torchvision.transforms import functional as F
20
+
21
+ from skimage import measure # (pip install scikit-image)
22
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
23
+
24
+ import matplotlib.pyplot as plt
25
+ import matplotlib.patches as patches
26
+ from matplotlib.collections import PatchCollection
27
+ from matplotlib.patches import Rectangle
28
+
29
+
30
+ import ipywidgets as widgets
31
+ from IPython.display import display, clear_output
32
+
33
+ #==================json 만들기===================
34
+ def createJson(train_dataset, metas):
35
+ entire_json = {}
36
+
37
+ #초기화
38
+ data_idx = 0
39
+
40
+ while data_idx < 10:
41
+
42
+ #하나의 비디오에 대해
43
+ video_data = {}
44
+ video_id = metas[data_idx]['video']
45
+ video_data['bins'] = metas[data_idx]['bins']
46
+ annotation_data = []
47
+ frame_names = []
48
+
49
+ while metas[data_idx]['video'] == video_id:
50
+
51
+ obj_id = metas[data_idx]['obj_id']
52
+ sample_id = metas[data_idx]['sample_id']
53
+ sample_frames_id = metas[data_idx]['sample_frames_id']
54
+ sample_frame_idx = sample_frames_id.index(sample_id)
55
+
56
+ frames = metas[data_idx]['frames']
57
+
58
+ frame_name = frames[sample_id]
59
+ cat_name = metas[data_idx]['category']
60
+
61
+ bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
62
+
63
+ obj_data = {obj_id: {
64
+ "category_name" : cat_name,
65
+ "bbox": bbox
66
+ }}
67
+
68
+
69
+ annotation_data.append(obj_data)
70
+
71
+ frame_names.append(frame_name)
72
+
73
+ data_idx += 1
74
+
75
+ video_data['annotations'] = annotation_data
76
+ video_data['frame_names'] = frame_names
77
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
78
+
79
+ entire_json[video_id] = video_data
80
+
81
+ return entire_json
82
+
83
+
84
+ if __name__ == '__main__':
85
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
86
+ args = parser.parse_args()
87
+
88
+ #==================데이터 불러오기===================
89
+ # 전체 데이터셋
90
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
91
+
92
+ # 전체 데이터셋 메타데이터
93
+ metas = train_dataset.metas
94
+
95
+ #==================json 만들기===================
96
+ entire_json_dict = createJson(train_dataset, metas)
97
+ entire_json = json.dumps(entire_json_dict, indent=4)
98
+
99
+ with open('mbench/sampled_frame.json', mode='w') as file:
100
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250113182916.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+
10
+ from pathlib import Path
11
+ import io
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ import regex as re
16
+ import json
17
+
18
+ import cv2
19
+ from PIL import Image, ImageDraw
20
+ import torch
21
+ from torchvision.transforms import functional as F
22
+
23
+ from skimage import measure # (pip install scikit-image)
24
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
25
+
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib.patches as patches
28
+ from matplotlib.collections import PatchCollection
29
+ from matplotlib.patches import Rectangle
30
+
31
+
32
+ import ipywidgets as widgets
33
+ from IPython.display import display, clear_output
34
+
35
+ #==================json 만들기===================
36
+ def createJson(train_dataset, metas):
37
+ entire_json = {}
38
+
39
+ #초기화
40
+ data_idx = 0
41
+
42
+ while data_idx < 10:
43
+
44
+ #하나의 비디오에 대해
45
+ video_data = {}
46
+ video_id = metas[data_idx]['video']
47
+ video_data['bins'] = metas[data_idx]['bins']
48
+ annotation_data = []
49
+ frame_names = []
50
+
51
+ while metas[data_idx]['video'] == video_id:
52
+
53
+ obj_id = metas[data_idx]['obj_id']
54
+ sample_id = metas[data_idx]['sample_id']
55
+ sample_frames_id = metas[data_idx]['sample_frames_id']
56
+ sample_frame_idx = sample_frames_id.index(sample_id)
57
+
58
+ frames = metas[data_idx]['frames']
59
+
60
+ frame_name = frames[sample_id]
61
+ cat_name = metas[data_idx]['category']
62
+
63
+ bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
64
+
65
+ obj_data = {obj_id: {
66
+ "category_name" : cat_name,
67
+ "bbox": bbox
68
+ }}
69
+
70
+
71
+ annotation_data.append(obj_data)
72
+
73
+ frame_names.append(frame_name)
74
+
75
+ data_idx += 1
76
+
77
+ video_data['annotations'] = annotation_data
78
+ video_data['frame_names'] = frame_names
79
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
80
+
81
+ entire_json[video_id] = video_data
82
+
83
+ return entire_json
84
+
85
+
86
+ if __name__ == '__main__':
87
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
88
+ args = parser.parse_args()
89
+
90
+ #==================데이터 불러오기===================
91
+ # 전체 데이터셋
92
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
93
+
94
+ # 전체 데이터셋 메타데이터
95
+ metas = train_dataset.metas
96
+
97
+ #==================json 만들기===================
98
+ entire_json_dict = createJson(train_dataset, metas)
99
+ entire_json = json.dumps(entire_json_dict, indent=4)
100
+
101
+ with open('mbench/sampled_frame.json', mode='w') as file:
102
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250113182917.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+
10
+ from pathlib import Path
11
+ import io
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ import regex as re
16
+ import json
17
+
18
+ import cv2
19
+ from PIL import Image, ImageDraw
20
+ import torch
21
+ from torchvision.transforms import functional as F
22
+
23
+ from skimage import measure # (pip install scikit-image)
24
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
25
+
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib.patches as patches
28
+ from matplotlib.collections import PatchCollection
29
+ from matplotlib.patches import Rectangle
30
+
31
+
32
+ import ipywidgets as widgets
33
+ from IPython.display import display, clear_output
34
+
35
+ #==================json 만들기===================
36
+ def createJson(train_dataset, metas):
37
+ entire_json = {}
38
+
39
+ #초기화
40
+ data_idx = 0
41
+
42
+ while data_idx < 10:
43
+
44
+ #하나의 비디오에 대해
45
+ video_data = {}
46
+ video_id = metas[data_idx]['video']
47
+ video_data['bins'] = metas[data_idx]['bins']
48
+ annotation_data = []
49
+ frame_names = []
50
+
51
+ while metas[data_idx]['video'] == video_id:
52
+
53
+ obj_id = metas[data_idx]['obj_id']
54
+ sample_id = metas[data_idx]['sample_id']
55
+ sample_frames_id = metas[data_idx]['sample_frames_id']
56
+ sample_frame_idx = sample_frames_id.index(sample_id)
57
+
58
+ frames = metas[data_idx]['frames']
59
+
60
+ frame_name = frames[sample_id]
61
+ cat_name = metas[data_idx]['category']
62
+
63
+ bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
64
+
65
+ obj_data = {obj_id: {
66
+ "category_name" : cat_name,
67
+ "bbox": bbox
68
+ }}
69
+
70
+
71
+ annotation_data.append(obj_data)
72
+
73
+ frame_names.append(frame_name)
74
+
75
+ data_idx += 1
76
+
77
+ video_data['annotations'] = annotation_data
78
+ video_data['frame_names'] = frame_names
79
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
80
+
81
+ entire_json[video_id] = video_data
82
+
83
+ return entire_json
84
+
85
+
86
+ if __name__ == '__main__':
87
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
88
+ args = parser.parse_args()
89
+
90
+ #==================데이터 불러오기===================
91
+ # 전체 데이터셋
92
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
93
+
94
+ # 전체 데이터셋 메타데이터
95
+ metas = train_dataset.metas
96
+
97
+ #==================json 만들기===================
98
+ entire_json_dict = createJson(train_dataset, metas)
99
+ entire_json = json.dumps(entire_json_dict, indent=4)
100
+
101
+ with open('mbench/sampled_frame.json', mode='w') as file:
102
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250113183527.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+
10
+ from pathlib import Path
11
+ import io
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ import regex as re
16
+ import json
17
+
18
+ import cv2
19
+ from PIL import Image, ImageDraw
20
+ import torch
21
+ from torchvision.transforms import functional as F
22
+
23
+ from skimage import measure # (pip install scikit-image)
24
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
25
+
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib.patches as patches
28
+ from matplotlib.collections import PatchCollection
29
+ from matplotlib.patches import Rectangle
30
+
31
+
32
+ import ipywidgets as widgets
33
+ from IPython.display import display, clear_output
34
+
35
+ #==================json 만들기===================
36
+ def createJson(train_dataset, metas):
37
+ entire_json = {}
38
+
39
+ #초기화
40
+ data_idx = 0
41
+
42
+ while data_idx < len(train_dataset):
43
+
44
+ #하나의 비디오에 대해
45
+ video_data = {}
46
+ video_id = metas[data_idx]['video']
47
+ video_data['bins'] = metas[data_idx]['bins']
48
+ annotation_data = []
49
+ frame_names = []
50
+
51
+ while metas[data_idx]['video'] == video_id:
52
+
53
+ obj_id = metas[data_idx]['obj_id']
54
+ sample_id = metas[data_idx]['sample_id']
55
+ sample_frames_id = metas[data_idx]['sample_frames_id']
56
+ sample_frame_idx = sample_frames_id.index(sample_id)
57
+
58
+ frames = metas[data_idx]['frames']
59
+
60
+ frame_name = frames[sample_id]
61
+ cat_name = metas[data_idx]['category']
62
+
63
+ bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :].tolist()
64
+
65
+ obj_data = {obj_id: {
66
+ "category_name" : cat_name,
67
+ "bbox": bbox
68
+ }}
69
+
70
+
71
+ annotation_data.append(obj_data)
72
+
73
+ frame_names.append(frame_name)
74
+
75
+ data_idx += 1
76
+
77
+ video_data['annotations'] = annotation_data
78
+ video_data['frame_names'] = frame_names
79
+ video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
80
+
81
+ entire_json[video_id] = video_data
82
+
83
+ return entire_json
84
+
85
+
86
+ if __name__ == '__main__':
87
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
88
+ args = parser.parse_args()
89
+
90
+ #==================데이터 불러오기===================
91
+ # 전체 데이터셋
92
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
93
+
94
+ # 전체 데이터셋 메타데이터
95
+ metas = train_dataset.metas
96
+
97
+ #==================json 만들기===================
98
+ entire_json_dict = createJson(train_dataset, metas)
99
+ print(type(entire_json_dict))
100
+ entire_json = json.dumps(entire_json_dict, indent=4)
101
+
102
+ with open('mbench/sampled_frame.json', mode='w') as file:
103
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250113195258.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+
10
+ from pathlib import Path
11
+ import io
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ import regex as re
16
+ import json
17
+
18
+ import cv2
19
+ from PIL import Image, ImageDraw
20
+ import torch
21
+ from torchvision.transforms import functional as F
22
+
23
+ from skimage import measure # (pip install scikit-image)
24
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
25
+
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib.patches as patches
28
+ from matplotlib.collections import PatchCollection
29
+ from matplotlib.patches import Rectangle
30
+
31
+
32
+ import ipywidgets as widgets
33
+ from IPython.display import display, clear_output
34
+
35
+ #==================json 만들기===================
36
+ def createJson(train_dataset, metas):
37
+ entire_json = {}
38
+
39
+ #초기화
40
+ data_idx = 0
41
+ print(len(train_dataset), len(metas), flush = True)
42
+ while data_idx < len(train_dataset):
43
+
44
+ #하나의 비디오에 대해
45
+ video_data = {}
46
+ video_id = metas[data_idx]['video']
47
+ video_data['bins'] = metas[data_idx]['bins']
48
+ annotation_data = []
49
+ frame_names = []
50
+
51
+ while metas[data_idx]['video'] == video_id:
52
+
53
+ obj_id = metas[data_idx]['obj_id']
54
+ sample_id = metas[data_idx]['sample_id']
55
+ sample_frames_id = metas[data_idx]['sample_frames_id']
56
+ sample_frame_idx = sample_frames_id.index(sample_id)
57
+
58
+ frames = metas[data_idx]['frames']
59
+
60
+ frame_name = frames[sample_id]
61
+ cat_name = metas[data_idx]['category']
62
+
63
+ bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :].tolist()
64
+
65
+ obj_data = {obj_id: {
66
+ "category_name" : cat_name,
67
+ "bbox": bbox
68
+ }}
69
+
70
+
71
+ annotation_data.append(obj_data)
72
+
73
+ frame_names.append(frame_name)
74
+
75
+ data_idx += 1
76
+
77
+ video_data['annotations'] = annotation_data
78
+ video_data['frame_names'] = frame_names
79
+ video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
80
+
81
+ entire_json[video_id] = video_data
82
+
83
+ return entire_json
84
+
85
+
86
+ if __name__ == '__main__':
87
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
88
+ args = parser.parse_args()
89
+
90
+ #==================데이터 불러오기===================
91
+ # 전체 데이터셋
92
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
93
+
94
+ # 전체 데이터셋 메타데이터
95
+ metas = train_dataset.metas
96
+
97
+ #==================json 만들기===================
98
+ entire_json_dict = createJson(train_dataset, metas)
99
+ print(type(entire_json_dict))
100
+ entire_json = json.dumps(entire_json_dict, indent=4)
101
+
102
+ with open('mbench/sampled_frame.json', mode='w') as file:
103
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250113195443.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+
10
+ from pathlib import Path
11
+ import io
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ import regex as re
16
+ import json
17
+
18
+ import cv2
19
+ from PIL import Image, ImageDraw
20
+ import torch
21
+ from torchvision.transforms import functional as F
22
+
23
+ from skimage import measure # (pip install scikit-image)
24
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
25
+
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib.patches as patches
28
+ from matplotlib.collections import PatchCollection
29
+ from matplotlib.patches import Rectangle
30
+
31
+
32
+ import ipywidgets as widgets
33
+ from IPython.display import display, clear_output
34
+
35
+ #==================json 만들기===================
36
+ def createJson(train_dataset, metas):
37
+ entire_json = {}
38
+
39
+ #초기화
40
+ data_idx = 0
41
+
42
+ while data_idx < len(train_dataset):
43
+
44
+ #하나의 비디오에 대해
45
+ video_data = {}
46
+ video_id = metas[data_idx]['video']
47
+ video_data['bins'] = metas[data_idx]['bins']
48
+ annotation_data = []
49
+ frame_names = []
50
+
51
+ while data_idx < len(train_dataset) and metas[data_idx]['video'] == video_id:
52
+
53
+ obj_id = metas[data_idx]['obj_id']
54
+ sample_id = metas[data_idx]['sample_id']
55
+ sample_frames_id = metas[data_idx]['sample_frames_id']
56
+ sample_frame_idx = sample_frames_id.index(sample_id)
57
+
58
+ frames = metas[data_idx]['frames']
59
+
60
+ frame_name = frames[sample_id]
61
+ cat_name = metas[data_idx]['category']
62
+
63
+ bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :].tolist()
64
+
65
+ obj_data = {obj_id: {
66
+ "category_name" : cat_name,
67
+ "bbox": bbox
68
+ }}
69
+
70
+
71
+ annotation_data.append(obj_data)
72
+
73
+ frame_names.append(frame_name)
74
+
75
+ data_idx += 1
76
+
77
+ video_data['annotations'] = annotation_data
78
+ video_data['frame_names'] = frame_names
79
+ video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
80
+
81
+ entire_json[video_id] = video_data
82
+
83
+ return entire_json
84
+
85
+
86
+ if __name__ == '__main__':
87
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
88
+ args = parser.parse_args()
89
+
90
+ #==================데이터 불러오기===================
91
+ # 전체 데이터셋
92
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
93
+
94
+ # 전체 데이터셋 메타데이터
95
+ metas = train_dataset.metas
96
+
97
+ #==================json 만들기===================
98
+ entire_json_dict = createJson(train_dataset, metas)
99
+ print(type(entire_json_dict))
100
+ entire_json = json.dumps(entire_json_dict, indent=4)
101
+
102
+ with open('mbench/sampled_frame.json', mode='w') as file:
103
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250116140957.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+
10
+ from pathlib import Path
11
+ import io
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ import regex as re
16
+ import json
17
+
18
+ import cv2
19
+ from PIL import Image, ImageDraw
20
+ import torch
21
+ from torchvision.transforms import functional as F
22
+
23
+ from skimage import measure # (pip install scikit-image)
24
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
25
+
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib.patches as patches
28
+ from matplotlib.collections import PatchCollection
29
+ from matplotlib.patches import Rectangle
30
+
31
+
32
+ import ipywidgets as widgets
33
+ from IPython.display import display, clear_output
34
+
35
+ #==================json 만들기===================
36
+ def createJson(train_dataset, metas):
37
+ entire_json = {}
38
+
39
+ #초기화
40
+ vid_idx = 0
41
+
42
+ while vid_idx < 5:
43
+
44
+ #하나의 비디오에 대해
45
+ video_data = {}
46
+ video_train_frames, video_train_info = train_dataset[vid_idx]
47
+ video_meta = metas[vid_idx]
48
+
49
+ video_id = video_meta['video']
50
+ video_data['bins'] = video_meta['bins']
51
+ bin_nums = len(video_meta['bins'])
52
+ obj_nums = len(list(video_meta['obj_id_cat'].keys()))
53
+
54
+ annotation_data = []
55
+ frame_names = []
56
+
57
+ for i in range(bin_nums):
58
+ bin_data = {}
59
+ for j in range(obj_nums):
60
+ obj_id = str(j+1)
61
+ obj_data = {
62
+ "category_name":video_meta['obj_id_cat'][obj_id],
63
+ "bbox":video_train_info['boxes'][i*obj_nums+j, :]
64
+ }
65
+ bin_data[obj_id] = obj_data
66
+ annotation_data.append(bin_data)
67
+
68
+ video_data['annotations'] = annotation_data
69
+
70
+
71
+ sample_indx = metas[vid_idx]['sample_indx']
72
+ frames = metas[vid_idx]['frames']
73
+ for i in sample_indx:
74
+ frame_name = frames[i]
75
+ frame_names.append(frame_name)
76
+
77
+ video_data['frame_names'] = frame_names
78
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
79
+ entire_json[video_id] = video_data
80
+
81
+ vid_idx += 1
82
+
83
+ return entire_json
84
+
85
+
86
+ if __name__ == '__main__':
87
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
88
+ args = parser.parse_args()
89
+
90
+ #==================데이터 불러오기===================
91
+ # 전체 데이터셋
92
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
93
+
94
+ # 전체 데이터셋 메타데이터
95
+ metas = train_dataset.metas
96
+
97
+ #==================json 만들기===================
98
+ entire_json_dict = createJson(train_dataset, metas)
99
+ print(type(entire_json_dict))
100
+ entire_json = json.dumps(entire_json_dict, indent=4)
101
+
102
+ with open('mbench/sampled_frame.json', mode='w') as file:
103
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250117032934.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from os import path as osp
4
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
5
+
6
+ from datasets import build_dataset
7
+ import argparse
8
+ import opts
9
+
10
+
11
+ from pathlib import Path
12
+ import io
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ #==================json 만들기===================
37
+ def createJson(train_dataset, metas):
38
+ entire_json = {}
39
+
40
+ #초기화
41
+ vid_idx = 0
42
+
43
+ while vid_idx < len(train_dataset):
44
+
45
+ #하나의 비디오에 대해
46
+ video_data = {}
47
+ video_train_frames, video_train_info = train_dataset[vid_idx]
48
+ video_meta = metas[vid_idx]
49
+
50
+ video_id = video_meta['video']
51
+ video_data['bins'] = video_meta['bins']
52
+ bin_nums = len(video_meta['bins'])
53
+ obj_nums = len(list(video_meta['obj_id_cat'].keys()))
54
+
55
+ annotation_data = []
56
+ frame_names = []
57
+
58
+ for i in range(bin_nums):
59
+ bin_data = {}
60
+ for j in range(obj_nums):
61
+ obj_id = str(j+1)
62
+ print(video_meta['obj_id_cat'].keys())
63
+ obj_data = {
64
+ "category_name":video_meta['obj_id_cat'][obj_id],
65
+ "bbox":video_train_info['boxes'][i*obj_nums+j, :]
66
+ }
67
+ bin_data[obj_id] = obj_data
68
+ annotation_data.append(bin_data)
69
+
70
+ video_data['annotations'] = annotation_data
71
+
72
+
73
+ sample_indx = metas[vid_idx]['sample_indx']
74
+ frames = metas[vid_idx]['frames']
75
+ for i in sample_indx:
76
+ frame_name = frames[i]
77
+ frame_names.append(frame_name)
78
+
79
+ video_data['frame_names'] = frame_names
80
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
81
+ entire_json[video_id] = video_data
82
+
83
+ vid_idx += 1
84
+
85
+ return entire_json
86
+
87
+
88
+ if __name__ == '__main__':
89
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
90
+ args = parser.parse_args()
91
+
92
+ #==================데이터 불러오기===================
93
+ # 전체 데이터셋
94
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
95
+
96
+ # 전체 데이터셋 메타데이터
97
+ metas = train_dataset.metas
98
+
99
+ #==================json 만들기===================
100
+ entire_json_dict = createJson(train_dataset, metas)
101
+ print(type(entire_json_dict))
102
+ entire_json = json.dumps(entire_json_dict, indent=4)
103
+
104
+ with open('mbench/sampled_frame2.json', mode='w') as file:
105
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250117074200.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from os import path as osp
4
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
5
+
6
+ from datasets import build_dataset
7
+ import argparse
8
+ import opts
9
+
10
+
11
+ from pathlib import Path
12
+ import io
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ #==================json 만들기===================
37
+ def createJson(train_dataset, metas):
38
+ entire_json = {}
39
+
40
+ #초기화
41
+ vid_idx = 0
42
+
43
+ while vid_idx < 10:
44
+
45
+ #하나의 비디오에 대해
46
+ video_data = {}
47
+ video_train_frames, video_train_info = train_dataset[vid_idx]
48
+ video_meta = metas[vid_idx]
49
+
50
+ video_id = video_meta['video']
51
+ video_data['bins'] = video_meta['bins']
52
+ bin_nums = len(video_meta['bins'])
53
+ obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
54
+
55
+ annotation_data = []
56
+ frame_names = []
57
+
58
+ for i in range(bin_nums):
59
+ bin_data = {}
60
+ for j in range(obj_nums):
61
+ obj_id = str(j+1)
62
+ try:
63
+ obj_data = {
64
+ "category_name":video_meta['obj_id_cat'][obj_id],
65
+ "bbox":video_train_info['boxes'][i*obj_nums+j, :].tolist()
66
+ }
67
+ except:
68
+ obj_data = {}
69
+ bin_data[obj_id] = obj_data
70
+ annotation_data.append(bin_data)
71
+
72
+ video_data['annotations'] = annotation_data
73
+
74
+
75
+ sample_indx = metas[vid_idx]['sample_indx']
76
+ frames = metas[vid_idx]['frames']
77
+ for i in sample_indx:
78
+ frame_name = frames[i]
79
+ frame_names.append(frame_name)
80
+
81
+ video_data['frame_names'] = frame_names
82
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
83
+ entire_json[video_id] = video_data
84
+
85
+ vid_idx += 1
86
+
87
+ return entire_json
88
+
89
+
90
+ if __name__ == '__main__':
91
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
92
+ args = parser.parse_args()
93
+
94
+ #==================데이터 불러오기===================
95
+ # 전체 데이터셋
96
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
97
+
98
+ # 전체 데이터셋 메타데이터
99
+ metas = train_dataset.metas
100
+
101
+ #==================json 만들기===================
102
+ entire_json_dict = createJson(train_dataset, metas)
103
+ print(type(entire_json_dict))
104
+ entire_json = json.dumps(entire_json_dict, indent=4)
105
+
106
+ with open('mbench/sampled_frame2.json', mode='w') as file:
107
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250117074329.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from os import path as osp
4
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
5
+
6
+ from datasets import build_dataset
7
+ import argparse
8
+ import opts
9
+
10
+
11
+ from pathlib import Path
12
+ import io
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ #==================json 만들기===================
37
+ def createJson(train_dataset, metas):
38
+ entire_json = {}
39
+
40
+ #초기화
41
+ vid_idx = 0
42
+
43
+ while vid_idx < len(train_dataset):
44
+
45
+ #하나의 비디오에 대해
46
+ video_data = {}
47
+ video_train_frames, video_train_info = train_dataset[vid_idx]
48
+ video_meta = metas[vid_idx]
49
+
50
+ video_id = video_meta['video']
51
+ video_data['bins'] = video_meta['bins']
52
+ bin_nums = len(video_meta['bins'])
53
+ obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
54
+
55
+ annotation_data = []
56
+ frame_names = []
57
+
58
+ for i in range(bin_nums):
59
+ bin_data = {}
60
+ for j in range(obj_nums):
61
+ obj_id = str(j+1)
62
+ try:
63
+ obj_data = {
64
+ "category_name":video_meta['obj_id_cat'][obj_id],
65
+ "bbox":video_train_info['boxes'][i*obj_nums+j, :].tolist()
66
+ }
67
+ except:
68
+ obj_data = {}
69
+ bin_data[obj_id] = obj_data
70
+ annotation_data.append(bin_data)
71
+
72
+ video_data['annotations'] = annotation_data
73
+
74
+
75
+ sample_indx = metas[vid_idx]['sample_indx']
76
+ frames = metas[vid_idx]['frames']
77
+ for i in sample_indx:
78
+ frame_name = frames[i]
79
+ frame_names.append(frame_name)
80
+
81
+ video_data['frame_names'] = frame_names
82
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
83
+ entire_json[video_id] = video_data
84
+
85
+ vid_idx += 1
86
+
87
+ return entire_json
88
+
89
+
90
+ if __name__ == '__main__':
91
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
92
+ args = parser.parse_args()
93
+
94
+ #==================데이터 불러오기===================
95
+ # 전체 데이터셋
96
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
97
+
98
+ # 전체 데이터셋 메타데이터
99
+ metas = train_dataset.metas
100
+
101
+ #==================json 만들기===================
102
+ entire_json_dict = createJson(train_dataset, metas)
103
+ print(type(entire_json_dict))
104
+ entire_json = json.dumps(entire_json_dict, indent=4)
105
+
106
+ with open('mbench/sampled_frame2.json', mode='w') as file:
107
+ file.write(entire_json)
.history/slurm_script/jupyter_20250106230703.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=jupyter
4
+ #SBATCH --partition=a4000
5
+ #SBATCH --nodelist=node05
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=0-06:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
11
+
12
+ ml purge
13
+ ml load cuda/12.1
14
+ eval "$(conda shell.bash hook)"
15
+ conda activate referformer
16
+ srun jupyter notebook --no-browser --port=7890
.history/slurm_script/jupyter_20250113135212.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=jupyter
4
+ #SBATCH --partition=a5000
5
+ #SBATCH --nodelist=node04
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=0-06:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
11
+
12
+ ml purge
13
+ ml load cuda/12.1
14
+ eval "$(conda shell.bash hook)"
15
+ conda activate referformer
16
+ srun jupyter notebook --no-browser --port=7890
.history/slurm_script/jupyter_20250117012746.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=jupyter
4
+ #SBATCH --partition=a4000
5
+ #SBATCH --nodelist=node05
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=0-06:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
11
+
12
+ ml purge
13
+ ml load cuda/12.1
14
+ eval "$(conda shell.bash hook)"
15
+ conda activate referformer
16
+ srun jupyter notebook --no-browser --port=7890
.history/slurm_script/jupyter_20250117012750.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=jupyter
4
+ #SBATCH --partition=a4000
5
+ #SBATCH --nodelist=node05
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=14-00:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
11
+
12
+ ml purge
13
+ ml load cuda/12.1
14
+ eval "$(conda shell.bash hook)"
15
+ conda activate referformer
16
+ srun jupyter notebook --no-browser --port=7890
.history/slurm_script/jupyter_20250117143527.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=jupyter
4
+ #SBATCH --partition=a5000
5
+ #SBATCH --nodelist=node04
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=14-00:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
11
+
12
+ ml purge
13
+ ml load cuda/12.1
14
+ eval "$(conda shell.bash hook)"
15
+ conda activate referformer
16
+ srun jupyter notebook --no-browser --port=7890
.history/slurm_script/mbench_gpt_a2d_20250205122407.sh ADDED
File without changes
.history/slurm_script/mbench_gpt_a2d_20250205151525.sh ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=mbench_gpt_a2d
4
+ #SBATCH --partition=a4000
5
+ #SBATCH --nodelist=node05
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=14-00:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_a2d.out
11
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
12
+
13
+ ml purge
14
+ ml load cuda/12.1
15
+ eval "$(conda shell.bash hook)"
16
+ conda activate referformer
17
+
18
+ python3 mbench_a2d/gpt_a2d_numbered.py \
19
+ --save_caption_path mbench_a2d/numbered_captions.json
.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121155759.sh ADDED
File without changes
.history/slurm_script/mbench_gpt_ref-ytvos_20250119070901.sh ADDED
File without changes
.history/slurm_script/mbench_gpt_ref-ytvos_20250119070932.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=mbench_gpt_ref-ytvos
4
+ #SBATCH --partition=a4000
5
+ #SBATCH --nodelist=node05
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=14-00:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos.out
11
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
12
+
13
+ ml purge
14
+ ml load cuda/12.1
15
+ eval "$(conda shell.bash hook)"
16
+ conda activate referformer
17
+
18
+ python3 mbench/make_ref-ytvos_json.py
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130185113.sh ADDED
File without changes
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130220432.sh ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
4
+ #SBATCH --partition=a4000
5
+ #SBATCH --nodelist=node05
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=14-00:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
11
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
12
+
13
+ ml purge
14
+ ml load cuda/12.1
15
+ eval "$(conda shell.bash hook)"
16
+ conda activate referformer
17
+
18
+ python3 mbench/gpt_ref-ytvos_numbered_cy.py \
19
+ --save_caption_path mbench/numbered_captions_gpt-4o-mini.json \
20
+ --save_valid_obj_ids_path mbench/numbered_valid_obj_idsgpt-4o-mini.json
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130220435.sh ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
4
+ #SBATCH --partition=a4000
5
+ #SBATCH --nodelist=node05
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=14-00:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
11
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
12
+
13
+ ml purge
14
+ ml load cuda/12.1
15
+ eval "$(conda shell.bash hook)"
16
+ conda activate referformer
17
+
18
+ python3 mbench/gpt_ref-ytvos_numbered_cy.py \
19
+ --save_caption_path mbench/numbered_captions_gpt-4o-mini.json \
20
+ --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o-mini.json
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207171522.sh ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
4
+ #SBATCH --partition=a4000
5
+ #SBATCH --nodelist=node05
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=14-00:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
11
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
12
+
13
+ ml purge
14
+ ml load cuda/12.1
15
+ eval "$(conda shell.bash hook)"
16
+ conda activate referformer
17
+
18
+ python3 mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py \
19
+ --save_caption_path mbench/numbered_captions_gpt-4o_final.json \
20
+ --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o_final.json
.history/slurm_script/mbench_ref-ytvos_json_20250113182619.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=mbench_ref-ytvos_json
4
+ #SBATCH --partition=a4000
5
+ #SBATCH --nodelist=node05
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=0-06:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_ref-ytvos_json.out
11
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
12
+
13
+ ml purge
14
+ ml load cuda/12.1
15
+ eval "$(conda shell.bash hook)"
16
+ conda activate referformer
17
+
18
+ python3 mbench/make_ref-ytvos_json.py
.history/slurm_script/mbench_ref-ytvos_json_20250113182952.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=mbench_ref-ytvos_json
4
+ #SBATCH --partition=a4000
5
+ #SBATCH --nodelist=node05
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=14-00:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_ref-ytvos_json.out
11
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
12
+
13
+ ml purge
14
+ ml load cuda/12.1
15
+ eval "$(conda shell.bash hook)"
16
+ conda activate referformer
17
+
18
+ python3 mbench/make_ref-ytvos_json.py
.history/slurm_script/mbench_ref-ytvos_json_20250116141255.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=mbench_ref-ytvos_json
4
+ #SBATCH --partition=a5000
5
+ #SBATCH --nodelist=node04
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=14-00:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_ref-ytvos_json.out
11
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
12
+
13
+ ml purge
14
+ ml load cuda/12.1
15
+ eval "$(conda shell.bash hook)"
16
+ conda activate referformer
17
+
18
+ python3 mbench/make_ref-ytvos_json.py
.history/slurm_script/mbench_ref-ytvos_json_20250117072826.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=mbench_ref-ytvos_json
4
+ #SBATCH --partition=a4000
5
+ #SBATCH --nodelist=node05
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=14-00:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_ref-ytvos_json.out
11
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
12
+
13
+ ml purge
14
+ ml load cuda/12.1
15
+ eval "$(conda shell.bash hook)"
16
+ conda activate referformer
17
+
18
+ python3 mbench/make_ref-ytvos_json.py
davis2017/results.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ from PIL import Image
4
+ import sys
5
+
6
+
7
+ class Results(object):
8
+ def __init__(self, root_dir):
9
+ self.root_dir = root_dir
10
+
11
+ def _read_mask(self, sequence, frame_id):
12
+ try:
13
+ mask_path = os.path.join(self.root_dir, sequence, f'{frame_id}.png')
14
+ return np.array(Image.open(mask_path))
15
+ except IOError as err:
16
+ sys.stdout.write(sequence + " frame %s not found!\n" % frame_id)
17
+ sys.stdout.write("The frames have to be indexed PNG files placed inside the corespondent sequence "
18
+ "folder.\nThe indexes have to match with the initial frame.\n")
19
+ sys.stderr.write("IOError: " + err.strerror + "\n")
20
+ sys.exit()
21
+
22
+ def read_masks(self, sequence, masks_id):
23
+ mask_0 = self._read_mask(sequence, masks_id[0])
24
+ masks = np.zeros((len(masks_id), *mask_0.shape))
25
+ for ii, m in enumerate(masks_id):
26
+ masks[ii, ...] = self._read_mask(sequence, m)
27
+ num_objects = int(np.max(masks))
28
+ tmp = np.ones((num_objects, *masks.shape))
29
+ tmp = tmp * np.arange(1, num_objects + 1)[:, None, None, None]
30
+ masks = (tmp == masks[None, ...]) > 0
31
+ return masks
hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/7528dbb1b6ce860d242aff71294a5fef12a41572.lock ADDED
File without changes
hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/cc6c13cb9acd48b061e2d2664a50963c338b4998.lock ADDED
File without changes
hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/e7dbc990f8ede75b1ad2fd17028fbd89a950286a.lock ADDED
File without changes
hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/model.safetensors ADDED
File without changes
hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/model.safetensors.index.json ADDED
File without changes
hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/7528dbb1b6ce860d242aff71294a5fef12a41572 ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.31.0"
7
+ }