dianecy commited on
Commit
3b5fc39
·
verified ·
1 Parent(s): 91e3dad

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .history/datasets/ytvos_ref_20250113163402.py +249 -0
  2. .history/datasets/ytvos_ref_20250116073826.py +240 -0
  3. .history/mbench/gpt_ref-ytvos-cy_20250121151408.py +431 -0
  4. .history/mbench/gpt_ref-ytvos-cy_20250121155710.py +428 -0
  5. .history/mbench/gpt_ref-ytvos-revised_20250121155717.py +428 -0
  6. .history/mbench/gpt_ref-ytvos-revised_20250121155956.py +428 -0
  7. .history/mbench/gpt_ref-ytvos-revised_20250121160813.py +428 -0
  8. .history/mbench/gpt_ref-ytvos_20250119070213.py +277 -0
  9. .history/mbench/gpt_ref-ytvos_20250119070707.py +282 -0
  10. .history/mbench/gpt_ref-ytvos_20250119070824.py +286 -0
  11. .history/mbench/gpt_ref-ytvos_20250119071214.py +290 -0
  12. .history/mbench/gpt_ref-ytvos_20250119073250.py +292 -0
  13. .history/mbench/gpt_ref-ytvos_numbered_cy_20250130183735.py +0 -0
  14. .history/mbench/gpt_ref-ytvos_numbered_cy_20250130183916.py +199 -0
  15. .history/mbench/gpt_ref-ytvos_numbered_cy_20250130185048.py +422 -0
  16. .history/mbench/gpt_ref-ytvos_numbered_cy_20250130190055.py +428 -0
  17. .history/mbench/gpt_ref-ytvos_numbered_cy_20250130190447.py +430 -0
  18. .history/mbench/gpt_ref-ytvos_numbered_cy_20250130190713.py +430 -0
  19. .history/mbench/gpt_ref-ytvos_numbered_cy_20250131124156.py +427 -0
  20. .history/mbench/gpt_ref-ytvos_numbered_cy_20250201140343.py +460 -0
  21. .history/mbench/gpt_ref-ytvos_numbered_cy_20250201140413.py +460 -0
  22. .history/mbench/gpt_ref-ytvos_numbered_cy_20250201141847.py +460 -0
  23. .history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250206153011.py +644 -0
  24. .history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207171300.py +644 -0
  25. .history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207171416.py +644 -0
  26. .history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207173350.py +677 -0
  27. .history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207184812.py +676 -0
  28. .history/mbench/make_ref-ytvos_json_20250113183250.py +103 -0
  29. .history/mbench/make_ref-ytvos_json_20250113183335.py +103 -0
  30. .history/mbench/make_ref-ytvos_json_20250113183413.py +103 -0
  31. .history/mbench/make_ref-ytvos_json_20250113195227.py +103 -0
  32. .history/mbench/make_ref-ytvos_json_20250116140938.py +103 -0
  33. .history/mbench/make_ref-ytvos_json_20250116141629.py +104 -0
  34. .history/mbench/make_ref-ytvos_json_20250117072647.py +107 -0
  35. .history/mbench/make_ref-ytvos_json_20250117074149.py +107 -0
  36. .history/mbench/make_ref-ytvos_json_20250118024354.py +108 -0
  37. .history/mbench/ytvos_ref_20250121140600.py +265 -0
  38. .history/mbench_a2d/gpt_a2d_numbered_20250205111521.py +0 -0
  39. .history/mbench_a2d/gpt_a2d_numbered_20250205151640.py +197 -0
  40. .history/mbench_a2d/gpt_a2d_numbered_20250205151759.py +199 -0
  41. .history/mbench_a2d/gpt_a2d_numbered_20250205151827.py +199 -0
  42. .history/mbench_a2d/gpt_a2d_numbered_20250205151833.py +199 -0
  43. .history/mbench_a2d/gpt_a2d_numbered_20250205152714.py +200 -0
  44. .history/mbench_a2d/gpt_a2d_numbered_20250206114221.py +205 -0
  45. .history/mbench_a2d/gpt_a2d_numbered_20250206114540.py +209 -0
  46. .history/mbench_a2d/gpt_a2d_numbered_20250206145656.py +209 -0
  47. .history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130185215.sh +18 -0
  48. .history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207173418.sh +20 -0
  49. hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b.lock +0 -0
  50. hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/cf1c08b23cfa58fa714ab5a4a233b9b42ee9bb9b.lock +0 -0
.history/datasets/ytvos_ref_20250113163402.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+ for exp_id, exp_dict in vid_data['expressions'].items():
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ for frame_id in sample_indx:
93
+ meta = {
94
+ 'video': vid,
95
+ 'exp': exp_dict['exp'],
96
+ 'obj_id': int(exp_dict['obj_id']),
97
+ 'frames': vid_frames,
98
+ 'frame_id' : frame_id,
99
+ 'sample_frames_id' : sample_indx,
100
+ 'bins': bins,
101
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
102
+ }
103
+ self.metas.append(meta)
104
+ print(f"skipped {skip_vid_count} short videos")
105
+
106
+
107
+ @staticmethod
108
+ def bounding_box(img):
109
+ rows = np.any(img, axis=1)
110
+ cols = np.any(img, axis=0)
111
+ rmin, rmax = np.where(rows)[0][[0, -1]]
112
+ cmin, cmax = np.where(cols)[0][[0, -1]]
113
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
114
+
115
+ def __len__(self):
116
+ return len(self.metas)
117
+
118
+ def __getitem__(self, idx):
119
+ instance_check = False
120
+ while not instance_check:
121
+ meta = self.metas[idx] # dict
122
+
123
+
124
+ video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
125
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
126
+
127
+
128
+ # clean up the caption
129
+ exp = " ".join(exp.lower().split())
130
+ category_id = category_dict[category]
131
+ vid_len = len(frames)
132
+
133
+ # num_frames = self.num_frames
134
+
135
+ # read frames and masks
136
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
137
+ for frame_indx in sample_frames_id:
138
+ frame_name = frames[frame_indx]
139
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
140
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
141
+ img = Image.open(img_path).convert('RGB')
142
+ mask = Image.open(mask_path).convert('P')
143
+
144
+ # create the target
145
+ label = torch.tensor(category_id)
146
+ mask = np.array(mask)
147
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
148
+ if (mask > 0).any():
149
+ y1, y2, x1, x2 = self.bounding_box(mask)
150
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
151
+ valid.append(1)
152
+ else: # some frame didn't contain the instance
153
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
154
+ valid.append(0)
155
+ mask = torch.from_numpy(mask)
156
+
157
+ # append
158
+ imgs.append(img)
159
+ labels.append(label)
160
+ masks.append(mask)
161
+ boxes.append(box)
162
+
163
+ # transform
164
+ w, h = img.size
165
+ labels = torch.stack(labels, dim=0)
166
+ boxes = torch.stack(boxes, dim=0)
167
+ boxes[:, 0::2].clamp_(min=0, max=w)
168
+ boxes[:, 1::2].clamp_(min=0, max=h)
169
+ masks = torch.stack(masks, dim=0)
170
+ target = {
171
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
172
+ 'labels': labels, # [T,]
173
+ 'boxes': boxes, # [T, 4], xyxy
174
+ 'masks': masks, # [T, H, W]
175
+ 'valid': torch.tensor(valid), # [T,]
176
+ 'caption': exp,
177
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
178
+ 'size': torch.as_tensor([int(h), int(w)])
179
+ }
180
+
181
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
182
+ if self._transforms:
183
+ imgs, target = self._transforms(imgs, target)
184
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
185
+ else:
186
+ imgs = np.array(imgs)
187
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
188
+
189
+
190
+ # FIXME: handle "valid", since some box may be removed due to random crop
191
+ if torch.any(target['valid'] == 1): # at leatst one instance
192
+ instance_check = True
193
+ else:
194
+ idx = random.randint(0, self.__len__() - 1)
195
+
196
+ return imgs, target
197
+
198
+
199
+ def make_coco_transforms(image_set, max_size=640):
200
+ normalize = T.Compose([
201
+ T.ToTensor(),
202
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
203
+ ])
204
+
205
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
206
+
207
+ if image_set == 'train':
208
+ return T.Compose([
209
+ T.RandomHorizontalFlip(),
210
+ T.PhotometricDistort(),
211
+ T.RandomSelect(
212
+ T.Compose([
213
+ T.RandomResize(scales, max_size=max_size),
214
+ T.Check(),
215
+ ]),
216
+ T.Compose([
217
+ T.RandomResize([400, 500, 600]),
218
+ T.RandomSizeCrop(384, 600),
219
+ T.RandomResize(scales, max_size=max_size),
220
+ T.Check(),
221
+ ])
222
+ ),
223
+ normalize,
224
+ ])
225
+
226
+ # we do not use the 'val' set since the annotations are inaccessible
227
+ if image_set == 'val':
228
+ return T.Compose([
229
+ T.RandomResize([360], max_size=640),
230
+ normalize,
231
+ ])
232
+
233
+ raise ValueError(f'unknown {image_set}')
234
+
235
+
236
+ def build(image_set, args):
237
+ root = Path(args.ytvos_path)
238
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
239
+ PATHS = {
240
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
241
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
242
+ }
243
+ img_folder, ann_file = PATHS[image_set]
244
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
245
+ # num_frames=args.num_frames, max_skip=args.max_skip)
246
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
247
+ num_frames=args.num_frames, max_skip=args.max_skip)
248
+ return dataset
249
+
.history/datasets/ytvos_ref_20250116073826.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ meta = self.metas[idx] # dict
122
+
123
+ video, sample_indx, bins, frames, obj_id_cat = \
124
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
125
+
126
+ # read frames and masks
127
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
128
+ for frame_indx in sample_indx:
129
+ frame_name = frames[frame_indx]
130
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
131
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
132
+ img = Image.open(img_path).convert('RGB')
133
+ imgs.append(img)
134
+
135
+ mask = Image.open(mask_path).convert('P')
136
+ mask = np.array(mask)
137
+ print(mask.dtype)
138
+
139
+ # create the target
140
+ for obj_id in list(obj_id_cat.keys()):
141
+ obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary
142
+ if (obj_mask > 0).any():
143
+ y1, y2, x1, x2 = self.bounding_box(mask)
144
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
145
+ valid.append(1)
146
+ else: # some frame didn't contain the instance
147
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
148
+ valid.append(0)
149
+ obj_mask = torch.from_numpy(obj_mask)
150
+
151
+ # append
152
+ masks.append(obj_mask)
153
+ boxes.append(box)
154
+
155
+
156
+ # transform
157
+ w, h = img.size
158
+ boxes = torch.stack(boxes, dim=0)
159
+ boxes[:, 0::2].clamp_(min=0, max=w)
160
+ boxes[:, 1::2].clamp_(min=0, max=h)
161
+ masks = torch.stack(masks, dim=0)
162
+ target = {
163
+ 'frames_idx': sample_indx, # [T,]
164
+ 'boxes': boxes, # [T, 4], xyxy
165
+ 'masks': masks, # [T, H, W]
166
+ 'valid': torch.tensor(valid), # [T,]
167
+ 'obj_ids' : list(obj_id_cat.keys()),
168
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
169
+ 'size': torch.as_tensor([int(h), int(w)])
170
+ }
171
+
172
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
173
+ if self._transforms:
174
+ imgs, target = self._transforms(imgs, target)
175
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
176
+ else:
177
+ imgs = np.array(imgs)
178
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
179
+
180
+
181
+ # # FIXME: handle "valid", since some box may be removed due to random crop
182
+ # if torch.any(target['valid'] == 1): # at leatst one instance
183
+ # instance_check = True
184
+ # else:
185
+ # idx = random.randint(0, self.__len__() - 1)
186
+
187
+ return imgs, target
188
+
189
+
190
+ def make_coco_transforms(image_set, max_size=640):
191
+ normalize = T.Compose([
192
+ T.ToTensor(),
193
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
194
+ ])
195
+
196
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
197
+
198
+ if image_set == 'train':
199
+ return T.Compose([
200
+ T.RandomHorizontalFlip(),
201
+ T.PhotometricDistort(),
202
+ T.RandomSelect(
203
+ T.Compose([
204
+ T.RandomResize(scales, max_size=max_size),
205
+ T.Check(),
206
+ ]),
207
+ T.Compose([
208
+ T.RandomResize([400, 500, 600]),
209
+ T.RandomSizeCrop(384, 600),
210
+ T.RandomResize(scales, max_size=max_size),
211
+ T.Check(),
212
+ ])
213
+ ),
214
+ normalize,
215
+ ])
216
+
217
+ # we do not use the 'val' set since the annotations are inaccessible
218
+ if image_set == 'val':
219
+ return T.Compose([
220
+ T.RandomResize([360], max_size=640),
221
+ normalize,
222
+ ])
223
+
224
+ raise ValueError(f'unknown {image_set}')
225
+
226
+
227
+ def build(image_set, args):
228
+ root = Path(args.ytvos_path)
229
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
230
+ PATHS = {
231
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
232
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
233
+ }
234
+ img_folder, ann_file = PATHS[image_set]
235
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
236
+ # num_frames=args.num_frames, max_skip=args.max_skip)
237
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
238
+ num_frames=args.num_frames, max_skip=args.max_skip)
239
+ return dataset
240
+
.history/mbench/gpt_ref-ytvos-cy_20250121151408.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from mbench.ytvos_ref import build as build_ytvos_ref
6
+ import argparse
7
+ import opts
8
+
9
+ import sys
10
+ from pathlib import Path
11
+ import os
12
+ from os import path as osp
13
+ import skimage
14
+ from io import BytesIO
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ import regex as re
19
+ import json
20
+
21
+ import cv2
22
+ from PIL import Image, ImageDraw
23
+ import torch
24
+ from torchvision.transforms import functional as F
25
+
26
+ from skimage import measure # (pip install scikit-image)
27
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
28
+
29
+ import matplotlib.pyplot as plt
30
+ import matplotlib.patches as patches
31
+ from matplotlib.collections import PatchCollection
32
+ from matplotlib.patches import Rectangle
33
+
34
+
35
+ import ipywidgets as widgets
36
+ from IPython.display import display, clear_output
37
+
38
+ from openai import OpenAI
39
+ import base64
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+
47
+ # Captioner
48
+ ytvos_category_valid_list = [
49
+ 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
50
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
51
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
52
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
53
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
54
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
55
+ ]
56
+ def getCaption(video_id, json_data):
57
+ #데이터 가져오기
58
+ video_data = json_data[video_id]
59
+ frame_names = video_data['frame_names']
60
+ video_path = video_data['video_path']
61
+
62
+ cat_names = set()
63
+ all_captions = dict()
64
+ for obj_id in list(video_data['annotations'][0].keys()):
65
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
66
+
67
+ # cat_names : person, snowboard
68
+ # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
69
+ # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
70
+
71
+ for cat_name in list(cat_names) :
72
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
73
+ image_captions = {}
74
+
75
+ captioner = OpenAI()
76
+
77
+ #0단계: action의 대상이 될 수 있는가?
78
+ is_movable = False
79
+ if cat_name in ytvos_category_valid_list :
80
+ is_movable = True
81
+
82
+ # response_check = captioner.chat.completions.create(
83
+ # model="gpt-4o",
84
+ # messages=[
85
+ # {
86
+ # "role": "user",
87
+ # "content": f"""
88
+ # Can a {cat_name} be a subject of distinct actions or movements?
89
+ # For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
90
+ # However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
91
+ # Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
92
+ # Answer only YES or NONE.
93
+ # """
94
+ # }
95
+ # ],
96
+ # )
97
+ # response_check_content = response_check.choices[0].message.content.strip().lower()
98
+ # print(f"Movable Check for {cat_name}: {response_check_content}")
99
+
100
+ # if response_check_content == "yes": is_movable = True
101
+
102
+ if not is_movable:
103
+ print(f"Skipping {cat_name}: Determined to be non-movable.")
104
+ continue
105
+
106
+ for i in range(len(image_paths)):
107
+ image_path = image_paths[i]
108
+ frame_name = frame_names[i]
109
+ base64_image = encode_image(image_path)
110
+
111
+ #1단계: 필터링
112
+ print(cat_name, frame_name)
113
+ response1 = captioner.chat.completions.create(
114
+ model="gpt-4o",
115
+ messages=[
116
+ {
117
+ "role": "user",
118
+ "content": [
119
+ {
120
+ "type": "text",
121
+
122
+ "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
123
+ Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
124
+ Each action should be unique and clearly associated with a specific object.
125
+
126
+ Respond with YES if:
127
+ - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
128
+ - The {cat_name}s involve clear, distinguishable actions performed independently.
129
+
130
+ Respond with NONE if:
131
+ - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
132
+ - Actions are ambiguous, minor, or not clearly visible.
133
+
134
+ If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
135
+ If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
136
+
137
+ Answer only YES or NONE."""
138
+
139
+ },
140
+ {
141
+ "type": "image_url",
142
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
143
+ },
144
+ ],
145
+ }
146
+ ],
147
+ )
148
+ response_content = response1.choices[0].message.content
149
+ should_caption = True if "yes" in response_content.lower() else False
150
+ print(f"are {cat_name}s distinguished by action: {response_content}")
151
+
152
+ #2단계: dense caption 만들기
153
+ if should_caption:
154
+ response2 = captioner.chat.completions.create(
155
+ model="gpt-4o-mini",
156
+ messages=[
157
+ {
158
+ "role": "user",
159
+ "content": [
160
+ {
161
+ "type": "text",
162
+
163
+ "text": f"""
164
+ Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
165
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
166
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
167
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
168
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
169
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
170
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
171
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
172
+ 8. Include interactions with objects or other entities when they are prominent and observable.
173
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
174
+ Output only the caption.""",
175
+ },
176
+ {
177
+ "type": "image_url",
178
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
179
+ },
180
+ ],
181
+ }
182
+ ],
183
+ )
184
+
185
+ caption = response2.choices[0].message.content
186
+ print(f"{image_path} - {frame_name}: {caption}")
187
+ else:
188
+ caption = None
189
+
190
+ image_captions[frame_name] = caption
191
+ all_captions[cat_name] = image_captions
192
+
193
+ # final : also prepare valid object ids
194
+ valid_obj_ids = []
195
+ valid_cat_names = list(all_captions.keys())
196
+ for obj_id in list(video_data['annotations'][0].keys()):
197
+ cat = video_data['annotations'][0][obj_id]['category_name']
198
+ if cat in valid_cat_names : valid_obj_ids.append(obj_id)
199
+
200
+ return all_captions, valid_obj_ids
201
+
202
+
203
+ # Referring expression generator and QA filter
204
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
205
+ # 이미지에 해당 물체 바운딩 박스 그리기
206
+ video_data = json_data[video_id]
207
+ frame_names = video_data['frame_names']
208
+ video_path = video_data['video_path']
209
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
210
+ frame_indx = frame_names.index(frame_name)
211
+ obj_data = video_data['annotations'][frame_indx][obj_id]
212
+
213
+ bbox = obj_data['bbox']
214
+ cat_name = obj_data['category_name']
215
+ valid = obj_data['valid']
216
+
217
+ if valid == 0:
218
+ print("Object not in this frame!")
219
+ return {}
220
+
221
+
222
+ x_min, y_min, x_max, y_max = bbox
223
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
224
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
225
+ plt.figure()
226
+ plt.imshow(I)
227
+ plt.axis('off')
228
+ plt.show()
229
+
230
+ #cropped object for visibility check
231
+ cropped_I = I[y_min:y_max, x_min:x_max]
232
+ pil_cropped_I = Image.fromarray(cropped_I)
233
+ buff_crop = BytesIO()
234
+ pil_cropped_I.save(buff_crop, format='JPEG')
235
+ base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
236
+
237
+ #entire image for referring expression generation
238
+ pil_I = Image.fromarray(I)
239
+ buff = BytesIO()
240
+ pil_I.save(buff, format='JPEG')
241
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
242
+
243
+ # 구분 가능 여부 확인
244
+ generator = OpenAI()
245
+ response_check = generator.chat.completions.create(
246
+ model="chatgpt-4o-latest",
247
+ messages=[
248
+ {
249
+ "role": "user",
250
+ "content": [
251
+ {
252
+
253
+ "type": "text",
254
+ "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
255
+ Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
256
+
257
+ Guidelines:
258
+ - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
259
+ - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
260
+ - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
261
+
262
+ Output only either YES or NONE.
263
+ """
264
+ },
265
+ {
266
+ "type": "image_url",
267
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
268
+ }
269
+ ]
270
+ },
271
+ ]
272
+ )
273
+
274
+ response_check_content = response_check.choices[0].message.content.strip().lower()
275
+ print(f"is object {obj_id} visible: {response_check_content}")
276
+
277
+ if "yes" not in response_check_content:
278
+ print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
279
+ return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
280
+
281
+ # Referring expression 만들기
282
+ # generator = OpenAI()
283
+ response = generator.chat.completions.create(
284
+ model="chatgpt-4o-latest",
285
+ messages=[
286
+ {
287
+ "role": "user",
288
+ "content": [
289
+ {
290
+ "type": "text",
291
+
292
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
293
+ Guidelines for creating the referring expression:
294
+ 1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
295
+ 2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
296
+ 3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
297
+ 4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
298
+ 5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
299
+ 6. Use '{cat_name}' as the noun for the referring expressions.
300
+ Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
301
+
302
+ {caption}
303
+ """
304
+ },
305
+ {
306
+ "type": "image_url",
307
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
308
+ },
309
+ # {
310
+ # "type": "image_url",
311
+ # "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
312
+ # }
313
+ ],
314
+ }
315
+ ],
316
+ )
317
+
318
+ ref_exp = response.choices[0].message.content.strip()
319
+
320
+ #QA filtering
321
+ #QA1: 원하는 물체를 설명하는지
322
+ filter = OpenAI()
323
+ response1 = filter.chat.completions.create(
324
+ model="gpt-4o",
325
+ messages=[
326
+ {
327
+ "role": "user",
328
+ "content": [
329
+ {
330
+ "type": "text",
331
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
332
+ {ref_exp}""",
333
+ },
334
+ {
335
+ "type": "image_url",
336
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
337
+ },
338
+ ],
339
+ }
340
+ ],
341
+ )
342
+
343
+ response1_content = response1.choices[0].message.content
344
+ describesHighlighted = True if "yes" in response1_content.lower() else False
345
+
346
+ #QA2: 원하지 않는 물체를 설명하지 않는지
347
+ response2 = filter.chat.completions.create(
348
+ model="gpt-4o-mini",
349
+ messages=[
350
+ {
351
+ "role": "user",
352
+ "content": [
353
+ {
354
+ "type": "text",
355
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
356
+ {ref_exp}""",
357
+ },
358
+ {
359
+ "type": "image_url",
360
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
361
+ },
362
+ ],
363
+ }
364
+ ],
365
+ )
366
+
367
+ response2_content = response2.choices[0].message.content
368
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
369
+
370
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
371
+
372
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
373
+
374
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
375
+
376
+
377
+
378
+ if __name__ == '__main__':
379
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
380
+ args = parser.parse_args()
381
+
382
+ #==================데이터 불러오기===================
383
+ # # 전체 데이터셋
384
+ # train_dataset = build_ytvos_ref(image_set = 'train', args = args)
385
+
386
+ # # 전체 데이터셋 메타데이터
387
+ # metas = train_dataset.metas
388
+
389
+ with open('mbench/sampled_frame3.json', 'r') as file:
390
+ data = json.load(file)
391
+
392
+ vid_ids = list(data.keys())
393
+
394
+ all_ref_exps = {}
395
+
396
+ #==================GPT 돌리기===================
397
+ os.environ['OPENAI_API_KEY'] = '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'
398
+
399
+ # 전체 데이터셋의 vid_id에 대해
400
+ for i in range(1):
401
+ vid_id = vid_ids[i]
402
+
403
+ #====캡션 만들기====
404
+ caption, valid_obj_ids = getCaption(vid_id, data)
405
+ cats_in_vid = list(caption.keys())
406
+
407
+ #====referring expression 만들고 QA filtering====
408
+ ref_expressions = {}
409
+ # 각 카테고리별로
410
+ for cat_name in cats_in_vid:
411
+ if cat_name not in ref_expressions:
412
+ ref_expressions[cat_name] = {}
413
+
414
+ # 각 비디오 프레임 별로
415
+ for frame_name in data[vid_id]['frame_names']:
416
+
417
+ if frame_name not in ref_expressions[cat_name]:
418
+ ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary
419
+
420
+ caption = caption[cat_name][frame_name]
421
+
422
+ if not caption : continue
423
+ else :
424
+ # 각 obj id별로
425
+ for obj_id in valid_obj_ids:
426
+ ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
427
+ ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp
428
+
429
+
430
+ all_ref_exps[vid_id] = ref_expressions
431
+
.history/mbench/gpt_ref-ytvos-cy_20250121155710.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from mbench.ytvos_ref import build as build_ytvos_ref
6
+ import argparse
7
+ import opts
8
+
9
+ import sys
10
+ from pathlib import Path
11
+ import os
12
+ from os import path as osp
13
+ import skimage
14
+ from io import BytesIO
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ import regex as re
19
+ import json
20
+
21
+ import cv2
22
+ from PIL import Image, ImageDraw
23
+ import torch
24
+ from torchvision.transforms import functional as F
25
+
26
+ from skimage import measure # (pip install scikit-image)
27
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
28
+
29
+ import matplotlib.pyplot as plt
30
+ import matplotlib.patches as patches
31
+ from matplotlib.collections import PatchCollection
32
+ from matplotlib.patches import Rectangle
33
+
34
+
35
+ import ipywidgets as widgets
36
+ from IPython.display import display, clear_output
37
+
38
+ from openai import OpenAI
39
+ import base64
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+ # Captioner
47
+ ytvos_category_valid_list = [
48
+ 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
49
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
50
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
51
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
52
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
53
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
54
+ ]
55
+ def getCaption(video_id, json_data):
56
+ #데이터 가져오기
57
+ video_data = json_data[video_id]
58
+ frame_names = video_data['frame_names']
59
+ video_path = video_data['video_path']
60
+
61
+ cat_names = set()
62
+ all_captions = dict()
63
+ for obj_id in list(video_data['annotations'][0].keys()):
64
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
65
+
66
+ # cat_names : person, snowboard
67
+ # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
68
+ # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
69
+
70
+ for cat_name in list(cat_names) :
71
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
72
+ image_captions = {}
73
+
74
+ captioner = OpenAI()
75
+
76
+ #0단계: action의 대상이 될 수 있는가?
77
+ is_movable = False
78
+ if cat_name in ytvos_category_valid_list :
79
+ is_movable = True
80
+
81
+ # response_check = captioner.chat.completions.create(
82
+ # model="gpt-4o",
83
+ # messages=[
84
+ # {
85
+ # "role": "user",
86
+ # "content": f"""
87
+ # Can a {cat_name} be a subject of distinct actions or movements?
88
+ # For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
89
+ # However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
90
+ # Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
91
+ # Answer only YES or NONE.
92
+ # """
93
+ # }
94
+ # ],
95
+ # )
96
+ # response_check_content = response_check.choices[0].message.content.strip().lower()
97
+ # print(f"Movable Check for {cat_name}: {response_check_content}")
98
+
99
+ # if response_check_content == "yes": is_movable = True
100
+
101
+ if not is_movable:
102
+ print(f"Skipping {cat_name}: Determined to be non-movable.")
103
+ continue
104
+
105
+ for i in range(len(image_paths)):
106
+ image_path = image_paths[i]
107
+ frame_name = frame_names[i]
108
+ base64_image = encode_image(image_path)
109
+
110
+ #1단계: 필터링
111
+ #print(f"-----------category name: {cat_name}, frame name: {frame_name}")
112
+ response1 = captioner.chat.completions.create(
113
+ model="chatgpt-4o-latest",
114
+ messages=[
115
+ {
116
+ "role": "user",
117
+ "content": [
118
+ {
119
+ "type": "text",
120
+
121
+ "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
122
+ Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
123
+ Each action should be unique and clearly associated with a specific object.
124
+
125
+ Respond with YES if:
126
+ - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
127
+ - The {cat_name}s involve clear, distinguishable actions performed independently.
128
+
129
+ Respond with NONE if:
130
+ - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
131
+ - Actions are ambiguous, minor, or not clearly visible.
132
+
133
+ If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
134
+ If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
135
+
136
+ Answer only YES or NONE."""
137
+
138
+ },
139
+ {
140
+ "type": "image_url",
141
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
142
+ },
143
+ ],
144
+ }
145
+ ],
146
+ )
147
+ response_content = response1.choices[0].message.content
148
+ should_caption = True if "yes" in response_content.lower() else False
149
+ #print(f"are {cat_name}s distinguished by action: {response_content}")
150
+
151
+ #2단계: dense caption 만들기
152
+ if should_caption:
153
+ response2 = captioner.chat.completions.create(
154
+ model="chatgpt-4o-latest",
155
+ messages=[
156
+ {
157
+ "role": "user",
158
+ "content": [
159
+ {
160
+ "type": "text",
161
+
162
+ "text": f"""
163
+ Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
164
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
165
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
166
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
167
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
168
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
169
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
170
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
171
+ 8. Include interactions with objects or other entities when they are prominent and observable.
172
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
173
+ Output only the caption.""",
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
178
+ },
179
+ ],
180
+ }
181
+ ],
182
+ )
183
+
184
+ caption = response2.choices[0].message.content
185
+ #print(f"{image_path} - {frame_name}: {caption}")
186
+ else:
187
+ caption = None
188
+
189
+ image_captions[frame_name] = caption
190
+ all_captions[cat_name] = image_captions
191
+
192
+ # final : also prepare valid object ids
193
+ valid_obj_ids = []
194
+ valid_cat_names = list(all_captions.keys())
195
+ for obj_id in list(video_data['annotations'][0].keys()):
196
+ cat = video_data['annotations'][0][obj_id]['category_name']
197
+ if cat in valid_cat_names : valid_obj_ids.append(obj_id)
198
+
199
+ return all_captions, valid_obj_ids
200
+
201
+ # Referring expression generator and QA filter
202
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
203
+
204
+ # 이미지에 해당 물체 바운딩 박스 그리기
205
+ video_data = json_data[video_id]
206
+ frame_names = video_data['frame_names']
207
+ video_path = video_data['video_path']
208
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
209
+ frame_indx = frame_names.index(frame_name)
210
+ obj_data = video_data['annotations'][frame_indx][obj_id]
211
+
212
+ bbox = obj_data['bbox']
213
+ cat_name = obj_data['category_name']
214
+ valid = obj_data['valid']
215
+
216
+ if valid == 0:
217
+ print("Object not in this frame!")
218
+ return {}
219
+
220
+
221
+ x_min, y_min, x_max, y_max = bbox
222
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
223
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
224
+ plt.figure()
225
+ plt.imshow(I)
226
+ plt.axis('off')
227
+ plt.show()
228
+
229
+ #cropped object for visibility check
230
+ cropped_I = I[y_min:y_max, x_min:x_max]
231
+ pil_cropped_I = Image.fromarray(cropped_I)
232
+ buff_crop = BytesIO()
233
+ pil_cropped_I.save(buff_crop, format='JPEG')
234
+ base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
235
+
236
+ #entire image for referring expression generation
237
+ pil_I = Image.fromarray(I)
238
+ buff = BytesIO()
239
+ pil_I.save(buff, format='JPEG')
240
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
241
+
242
+ # 구분 가능 여부 확인
243
+ generator = OpenAI()
244
+ response_check = generator.chat.completions.create(
245
+ model="chatgpt-4o-latest",
246
+ messages=[
247
+ {
248
+ "role": "user",
249
+ "content": [
250
+ {
251
+
252
+ "type": "text",
253
+ "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
254
+ Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
255
+
256
+ Guidelines:
257
+ - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
258
+ - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
259
+ - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
260
+
261
+ Output only either YES or NONE.
262
+ """
263
+ },
264
+ {
265
+ "type": "image_url",
266
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
267
+ }
268
+ ]
269
+ },
270
+ ]
271
+ )
272
+
273
+ response_check_content = response_check.choices[0].message.content.strip().lower()
274
+ #print(f"is object {obj_id} visible: {response_check_content}")
275
+
276
+ if "yes" not in response_check_content:
277
+ print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
278
+ return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
279
+
280
+ # Referring expression 만들기
281
+ # generator = OpenAI()
282
+ response = generator.chat.completions.create(
283
+ model="chatgpt-4o-latest",
284
+ messages=[
285
+ {
286
+ "role": "user",
287
+ "content": [
288
+ {
289
+ "type": "text",
290
+
291
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
292
+ Guidelines for creating the referring expression:
293
+ 1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
294
+ 2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
295
+ 3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
296
+ 4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
297
+ 5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
298
+ 6. Use '{cat_name}' as the noun for the referring expressions.
299
+ Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
300
+
301
+ {caption}
302
+ """
303
+ },
304
+ {
305
+ "type": "image_url",
306
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
307
+ },
308
+ # {
309
+ # "type": "image_url",
310
+ # "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
311
+ # }
312
+ ],
313
+ }
314
+ ],
315
+ )
316
+
317
+ ref_exp = response.choices[0].message.content.strip()
318
+
319
+ #QA filtering
320
+ #QA1: 원하는 물체를 설명하는지
321
+ filter = OpenAI()
322
+ response1 = filter.chat.completions.create(
323
+ model="chatgpt-4o-latest",
324
+ messages=[
325
+ {
326
+ "role": "user",
327
+ "content": [
328
+ {
329
+ "type": "text",
330
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
331
+ {ref_exp}""",
332
+ },
333
+ {
334
+ "type": "image_url",
335
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
336
+ },
337
+ ],
338
+ }
339
+ ],
340
+ )
341
+
342
+ response1_content = response1.choices[0].message.content
343
+ describesHighlighted = True if "yes" in response1_content.lower() else False
344
+
345
+ #QA2: 원하지 않는 물체를 설명하지 않는지
346
+ response2 = filter.chat.completions.create(
347
+ model="chatgpt-4o-latest",
348
+ messages=[
349
+ {
350
+ "role": "user",
351
+ "content": [
352
+ {
353
+ "type": "text",
354
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
355
+ {ref_exp}""",
356
+ },
357
+ {
358
+ "type": "image_url",
359
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
360
+ },
361
+ ],
362
+ }
363
+ ],
364
+ )
365
+
366
+ response2_content = response2.choices[0].message.content
367
+ notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
368
+
369
+ isValid = True if describesHighlighted and notDescribesNotHighlighted else False
370
+
371
+ #print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
372
+ #print(f"ref exp: {ref_exp}")
373
+ #print("")
374
+
375
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
376
+
377
+
378
+ if __name__ == '__main__':
379
+ with open('mbench/sampled_frame3.json', 'r') as file:
380
+ data = json.load(file)
381
+
382
+ vid_ids = list(data.keys())
383
+ all_ref_exps = {}
384
+
385
+ os.environ['OPENAI_API_KEY'] = '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'
386
+
387
+ # 전체 데이터셋의 vid_id에 대해
388
+ for i in range(50):
389
+ vid_id = vid_ids[i]
390
+
391
+ #====캡션 만들기====
392
+ # print("=====================captioner========================")
393
+ captions, valid_obj_ids = getCaption(vid_id, data)
394
+ cats_in_vid = list(captions.keys())
395
+ # print()
396
+
397
+ #====referring expression 만들고 QA filtering====
398
+ # print("=====================referring expression generator & QA filter========================")
399
+ ref_expressions = {}
400
+
401
+ # 각 카테고리별로
402
+ for cat_name in cats_in_vid:
403
+ if cat_name not in ref_expressions:
404
+ ref_expressions[cat_name] = {}
405
+ # 각 비디오 프레임 별로
406
+ for frame_name in data[vid_id]['frame_names']:
407
+ # print(f'--------category: {cat_name}, frame_name: {frame_name}')
408
+
409
+ if frame_name not in ref_expressions[cat_name]:
410
+ ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary
411
+ caption = captions[cat_name][frame_name]
412
+ if not caption : continue
413
+ else :
414
+ # 각 obj id별로
415
+ for obj_id in valid_obj_ids:
416
+ ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
417
+ ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp
418
+
419
+ all_ref_exps[vid_id] = ref_expressions
420
+
421
+
422
+ with open('mbench/result_revised.json', 'w') as file:
423
+ json.dump(all_ref_exps, file, indent=4)
424
+
425
+
426
+
427
+
428
+
.history/mbench/gpt_ref-ytvos-revised_20250121155717.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from mbench.ytvos_ref import build as build_ytvos_ref
6
+ import argparse
7
+ import opts
8
+
9
+ import sys
10
+ from pathlib import Path
11
+ import os
12
+ from os import path as osp
13
+ import skimage
14
+ from io import BytesIO
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ import regex as re
19
+ import json
20
+
21
+ import cv2
22
+ from PIL import Image, ImageDraw
23
+ import torch
24
+ from torchvision.transforms import functional as F
25
+
26
+ from skimage import measure # (pip install scikit-image)
27
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
28
+
29
+ import matplotlib.pyplot as plt
30
+ import matplotlib.patches as patches
31
+ from matplotlib.collections import PatchCollection
32
+ from matplotlib.patches import Rectangle
33
+
34
+
35
+ import ipywidgets as widgets
36
+ from IPython.display import display, clear_output
37
+
38
+ from openai import OpenAI
39
+ import base64
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+ # Captioner
47
+ ytvos_category_valid_list = [
48
+ 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
49
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
50
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
51
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
52
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
53
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
54
+ ]
55
+ def getCaption(video_id, json_data):
56
+ #데이터 가져오기
57
+ video_data = json_data[video_id]
58
+ frame_names = video_data['frame_names']
59
+ video_path = video_data['video_path']
60
+
61
+ cat_names = set()
62
+ all_captions = dict()
63
+ for obj_id in list(video_data['annotations'][0].keys()):
64
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
65
+
66
+ # cat_names : person, snowboard
67
+ # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
68
+ # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
69
+
70
+ for cat_name in list(cat_names) :
71
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
72
+ image_captions = {}
73
+
74
+ captioner = OpenAI()
75
+
76
+ #0단계: action의 대상이 될 수 있는가?
77
+ is_movable = False
78
+ if cat_name in ytvos_category_valid_list :
79
+ is_movable = True
80
+
81
+ # response_check = captioner.chat.completions.create(
82
+ # model="gpt-4o",
83
+ # messages=[
84
+ # {
85
+ # "role": "user",
86
+ # "content": f"""
87
+ # Can a {cat_name} be a subject of distinct actions or movements?
88
+ # For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
89
+ # However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
90
+ # Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
91
+ # Answer only YES or NONE.
92
+ # """
93
+ # }
94
+ # ],
95
+ # )
96
+ # response_check_content = response_check.choices[0].message.content.strip().lower()
97
+ # print(f"Movable Check for {cat_name}: {response_check_content}")
98
+
99
+ # if response_check_content == "yes": is_movable = True
100
+
101
+ if not is_movable:
102
+ print(f"Skipping {cat_name}: Determined to be non-movable.")
103
+ continue
104
+
105
+ for i in range(len(image_paths)):
106
+ image_path = image_paths[i]
107
+ frame_name = frame_names[i]
108
+ base64_image = encode_image(image_path)
109
+
110
+ #1단계: 필터링
111
+ #print(f"-----------category name: {cat_name}, frame name: {frame_name}")
112
+ response1 = captioner.chat.completions.create(
113
+ model="chatgpt-4o-latest",
114
+ messages=[
115
+ {
116
+ "role": "user",
117
+ "content": [
118
+ {
119
+ "type": "text",
120
+
121
+ "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
122
+ Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
123
+ Each action should be unique and clearly associated with a specific object.
124
+
125
+ Respond with YES if:
126
+ - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
127
+ - The {cat_name}s involve clear, distinguishable actions performed independently.
128
+
129
+ Respond with NONE if:
130
+ - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
131
+ - Actions are ambiguous, minor, or not clearly visible.
132
+
133
+ If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
134
+ If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
135
+
136
+ Answer only YES or NONE."""
137
+
138
+ },
139
+ {
140
+ "type": "image_url",
141
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
142
+ },
143
+ ],
144
+ }
145
+ ],
146
+ )
147
+ response_content = response1.choices[0].message.content
148
+ should_caption = True if "yes" in response_content.lower() else False
149
+ #print(f"are {cat_name}s distinguished by action: {response_content}")
150
+
151
+ #2단계: dense caption 만들기
152
+ if should_caption:
153
+ response2 = captioner.chat.completions.create(
154
+ model="chatgpt-4o-latest",
155
+ messages=[
156
+ {
157
+ "role": "user",
158
+ "content": [
159
+ {
160
+ "type": "text",
161
+
162
+ "text": f"""
163
+ Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
164
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
165
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
166
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
167
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
168
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
169
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
170
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
171
+ 8. Include interactions with objects or other entities when they are prominent and observable.
172
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
173
+ Output only the caption.""",
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
178
+ },
179
+ ],
180
+ }
181
+ ],
182
+ )
183
+
184
+ caption = response2.choices[0].message.content
185
+ #print(f"{image_path} - {frame_name}: {caption}")
186
+ else:
187
+ caption = None
188
+
189
+ image_captions[frame_name] = caption
190
+ all_captions[cat_name] = image_captions
191
+
192
+ # final : also prepare valid object ids
193
+ valid_obj_ids = []
194
+ valid_cat_names = list(all_captions.keys())
195
+ for obj_id in list(video_data['annotations'][0].keys()):
196
+ cat = video_data['annotations'][0][obj_id]['category_name']
197
+ if cat in valid_cat_names : valid_obj_ids.append(obj_id)
198
+
199
+ return all_captions, valid_obj_ids
200
+
201
+ # Referring expression generator and QA filter
202
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
203
+
204
+ # 이미지에 해당 물체 바운딩 박스 그리기
205
+ video_data = json_data[video_id]
206
+ frame_names = video_data['frame_names']
207
+ video_path = video_data['video_path']
208
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
209
+ frame_indx = frame_names.index(frame_name)
210
+ obj_data = video_data['annotations'][frame_indx][obj_id]
211
+
212
+ bbox = obj_data['bbox']
213
+ cat_name = obj_data['category_name']
214
+ valid = obj_data['valid']
215
+
216
+ if valid == 0:
217
+ print("Object not in this frame!")
218
+ return {}
219
+
220
+
221
+ x_min, y_min, x_max, y_max = bbox
222
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
223
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
224
+ plt.figure()
225
+ plt.imshow(I)
226
+ plt.axis('off')
227
+ plt.show()
228
+
229
+ #cropped object for visibility check
230
+ cropped_I = I[y_min:y_max, x_min:x_max]
231
+ pil_cropped_I = Image.fromarray(cropped_I)
232
+ buff_crop = BytesIO()
233
+ pil_cropped_I.save(buff_crop, format='JPEG')
234
+ base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
235
+
236
+ #entire image for referring expression generation
237
+ pil_I = Image.fromarray(I)
238
+ buff = BytesIO()
239
+ pil_I.save(buff, format='JPEG')
240
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
241
+
242
+ # 구분 가능 여부 확인
243
+ generator = OpenAI()
244
+ response_check = generator.chat.completions.create(
245
+ model="chatgpt-4o-latest",
246
+ messages=[
247
+ {
248
+ "role": "user",
249
+ "content": [
250
+ {
251
+
252
+ "type": "text",
253
+ "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
254
+ Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
255
+
256
+ Guidelines:
257
+ - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
258
+ - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
259
+ - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
260
+
261
+ Output only either YES or NONE.
262
+ """
263
+ },
264
+ {
265
+ "type": "image_url",
266
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
267
+ }
268
+ ]
269
+ },
270
+ ]
271
+ )
272
+
273
+ response_check_content = response_check.choices[0].message.content.strip().lower()
274
+ #print(f"is object {obj_id} visible: {response_check_content}")
275
+
276
+ if "yes" not in response_check_content:
277
+ print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
278
+ return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
279
+
280
+ # Referring expression 만들기
281
+ # generator = OpenAI()
282
+ response = generator.chat.completions.create(
283
+ model="chatgpt-4o-latest",
284
+ messages=[
285
+ {
286
+ "role": "user",
287
+ "content": [
288
+ {
289
+ "type": "text",
290
+
291
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
292
+ Guidelines for creating the referring expression:
293
+ 1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
294
+ 2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
295
+ 3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
296
+ 4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
297
+ 5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
298
+ 6. Use '{cat_name}' as the noun for the referring expressions.
299
+ Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
300
+
301
+ {caption}
302
+ """
303
+ },
304
+ {
305
+ "type": "image_url",
306
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
307
+ },
308
+ # {
309
+ # "type": "image_url",
310
+ # "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
311
+ # }
312
+ ],
313
+ }
314
+ ],
315
+ )
316
+
317
+ ref_exp = response.choices[0].message.content.strip()
318
+
319
+ #QA filtering
320
+ #QA1: 원하는 물체를 설명하는지
321
+ filter = OpenAI()
322
+ response1 = filter.chat.completions.create(
323
+ model="chatgpt-4o-latest",
324
+ messages=[
325
+ {
326
+ "role": "user",
327
+ "content": [
328
+ {
329
+ "type": "text",
330
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
331
+ {ref_exp}""",
332
+ },
333
+ {
334
+ "type": "image_url",
335
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
336
+ },
337
+ ],
338
+ }
339
+ ],
340
+ )
341
+
342
+ response1_content = response1.choices[0].message.content
343
+ describesHighlighted = True if "yes" in response1_content.lower() else False
344
+
345
+ #QA2: 원하지 않는 물체를 설명하지 않는지
346
+ response2 = filter.chat.completions.create(
347
+ model="chatgpt-4o-latest",
348
+ messages=[
349
+ {
350
+ "role": "user",
351
+ "content": [
352
+ {
353
+ "type": "text",
354
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
355
+ {ref_exp}""",
356
+ },
357
+ {
358
+ "type": "image_url",
359
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
360
+ },
361
+ ],
362
+ }
363
+ ],
364
+ )
365
+
366
+ response2_content = response2.choices[0].message.content
367
+ notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
368
+
369
+ isValid = True if describesHighlighted and notDescribesNotHighlighted else False
370
+
371
+ #print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
372
+ #print(f"ref exp: {ref_exp}")
373
+ #print("")
374
+
375
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
376
+
377
+
378
+ if __name__ == '__main__':
379
+ with open('mbench/sampled_frame3.json', 'r') as file:
380
+ data = json.load(file)
381
+
382
+ vid_ids = list(data.keys())
383
+ all_ref_exps = {}
384
+
385
+ os.environ['OPENAI_API_KEY'] = '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'
386
+
387
+ # 전체 데이터셋의 vid_id에 대해
388
+ for i in range(1):
389
+ vid_id = vid_ids[i]
390
+
391
+ #====캡션 만들기====
392
+ # print("=====================captioner========================")
393
+ captions, valid_obj_ids = getCaption(vid_id, data)
394
+ cats_in_vid = list(captions.keys())
395
+ # print()
396
+
397
+ #====referring expression 만들고 QA filtering====
398
+ # print("=====================referring expression generator & QA filter========================")
399
+ ref_expressions = {}
400
+
401
+ # 각 카테고리별로
402
+ for cat_name in cats_in_vid:
403
+ if cat_name not in ref_expressions:
404
+ ref_expressions[cat_name] = {}
405
+ # 각 비디오 프레임 별로
406
+ for frame_name in data[vid_id]['frame_names']:
407
+ # print(f'--------category: {cat_name}, frame_name: {frame_name}')
408
+
409
+ if frame_name not in ref_expressions[cat_name]:
410
+ ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary
411
+ caption = captions[cat_name][frame_name]
412
+ if not caption : continue
413
+ else :
414
+ # 각 obj id별로
415
+ for obj_id in valid_obj_ids:
416
+ ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
417
+ ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp
418
+
419
+ all_ref_exps[vid_id] = ref_expressions
420
+
421
+
422
+ with open('mbench/result_revised.json', 'w') as file:
423
+ json.dump(all_ref_exps, file, indent=4)
424
+
425
+
426
+
427
+
428
+
.history/mbench/gpt_ref-ytvos-revised_20250121155956.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from mbench.ytvos_ref import build as build_ytvos_ref
6
+ import argparse
7
+ import opts
8
+
9
+ import sys
10
+ from pathlib import Path
11
+ import os
12
+ from os import path as osp
13
+ import skimage
14
+ from io import BytesIO
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ import regex as re
19
+ import json
20
+
21
+ import cv2
22
+ from PIL import Image, ImageDraw
23
+ import torch
24
+ from torchvision.transforms import functional as F
25
+
26
+ from skimage import measure # (pip install scikit-image)
27
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
28
+
29
+ import matplotlib.pyplot as plt
30
+ import matplotlib.patches as patches
31
+ from matplotlib.collections import PatchCollection
32
+ from matplotlib.patches import Rectangle
33
+
34
+
35
+ import ipywidgets as widgets
36
+ from IPython.display import display, clear_output
37
+
38
+ from openai import OpenAI
39
+ import base64
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+ # Captioner
47
+ ytvos_category_valid_list = [
48
+ 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
49
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
50
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
51
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
52
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
53
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
54
+ ]
55
+ def getCaption(video_id, json_data):
56
+ #데이터 가져오기
57
+ video_data = json_data[video_id]
58
+ frame_names = video_data['frame_names']
59
+ video_path = video_data['video_path']
60
+
61
+ cat_names = set()
62
+ all_captions = dict()
63
+ for obj_id in list(video_data['annotations'][0].keys()):
64
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
65
+
66
+ # cat_names : person, snowboard
67
+ # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
68
+ # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
69
+
70
+ for cat_name in list(cat_names) :
71
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
72
+ image_captions = {}
73
+
74
+ captioner = OpenAI()
75
+
76
+ #0단계: action의 대상이 될 수 있는가?
77
+ is_movable = False
78
+ if cat_name in ytvos_category_valid_list :
79
+ is_movable = True
80
+
81
+ # response_check = captioner.chat.completions.create(
82
+ # model="gpt-4o",
83
+ # messages=[
84
+ # {
85
+ # "role": "user",
86
+ # "content": f"""
87
+ # Can a {cat_name} be a subject of distinct actions or movements?
88
+ # For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
89
+ # However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
90
+ # Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
91
+ # Answer only YES or NONE.
92
+ # """
93
+ # }
94
+ # ],
95
+ # )
96
+ # response_check_content = response_check.choices[0].message.content.strip().lower()
97
+ # print(f"Movable Check for {cat_name}: {response_check_content}")
98
+
99
+ # if response_check_content == "yes": is_movable = True
100
+
101
+ if not is_movable:
102
+ print(f"Skipping {cat_name}: Determined to be non-movable.")
103
+ continue
104
+
105
+ for i in range(len(image_paths)):
106
+ image_path = image_paths[i]
107
+ frame_name = frame_names[i]
108
+ base64_image = encode_image(image_path)
109
+
110
+ #1단계: 필터링
111
+ #print(f"-----------category name: {cat_name}, frame name: {frame_name}")
112
+ response1 = captioner.chat.completions.create(
113
+ model="chatgpt-4o-latest",
114
+ messages=[
115
+ {
116
+ "role": "user",
117
+ "content": [
118
+ {
119
+ "type": "text",
120
+
121
+ "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
122
+ Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
123
+ Each action should be unique and clearly associated with a specific object.
124
+
125
+ Respond with YES if:
126
+ - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
127
+ - The {cat_name}s involve clear, distinguishable actions performed independently.
128
+
129
+ Respond with NONE if:
130
+ - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
131
+ - Actions are ambiguous, minor, or not clearly visible.
132
+
133
+ If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
134
+ If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
135
+
136
+ Answer only YES or NONE."""
137
+
138
+ },
139
+ {
140
+ "type": "image_url",
141
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
142
+ },
143
+ ],
144
+ }
145
+ ],
146
+ )
147
+ response_content = response1.choices[0].message.content
148
+ should_caption = True if "yes" in response_content.lower() else False
149
+ #print(f"are {cat_name}s distinguished by action: {response_content}")
150
+
151
+ #2단계: dense caption 만들기
152
+ if should_caption:
153
+ response2 = captioner.chat.completions.create(
154
+ model="chatgpt-4o-latest",
155
+ messages=[
156
+ {
157
+ "role": "user",
158
+ "content": [
159
+ {
160
+ "type": "text",
161
+
162
+ "text": f"""
163
+ Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
164
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
165
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
166
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
167
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
168
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
169
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
170
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
171
+ 8. Include interactions with objects or other entities when they are prominent and observable.
172
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
173
+ Output only the caption.""",
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
178
+ },
179
+ ],
180
+ }
181
+ ],
182
+ )
183
+
184
+ caption = response2.choices[0].message.content
185
+ #print(f"{image_path} - {frame_name}: {caption}")
186
+ else:
187
+ caption = None
188
+
189
+ image_captions[frame_name] = caption
190
+ all_captions[cat_name] = image_captions
191
+
192
+ # final : also prepare valid object ids
193
+ valid_obj_ids = []
194
+ valid_cat_names = list(all_captions.keys())
195
+ for obj_id in list(video_data['annotations'][0].keys()):
196
+ cat = video_data['annotations'][0][obj_id]['category_name']
197
+ if cat in valid_cat_names : valid_obj_ids.append(obj_id)
198
+
199
+ return all_captions, valid_obj_ids
200
+
201
+ # Referring expression generator and QA filter
202
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
203
+
204
+ # 이미지에 해당 물체 바운딩 박스 그리기
205
+ video_data = json_data[video_id]
206
+ frame_names = video_data['frame_names']
207
+ video_path = video_data['video_path']
208
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
209
+ frame_indx = frame_names.index(frame_name)
210
+ obj_data = video_data['annotations'][frame_indx][obj_id]
211
+
212
+ bbox = obj_data['bbox']
213
+ cat_name = obj_data['category_name']
214
+ valid = obj_data['valid']
215
+
216
+ if valid == 0:
217
+ print("Object not in this frame!")
218
+ return {}
219
+
220
+
221
+ x_min, y_min, x_max, y_max = bbox
222
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
223
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
224
+ plt.figure()
225
+ plt.imshow(I)
226
+ plt.axis('off')
227
+ plt.show()
228
+
229
+ #cropped object for visibility check
230
+ cropped_I = I[y_min:y_max, x_min:x_max]
231
+ pil_cropped_I = Image.fromarray(cropped_I)
232
+ buff_crop = BytesIO()
233
+ pil_cropped_I.save(buff_crop, format='JPEG')
234
+ base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
235
+
236
+ #entire image for referring expression generation
237
+ pil_I = Image.fromarray(I)
238
+ buff = BytesIO()
239
+ pil_I.save(buff, format='JPEG')
240
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
241
+
242
+ # 구분 가능 여부 확인
243
+ generator = OpenAI()
244
+ response_check = generator.chat.completions.create(
245
+ model="chatgpt-4o-latest",
246
+ messages=[
247
+ {
248
+ "role": "user",
249
+ "content": [
250
+ {
251
+
252
+ "type": "text",
253
+ "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
254
+ Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
255
+
256
+ Guidelines:
257
+ - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
258
+ - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
259
+ - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
260
+
261
+ Output only either YES or NONE.
262
+ """
263
+ },
264
+ {
265
+ "type": "image_url",
266
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
267
+ }
268
+ ]
269
+ },
270
+ ]
271
+ )
272
+
273
+ response_check_content = response_check.choices[0].message.content.strip().lower()
274
+ #print(f"is object {obj_id} visible: {response_check_content}")
275
+
276
+ if "yes" not in response_check_content:
277
+ print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
278
+ return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
279
+
280
+ # Referring expression 만들기
281
+ # generator = OpenAI()
282
+ response = generator.chat.completions.create(
283
+ model="chatgpt-4o-latest",
284
+ messages=[
285
+ {
286
+ "role": "user",
287
+ "content": [
288
+ {
289
+ "type": "text",
290
+
291
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
292
+ Guidelines for creating the referring expression:
293
+ 1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
294
+ 2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
295
+ 3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
296
+ 4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
297
+ 5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
298
+ 6. Use '{cat_name}' as the noun for the referring expressions.
299
+ Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
300
+
301
+ {caption}
302
+ """
303
+ },
304
+ {
305
+ "type": "image_url",
306
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
307
+ },
308
+ # {
309
+ # "type": "image_url",
310
+ # "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
311
+ # }
312
+ ],
313
+ }
314
+ ],
315
+ )
316
+
317
+ ref_exp = response.choices[0].message.content.strip()
318
+
319
+ #QA filtering
320
+ #QA1: 원하는 물체를 설명하는지
321
+ filter = OpenAI()
322
+ response1 = filter.chat.completions.create(
323
+ model="chatgpt-4o-latest",
324
+ messages=[
325
+ {
326
+ "role": "user",
327
+ "content": [
328
+ {
329
+ "type": "text",
330
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
331
+ {ref_exp}""",
332
+ },
333
+ {
334
+ "type": "image_url",
335
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
336
+ },
337
+ ],
338
+ }
339
+ ],
340
+ )
341
+
342
+ response1_content = response1.choices[0].message.content
343
+ describesHighlighted = True if "yes" in response1_content.lower() else False
344
+
345
+ #QA2: 원하지 않는 물체를 설명하지 않는지
346
+ response2 = filter.chat.completions.create(
347
+ model="chatgpt-4o-latest",
348
+ messages=[
349
+ {
350
+ "role": "user",
351
+ "content": [
352
+ {
353
+ "type": "text",
354
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
355
+ {ref_exp}""",
356
+ },
357
+ {
358
+ "type": "image_url",
359
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
360
+ },
361
+ ],
362
+ }
363
+ ],
364
+ )
365
+
366
+ response2_content = response2.choices[0].message.content
367
+ notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
368
+
369
+ isValid = True if describesHighlighted and notDescribesNotHighlighted else False
370
+
371
+ #print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
372
+ #print(f"ref exp: {ref_exp}")
373
+ #print("")
374
+
375
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
376
+
377
+
378
+ if __name__ == '__main__':
379
+ with open('mbench/sampled_frame3.json', 'r') as file:
380
+ data = json.load(file)
381
+
382
+ vid_ids = list(data.keys())
383
+ all_ref_exps = {}
384
+
385
+ os.environ['OPENAI_API_KEY'] = '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'
386
+
387
+ # 전체 데이터셋의 vid_id에 대해
388
+ for i in range(1):
389
+ vid_id = vid_ids[i]
390
+
391
+ #====캡션 만들기====
392
+ # print("=====================captioner========================")
393
+ captions, valid_obj_ids = getCaption(vid_id, data)
394
+ cats_in_vid = list(captions.keys())
395
+ # print()
396
+
397
+ #====referring expression 만들고 QA filtering====
398
+ # print("=====================referring expression generator & QA filter========================")
399
+ ref_expressions = {}
400
+
401
+ # 각 카테고리별로
402
+ for cat_name in cats_in_vid:
403
+ if cat_name not in ref_expressions:
404
+ ref_expressions[cat_name] = {}
405
+ # 각 비디오 프레임 별로
406
+ for frame_name in data[vid_id]['frame_names']:
407
+ # print(f'--------category: {cat_name}, frame_name: {frame_name}')
408
+
409
+ if frame_name not in ref_expressions[cat_name]:
410
+ ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary
411
+ caption = captions[cat_name][frame_name]
412
+ if not caption : continue
413
+ else :
414
+ # 각 obj id별로
415
+ for obj_id in valid_obj_ids:
416
+ ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
417
+ ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp
418
+
419
+ all_ref_exps[vid_id] = ref_expressions
420
+
421
+
422
+ with open('mbench/result_revised.json', 'w') as file:
423
+ json.dump(all_ref_exps, file, indent=4)
424
+
425
+
426
+
427
+
428
+
.history/mbench/gpt_ref-ytvos-revised_20250121160813.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from mbench.ytvos_ref import build as build_ytvos_ref
6
+ import argparse
7
+ import opts
8
+
9
+ import sys
10
+ from pathlib import Path
11
+ import os
12
+ from os import path as osp
13
+ import skimage
14
+ from io import BytesIO
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ import regex as re
19
+ import json
20
+
21
+ import cv2
22
+ from PIL import Image, ImageDraw
23
+ import torch
24
+ from torchvision.transforms import functional as F
25
+
26
+ from skimage import measure # (pip install scikit-image)
27
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
28
+
29
+ import matplotlib.pyplot as plt
30
+ import matplotlib.patches as patches
31
+ from matplotlib.collections import PatchCollection
32
+ from matplotlib.patches import Rectangle
33
+
34
+
35
+ import ipywidgets as widgets
36
+ from IPython.display import display, clear_output
37
+
38
+ from openai import OpenAI
39
+ import base64
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+ # Captioner
47
+ ytvos_category_valid_list = [
48
+ 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
49
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
50
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
51
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
52
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
53
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
54
+ ]
55
+ def getCaption(video_id, json_data):
56
+ #데이터 가져오기
57
+ video_data = json_data[video_id]
58
+ frame_names = video_data['frame_names']
59
+ video_path = video_data['video_path']
60
+
61
+ cat_names = set()
62
+ all_captions = dict()
63
+ for obj_id in list(video_data['annotations'][0].keys()):
64
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
65
+
66
+ # cat_names : person, snowboard
67
+ # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
68
+ # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
69
+
70
+ for cat_name in list(cat_names) :
71
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
72
+ image_captions = {}
73
+
74
+ captioner = OpenAI()
75
+
76
+ #0단계: action의 대상이 될 수 있는가?
77
+ is_movable = False
78
+ if cat_name in ytvos_category_valid_list :
79
+ is_movable = True
80
+
81
+ # response_check = captioner.chat.completions.create(
82
+ # model="gpt-4o",
83
+ # messages=[
84
+ # {
85
+ # "role": "user",
86
+ # "content": f"""
87
+ # Can a {cat_name} be a subject of distinct actions or movements?
88
+ # For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
89
+ # However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
90
+ # Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
91
+ # Answer only YES or NONE.
92
+ # """
93
+ # }
94
+ # ],
95
+ # )
96
+ # response_check_content = response_check.choices[0].message.content.strip().lower()
97
+ # print(f"Movable Check for {cat_name}: {response_check_content}")
98
+
99
+ # if response_check_content == "yes": is_movable = True
100
+
101
+ if not is_movable:
102
+ print(f"Skipping {cat_name}: Determined to be non-movable.")
103
+ continue
104
+
105
+ for i in range(len(image_paths)):
106
+ image_path = image_paths[i]
107
+ frame_name = frame_names[i]
108
+ base64_image = encode_image(image_path)
109
+
110
+ #1단계: 필터링
111
+ #print(f"-----------category name: {cat_name}, frame name: {frame_name}")
112
+ response1 = captioner.chat.completions.create(
113
+ model="chatgpt-4o-latest",
114
+ messages=[
115
+ {
116
+ "role": "user",
117
+ "content": [
118
+ {
119
+ "type": "text",
120
+
121
+ "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
122
+ Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
123
+ Each action should be unique and clearly associated with a specific object.
124
+
125
+ Respond with YES if:
126
+ - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
127
+ - The {cat_name}s involve clear, distinguishable actions performed independently.
128
+
129
+ Respond with NONE if:
130
+ - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
131
+ - Actions are ambiguous, minor, or not clearly visible.
132
+
133
+ If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
134
+ If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
135
+
136
+ Answer only YES or NONE."""
137
+
138
+ },
139
+ {
140
+ "type": "image_url",
141
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
142
+ },
143
+ ],
144
+ }
145
+ ],
146
+ )
147
+ response_content = response1.choices[0].message.content
148
+ should_caption = True if "yes" in response_content.lower() else False
149
+ #print(f"are {cat_name}s distinguished by action: {response_content}")
150
+
151
+ #2단계: dense caption 만들기
152
+ if should_caption:
153
+ response2 = captioner.chat.completions.create(
154
+ model="chatgpt-4o-latest",
155
+ messages=[
156
+ {
157
+ "role": "user",
158
+ "content": [
159
+ {
160
+ "type": "text",
161
+
162
+ "text": f"""
163
+ Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
164
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
165
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
166
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
167
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
168
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
169
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
170
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
171
+ 8. Include interactions with objects or other entities when they are prominent and observable.
172
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
173
+ Output only the caption.""",
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
178
+ },
179
+ ],
180
+ }
181
+ ],
182
+ )
183
+
184
+ caption = response2.choices[0].message.content
185
+ #print(f"{image_path} - {frame_name}: {caption}")
186
+ else:
187
+ caption = None
188
+
189
+ image_captions[frame_name] = caption
190
+ all_captions[cat_name] = image_captions
191
+
192
+ # final : also prepare valid object ids
193
+ valid_obj_ids = []
194
+ valid_cat_names = list(all_captions.keys())
195
+ for obj_id in list(video_data['annotations'][0].keys()):
196
+ cat = video_data['annotations'][0][obj_id]['category_name']
197
+ if cat in valid_cat_names : valid_obj_ids.append(obj_id)
198
+
199
+ return all_captions, valid_obj_ids
200
+
201
+ # Referring expression generator and QA filter
202
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
203
+
204
+ # 이미지에 해당 물체 바운딩 박스 그리기
205
+ video_data = json_data[video_id]
206
+ frame_names = video_data['frame_names']
207
+ video_path = video_data['video_path']
208
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
209
+ frame_indx = frame_names.index(frame_name)
210
+ obj_data = video_data['annotations'][frame_indx][obj_id]
211
+
212
+ bbox = obj_data['bbox']
213
+ cat_name = obj_data['category_name']
214
+ valid = obj_data['valid']
215
+
216
+ if valid == 0:
217
+ print("Object not in this frame!")
218
+ return {}
219
+
220
+
221
+ x_min, y_min, x_max, y_max = bbox
222
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
223
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
224
+ plt.figure()
225
+ plt.imshow(I)
226
+ plt.axis('off')
227
+ plt.show()
228
+
229
+ #cropped object for visibility check
230
+ cropped_I = I[y_min:y_max, x_min:x_max]
231
+ pil_cropped_I = Image.fromarray(cropped_I)
232
+ buff_crop = BytesIO()
233
+ pil_cropped_I.save(buff_crop, format='JPEG')
234
+ base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
235
+
236
+ #entire image for referring expression generation
237
+ pil_I = Image.fromarray(I)
238
+ buff = BytesIO()
239
+ pil_I.save(buff, format='JPEG')
240
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
241
+
242
+ # 구분 가능 여부 확인
243
+ generator = OpenAI()
244
+ response_check = generator.chat.completions.create(
245
+ model="chatgpt-4o-latest",
246
+ messages=[
247
+ {
248
+ "role": "user",
249
+ "content": [
250
+ {
251
+
252
+ "type": "text",
253
+ "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
254
+ Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
255
+
256
+ Guidelines:
257
+ - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
258
+ - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
259
+ - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
260
+
261
+ Output only either YES or NONE.
262
+ """
263
+ },
264
+ {
265
+ "type": "image_url",
266
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
267
+ }
268
+ ]
269
+ },
270
+ ]
271
+ )
272
+
273
+ response_check_content = response_check.choices[0].message.content.strip().lower()
274
+ #print(f"is object {obj_id} visible: {response_check_content}")
275
+
276
+ if "yes" not in response_check_content:
277
+ print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
278
+ return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
279
+
280
+ # Referring expression 만들기
281
+ # generator = OpenAI()
282
+ response = generator.chat.completions.create(
283
+ model="chatgpt-4o-latest",
284
+ messages=[
285
+ {
286
+ "role": "user",
287
+ "content": [
288
+ {
289
+ "type": "text",
290
+
291
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
292
+ Guidelines for creating the referring expression:
293
+ 1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
294
+ 2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
295
+ 3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
296
+ 4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
297
+ 5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
298
+ 6. Use '{cat_name}' as the noun for the referring expressions.
299
+ Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
300
+
301
+ {caption}
302
+ """
303
+ },
304
+ {
305
+ "type": "image_url",
306
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
307
+ },
308
+ # {
309
+ # "type": "image_url",
310
+ # "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
311
+ # }
312
+ ],
313
+ }
314
+ ],
315
+ )
316
+
317
+ ref_exp = response.choices[0].message.content.strip()
318
+
319
+ #QA filtering
320
+ #QA1: 원하는 물체를 설명하는지
321
+ filter = OpenAI()
322
+ response1 = filter.chat.completions.create(
323
+ model="chatgpt-4o-latest",
324
+ messages=[
325
+ {
326
+ "role": "user",
327
+ "content": [
328
+ {
329
+ "type": "text",
330
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
331
+ {ref_exp}""",
332
+ },
333
+ {
334
+ "type": "image_url",
335
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
336
+ },
337
+ ],
338
+ }
339
+ ],
340
+ )
341
+
342
+ response1_content = response1.choices[0].message.content
343
+ describesHighlighted = True if "yes" in response1_content.lower() else False
344
+
345
+ #QA2: 원하지 않는 물체를 설명하지 않는지
346
+ response2 = filter.chat.completions.create(
347
+ model="chatgpt-4o-latest",
348
+ messages=[
349
+ {
350
+ "role": "user",
351
+ "content": [
352
+ {
353
+ "type": "text",
354
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
355
+ {ref_exp}""",
356
+ },
357
+ {
358
+ "type": "image_url",
359
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
360
+ },
361
+ ],
362
+ }
363
+ ],
364
+ )
365
+
366
+ response2_content = response2.choices[0].message.content
367
+ notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
368
+
369
+ isValid = True if describesHighlighted and notDescribesNotHighlighted else False
370
+
371
+ #print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
372
+ #print(f"ref exp: {ref_exp}")
373
+ #print("")
374
+
375
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
376
+
377
+
378
+ if __name__ == '__main__':
379
+ with open('mbench/sampled_frame3.json', 'r') as file:
380
+ data = json.load(file)
381
+
382
+ vid_ids = list(data.keys())
383
+ all_ref_exps = {}
384
+
385
+ os.environ['OPENAI_API_KEY'] = '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'
386
+
387
+ # 전체 데이터셋의 vid_id에 대해
388
+ for i in range(50):
389
+ vid_id = vid_ids[i]
390
+
391
+ #====캡션 만들기====
392
+ # print("=====================captioner========================")
393
+ captions, valid_obj_ids = getCaption(vid_id, data)
394
+ cats_in_vid = list(captions.keys())
395
+ # print()
396
+
397
+ #====referring expression 만들고 QA filtering====
398
+ # print("=====================referring expression generator & QA filter========================")
399
+ ref_expressions = {}
400
+
401
+ # 각 카테고리별로
402
+ for cat_name in cats_in_vid:
403
+ if cat_name not in ref_expressions:
404
+ ref_expressions[cat_name] = {}
405
+ # 각 비디오 프레임 별로
406
+ for frame_name in data[vid_id]['frame_names']:
407
+ # print(f'--------category: {cat_name}, frame_name: {frame_name}')
408
+
409
+ if frame_name not in ref_expressions[cat_name]:
410
+ ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary
411
+ caption = captions[cat_name][frame_name]
412
+ if not caption : continue
413
+ else :
414
+ # 각 obj id별로
415
+ for obj_id in valid_obj_ids:
416
+ ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
417
+ ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp
418
+
419
+ all_ref_exps[vid_id] = ref_expressions
420
+
421
+
422
+ with open('mbench/result_revised.json', 'w') as file:
423
+ json.dump(all_ref_exps, file, indent=4)
424
+
425
+
426
+
427
+
428
+
.history/mbench/gpt_ref-ytvos_20250119070213.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import build_dataset
2
+ import argparse
3
+ import opts
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ import os
8
+ from os import path as osp
9
+ import skimage
10
+ from io import BytesIO
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ import regex as re
15
+ import json
16
+
17
+ import cv2
18
+ from PIL import Image, ImageDraw
19
+ import torch
20
+ from torchvision.transforms import functional as F
21
+
22
+ from skimage import measure # (pip install scikit-image)
23
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
24
+
25
+ import matplotlib.pyplot as plt
26
+ import matplotlib.patches as patches
27
+ from matplotlib.collections import PatchCollection
28
+ from matplotlib.patches import Rectangle
29
+
30
+
31
+ import ipywidgets as widgets
32
+ from IPython.display import display, clear_output
33
+
34
+ from openai import OpenAI
35
+ import base64
36
+
37
+ os.environ['OPENAI_API_KEY'] = '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'
38
+
39
+ # Function to encode the image
40
+ def encode_image(image_path):
41
+ with open(image_path, "rb") as image_file:
42
+ return base64.b64encode(image_file.read()).decode("utf-8")
43
+
44
+ def getCaption(video_id, json_data):
45
+ #데이터 가져오기
46
+ video_data = json_data[video_id]
47
+ frame_names = video_data['frame_names']
48
+ video_path = video_data['video_path']
49
+
50
+ cat_names = set()
51
+ for obj_id in list(video_data['annotations'][0].keys()):
52
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
53
+
54
+ if len(cat_names) == 1:
55
+ cat_name = next(iter(cat_names))
56
+ else:
57
+ print("more than 2 categories")
58
+ return -1
59
+
60
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
61
+ image_captions = {}
62
+
63
+ captioner = OpenAI()
64
+ for i in range(len(image_paths)):
65
+ image_path = image_paths[i]
66
+ frame_name = frame_names[i]
67
+ base64_image = encode_image(image_path)
68
+
69
+ #1단계: 필터링
70
+ response1 = captioner.chat.completions.create(
71
+ model="gpt-4o-mini",
72
+ messages=[
73
+ {
74
+ "role": "user",
75
+ "content": [
76
+ {
77
+ "type": "text",
78
+ "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
79
+ },
80
+ {
81
+ "type": "image_url",
82
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
83
+ },
84
+ ],
85
+ }
86
+ ],
87
+ )
88
+ response_content = response1.choices[0].message.content
89
+ should_caption = True if "yes" in response_content.lower() else False
90
+
91
+ #2단계: dense caption 만들기
92
+ if should_caption:
93
+ response2 = captioner.chat.completions.create(
94
+ model="gpt-4o-mini",
95
+ messages=[
96
+ {
97
+ "role": "user",
98
+ "content": [
99
+ {
100
+ "type": "text",
101
+ "text": f"""
102
+ Describe the image in detail focusing on the {cat_name}s' actions.
103
+ 1. Each action should be prominent, clear and unique, describing the corresponding object only.
104
+ 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
105
+ 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
106
+ 4. Do not include actions that needs to be guessed or suggested.""",
107
+ },
108
+ {
109
+ "type": "image_url",
110
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
111
+ },
112
+ ],
113
+ }
114
+ ],
115
+ )
116
+
117
+ caption = response2.choices[0].message.content
118
+ else:
119
+ caption = None
120
+
121
+ image_captions[frame_name] = caption
122
+ return image_captions
123
+
124
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
125
+ # 이미지에 해당 물체 바운딩 박스 그리기
126
+ video_data = json_data[video_id]
127
+ frame_names = video_data['frame_names']
128
+ video_path = video_data['video_path']
129
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
130
+ frame_indx = frame_names.index(frame_name)
131
+ obj_data = video_data['annotations'][frame_indx][obj_id]
132
+
133
+ bbox = obj_data['bbox']
134
+ cat_name = obj_data['category_name']
135
+ valid = obj_data['valid']
136
+
137
+ if valid == 0:
138
+ print("Object not in this frame!")
139
+ return {}
140
+
141
+
142
+ x_min, y_min, x_max, y_max = bbox
143
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
144
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
145
+ plt.figure()
146
+ plt.imshow(I)
147
+ plt.axis('off')
148
+ plt.show()
149
+ pil_I = Image.fromarray(I)
150
+ buff = BytesIO()
151
+ pil_I.save(buff, format='JPEG')
152
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
153
+
154
+ #ref expression 만들기
155
+ generator = OpenAI()
156
+ response = generator.chat.completions.create(
157
+ model="gpt-4o-mini",
158
+ messages=[
159
+ {
160
+ "role": "user",
161
+ "content": [
162
+ {
163
+ "type": "text",
164
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
165
+ 1. The referring expression describes the action and does not contain information about appearance or location in the picture.
166
+ 2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
167
+ 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
168
+ 4. The referring expression should only describe the highlighted {cat_name} and not any other.
169
+ 5. Use '{cat_name}' as the noun for the referring expressions.
170
+ Output only the referring expression.
171
+ {caption}""",
172
+ },
173
+ {
174
+ "type": "image_url",
175
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
176
+ },
177
+ ],
178
+ }
179
+ ],
180
+ )
181
+
182
+ ref_exp = response.choices[0].message.content
183
+
184
+ #QA filtering
185
+ #QA1: 원하는 물체를 설명하는지
186
+ filter = OpenAI()
187
+ response1 = filter.chat.completions.create(
188
+ model="gpt-4o-mini",
189
+ messages=[
190
+ {
191
+ "role": "user",
192
+ "content": [
193
+ {
194
+ "type": "text",
195
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
196
+ {ref_exp}""",
197
+ },
198
+ {
199
+ "type": "image_url",
200
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
201
+ },
202
+ ],
203
+ }
204
+ ],
205
+ )
206
+
207
+ response1_content = response1.choices[0].message.content
208
+ describesHighlighted = True if "yes" in response1_content.lower() else False
209
+
210
+ #QA2: 원하지 않는 물체를 설명하지 않는지
211
+ response2 = filter.chat.completions.create(
212
+ model="gpt-4o-mini",
213
+ messages=[
214
+ {
215
+ "role": "user",
216
+ "content": [
217
+ {
218
+ "type": "text",
219
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
220
+ {ref_exp}""",
221
+ },
222
+ {
223
+ "type": "image_url",
224
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
225
+ },
226
+ ],
227
+ }
228
+ ],
229
+ )
230
+
231
+ response2_content = response2.choices[0].message.content
232
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
233
+
234
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
235
+
236
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
237
+
238
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
239
+
240
+ def createRefExp(video_id, json_data):
241
+ video_data = json_data[video_id]
242
+ obj_ids = list(video_data['annotations'][0].keys())
243
+ frame_names = video_data['frame_names']
244
+
245
+ captions_per_frame = getCaption(video_id, json_data)
246
+
247
+ if captions_per_frame == -1:
248
+ print("There are more than 2 cateories")
249
+ return
250
+
251
+
252
+ video_ref_exps = {}
253
+
254
+ for frame_name in frame_names:
255
+ frame_caption = captions_per_frame[frame_name]
256
+
257
+ if frame_caption == None:
258
+ video_ref_exps[frame_name] = None
259
+
260
+ else:
261
+ frame_ref_exps = {}
262
+ for obj_id in obj_ids:
263
+ exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
264
+ frame_ref_exps[obj_id] = exp_per_obj
265
+ video_ref_exps[frame_name] = frame_ref_exps
266
+
267
+ return video_ref_exps
268
+
269
+ if __name__ == '__main__':
270
+ with open('mbench/sampled_frame3.json', 'r') as file:
271
+ data = json.load(file)
272
+
273
+ all_video_refs = {}
274
+ for i in range(10):
275
+ video_id = list(data.keys())[i]
276
+ video_ref = createRefExp(video_id, data)
277
+ all_video_refs[video_id] = video_ref
.history/mbench/gpt_ref-ytvos_20250119070707.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import build_dataset
2
+ import argparse
3
+ import opts
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ import os
8
+ from os import path as osp
9
+ import skimage
10
+ from io import BytesIO
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ import regex as re
15
+ import json
16
+
17
+ import cv2
18
+ from PIL import Image, ImageDraw
19
+ import torch
20
+ from torchvision.transforms import functional as F
21
+
22
+ from skimage import measure # (pip install scikit-image)
23
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
24
+
25
+ import matplotlib.pyplot as plt
26
+ import matplotlib.patches as patches
27
+ from matplotlib.collections import PatchCollection
28
+ from matplotlib.patches import Rectangle
29
+
30
+
31
+ import ipywidgets as widgets
32
+ from IPython.display import display, clear_output
33
+
34
+ from openai import OpenAI
35
+ import base64
36
+
37
+ os.environ['OPENAI_API_KEY'] = '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'
38
+
39
+ # Function to encode the image
40
+ def encode_image(image_path):
41
+ with open(image_path, "rb") as image_file:
42
+ return base64.b64encode(image_file.read()).decode("utf-8")
43
+
44
+ def getCaption(video_id, json_data):
45
+ #데이터 가져오기
46
+ video_data = json_data[video_id]
47
+ frame_names = video_data['frame_names']
48
+ video_path = video_data['video_path']
49
+
50
+ cat_names = set()
51
+ for obj_id in list(video_data['annotations'][0].keys()):
52
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
53
+
54
+ if len(cat_names) == 1:
55
+ cat_name = next(iter(cat_names))
56
+ else:
57
+ print("more than 2 categories")
58
+ return -1
59
+
60
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
61
+ image_captions = {}
62
+
63
+ captioner = OpenAI()
64
+ for i in range(len(image_paths)):
65
+ image_path = image_paths[i]
66
+ frame_name = frame_names[i]
67
+ base64_image = encode_image(image_path)
68
+
69
+ #1단계: 필터링
70
+ response1 = captioner.chat.completions.create(
71
+ model="gpt-4o-mini",
72
+ messages=[
73
+ {
74
+ "role": "user",
75
+ "content": [
76
+ {
77
+ "type": "text",
78
+ "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
79
+ },
80
+ {
81
+ "type": "image_url",
82
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
83
+ },
84
+ ],
85
+ }
86
+ ],
87
+ )
88
+ response_content = response1.choices[0].message.content
89
+ should_caption = True if "yes" in response_content.lower() else False
90
+
91
+ #2단계: dense caption 만들기
92
+ if should_caption:
93
+ response2 = captioner.chat.completions.create(
94
+ model="gpt-4o-mini",
95
+ messages=[
96
+ {
97
+ "role": "user",
98
+ "content": [
99
+ {
100
+ "type": "text",
101
+ "text": f"""
102
+ Describe the image in detail focusing on the {cat_name}s' actions.
103
+ 1. Each action should be prominent, clear and unique, describing the corresponding object only.
104
+ 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
105
+ 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
106
+ 4. Do not include actions that needs to be guessed or suggested.""",
107
+ },
108
+ {
109
+ "type": "image_url",
110
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
111
+ },
112
+ ],
113
+ }
114
+ ],
115
+ )
116
+
117
+ caption = response2.choices[0].message.content
118
+ else:
119
+ caption = None
120
+
121
+ image_captions[frame_name] = caption
122
+ return image_captions
123
+
124
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
125
+ # 이미지에 해당 물체 바운딩 박스 그리기
126
+ video_data = json_data[video_id]
127
+ frame_names = video_data['frame_names']
128
+ video_path = video_data['video_path']
129
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
130
+ frame_indx = frame_names.index(frame_name)
131
+ obj_data = video_data['annotations'][frame_indx][obj_id]
132
+
133
+ bbox = obj_data['bbox']
134
+ cat_name = obj_data['category_name']
135
+ valid = obj_data['valid']
136
+
137
+ if valid == 0:
138
+ print("Object not in this frame!")
139
+ return {}
140
+
141
+
142
+ x_min, y_min, x_max, y_max = bbox
143
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
144
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
145
+ plt.figure()
146
+ plt.imshow(I)
147
+ plt.axis('off')
148
+ plt.show()
149
+ pil_I = Image.fromarray(I)
150
+ buff = BytesIO()
151
+ pil_I.save(buff, format='JPEG')
152
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
153
+
154
+ #ref expression 만들기
155
+ generator = OpenAI()
156
+ response = generator.chat.completions.create(
157
+ model="gpt-4o-mini",
158
+ messages=[
159
+ {
160
+ "role": "user",
161
+ "content": [
162
+ {
163
+ "type": "text",
164
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
165
+ 1. The referring expression describes the action and does not contain information about appearance or location in the picture.
166
+ 2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
167
+ 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
168
+ 4. The referring expression should only describe the highlighted {cat_name} and not any other.
169
+ 5. Use '{cat_name}' as the noun for the referring expressions.
170
+ Output only the referring expression.
171
+ {caption}""",
172
+ },
173
+ {
174
+ "type": "image_url",
175
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
176
+ },
177
+ ],
178
+ }
179
+ ],
180
+ )
181
+
182
+ ref_exp = response.choices[0].message.content
183
+
184
+ #QA filtering
185
+ #QA1: 원하는 물체를 설명하는지
186
+ filter = OpenAI()
187
+ response1 = filter.chat.completions.create(
188
+ model="gpt-4o-mini",
189
+ messages=[
190
+ {
191
+ "role": "user",
192
+ "content": [
193
+ {
194
+ "type": "text",
195
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
196
+ {ref_exp}""",
197
+ },
198
+ {
199
+ "type": "image_url",
200
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
201
+ },
202
+ ],
203
+ }
204
+ ],
205
+ )
206
+
207
+ response1_content = response1.choices[0].message.content
208
+ describesHighlighted = True if "yes" in response1_content.lower() else False
209
+
210
+ #QA2: 원하지 않는 물체를 설명하지 않는지
211
+ response2 = filter.chat.completions.create(
212
+ model="gpt-4o-mini",
213
+ messages=[
214
+ {
215
+ "role": "user",
216
+ "content": [
217
+ {
218
+ "type": "text",
219
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
220
+ {ref_exp}""",
221
+ },
222
+ {
223
+ "type": "image_url",
224
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
225
+ },
226
+ ],
227
+ }
228
+ ],
229
+ )
230
+
231
+ response2_content = response2.choices[0].message.content
232
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
233
+
234
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
235
+
236
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
237
+
238
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
239
+
240
+ def createRefExp(video_id, json_data):
241
+ video_data = json_data[video_id]
242
+ obj_ids = list(video_data['annotations'][0].keys())
243
+ frame_names = video_data['frame_names']
244
+
245
+ captions_per_frame = getCaption(video_id, json_data)
246
+
247
+ if captions_per_frame == -1:
248
+ print("There are more than 2 cateories")
249
+ return
250
+
251
+
252
+ video_ref_exps = {}
253
+
254
+ for frame_name in frame_names:
255
+ frame_caption = captions_per_frame[frame_name]
256
+
257
+ if frame_caption == None:
258
+ video_ref_exps[frame_name] = None
259
+
260
+ else:
261
+ frame_ref_exps = {}
262
+ for obj_id in obj_ids:
263
+ exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
264
+ frame_ref_exps[obj_id] = exp_per_obj
265
+ video_ref_exps[frame_name] = frame_ref_exps
266
+
267
+ return video_ref_exps
268
+
269
+ if __name__ == '__main__':
270
+ with open('mbench/sampled_frame3.json', 'r') as file:
271
+ data = json.load(file)
272
+
273
+ with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
274
+ manual_select = list(file)
275
+ for frame in manual_select:
276
+ result = json.loads
277
+
278
+ all_video_refs = {}
279
+ for i in range(10):
280
+ video_id = list(data.keys())[i]
281
+ video_ref = createRefExp(video_id, data)
282
+ all_video_refs[video_id] = video_ref
.history/mbench/gpt_ref-ytvos_20250119070824.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import build_dataset
2
+ import argparse
3
+ import opts
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ import os
8
+ from os import path as osp
9
+ import skimage
10
+ from io import BytesIO
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ import regex as re
15
+ import json
16
+
17
+ import cv2
18
+ from PIL import Image, ImageDraw
19
+ import torch
20
+ from torchvision.transforms import functional as F
21
+
22
+ from skimage import measure # (pip install scikit-image)
23
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
24
+
25
+ import matplotlib.pyplot as plt
26
+ import matplotlib.patches as patches
27
+ from matplotlib.collections import PatchCollection
28
+ from matplotlib.patches import Rectangle
29
+
30
+
31
+ import ipywidgets as widgets
32
+ from IPython.display import display, clear_output
33
+
34
+ from openai import OpenAI
35
+ import base64
36
+
37
+ os.environ['OPENAI_API_KEY'] = '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'
38
+
39
+ # Function to encode the image
40
+ def encode_image(image_path):
41
+ with open(image_path, "rb") as image_file:
42
+ return base64.b64encode(image_file.read()).decode("utf-8")
43
+
44
+ def getCaption(video_id, json_data):
45
+ #데이터 가져오기
46
+ video_data = json_data[video_id]
47
+ frame_names = video_data['frame_names']
48
+ video_path = video_data['video_path']
49
+
50
+ cat_names = set()
51
+ for obj_id in list(video_data['annotations'][0].keys()):
52
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
53
+
54
+ if len(cat_names) == 1:
55
+ cat_name = next(iter(cat_names))
56
+ else:
57
+ print("more than 2 categories")
58
+ return -1
59
+
60
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
61
+ image_captions = {}
62
+
63
+ captioner = OpenAI()
64
+ for i in range(len(image_paths)):
65
+ image_path = image_paths[i]
66
+ frame_name = frame_names[i]
67
+ base64_image = encode_image(image_path)
68
+
69
+ #1단계: 필터링
70
+ response1 = captioner.chat.completions.create(
71
+ model="gpt-4o-mini",
72
+ messages=[
73
+ {
74
+ "role": "user",
75
+ "content": [
76
+ {
77
+ "type": "text",
78
+ "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
79
+ },
80
+ {
81
+ "type": "image_url",
82
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
83
+ },
84
+ ],
85
+ }
86
+ ],
87
+ )
88
+ response_content = response1.choices[0].message.content
89
+ should_caption = True if "yes" in response_content.lower() else False
90
+
91
+ #2단계: dense caption 만들기
92
+ if should_caption:
93
+ response2 = captioner.chat.completions.create(
94
+ model="gpt-4o-mini",
95
+ messages=[
96
+ {
97
+ "role": "user",
98
+ "content": [
99
+ {
100
+ "type": "text",
101
+ "text": f"""
102
+ Describe the image in detail focusing on the {cat_name}s' actions.
103
+ 1. Each action should be prominent, clear and unique, describing the corresponding object only.
104
+ 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
105
+ 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
106
+ 4. Do not include actions that needs to be guessed or suggested.""",
107
+ },
108
+ {
109
+ "type": "image_url",
110
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
111
+ },
112
+ ],
113
+ }
114
+ ],
115
+ )
116
+
117
+ caption = response2.choices[0].message.content
118
+ else:
119
+ caption = None
120
+
121
+ image_captions[frame_name] = caption
122
+ return image_captions
123
+
124
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
125
+ # 이미지에 해당 물체 바운딩 박스 그리기
126
+ video_data = json_data[video_id]
127
+ frame_names = video_data['frame_names']
128
+ video_path = video_data['video_path']
129
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
130
+ frame_indx = frame_names.index(frame_name)
131
+ obj_data = video_data['annotations'][frame_indx][obj_id]
132
+
133
+ bbox = obj_data['bbox']
134
+ cat_name = obj_data['category_name']
135
+ valid = obj_data['valid']
136
+
137
+ if valid == 0:
138
+ print("Object not in this frame!")
139
+ return {}
140
+
141
+
142
+ x_min, y_min, x_max, y_max = bbox
143
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
144
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
145
+ plt.figure()
146
+ plt.imshow(I)
147
+ plt.axis('off')
148
+ plt.show()
149
+ pil_I = Image.fromarray(I)
150
+ buff = BytesIO()
151
+ pil_I.save(buff, format='JPEG')
152
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
153
+
154
+ #ref expression 만들기
155
+ generator = OpenAI()
156
+ response = generator.chat.completions.create(
157
+ model="gpt-4o-mini",
158
+ messages=[
159
+ {
160
+ "role": "user",
161
+ "content": [
162
+ {
163
+ "type": "text",
164
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
165
+ 1. The referring expression describes the action and does not contain information about appearance or location in the picture.
166
+ 2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
167
+ 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
168
+ 4. The referring expression should only describe the highlighted {cat_name} and not any other.
169
+ 5. Use '{cat_name}' as the noun for the referring expressions.
170
+ Output only the referring expression.
171
+ {caption}""",
172
+ },
173
+ {
174
+ "type": "image_url",
175
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
176
+ },
177
+ ],
178
+ }
179
+ ],
180
+ )
181
+
182
+ ref_exp = response.choices[0].message.content
183
+
184
+ #QA filtering
185
+ #QA1: 원하는 물체를 설명하는지
186
+ filter = OpenAI()
187
+ response1 = filter.chat.completions.create(
188
+ model="gpt-4o-mini",
189
+ messages=[
190
+ {
191
+ "role": "user",
192
+ "content": [
193
+ {
194
+ "type": "text",
195
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
196
+ {ref_exp}""",
197
+ },
198
+ {
199
+ "type": "image_url",
200
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
201
+ },
202
+ ],
203
+ }
204
+ ],
205
+ )
206
+
207
+ response1_content = response1.choices[0].message.content
208
+ describesHighlighted = True if "yes" in response1_content.lower() else False
209
+
210
+ #QA2: 원하지 않는 물체를 설명하지 않는지
211
+ response2 = filter.chat.completions.create(
212
+ model="gpt-4o-mini",
213
+ messages=[
214
+ {
215
+ "role": "user",
216
+ "content": [
217
+ {
218
+ "type": "text",
219
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
220
+ {ref_exp}""",
221
+ },
222
+ {
223
+ "type": "image_url",
224
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
225
+ },
226
+ ],
227
+ }
228
+ ],
229
+ )
230
+
231
+ response2_content = response2.choices[0].message.content
232
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
233
+
234
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
235
+
236
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
237
+
238
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
239
+
240
+ def createRefExp(video_id, json_data):
241
+ video_data = json_data[video_id]
242
+ obj_ids = list(video_data['annotations'][0].keys())
243
+ frame_names = video_data['frame_names']
244
+
245
+ captions_per_frame = getCaption(video_id, json_data)
246
+
247
+ if captions_per_frame == -1:
248
+ print("There are more than 2 cateories")
249
+ return
250
+
251
+
252
+ video_ref_exps = {}
253
+
254
+ for frame_name in frame_names:
255
+ frame_caption = captions_per_frame[frame_name]
256
+
257
+ if frame_caption == None:
258
+ video_ref_exps[frame_name] = None
259
+
260
+ else:
261
+ frame_ref_exps = {}
262
+ for obj_id in obj_ids:
263
+ exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
264
+ frame_ref_exps[obj_id] = exp_per_obj
265
+ video_ref_exps[frame_name] = frame_ref_exps
266
+
267
+ return video_ref_exps
268
+
269
+ if __name__ == '__main__':
270
+ with open('mbench/sampled_frame3.json', 'r') as file:
271
+ data = json.load(file)
272
+
273
+ videos = set()
274
+ with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
275
+ manual_select = list(file)
276
+ for frame in manual_select:
277
+ result = json.loads(frame)
278
+ videos.add(result['video'])
279
+ videos = list(videos)
280
+
281
+
282
+ all_video_refs = {}
283
+ for i in range(1):
284
+ video_id = videos[i]
285
+ video_ref = createRefExp(video_id, data)
286
+ all_video_refs[video_id] = video_ref
.history/mbench/gpt_ref-ytvos_20250119071214.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import build_dataset
2
+ import argparse
3
+ import opts
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ import os
8
+ from os import path as osp
9
+ import skimage
10
+ from io import BytesIO
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ import regex as re
15
+ import json
16
+
17
+ import cv2
18
+ from PIL import Image, ImageDraw
19
+ import torch
20
+ from torchvision.transforms import functional as F
21
+
22
+ from skimage import measure # (pip install scikit-image)
23
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
24
+
25
+ import matplotlib.pyplot as plt
26
+ import matplotlib.patches as patches
27
+ from matplotlib.collections import PatchCollection
28
+ from matplotlib.patches import Rectangle
29
+
30
+
31
+ import ipywidgets as widgets
32
+ from IPython.display import display, clear_output
33
+
34
+ from openai import OpenAI
35
+ import base64
36
+
37
+ os.environ['OPENAI_API_KEY'] = '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'
38
+
39
+ # Function to encode the image
40
+ def encode_image(image_path):
41
+ with open(image_path, "rb") as image_file:
42
+ return base64.b64encode(image_file.read()).decode("utf-8")
43
+
44
+ def getCaption(video_id, json_data):
45
+ #데이터 가져오기
46
+ video_data = json_data[video_id]
47
+ frame_names = video_data['frame_names']
48
+ video_path = video_data['video_path']
49
+
50
+ cat_names = set()
51
+ for obj_id in list(video_data['annotations'][0].keys()):
52
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
53
+
54
+ if len(cat_names) == 1:
55
+ cat_name = next(iter(cat_names))
56
+ else:
57
+ print("more than 2 categories")
58
+ return -1
59
+
60
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
61
+ image_captions = {}
62
+
63
+ captioner = OpenAI()
64
+ for i in range(len(image_paths)):
65
+ image_path = image_paths[i]
66
+ frame_name = frame_names[i]
67
+ base64_image = encode_image(image_path)
68
+
69
+ #1단계: 필터링
70
+ response1 = captioner.chat.completions.create(
71
+ model="gpt-4o-mini",
72
+ messages=[
73
+ {
74
+ "role": "user",
75
+ "content": [
76
+ {
77
+ "type": "text",
78
+ "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
79
+ },
80
+ {
81
+ "type": "image_url",
82
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
83
+ },
84
+ ],
85
+ }
86
+ ],
87
+ )
88
+ response_content = response1.choices[0].message.content
89
+ should_caption = True if "yes" in response_content.lower() else False
90
+
91
+ #2단계: dense caption 만들기
92
+ if should_caption:
93
+ response2 = captioner.chat.completions.create(
94
+ model="gpt-4o-mini",
95
+ messages=[
96
+ {
97
+ "role": "user",
98
+ "content": [
99
+ {
100
+ "type": "text",
101
+ "text": f"""
102
+ Describe the image in detail focusing on the {cat_name}s' actions.
103
+ 1. Each action should be prominent, clear and unique, describing the corresponding object only.
104
+ 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
105
+ 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
106
+ 4. Do not include actions that needs to be guessed or suggested.""",
107
+ },
108
+ {
109
+ "type": "image_url",
110
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
111
+ },
112
+ ],
113
+ }
114
+ ],
115
+ )
116
+
117
+ caption = response2.choices[0].message.content
118
+ else:
119
+ caption = None
120
+
121
+ image_captions[frame_name] = caption
122
+ return image_captions
123
+
124
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
125
+ # 이미지에 해당 물체 바운딩 박스 그리기
126
+ video_data = json_data[video_id]
127
+ frame_names = video_data['frame_names']
128
+ video_path = video_data['video_path']
129
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
130
+ frame_indx = frame_names.index(frame_name)
131
+ obj_data = video_data['annotations'][frame_indx][obj_id]
132
+
133
+ bbox = obj_data['bbox']
134
+ cat_name = obj_data['category_name']
135
+ valid = obj_data['valid']
136
+
137
+ if valid == 0:
138
+ print("Object not in this frame!")
139
+ return {}
140
+
141
+
142
+ x_min, y_min, x_max, y_max = bbox
143
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
144
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
145
+ plt.figure()
146
+ plt.imshow(I)
147
+ plt.axis('off')
148
+ plt.show()
149
+ pil_I = Image.fromarray(I)
150
+ buff = BytesIO()
151
+ pil_I.save(buff, format='JPEG')
152
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
153
+
154
+ #ref expression 만들기
155
+ generator = OpenAI()
156
+ response = generator.chat.completions.create(
157
+ model="gpt-4o-mini",
158
+ messages=[
159
+ {
160
+ "role": "user",
161
+ "content": [
162
+ {
163
+ "type": "text",
164
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
165
+ 1. The referring expression describes the action and does not contain information about appearance or location in the picture.
166
+ 2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
167
+ 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
168
+ 4. The referring expression should only describe the highlighted {cat_name} and not any other.
169
+ 5. Use '{cat_name}' as the noun for the referring expressions.
170
+ Output only the referring expression.
171
+ {caption}""",
172
+ },
173
+ {
174
+ "type": "image_url",
175
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
176
+ },
177
+ ],
178
+ }
179
+ ],
180
+ )
181
+
182
+ ref_exp = response.choices[0].message.content
183
+
184
+ #QA filtering
185
+ #QA1: 원하는 물체를 설명하는지
186
+ filter = OpenAI()
187
+ response1 = filter.chat.completions.create(
188
+ model="gpt-4o-mini",
189
+ messages=[
190
+ {
191
+ "role": "user",
192
+ "content": [
193
+ {
194
+ "type": "text",
195
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
196
+ {ref_exp}""",
197
+ },
198
+ {
199
+ "type": "image_url",
200
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
201
+ },
202
+ ],
203
+ }
204
+ ],
205
+ )
206
+
207
+ response1_content = response1.choices[0].message.content
208
+ describesHighlighted = True if "yes" in response1_content.lower() else False
209
+
210
+ #QA2: 원하지 않는 물체를 설명하지 않는지
211
+ response2 = filter.chat.completions.create(
212
+ model="gpt-4o-mini",
213
+ messages=[
214
+ {
215
+ "role": "user",
216
+ "content": [
217
+ {
218
+ "type": "text",
219
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
220
+ {ref_exp}""",
221
+ },
222
+ {
223
+ "type": "image_url",
224
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
225
+ },
226
+ ],
227
+ }
228
+ ],
229
+ )
230
+
231
+ response2_content = response2.choices[0].message.content
232
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
233
+
234
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
235
+
236
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
237
+
238
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
239
+
240
+ def createRefExp(video_id, json_data):
241
+ video_data = json_data[video_id]
242
+ obj_ids = list(video_data['annotations'][0].keys())
243
+ frame_names = video_data['frame_names']
244
+
245
+ captions_per_frame = getCaption(video_id, json_data)
246
+
247
+ if captions_per_frame == -1:
248
+ print("There are more than 2 cateories")
249
+ return
250
+
251
+
252
+ video_ref_exps = {}
253
+
254
+ for frame_name in frame_names:
255
+ frame_caption = captions_per_frame[frame_name]
256
+
257
+ if frame_caption == None:
258
+ video_ref_exps[frame_name] = None
259
+
260
+ else:
261
+ frame_ref_exps = {}
262
+ for obj_id in obj_ids:
263
+ exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
264
+ frame_ref_exps[obj_id] = exp_per_obj
265
+ video_ref_exps[frame_name] = frame_ref_exps
266
+
267
+ return video_ref_exps
268
+
269
+ if __name__ == '__main__':
270
+ with open('mbench/sampled_frame3.json', 'r') as file:
271
+ data = json.load(file)
272
+
273
+ videos = set()
274
+ with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
275
+ manual_select = list(file)
276
+ for frame in manual_select:
277
+ result = json.loads(frame)
278
+ videos.add(result['video'])
279
+ videos = list(videos)
280
+
281
+
282
+ all_video_refs = {}
283
+ for i in range(1):
284
+ video_id = videos[i]
285
+ video_ref = createRefExp(video_id, data)
286
+ all_video_refs[video_id] = video_ref
287
+
288
+ json_obj = json.dumps(all_video_refs, indent=4)
289
+ with open('mbench/result.json', 'w') as file:
290
+ file.wirte(json_obj)
.history/mbench/gpt_ref-ytvos_20250119073250.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+ from pathlib import Path
10
+ import os
11
+ import skimage
12
+ from io import BytesIO
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ from openai import OpenAI
37
+ import base64
38
+
39
+ os.environ['OPENAI_API_KEY'] = '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'
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+ def getCaption(video_id, json_data):
47
+ #데이터 가져오기
48
+ video_data = json_data[video_id]
49
+ frame_names = video_data['frame_names']
50
+ video_path = video_data['video_path']
51
+
52
+ cat_names = set()
53
+ for obj_id in list(video_data['annotations'][0].keys()):
54
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
55
+
56
+ if len(cat_names) == 1:
57
+ cat_name = next(iter(cat_names))
58
+ else:
59
+ print("more than 2 categories")
60
+ return -1
61
+
62
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
63
+ image_captions = {}
64
+
65
+ captioner = OpenAI()
66
+ for i in range(len(image_paths)):
67
+ image_path = image_paths[i]
68
+ frame_name = frame_names[i]
69
+ base64_image = encode_image(image_path)
70
+
71
+ #1단계: 필터링
72
+ response1 = captioner.chat.completions.create(
73
+ model="gpt-4o-mini",
74
+ messages=[
75
+ {
76
+ "role": "user",
77
+ "content": [
78
+ {
79
+ "type": "text",
80
+ "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
81
+ },
82
+ {
83
+ "type": "image_url",
84
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
85
+ },
86
+ ],
87
+ }
88
+ ],
89
+ )
90
+ response_content = response1.choices[0].message.content
91
+ should_caption = True if "yes" in response_content.lower() else False
92
+
93
+ #2단계: dense caption 만들기
94
+ if should_caption:
95
+ response2 = captioner.chat.completions.create(
96
+ model="gpt-4o-mini",
97
+ messages=[
98
+ {
99
+ "role": "user",
100
+ "content": [
101
+ {
102
+ "type": "text",
103
+ "text": f"""
104
+ Describe the image in detail focusing on the {cat_name}s' actions.
105
+ 1. Each action should be prominent, clear and unique, describing the corresponding object only.
106
+ 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
107
+ 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
108
+ 4. Do not include actions that needs to be guessed or suggested.""",
109
+ },
110
+ {
111
+ "type": "image_url",
112
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
113
+ },
114
+ ],
115
+ }
116
+ ],
117
+ )
118
+
119
+ caption = response2.choices[0].message.content
120
+ else:
121
+ caption = None
122
+
123
+ image_captions[frame_name] = caption
124
+ return image_captions
125
+
126
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
127
+ # 이미지에 해당 물체 바운딩 박스 그리기
128
+ video_data = json_data[video_id]
129
+ frame_names = video_data['frame_names']
130
+ video_path = video_data['video_path']
131
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
132
+ frame_indx = frame_names.index(frame_name)
133
+ obj_data = video_data['annotations'][frame_indx][obj_id]
134
+
135
+ bbox = obj_data['bbox']
136
+ cat_name = obj_data['category_name']
137
+ valid = obj_data['valid']
138
+
139
+ if valid == 0:
140
+ print("Object not in this frame!")
141
+ return {}
142
+
143
+
144
+ x_min, y_min, x_max, y_max = bbox
145
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
146
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
147
+ # plt.figure()
148
+ # plt.imshow(I)
149
+ # plt.axis('off')
150
+ # plt.show()
151
+ pil_I = Image.fromarray(I)
152
+ buff = BytesIO()
153
+ pil_I.save(buff, format='JPEG')
154
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
155
+
156
+ #ref expression 만들기
157
+ generator = OpenAI()
158
+ response = generator.chat.completions.create(
159
+ model="gpt-4o-mini",
160
+ messages=[
161
+ {
162
+ "role": "user",
163
+ "content": [
164
+ {
165
+ "type": "text",
166
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
167
+ 1. The referring expression describes the action and does not contain information about appearance or location in the picture.
168
+ 2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
169
+ 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
170
+ 4. The referring expression should only describe the highlighted {cat_name} and not any other.
171
+ 5. Use '{cat_name}' as the noun for the referring expressions.
172
+ Output only the referring expression.
173
+ {caption}""",
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
178
+ },
179
+ ],
180
+ }
181
+ ],
182
+ )
183
+
184
+ ref_exp = response.choices[0].message.content
185
+
186
+ #QA filtering
187
+ #QA1: 원하는 물체를 설명하는지
188
+ filter = OpenAI()
189
+ response1 = filter.chat.completions.create(
190
+ model="gpt-4o-mini",
191
+ messages=[
192
+ {
193
+ "role": "user",
194
+ "content": [
195
+ {
196
+ "type": "text",
197
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
198
+ {ref_exp}""",
199
+ },
200
+ {
201
+ "type": "image_url",
202
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
203
+ },
204
+ ],
205
+ }
206
+ ],
207
+ )
208
+
209
+ response1_content = response1.choices[0].message.content
210
+ describesHighlighted = True if "yes" in response1_content.lower() else False
211
+
212
+ #QA2: 원하지 않는 물체를 설명하지 않는지
213
+ response2 = filter.chat.completions.create(
214
+ model="gpt-4o-mini",
215
+ messages=[
216
+ {
217
+ "role": "user",
218
+ "content": [
219
+ {
220
+ "type": "text",
221
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
222
+ {ref_exp}""",
223
+ },
224
+ {
225
+ "type": "image_url",
226
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
227
+ },
228
+ ],
229
+ }
230
+ ],
231
+ )
232
+
233
+ response2_content = response2.choices[0].message.content
234
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
235
+
236
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
237
+
238
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
239
+
240
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
241
+
242
+ def createRefExp(video_id, json_data):
243
+ video_data = json_data[video_id]
244
+ obj_ids = list(video_data['annotations'][0].keys())
245
+ frame_names = video_data['frame_names']
246
+
247
+ captions_per_frame = getCaption(video_id, json_data)
248
+
249
+ if captions_per_frame == -1:
250
+ print("There are more than 2 cateories")
251
+ return None
252
+
253
+
254
+ video_ref_exps = {}
255
+
256
+ for frame_name in frame_names:
257
+ frame_caption = captions_per_frame[frame_name]
258
+
259
+ if frame_caption == None:
260
+ video_ref_exps[frame_name] = None
261
+
262
+ else:
263
+ frame_ref_exps = {}
264
+ for obj_id in obj_ids:
265
+ exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
266
+ frame_ref_exps[obj_id] = exp_per_obj
267
+ video_ref_exps[frame_name] = frame_ref_exps
268
+
269
+ return video_ref_exps
270
+
271
+ if __name__ == '__main__':
272
+ with open('mbench/sampled_frame3.json', 'r') as file:
273
+ data = json.load(file)
274
+
275
+ videos = set()
276
+ with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
277
+ manual_select = list(file)
278
+ for frame in manual_select:
279
+ result = json.loads(frame)
280
+ videos.add(result['video'])
281
+ videos = list(videos)
282
+
283
+
284
+ all_video_refs = {}
285
+ for i in range(10):
286
+ video_id = videos[i]
287
+ video_ref = createRefExp(video_id, data)
288
+ all_video_refs[video_id] = video_ref
289
+
290
+ json_obj = json.dumps(all_video_refs, indent=4)
291
+ with open('mbench/result.json', 'w') as file:
292
+ file.write(json_obj)
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130183735.py ADDED
File without changes
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130183916.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from os import path as osp
4
+ from io import BytesIO
5
+
6
+ from mbench.ytvos_ref import build as build_ytvos_ref
7
+ import argparse
8
+ import opts
9
+
10
+ import sys
11
+ from pathlib import Path
12
+ import os
13
+ from os import path as osp
14
+ import skimage
15
+ from io import BytesIO
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+ import regex as re
20
+ import json
21
+
22
+ import cv2
23
+ from PIL import Image, ImageDraw
24
+ import torch
25
+ from torchvision.transforms import functional as F
26
+
27
+ from skimage import measure # (pip install scikit-image)
28
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
29
+
30
+ import matplotlib.pyplot as plt
31
+ import matplotlib.patches as patches
32
+ from matplotlib.collections import PatchCollection
33
+ from matplotlib.patches import Rectangle
34
+ import textwrap
35
+
36
+
37
+ import ipywidgets as widgets
38
+ from IPython.display import display, clear_output
39
+
40
+ from openai import OpenAI
41
+ import base64
42
+
43
+ def number_objects_and_encode(idx, color_mask=False):
44
+ encoded_frames = {}
45
+ contoured_frames = {} # New dictionary for original images
46
+ vid_cat_cnts = {}
47
+
48
+ vid_meta = metas[idx]
49
+ vid_data = train_dataset[idx]
50
+ vid_id = vid_meta['video']
51
+ frame_indx = vid_meta['sample_indx']
52
+ cat_names = set(vid_meta['obj_id_cat'].values())
53
+ imgs = vid_data[0]
54
+
55
+ for cat in cat_names:
56
+ cat_frames = []
57
+ contour_frames = []
58
+ frame_cat_cnts = {}
59
+
60
+ for i in range(imgs.size(0)):
61
+ frame_name = frame_indx[i]
62
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
63
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
64
+
65
+ frame_data = vid_data[2][frame_name]
66
+ obj_ids = list(frame_data.keys())
67
+
68
+ cat_cnt = 0
69
+
70
+ for j in range(len(obj_ids)):
71
+ obj_id = obj_ids[j]
72
+ obj_data = frame_data[obj_id]
73
+ obj_bbox = obj_data['bbox']
74
+ obj_valid = obj_data['valid']
75
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
76
+ obj_cat = obj_data['category_name']
77
+
78
+ if obj_cat == cat and obj_valid:
79
+ cat_cnt += 1
80
+
81
+ if color_mask == False:
82
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
83
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
84
+ for i, contour in enumerate(contours):
85
+ # 윤곽선 중심 계산
86
+ moments = cv2.moments(contour)
87
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
88
+ cx = int(moments["m10"] / moments["m00"])
89
+ cy = int(moments["m01"] / moments["m00"])
90
+ else:
91
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
92
+
93
+ # 텍스트 배경 (검은색 배경 만들기)
94
+ font = cv2.FONT_HERSHEY_SIMPLEX
95
+ text = obj_id
96
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
97
+ text_w, text_h = text_size
98
+
99
+ # 텍스트 배경 그리기 (검은색 배경)
100
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
101
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
102
+
103
+ # 텍스트 그리기 (흰색 텍스트)
104
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
105
+ font, 1, (255, 255, 255), 2)
106
+
107
+ else:
108
+ alpha = 0.08
109
+
110
+ colored_obj_mask = np.zeros_like(frame)
111
+ colored_obj_mask[obj_mask == 1] = colors[j]
112
+ frame[obj_mask == 1] = (
113
+ (1 - alpha) * frame[obj_mask == 1]
114
+ + alpha * colored_obj_mask[obj_mask == 1]
115
+ )
116
+
117
+
118
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
119
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
120
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
121
+
122
+
123
+
124
+ if len(contours) > 0:
125
+ largest_contour = max(contours, key=cv2.contourArea)
126
+ M = cv2.moments(largest_contour)
127
+ if M["m00"] != 0:
128
+ center_x = int(M["m10"] / M["m00"])
129
+ center_y = int(M["m01"] / M["m00"])
130
+ else:
131
+ center_x, center_y = 0, 0
132
+
133
+ font = cv2.FONT_HERSHEY_SIMPLEX
134
+ text = obj_id
135
+
136
+ font_scale = 0.9
137
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
138
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
139
+ text_y = center_y
140
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
141
+
142
+ # 텍스트 배경 사각형 좌표 계산
143
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
144
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
145
+ rect_end = (text_x + text_size[0] + 5, text_y)
146
+
147
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
148
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
149
+
150
+ plt.figure(figsize=(12, 8))
151
+ plt.imshow(frame)
152
+ plt.title(f"frame {frame_name}")
153
+ plt.tight_layout()
154
+ plt.axis('off')
155
+ plt.show()
156
+
157
+ buffer = BytesIO()
158
+ frame = Image.fromarray(frame)
159
+ frame.save(buffer, format='jpeg')
160
+ buffer.seek(0)
161
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
162
+ frame_cat_cnts[frame_name] = cat_cnt
163
+
164
+ buffer.seek(0) # Reuse buffer instead of creating a new one
165
+ buffer.truncate()
166
+ frame_for_contour = Image.fromarray(frame_for_contour)
167
+ frame_for_contour.save(buffer, format='jpeg')
168
+ buffer.seek(0)
169
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
170
+
171
+ encoded_frames[cat] = cat_frames
172
+ contoured_frames[cat] = contour_frames
173
+ vid_cat_cnts[cat] = frame_cat_cnts
174
+
175
+ return encoded_frames, vid_cat_cnts, contoured_frames
176
+
177
+ if __name__ == '__main__':
178
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
179
+ args = parser.parse_args()
180
+
181
+ #==================데이터 불러오기===================
182
+ # 전체 데이터셋
183
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
184
+
185
+ # 전체 데이터셋 메타데이터
186
+ metas = train_dataset.metas
187
+
188
+ # 색상 후보 8개 (RGB 형식)
189
+ colors = [
190
+ (255, 0, 0), # Red
191
+ (0, 255, 0), # Green
192
+ (0, 0, 255), # Blue
193
+ (255, 255, 0), # Yellow
194
+ (255, 0, 255), # Magenta
195
+ (0, 255, 255), # Cyan
196
+ (128, 0, 128), # Purple
197
+ (255, 165, 0) # Orange
198
+ ]
199
+
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130185048.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from os import path as osp
4
+ from io import BytesIO
5
+
6
+ from mbench.ytvos_ref import build as build_ytvos_ref
7
+ import argparse
8
+ import opts
9
+
10
+ import sys
11
+ from pathlib import Path
12
+ import os
13
+ from os import path as osp
14
+ import skimage
15
+ from io import BytesIO
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+ import regex as re
20
+ import json
21
+
22
+ import cv2
23
+ from PIL import Image, ImageDraw
24
+ import torch
25
+ from torchvision.transforms import functional as F
26
+
27
+ from skimage import measure # (pip install scikit-image)
28
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
29
+
30
+ import matplotlib.pyplot as plt
31
+ import matplotlib.patches as patches
32
+ from matplotlib.collections import PatchCollection
33
+ from matplotlib.patches import Rectangle
34
+ import textwrap
35
+
36
+
37
+ import ipywidgets as widgets
38
+ from IPython.display import display, clear_output
39
+
40
+ from openai import OpenAI
41
+ import base64
42
+ import json
43
+
44
+ def number_objects_and_encode(idx, color_mask=False):
45
+ encoded_frames = {}
46
+ contoured_frames = {} # New dictionary for original images
47
+ vid_cat_cnts = {}
48
+
49
+ vid_meta = metas[idx]
50
+ vid_data = train_dataset[idx]
51
+ vid_id = vid_meta['video']
52
+ frame_indx = vid_meta['sample_indx']
53
+ cat_names = set(vid_meta['obj_id_cat'].values())
54
+ imgs = vid_data[0]
55
+
56
+ for cat in cat_names:
57
+ cat_frames = []
58
+ contour_frames = []
59
+ frame_cat_cnts = {}
60
+
61
+ for i in range(imgs.size(0)):
62
+ frame_name = frame_indx[i]
63
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
64
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
65
+
66
+ frame_data = vid_data[2][frame_name]
67
+ obj_ids = list(frame_data.keys())
68
+
69
+ cat_cnt = 0
70
+
71
+ for j in range(len(obj_ids)):
72
+ obj_id = obj_ids[j]
73
+ obj_data = frame_data[obj_id]
74
+ obj_bbox = obj_data['bbox']
75
+ obj_valid = obj_data['valid']
76
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
77
+ obj_cat = obj_data['category_name']
78
+
79
+ if obj_cat == cat and obj_valid:
80
+ cat_cnt += 1
81
+
82
+ if color_mask == False:
83
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
84
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
85
+ for i, contour in enumerate(contours):
86
+ # 윤곽선 중심 계산
87
+ moments = cv2.moments(contour)
88
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
89
+ cx = int(moments["m10"] / moments["m00"])
90
+ cy = int(moments["m01"] / moments["m00"])
91
+ else:
92
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
93
+
94
+ # 텍스트 배경 (검은색 배경 만들기)
95
+ font = cv2.FONT_HERSHEY_SIMPLEX
96
+ text = obj_id
97
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
98
+ text_w, text_h = text_size
99
+
100
+ # 텍스트 배경 그리기 (검은색 배경)
101
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
102
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
103
+
104
+ # 텍스트 그리기 (흰색 텍스트)
105
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
106
+ font, 1, (255, 255, 255), 2)
107
+
108
+ else:
109
+ alpha = 0.08
110
+
111
+ colored_obj_mask = np.zeros_like(frame)
112
+ colored_obj_mask[obj_mask == 1] = colors[j]
113
+ frame[obj_mask == 1] = (
114
+ (1 - alpha) * frame[obj_mask == 1]
115
+ + alpha * colored_obj_mask[obj_mask == 1]
116
+ )
117
+
118
+
119
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
120
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
121
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
122
+
123
+
124
+
125
+ if len(contours) > 0:
126
+ largest_contour = max(contours, key=cv2.contourArea)
127
+ M = cv2.moments(largest_contour)
128
+ if M["m00"] != 0:
129
+ center_x = int(M["m10"] / M["m00"])
130
+ center_y = int(M["m01"] / M["m00"])
131
+ else:
132
+ center_x, center_y = 0, 0
133
+
134
+ font = cv2.FONT_HERSHEY_SIMPLEX
135
+ text = obj_id
136
+
137
+ font_scale = 0.9
138
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
139
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
140
+ text_y = center_y
141
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
142
+
143
+ # 텍스트 배경 사각형 좌표 계산
144
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
145
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
146
+ rect_end = (text_x + text_size[0] + 5, text_y)
147
+
148
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
149
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
150
+
151
+ # plt.figure(figsize=(12, 8))
152
+ # plt.imshow(frame)
153
+ # plt.title(f"frame {frame_name}")
154
+ # plt.tight_layout()
155
+ # plt.axis('off')
156
+ # plt.show()
157
+
158
+ buffer = BytesIO()
159
+ frame = Image.fromarray(frame)
160
+ frame.save(buffer, format='jpeg')
161
+ buffer.seek(0)
162
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
163
+ frame_cat_cnts[frame_name] = cat_cnt
164
+
165
+ buffer.seek(0) # Reuse buffer instead of creating a new one
166
+ buffer.truncate()
167
+ frame_for_contour = Image.fromarray(frame_for_contour)
168
+ frame_for_contour.save(buffer, format='jpeg')
169
+ buffer.seek(0)
170
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
171
+
172
+ encoded_frames[cat] = cat_frames
173
+ contoured_frames[cat] = contour_frames
174
+ vid_cat_cnts[cat] = frame_cat_cnts
175
+
176
+ return encoded_frames, vid_cat_cnts, contoured_frames
177
+
178
+
179
+ def getCaption(idx, color_mask=True):
180
+ vid_meta = metas[idx]
181
+ vid_data = train_dataset[idx]
182
+ vid_id = vid_meta['video']
183
+ print(f"vid id: {vid_id}\n")
184
+
185
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
186
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
187
+ all_captions = dict()
188
+
189
+ base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
190
+ marked = "mask with boundary" if color_mask else "boundary"
191
+
192
+ for cat_name in list(cat_names) :
193
+
194
+ is_movable = False
195
+ if cat_name in ytvos_category_valid_list :
196
+ is_movable = True
197
+
198
+ if not is_movable:
199
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
200
+
201
+
202
+ image_captions = {}
203
+ captioner = OpenAI()
204
+ cat_base64_frames = base64_frames[cat_name]
205
+ cont_base64_frames = contoured_frames[cat_name]
206
+
207
+ for i in range(len(cat_base64_frames)):
208
+ frame_name = frame_indx[i]
209
+ cont_base64_image = cont_base64_frames[i]
210
+ base64_image = cat_base64_frames[i]
211
+ should_filter = False
212
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
213
+
214
+ if frame_cat_cnts >= 2:
215
+ should_filter = True
216
+ else:
217
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
218
+
219
+ if is_movable and should_filter:
220
+ #1단계: 필터링
221
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
222
+ caption_filter_text = f"""
223
+ You are a visual assistant analyzing a single frame from a video.
224
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
225
+
226
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
227
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
228
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
229
+
230
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
231
+
232
+ - Respond with "YES" if:
233
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
234
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
235
+ 3) Each action is unambiguously recognizable and distinct.
236
+
237
+ - Respond with "NONE" if:
238
+ 1) The actions or pose are not clearly differentiable or too similar.
239
+ 2) They show no noticeable action beyond standing or minor movements.
240
+
241
+ Answer strictly with either "YES" or "NONE".
242
+ """
243
+
244
+
245
+ response1 = captioner.chat.completions.create(
246
+ model="chatgpt-4o-latest",
247
+ messages=[
248
+ {
249
+ "role": "user",
250
+ "content": [
251
+ {
252
+ "type": "text",
253
+ "text": caption_filter_text,
254
+ },
255
+ {
256
+ "type": "image_url",
257
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
258
+ }
259
+ ],
260
+ }
261
+ ],
262
+ )
263
+ response_content = response1.choices[0].message.content
264
+ should_caption = True if "yes" in response_content.lower() else False
265
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
266
+
267
+ else:
268
+ should_caption = False
269
+
270
+ #2단계: dense caption 만들기
271
+ dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
272
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
273
+ I want to use your expressions to create a action-centric referring expression dataset.
274
+ Therefore, your expressions for these {cat_name}s should describe unique action of each object.
275
+
276
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
277
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
278
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
279
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
280
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
281
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
282
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
283
+ 8. Include interactions with objects or other entities when they are prominent and observable.
284
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
285
+ 10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
286
+ 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
287
+ 12. Do not mention object IDs.
288
+ 13. Use '{cat_name}' as the noun for the referring expressions.
289
+
290
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
291
+ Output referring expressions for each object id.
292
+ """
293
+
294
+ dense_caption_prompt = f"""
295
+ You are a visual assistant analyzing a single frame of a video.
296
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
297
+ I want to use your expressions to create a action-centric referring expression dataset.
298
+ Please describe each {cat_name} using **clearly observable** and **specific** actions.
299
+
300
+ ## Guidelines:
301
+ 1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
302
+ 2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
303
+ 3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
304
+ 4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
305
+ Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
306
+ 5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
307
+ 6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
308
+ 7. Base your description on the following action definitions:
309
+ - Facial with object manipulation
310
+ - General body movement, body position or pattern
311
+ - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
312
+ - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
313
+
314
+ ## Output Format:
315
+ - For each labeled {cat_name}, output one line in the format:
316
+ ID. action-oriented description
317
+
318
+ Example:
319
+ 1. a bear grasping the edge of a wood with its front paws
320
+ 2. the bear pushing another bear, leaning forward
321
+
322
+ **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
323
+ **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
324
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
325
+ For each labeled {cat_name}, output referring expressions for each object id.
326
+ """
327
+ if should_caption:
328
+ response2 = captioner.chat.completions.create(
329
+ model="chatgpt-4o-latest",
330
+ messages=[
331
+ {
332
+ "role": "user",
333
+ "content": [
334
+ {
335
+ "type": "text",
336
+ "text": dense_caption_prompt,
337
+ },
338
+ {
339
+ "type": "image_url",
340
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
341
+ },
342
+ ],
343
+ }
344
+ ],
345
+ )
346
+
347
+ caption = response2.choices[0].message.content
348
+ #print(f"{image_path} - {frame_name}: {caption}")
349
+ else:
350
+ caption = None
351
+
352
+ image_captions[frame_name] = caption
353
+ all_captions[cat_name] = image_captions
354
+
355
+ # final : also prepare valid object ids
356
+ valid_obj_ids = dict()
357
+
358
+ for cat in cat_names:
359
+ if cat in ytvos_category_valid_list:
360
+ obj_id_cat = vid_meta['obj_id_cat']
361
+ valid_cat_ids = []
362
+ for obj_id in list(obj_id_cat.keys()):
363
+ if obj_id_cat[obj_id] == cat:
364
+ valid_cat_ids.append(obj_id)
365
+ valid_obj_ids[cat] = valid_cat_ids
366
+
367
+ return vid_id, all_captions, valid_obj_ids
368
+
369
+
370
+ if __name__ == '__main__':
371
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
372
+ args = parser.parse_args()
373
+
374
+ #==================데이터 불러오기===================
375
+ # 전체 데이터셋
376
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
377
+
378
+ # 전체 데이터셋 메타데이터
379
+ metas = train_dataset.metas
380
+
381
+ # 색상 후보 8개 (RGB 형식)
382
+ colors = [
383
+ (255, 0, 0), # Red
384
+ (0, 255, 0), # Green
385
+ (0, 0, 255), # Blue
386
+ (255, 255, 0), # Yellow
387
+ (255, 0, 255), # Magenta
388
+ (0, 255, 255), # Cyan
389
+ (128, 0, 128), # Purple
390
+ (255, 165, 0) # Orange
391
+ ]
392
+
393
+ ytvos_category_valid_list = [
394
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
395
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
396
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
397
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
398
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
399
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
400
+ ]
401
+
402
+ #==================gpt 돌리기===================
403
+ os.environ['OPENAI_API_KEY'] = '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'
404
+
405
+ result_captions = {}
406
+ result_valid_obj_ids = {}
407
+
408
+ for i in range(370):
409
+ vid_id, all_captions, valid_obj_ids = getCaption(i, True)
410
+
411
+ if vid_id not in result_captions:
412
+ result_captions[vid_id] = all_captions
413
+ if vid_id not in result_valid_obj_ids:
414
+ result_valid_obj_ids[vid_id] = valid_obj_ids
415
+
416
+ print("Finished!", flush=True)
417
+
418
+ with open("mbench/numbered_captions.json", "w") as file:
419
+ json.dump(result_captions, file, indent=4)
420
+
421
+ with open("mbench/numbered_valid_obj_ids.json", "w") as file:
422
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190055.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from os import path as osp
4
+ from io import BytesIO
5
+
6
+ from mbench.ytvos_ref import build as build_ytvos_ref
7
+ import argparse
8
+ import opts
9
+
10
+ import sys
11
+ from pathlib import Path
12
+ import os
13
+ from os import path as osp
14
+ import skimage
15
+ from io import BytesIO
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+ import regex as re
20
+ import json
21
+
22
+ import cv2
23
+ from PIL import Image, ImageDraw
24
+ import torch
25
+ from torchvision.transforms import functional as F
26
+
27
+ from skimage import measure # (pip install scikit-image)
28
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
29
+
30
+ import matplotlib.pyplot as plt
31
+ import matplotlib.patches as patches
32
+ from matplotlib.collections import PatchCollection
33
+ from matplotlib.patches import Rectangle
34
+ import textwrap
35
+
36
+
37
+ import ipywidgets as widgets
38
+ from IPython.display import display, clear_output
39
+
40
+ from openai import OpenAI
41
+ import base64
42
+ import json
43
+
44
+ def number_objects_and_encode(idx, color_mask=False):
45
+ encoded_frames = {}
46
+ contoured_frames = {} # New dictionary for original images
47
+ vid_cat_cnts = {}
48
+
49
+ vid_meta = metas[idx]
50
+ vid_data = train_dataset[idx]
51
+ vid_id = vid_meta['video']
52
+ frame_indx = vid_meta['sample_indx']
53
+ cat_names = set(vid_meta['obj_id_cat'].values())
54
+ imgs = vid_data[0]
55
+
56
+ for cat in cat_names:
57
+ cat_frames = []
58
+ contour_frames = []
59
+ frame_cat_cnts = {}
60
+
61
+ for i in range(imgs.size(0)):
62
+ frame_name = frame_indx[i]
63
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
64
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
65
+
66
+ frame_data = vid_data[2][frame_name]
67
+ obj_ids = list(frame_data.keys())
68
+
69
+ cat_cnt = 0
70
+
71
+ for j in range(len(obj_ids)):
72
+ obj_id = obj_ids[j]
73
+ obj_data = frame_data[obj_id]
74
+ obj_bbox = obj_data['bbox']
75
+ obj_valid = obj_data['valid']
76
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
77
+ obj_cat = obj_data['category_name']
78
+
79
+ if obj_cat == cat and obj_valid:
80
+ cat_cnt += 1
81
+
82
+ if color_mask == False:
83
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
84
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
85
+ for i, contour in enumerate(contours):
86
+ # 윤곽선 중심 계산
87
+ moments = cv2.moments(contour)
88
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
89
+ cx = int(moments["m10"] / moments["m00"])
90
+ cy = int(moments["m01"] / moments["m00"])
91
+ else:
92
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
93
+
94
+ # 텍스트 배경 (검은색 배경 만들기)
95
+ font = cv2.FONT_HERSHEY_SIMPLEX
96
+ text = obj_id
97
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
98
+ text_w, text_h = text_size
99
+
100
+ # 텍스트 배경 그리기 (검은색 배경)
101
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
102
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
103
+
104
+ # 텍스트 그리기 (흰색 텍스트)
105
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
106
+ font, 1, (255, 255, 255), 2)
107
+
108
+ else:
109
+ alpha = 0.08
110
+
111
+ colored_obj_mask = np.zeros_like(frame)
112
+ colored_obj_mask[obj_mask == 1] = colors[j]
113
+ frame[obj_mask == 1] = (
114
+ (1 - alpha) * frame[obj_mask == 1]
115
+ + alpha * colored_obj_mask[obj_mask == 1]
116
+ )
117
+
118
+
119
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
120
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
121
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
122
+
123
+
124
+
125
+ if len(contours) > 0:
126
+ largest_contour = max(contours, key=cv2.contourArea)
127
+ M = cv2.moments(largest_contour)
128
+ if M["m00"] != 0:
129
+ center_x = int(M["m10"] / M["m00"])
130
+ center_y = int(M["m01"] / M["m00"])
131
+ else:
132
+ center_x, center_y = 0, 0
133
+
134
+ font = cv2.FONT_HERSHEY_SIMPLEX
135
+ text = obj_id
136
+
137
+ font_scale = 0.9
138
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
139
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
140
+ text_y = center_y
141
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
142
+
143
+ # 텍스트 배경 사각형 좌표 계산
144
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
145
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
146
+ rect_end = (text_x + text_size[0] + 5, text_y)
147
+
148
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
149
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
150
+
151
+ # plt.figure(figsize=(12, 8))
152
+ # plt.imshow(frame)
153
+ # plt.title(f"frame {frame_name}")
154
+ # plt.tight_layout()
155
+ # plt.axis('off')
156
+ # plt.show()
157
+
158
+ buffer = BytesIO()
159
+ frame = Image.fromarray(frame)
160
+ frame.save(buffer, format='jpeg')
161
+ buffer.seek(0)
162
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
163
+ frame_cat_cnts[frame_name] = cat_cnt
164
+
165
+ buffer.seek(0) # Reuse buffer instead of creating a new one
166
+ buffer.truncate()
167
+ frame_for_contour = Image.fromarray(frame_for_contour)
168
+ frame_for_contour.save(buffer, format='jpeg')
169
+ buffer.seek(0)
170
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
171
+
172
+ encoded_frames[cat] = cat_frames
173
+ contoured_frames[cat] = contour_frames
174
+ vid_cat_cnts[cat] = frame_cat_cnts
175
+
176
+ return encoded_frames, vid_cat_cnts, contoured_frames
177
+
178
+
179
+ def getCaption(idx, color_mask=True):
180
+ vid_meta = metas[idx]
181
+ vid_data = train_dataset[idx]
182
+ vid_id = vid_meta['video']
183
+ print(f"vid id: {vid_id}\n")
184
+
185
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
186
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
187
+ all_captions = dict()
188
+
189
+ base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
190
+ marked = "mask with boundary" if color_mask else "boundary"
191
+
192
+ for cat_name in list(cat_names) :
193
+
194
+ is_movable = False
195
+ if cat_name in ytvos_category_valid_list :
196
+ is_movable = True
197
+
198
+ if not is_movable:
199
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
200
+
201
+
202
+ image_captions = {}
203
+ captioner = OpenAI()
204
+ cat_base64_frames = base64_frames[cat_name]
205
+ cont_base64_frames = contoured_frames[cat_name]
206
+
207
+ for i in range(len(cat_base64_frames)):
208
+ frame_name = frame_indx[i]
209
+ cont_base64_image = cont_base64_frames[i]
210
+ base64_image = cat_base64_frames[i]
211
+ should_filter = False
212
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
213
+
214
+ if frame_cat_cnts >= 2:
215
+ should_filter = True
216
+ else:
217
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
218
+
219
+ if is_movable and should_filter:
220
+ #1단계: 필터링
221
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
222
+ caption_filter_text = f"""
223
+ You are a visual assistant analyzing a single frame from a video.
224
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
225
+
226
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
227
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
228
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
229
+
230
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
231
+
232
+ - Respond with "YES" if:
233
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
234
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
235
+ 3) Each action is unambiguously recognizable and distinct.
236
+
237
+ - Respond with "NONE" if:
238
+ 1) The actions or pose are not clearly differentiable or too similar.
239
+ 2) They show no noticeable action beyond standing or minor movements.
240
+
241
+ Answer strictly with either "YES" or "NONE".
242
+ """
243
+
244
+
245
+ response1 = captioner.chat.completions.create(
246
+ model="chatgpt-4o-latest",
247
+ messages=[
248
+ {
249
+ "role": "user",
250
+ "content": [
251
+ {
252
+ "type": "text",
253
+ "text": caption_filter_text,
254
+ },
255
+ {
256
+ "type": "image_url",
257
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
258
+ }
259
+ ],
260
+ }
261
+ ],
262
+ )
263
+ response_content = response1.choices[0].message.content
264
+ should_caption = True if "yes" in response_content.lower() else False
265
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
266
+
267
+ else:
268
+ should_caption = False
269
+
270
+ #2단계: dense caption 만들기
271
+ dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
272
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
273
+ I want to use your expressions to create a action-centric referring expression dataset.
274
+ Therefore, your expressions for these {cat_name}s should describe unique action of each object.
275
+
276
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
277
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
278
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
279
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
280
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
281
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
282
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
283
+ 8. Include interactions with objects or other entities when they are prominent and observable.
284
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
285
+ 10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
286
+ 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
287
+ 12. Do not mention object IDs.
288
+ 13. Use '{cat_name}' as the noun for the referring expressions.
289
+
290
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
291
+ Output referring expressions for each object id.
292
+ """
293
+
294
+ dense_caption_prompt = f"""
295
+ You are a visual assistant analyzing a single frame of a video.
296
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
297
+ I want to use your expressions to create a action-centric referring expression dataset.
298
+ Please describe each {cat_name} using **clearly observable** and **specific** actions.
299
+
300
+ ## Guidelines:
301
+ 1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
302
+ 2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
303
+ 3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
304
+ 4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
305
+ Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
306
+ 5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
307
+ 6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
308
+ 7. Base your description on the following action definitions:
309
+ - Facial with object manipulation
310
+ - General body movement, body position or pattern
311
+ - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
312
+ - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
313
+
314
+ ## Output Format:
315
+ - For each labeled {cat_name}, output one line in the format:
316
+ ID. action-oriented description
317
+
318
+ Example:
319
+ 1. a bear grasping the edge of a wood with its front paws
320
+ 2. the bear pushing another bear, leaning forward
321
+
322
+ **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
323
+ **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
324
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
325
+ For each labeled {cat_name}, output referring expressions for each object id.
326
+ """
327
+ if should_caption:
328
+ response2 = captioner.chat.completions.create(
329
+ model="chatgpt-4o-latest",
330
+ messages=[
331
+ {
332
+ "role": "user",
333
+ "content": [
334
+ {
335
+ "type": "text",
336
+ "text": dense_caption_prompt,
337
+ },
338
+ {
339
+ "type": "image_url",
340
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
341
+ },
342
+ ],
343
+ }
344
+ ],
345
+ )
346
+
347
+ caption = response2.choices[0].message.content
348
+ #print(f"{image_path} - {frame_name}: {caption}")
349
+ else:
350
+ caption = None
351
+
352
+ image_captions[frame_name] = caption
353
+ all_captions[cat_name] = image_captions
354
+
355
+ # final : also prepare valid object ids
356
+ valid_obj_ids = dict()
357
+
358
+ for cat in cat_names:
359
+ if cat in ytvos_category_valid_list:
360
+ obj_id_cat = vid_meta['obj_id_cat']
361
+ valid_cat_ids = []
362
+ for obj_id in list(obj_id_cat.keys()):
363
+ if obj_id_cat[obj_id] == cat:
364
+ valid_cat_ids.append(obj_id)
365
+ valid_obj_ids[cat] = valid_cat_ids
366
+
367
+ return vid_id, all_captions, valid_obj_ids
368
+
369
+
370
+ if __name__ == '__main__':
371
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
372
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
373
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
374
+
375
+ args = parser.parse_args()
376
+
377
+ print(args.save_caption_path, flush=True)
378
+ print(args.save_valid_obj_ids_path, flush=True)
379
+
380
+ #==================데이터 불러오기===================
381
+ # 전체 데이터셋
382
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
383
+
384
+ # 전체 데이터셋 메타데이터
385
+ metas = train_dataset.metas
386
+
387
+ # 색상 후보 8개 (RGB 형식)
388
+ colors = [
389
+ (255, 0, 0), # Red
390
+ (0, 255, 0), # Green
391
+ (0, 0, 255), # Blue
392
+ (255, 255, 0), # Yellow
393
+ (255, 0, 255), # Magenta
394
+ (0, 255, 255), # Cyan
395
+ (128, 0, 128), # Purple
396
+ (255, 165, 0) # Orange
397
+ ]
398
+
399
+ ytvos_category_valid_list = [
400
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
401
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
402
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
403
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
404
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
405
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
406
+ ]
407
+
408
+ #==================gpt 돌리기===================
409
+ os.environ['OPENAI_API_KEY'] = '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'
410
+
411
+ result_captions = {}
412
+ result_valid_obj_ids = {}
413
+
414
+ for i in range(370):
415
+ vid_id, all_captions, valid_obj_ids = getCaption(i, True)
416
+
417
+ if vid_id not in result_captions:
418
+ result_captions[vid_id] = all_captions
419
+ if vid_id not in result_valid_obj_ids:
420
+ result_valid_obj_ids[vid_id] = valid_obj_ids
421
+
422
+ print("Finished!", flush=True)
423
+
424
+ with open(args.save_caption_path, "w") as file:
425
+ json.dump(result_captions, file, indent=4)
426
+
427
+ with open(args.save_valid_obj_ids_path, "w") as file:
428
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190447.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ print(os.getcwd()) # 현재 작업 디렉토리 출력
3
+
4
+ import sys
5
+ from os import path as osp
6
+ from io import BytesIO
7
+
8
+ from mbench.ytvos_ref import build as build_ytvos_ref
9
+ import argparse
10
+ import opts
11
+
12
+ import sys
13
+ from pathlib import Path
14
+ import os
15
+ from os import path as osp
16
+ import skimage
17
+ from io import BytesIO
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ import regex as re
22
+ import json
23
+
24
+ import cv2
25
+ from PIL import Image, ImageDraw
26
+ import torch
27
+ from torchvision.transforms import functional as F
28
+
29
+ from skimage import measure # (pip install scikit-image)
30
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
31
+
32
+ import matplotlib.pyplot as plt
33
+ import matplotlib.patches as patches
34
+ from matplotlib.collections import PatchCollection
35
+ from matplotlib.patches import Rectangle
36
+ import textwrap
37
+
38
+
39
+ import ipywidgets as widgets
40
+ from IPython.display import display, clear_output
41
+
42
+ from openai import OpenAI
43
+ import base64
44
+ import json
45
+
46
+ def number_objects_and_encode(idx, color_mask=False):
47
+ encoded_frames = {}
48
+ contoured_frames = {} # New dictionary for original images
49
+ vid_cat_cnts = {}
50
+
51
+ vid_meta = metas[idx]
52
+ vid_data = train_dataset[idx]
53
+ vid_id = vid_meta['video']
54
+ frame_indx = vid_meta['sample_indx']
55
+ cat_names = set(vid_meta['obj_id_cat'].values())
56
+ imgs = vid_data[0]
57
+
58
+ for cat in cat_names:
59
+ cat_frames = []
60
+ contour_frames = []
61
+ frame_cat_cnts = {}
62
+
63
+ for i in range(imgs.size(0)):
64
+ frame_name = frame_indx[i]
65
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
66
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
67
+
68
+ frame_data = vid_data[2][frame_name]
69
+ obj_ids = list(frame_data.keys())
70
+
71
+ cat_cnt = 0
72
+
73
+ for j in range(len(obj_ids)):
74
+ obj_id = obj_ids[j]
75
+ obj_data = frame_data[obj_id]
76
+ obj_bbox = obj_data['bbox']
77
+ obj_valid = obj_data['valid']
78
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
79
+ obj_cat = obj_data['category_name']
80
+
81
+ if obj_cat == cat and obj_valid:
82
+ cat_cnt += 1
83
+
84
+ if color_mask == False:
85
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
86
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
87
+ for i, contour in enumerate(contours):
88
+ # 윤곽선 중심 계산
89
+ moments = cv2.moments(contour)
90
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
91
+ cx = int(moments["m10"] / moments["m00"])
92
+ cy = int(moments["m01"] / moments["m00"])
93
+ else:
94
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
95
+
96
+ # 텍스트 배경 (검은색 배경 만들기)
97
+ font = cv2.FONT_HERSHEY_SIMPLEX
98
+ text = obj_id
99
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
100
+ text_w, text_h = text_size
101
+
102
+ # 텍스트 배경 그리기 (검은색 배경)
103
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
104
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
105
+
106
+ # 텍스트 그리기 (흰색 텍스트)
107
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
108
+ font, 1, (255, 255, 255), 2)
109
+
110
+ else:
111
+ alpha = 0.08
112
+
113
+ colored_obj_mask = np.zeros_like(frame)
114
+ colored_obj_mask[obj_mask == 1] = colors[j]
115
+ frame[obj_mask == 1] = (
116
+ (1 - alpha) * frame[obj_mask == 1]
117
+ + alpha * colored_obj_mask[obj_mask == 1]
118
+ )
119
+
120
+
121
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
122
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
123
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
124
+
125
+
126
+
127
+ if len(contours) > 0:
128
+ largest_contour = max(contours, key=cv2.contourArea)
129
+ M = cv2.moments(largest_contour)
130
+ if M["m00"] != 0:
131
+ center_x = int(M["m10"] / M["m00"])
132
+ center_y = int(M["m01"] / M["m00"])
133
+ else:
134
+ center_x, center_y = 0, 0
135
+
136
+ font = cv2.FONT_HERSHEY_SIMPLEX
137
+ text = obj_id
138
+
139
+ font_scale = 0.9
140
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
141
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
142
+ text_y = center_y
143
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
144
+
145
+ # 텍스트 배경 사각형 좌표 계산
146
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
147
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
148
+ rect_end = (text_x + text_size[0] + 5, text_y)
149
+
150
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
151
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
152
+
153
+ # plt.figure(figsize=(12, 8))
154
+ # plt.imshow(frame)
155
+ # plt.title(f"frame {frame_name}")
156
+ # plt.tight_layout()
157
+ # plt.axis('off')
158
+ # plt.show()
159
+
160
+ buffer = BytesIO()
161
+ frame = Image.fromarray(frame)
162
+ frame.save(buffer, format='jpeg')
163
+ buffer.seek(0)
164
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
165
+ frame_cat_cnts[frame_name] = cat_cnt
166
+
167
+ buffer.seek(0) # Reuse buffer instead of creating a new one
168
+ buffer.truncate()
169
+ frame_for_contour = Image.fromarray(frame_for_contour)
170
+ frame_for_contour.save(buffer, format='jpeg')
171
+ buffer.seek(0)
172
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
173
+
174
+ encoded_frames[cat] = cat_frames
175
+ contoured_frames[cat] = contour_frames
176
+ vid_cat_cnts[cat] = frame_cat_cnts
177
+
178
+ return encoded_frames, vid_cat_cnts, contoured_frames
179
+
180
+
181
+ def getCaption(idx, color_mask=True):
182
+ vid_meta = metas[idx]
183
+ vid_data = train_dataset[idx]
184
+ vid_id = vid_meta['video']
185
+ print(f"vid id: {vid_id}\n")
186
+
187
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
188
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
189
+ all_captions = dict()
190
+
191
+ base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
192
+ marked = "mask with boundary" if color_mask else "boundary"
193
+
194
+ for cat_name in list(cat_names) :
195
+
196
+ is_movable = False
197
+ if cat_name in ytvos_category_valid_list :
198
+ is_movable = True
199
+
200
+ if not is_movable:
201
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
202
+
203
+
204
+ image_captions = {}
205
+ captioner = OpenAI()
206
+ cat_base64_frames = base64_frames[cat_name]
207
+ cont_base64_frames = contoured_frames[cat_name]
208
+
209
+ for i in range(len(cat_base64_frames)):
210
+ frame_name = frame_indx[i]
211
+ cont_base64_image = cont_base64_frames[i]
212
+ base64_image = cat_base64_frames[i]
213
+ should_filter = False
214
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
215
+
216
+ if frame_cat_cnts >= 2:
217
+ should_filter = True
218
+ else:
219
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
220
+
221
+ if is_movable and should_filter:
222
+ #1단계: 필터링
223
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
224
+ caption_filter_text = f"""
225
+ You are a visual assistant analyzing a single frame from a video.
226
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
227
+
228
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
229
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
230
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
231
+
232
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
233
+
234
+ - Respond with "YES" if:
235
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
236
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
237
+ 3) Each action is unambiguously recognizable and distinct.
238
+
239
+ - Respond with "NONE" if:
240
+ 1) The actions or pose are not clearly differentiable or too similar.
241
+ 2) They show no noticeable action beyond standing or minor movements.
242
+
243
+ Answer strictly with either "YES" or "NONE".
244
+ """
245
+
246
+
247
+ response1 = captioner.chat.completions.create(
248
+ model="chatgpt-4o-latest",
249
+ messages=[
250
+ {
251
+ "role": "user",
252
+ "content": [
253
+ {
254
+ "type": "text",
255
+ "text": caption_filter_text,
256
+ },
257
+ {
258
+ "type": "image_url",
259
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
260
+ }
261
+ ],
262
+ }
263
+ ],
264
+ )
265
+ response_content = response1.choices[0].message.content
266
+ should_caption = True if "yes" in response_content.lower() else False
267
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
268
+
269
+ else:
270
+ should_caption = False
271
+
272
+ #2단계: dense caption 만들기
273
+ dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
274
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
275
+ I want to use your expressions to create a action-centric referring expression dataset.
276
+ Therefore, your expressions for these {cat_name}s should describe unique action of each object.
277
+
278
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
279
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
280
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
281
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
282
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
283
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
284
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
285
+ 8. Include interactions with objects or other entities when they are prominent and observable.
286
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
287
+ 10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
288
+ 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
289
+ 12. Do not mention object IDs.
290
+ 13. Use '{cat_name}' as the noun for the referring expressions.
291
+
292
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
293
+ Output referring expressions for each object id.
294
+ """
295
+
296
+ dense_caption_prompt = f"""
297
+ You are a visual assistant analyzing a single frame of a video.
298
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
299
+ I want to use your expressions to create a action-centric referring expression dataset.
300
+ Please describe each {cat_name} using **clearly observable** and **specific** actions.
301
+
302
+ ## Guidelines:
303
+ 1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
304
+ 2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
305
+ 3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
306
+ 4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
307
+ Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
308
+ 5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
309
+ 6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
310
+ 7. Base your description on the following action definitions:
311
+ - Facial with object manipulation
312
+ - General body movement, body position or pattern
313
+ - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
314
+ - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
315
+
316
+ ## Output Format:
317
+ - For each labeled {cat_name}, output one line in the format:
318
+ ID. action-oriented description
319
+
320
+ Example:
321
+ 1. a bear grasping the edge of a wood with its front paws
322
+ 2. the bear pushing another bear, leaning forward
323
+
324
+ **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
325
+ **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
326
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
327
+ For each labeled {cat_name}, output referring expressions for each object id.
328
+ """
329
+ if should_caption:
330
+ response2 = captioner.chat.completions.create(
331
+ model="chatgpt-4o-latest",
332
+ messages=[
333
+ {
334
+ "role": "user",
335
+ "content": [
336
+ {
337
+ "type": "text",
338
+ "text": dense_caption_prompt,
339
+ },
340
+ {
341
+ "type": "image_url",
342
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
343
+ },
344
+ ],
345
+ }
346
+ ],
347
+ )
348
+
349
+ caption = response2.choices[0].message.content
350
+ #print(f"{image_path} - {frame_name}: {caption}")
351
+ else:
352
+ caption = None
353
+
354
+ image_captions[frame_name] = caption
355
+ all_captions[cat_name] = image_captions
356
+
357
+ # final : also prepare valid object ids
358
+ valid_obj_ids = dict()
359
+
360
+ for cat in cat_names:
361
+ if cat in ytvos_category_valid_list:
362
+ obj_id_cat = vid_meta['obj_id_cat']
363
+ valid_cat_ids = []
364
+ for obj_id in list(obj_id_cat.keys()):
365
+ if obj_id_cat[obj_id] == cat:
366
+ valid_cat_ids.append(obj_id)
367
+ valid_obj_ids[cat] = valid_cat_ids
368
+
369
+ return vid_id, all_captions, valid_obj_ids
370
+
371
+
372
+ if __name__ == '__main__':
373
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
374
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
375
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
376
+
377
+ args = parser.parse_args()
378
+
379
+ print(args.save_caption_path, flush=True)
380
+ print(args.save_valid_obj_ids_path, flush=True)
381
+
382
+ #==================데이터 불러오기===================
383
+ # 전체 데이터셋
384
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
385
+
386
+ # 전체 데이터셋 메타데이터
387
+ metas = train_dataset.metas
388
+
389
+ # 색상 후보 8개 (RGB 형식)
390
+ colors = [
391
+ (255, 0, 0), # Red
392
+ (0, 255, 0), # Green
393
+ (0, 0, 255), # Blue
394
+ (255, 255, 0), # Yellow
395
+ (255, 0, 255), # Magenta
396
+ (0, 255, 255), # Cyan
397
+ (128, 0, 128), # Purple
398
+ (255, 165, 0) # Orange
399
+ ]
400
+
401
+ ytvos_category_valid_list = [
402
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
403
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
404
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
405
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
406
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
407
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
408
+ ]
409
+
410
+ #==================gpt 돌리기===================
411
+ os.environ['OPENAI_API_KEY'] = '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'
412
+
413
+ result_captions = {}
414
+ result_valid_obj_ids = {}
415
+
416
+ for i in range(370):
417
+ vid_id, all_captions, valid_obj_ids = getCaption(i, True)
418
+
419
+ if vid_id not in result_captions:
420
+ result_captions[vid_id] = all_captions
421
+ if vid_id not in result_valid_obj_ids:
422
+ result_valid_obj_ids[vid_id] = valid_obj_ids
423
+
424
+ print("Finished!", flush=True)
425
+
426
+ with open(args.save_caption_path, "w") as file:
427
+ json.dump(result_captions, file, indent=4)
428
+
429
+ with open(args.save_valid_obj_ids_path, "w") as file:
430
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190713.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from os import path as osp
6
+ from io import BytesIO
7
+
8
+ from mbench.ytvos_ref import build as build_ytvos_ref
9
+ import argparse
10
+ import opts
11
+
12
+ import sys
13
+ from pathlib import Path
14
+ import os
15
+ from os import path as osp
16
+ import skimage
17
+ from io import BytesIO
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ import regex as re
22
+ import json
23
+
24
+ import cv2
25
+ from PIL import Image, ImageDraw
26
+ import torch
27
+ from torchvision.transforms import functional as F
28
+
29
+ from skimage import measure # (pip install scikit-image)
30
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
31
+
32
+ import matplotlib.pyplot as plt
33
+ import matplotlib.patches as patches
34
+ from matplotlib.collections import PatchCollection
35
+ from matplotlib.patches import Rectangle
36
+ import textwrap
37
+
38
+
39
+ import ipywidgets as widgets
40
+ from IPython.display import display, clear_output
41
+
42
+ from openai import OpenAI
43
+ import base64
44
+ import json
45
+
46
+ def number_objects_and_encode(idx, color_mask=False):
47
+ encoded_frames = {}
48
+ contoured_frames = {} # New dictionary for original images
49
+ vid_cat_cnts = {}
50
+
51
+ vid_meta = metas[idx]
52
+ vid_data = train_dataset[idx]
53
+ vid_id = vid_meta['video']
54
+ frame_indx = vid_meta['sample_indx']
55
+ cat_names = set(vid_meta['obj_id_cat'].values())
56
+ imgs = vid_data[0]
57
+
58
+ for cat in cat_names:
59
+ cat_frames = []
60
+ contour_frames = []
61
+ frame_cat_cnts = {}
62
+
63
+ for i in range(imgs.size(0)):
64
+ frame_name = frame_indx[i]
65
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
66
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
67
+
68
+ frame_data = vid_data[2][frame_name]
69
+ obj_ids = list(frame_data.keys())
70
+
71
+ cat_cnt = 0
72
+
73
+ for j in range(len(obj_ids)):
74
+ obj_id = obj_ids[j]
75
+ obj_data = frame_data[obj_id]
76
+ obj_bbox = obj_data['bbox']
77
+ obj_valid = obj_data['valid']
78
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
79
+ obj_cat = obj_data['category_name']
80
+
81
+ if obj_cat == cat and obj_valid:
82
+ cat_cnt += 1
83
+
84
+ if color_mask == False:
85
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
86
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
87
+ for i, contour in enumerate(contours):
88
+ # 윤곽선 중심 계산
89
+ moments = cv2.moments(contour)
90
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
91
+ cx = int(moments["m10"] / moments["m00"])
92
+ cy = int(moments["m01"] / moments["m00"])
93
+ else:
94
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
95
+
96
+ # 텍스트 배경 (검은색 배경 만들기)
97
+ font = cv2.FONT_HERSHEY_SIMPLEX
98
+ text = obj_id
99
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
100
+ text_w, text_h = text_size
101
+
102
+ # 텍스트 배경 그리기 (검은색 배경)
103
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
104
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
105
+
106
+ # 텍스트 그리기 (흰색 텍스트)
107
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
108
+ font, 1, (255, 255, 255), 2)
109
+
110
+ else:
111
+ alpha = 0.08
112
+
113
+ colored_obj_mask = np.zeros_like(frame)
114
+ colored_obj_mask[obj_mask == 1] = colors[j]
115
+ frame[obj_mask == 1] = (
116
+ (1 - alpha) * frame[obj_mask == 1]
117
+ + alpha * colored_obj_mask[obj_mask == 1]
118
+ )
119
+
120
+
121
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
122
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
123
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
124
+
125
+
126
+
127
+ if len(contours) > 0:
128
+ largest_contour = max(contours, key=cv2.contourArea)
129
+ M = cv2.moments(largest_contour)
130
+ if M["m00"] != 0:
131
+ center_x = int(M["m10"] / M["m00"])
132
+ center_y = int(M["m01"] / M["m00"])
133
+ else:
134
+ center_x, center_y = 0, 0
135
+
136
+ font = cv2.FONT_HERSHEY_SIMPLEX
137
+ text = obj_id
138
+
139
+ font_scale = 0.9
140
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
141
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
142
+ text_y = center_y
143
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
144
+
145
+ # 텍스트 배경 사각형 좌표 계산
146
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
147
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
148
+ rect_end = (text_x + text_size[0] + 5, text_y)
149
+
150
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
151
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
152
+
153
+ # plt.figure(figsize=(12, 8))
154
+ # plt.imshow(frame)
155
+ # plt.title(f"frame {frame_name}")
156
+ # plt.tight_layout()
157
+ # plt.axis('off')
158
+ # plt.show()
159
+
160
+ buffer = BytesIO()
161
+ frame = Image.fromarray(frame)
162
+ frame.save(buffer, format='jpeg')
163
+ buffer.seek(0)
164
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
165
+ frame_cat_cnts[frame_name] = cat_cnt
166
+
167
+ buffer.seek(0) # Reuse buffer instead of creating a new one
168
+ buffer.truncate()
169
+ frame_for_contour = Image.fromarray(frame_for_contour)
170
+ frame_for_contour.save(buffer, format='jpeg')
171
+ buffer.seek(0)
172
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
173
+
174
+ encoded_frames[cat] = cat_frames
175
+ contoured_frames[cat] = contour_frames
176
+ vid_cat_cnts[cat] = frame_cat_cnts
177
+
178
+ return encoded_frames, vid_cat_cnts, contoured_frames
179
+
180
+
181
+ def getCaption(idx, color_mask=True):
182
+ vid_meta = metas[idx]
183
+ vid_data = train_dataset[idx]
184
+ vid_id = vid_meta['video']
185
+ print(f"vid id: {vid_id}\n")
186
+
187
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
188
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
189
+ all_captions = dict()
190
+
191
+ base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
192
+ marked = "mask with boundary" if color_mask else "boundary"
193
+
194
+ for cat_name in list(cat_names) :
195
+
196
+ is_movable = False
197
+ if cat_name in ytvos_category_valid_list :
198
+ is_movable = True
199
+
200
+ if not is_movable:
201
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
202
+
203
+
204
+ image_captions = {}
205
+ captioner = OpenAI()
206
+ cat_base64_frames = base64_frames[cat_name]
207
+ cont_base64_frames = contoured_frames[cat_name]
208
+
209
+ for i in range(len(cat_base64_frames)):
210
+ frame_name = frame_indx[i]
211
+ cont_base64_image = cont_base64_frames[i]
212
+ base64_image = cat_base64_frames[i]
213
+ should_filter = False
214
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
215
+
216
+ if frame_cat_cnts >= 2:
217
+ should_filter = True
218
+ else:
219
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
220
+
221
+ if is_movable and should_filter:
222
+ #1단계: 필터링
223
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
224
+ caption_filter_text = f"""
225
+ You are a visual assistant analyzing a single frame from a video.
226
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
227
+
228
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
229
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
230
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
231
+
232
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
233
+
234
+ - Respond with "YES" if:
235
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
236
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
237
+ 3) Each action is unambiguously recognizable and distinct.
238
+
239
+ - Respond with "NONE" if:
240
+ 1) The actions or pose are not clearly differentiable or too similar.
241
+ 2) They show no noticeable action beyond standing or minor movements.
242
+
243
+ Answer strictly with either "YES" or "NONE".
244
+ """
245
+
246
+
247
+ response1 = captioner.chat.completions.create(
248
+ model="chatgpt-4o-latest",
249
+ messages=[
250
+ {
251
+ "role": "user",
252
+ "content": [
253
+ {
254
+ "type": "text",
255
+ "text": caption_filter_text,
256
+ },
257
+ {
258
+ "type": "image_url",
259
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
260
+ }
261
+ ],
262
+ }
263
+ ],
264
+ )
265
+ response_content = response1.choices[0].message.content
266
+ should_caption = True if "yes" in response_content.lower() else False
267
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
268
+
269
+ else:
270
+ should_caption = False
271
+
272
+ #2단계: dense caption 만들기
273
+ dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
274
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
275
+ I want to use your expressions to create a action-centric referring expression dataset.
276
+ Therefore, your expressions for these {cat_name}s should describe unique action of each object.
277
+
278
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
279
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
280
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
281
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
282
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
283
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
284
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
285
+ 8. Include interactions with objects or other entities when they are prominent and observable.
286
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
287
+ 10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
288
+ 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
289
+ 12. Do not mention object IDs.
290
+ 13. Use '{cat_name}' as the noun for the referring expressions.
291
+
292
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
293
+ Output referring expressions for each object id.
294
+ """
295
+
296
+ dense_caption_prompt = f"""
297
+ You are a visual assistant analyzing a single frame of a video.
298
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
299
+ I want to use your expressions to create a action-centric referring expression dataset.
300
+ Please describe each {cat_name} using **clearly observable** and **specific** actions.
301
+
302
+ ## Guidelines:
303
+ 1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
304
+ 2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
305
+ 3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
306
+ 4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
307
+ Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
308
+ 5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
309
+ 6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
310
+ 7. Base your description on the following action definitions:
311
+ - Facial with object manipulation
312
+ - General body movement, body position or pattern
313
+ - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
314
+ - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
315
+
316
+ ## Output Format:
317
+ - For each labeled {cat_name}, output one line in the format:
318
+ ID. action-oriented description
319
+
320
+ Example:
321
+ 1. a bear grasping the edge of a wood with its front paws
322
+ 2. the bear pushing another bear, leaning forward
323
+
324
+ **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
325
+ **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
326
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
327
+ For each labeled {cat_name}, output referring expressions for each object id.
328
+ """
329
+ if should_caption:
330
+ response2 = captioner.chat.completions.create(
331
+ model="chatgpt-4o-latest",
332
+ messages=[
333
+ {
334
+ "role": "user",
335
+ "content": [
336
+ {
337
+ "type": "text",
338
+ "text": dense_caption_prompt,
339
+ },
340
+ {
341
+ "type": "image_url",
342
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
343
+ },
344
+ ],
345
+ }
346
+ ],
347
+ )
348
+
349
+ caption = response2.choices[0].message.content
350
+ #print(f"{image_path} - {frame_name}: {caption}")
351
+ else:
352
+ caption = None
353
+
354
+ image_captions[frame_name] = caption
355
+ all_captions[cat_name] = image_captions
356
+
357
+ # final : also prepare valid object ids
358
+ valid_obj_ids = dict()
359
+
360
+ for cat in cat_names:
361
+ if cat in ytvos_category_valid_list:
362
+ obj_id_cat = vid_meta['obj_id_cat']
363
+ valid_cat_ids = []
364
+ for obj_id in list(obj_id_cat.keys()):
365
+ if obj_id_cat[obj_id] == cat:
366
+ valid_cat_ids.append(obj_id)
367
+ valid_obj_ids[cat] = valid_cat_ids
368
+
369
+ return vid_id, all_captions, valid_obj_ids
370
+
371
+
372
+ if __name__ == '__main__':
373
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
374
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
375
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
376
+
377
+ args = parser.parse_args()
378
+
379
+ print(args.save_caption_path, flush=True)
380
+ print(args.save_valid_obj_ids_path, flush=True)
381
+
382
+ #==================데이터 불러오기===================
383
+ # 전체 데이터셋
384
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
385
+
386
+ # 전체 데이터셋 메타데이터
387
+ metas = train_dataset.metas
388
+
389
+ # 색상 후보 8개 (RGB 형식)
390
+ colors = [
391
+ (255, 0, 0), # Red
392
+ (0, 255, 0), # Green
393
+ (0, 0, 255), # Blue
394
+ (255, 255, 0), # Yellow
395
+ (255, 0, 255), # Magenta
396
+ (0, 255, 255), # Cyan
397
+ (128, 0, 128), # Purple
398
+ (255, 165, 0) # Orange
399
+ ]
400
+
401
+ ytvos_category_valid_list = [
402
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
403
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
404
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
405
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
406
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
407
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
408
+ ]
409
+
410
+ #==================gpt 돌리기===================
411
+ os.environ['OPENAI_API_KEY'] = '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'
412
+
413
+ result_captions = {}
414
+ result_valid_obj_ids = {}
415
+
416
+ for i in range(370):
417
+ vid_id, all_captions, valid_obj_ids = getCaption(i, True)
418
+
419
+ if vid_id not in result_captions:
420
+ result_captions[vid_id] = all_captions
421
+ if vid_id not in result_valid_obj_ids:
422
+ result_valid_obj_ids[vid_id] = valid_obj_ids
423
+
424
+ print("Finished!", flush=True)
425
+
426
+ with open(args.save_caption_path, "w") as file:
427
+ json.dump(result_captions, file, indent=4)
428
+
429
+ with open(args.save_valid_obj_ids_path, "w") as file:
430
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_20250131124156.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from os import path as osp
6
+ from io import BytesIO
7
+
8
+ from mbench.ytvos_ref import build as build_ytvos_ref
9
+ import argparse
10
+ import opts
11
+
12
+ import sys
13
+ from pathlib import Path
14
+ import os
15
+ from os import path as osp
16
+ import skimage
17
+ from io import BytesIO
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ import regex as re
22
+ import json
23
+
24
+ import cv2
25
+ from PIL import Image, ImageDraw
26
+ import torch
27
+ from torchvision.transforms import functional as F
28
+
29
+ from skimage import measure # (pip install scikit-image)
30
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
31
+
32
+ import matplotlib.pyplot as plt
33
+ import matplotlib.patches as patches
34
+ from matplotlib.collections import PatchCollection
35
+ from matplotlib.patches import Rectangle
36
+ import textwrap
37
+
38
+
39
+ import ipywidgets as widgets
40
+ from IPython.display import display, clear_output
41
+
42
+ from openai import OpenAI
43
+ import base64
44
+ import json
45
+
46
+ def number_objects_and_encode(idx, color_mask=False):
47
+ encoded_frames = {}
48
+ contoured_frames = {} # New dictionary for original images
49
+ vid_cat_cnts = {}
50
+
51
+ vid_meta = metas[idx]
52
+ vid_data = train_dataset[idx]
53
+ vid_id = vid_meta['video']
54
+ frame_indx = vid_meta['sample_indx']
55
+ cat_names = set(vid_meta['obj_id_cat'].values())
56
+ imgs = vid_data[0]
57
+
58
+ for cat in cat_names:
59
+ cat_frames = []
60
+ contour_frames = []
61
+ frame_cat_cnts = {}
62
+
63
+ for i in range(imgs.size(0)):
64
+ frame_name = frame_indx[i]
65
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
66
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
67
+
68
+ frame_data = vid_data[2][frame_name]
69
+ obj_ids = list(frame_data.keys())
70
+
71
+ cat_cnt = 0
72
+
73
+ for j in range(len(obj_ids)):
74
+ obj_id = obj_ids[j]
75
+ obj_data = frame_data[obj_id]
76
+ obj_bbox = obj_data['bbox']
77
+ obj_valid = obj_data['valid']
78
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
79
+ obj_cat = obj_data['category_name']
80
+
81
+ if obj_cat == cat and obj_valid:
82
+ cat_cnt += 1
83
+
84
+ if color_mask == False:
85
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
86
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
87
+ for i, contour in enumerate(contours):
88
+ # 윤곽선 중심 계산
89
+ moments = cv2.moments(contour)
90
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
91
+ cx = int(moments["m10"] / moments["m00"])
92
+ cy = int(moments["m01"] / moments["m00"])
93
+ else:
94
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
95
+
96
+ # 텍스트 배경 (검은색 배경 만들기)
97
+ font = cv2.FONT_HERSHEY_SIMPLEX
98
+ text = obj_id
99
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
100
+ text_w, text_h = text_size
101
+
102
+ # 텍스트 배경 그리기 (검은색 배경)
103
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
104
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
105
+
106
+ # 텍스트 그리기 (흰색 텍스트)
107
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
108
+ font, 1, (255, 255, 255), 2)
109
+
110
+ else:
111
+ alpha = 0.08
112
+
113
+ colored_obj_mask = np.zeros_like(frame)
114
+ colored_obj_mask[obj_mask == 1] = colors[j]
115
+ frame[obj_mask == 1] = (
116
+ (1 - alpha) * frame[obj_mask == 1]
117
+ + alpha * colored_obj_mask[obj_mask == 1]
118
+ )
119
+
120
+
121
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
122
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
123
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
124
+
125
+
126
+
127
+ if len(contours) > 0:
128
+ largest_contour = max(contours, key=cv2.contourArea)
129
+ M = cv2.moments(largest_contour)
130
+ if M["m00"] != 0:
131
+ center_x = int(M["m10"] / M["m00"])
132
+ center_y = int(M["m01"] / M["m00"])
133
+ else:
134
+ center_x, center_y = 0, 0
135
+
136
+ font = cv2.FONT_HERSHEY_SIMPLEX
137
+ text = obj_id
138
+
139
+ font_scale = 0.9
140
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
141
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
142
+ text_y = center_y
143
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
144
+
145
+ # 텍스트 배경 사각형 좌표 계산
146
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
147
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
148
+ rect_end = (text_x + text_size[0] + 5, text_y)
149
+
150
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
151
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
152
+
153
+ # plt.figure(figsize=(12, 8))
154
+ # plt.imshow(frame)
155
+ # plt.title(f"frame {frame_name}")
156
+ # plt.tight_layout()
157
+ # plt.axis('off')
158
+ # plt.show()
159
+
160
+ buffer = BytesIO()
161
+ frame = Image.fromarray(frame)
162
+ frame.save(buffer, format='jpeg')
163
+ buffer.seek(0)
164
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
165
+ frame_cat_cnts[frame_name] = cat_cnt
166
+
167
+ buffer.seek(0) # Reuse buffer instead of creating a new one
168
+ buffer.truncate()
169
+ frame_for_contour = Image.fromarray(frame_for_contour)
170
+ frame_for_contour.save(buffer, format='jpeg')
171
+ buffer.seek(0)
172
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
173
+
174
+ encoded_frames[cat] = cat_frames
175
+ contoured_frames[cat] = contour_frames
176
+ vid_cat_cnts[cat] = frame_cat_cnts
177
+
178
+ return encoded_frames, vid_cat_cnts, contoured_frames
179
+
180
+
181
+ def getCaption(idx, color_mask=True):
182
+ vid_meta = metas[idx]
183
+ vid_data = train_dataset[idx]
184
+ vid_id = vid_meta['video']
185
+ print(f"vid id: {vid_id}\n")
186
+
187
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
188
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
189
+ all_captions = dict()
190
+
191
+ base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
192
+ marked = "mask with boundary" if color_mask else "boundary"
193
+
194
+ for cat_name in list(cat_names) :
195
+
196
+ is_movable = False
197
+ if cat_name in ytvos_category_valid_list :
198
+ is_movable = True
199
+
200
+ if not is_movable:
201
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
202
+
203
+
204
+ image_captions = {}
205
+ captioner = OpenAI()
206
+ cat_base64_frames = base64_frames[cat_name]
207
+ cont_base64_frames = contoured_frames[cat_name]
208
+
209
+ for i in range(len(cat_base64_frames)):
210
+ frame_name = frame_indx[i]
211
+ cont_base64_image = cont_base64_frames[i]
212
+ base64_image = cat_base64_frames[i]
213
+ should_filter = False
214
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
215
+
216
+ if frame_cat_cnts >= 2:
217
+ should_filter = True
218
+ else:
219
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
220
+
221
+ if is_movable and should_filter:
222
+ #1단계: 필터링
223
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
224
+ caption_filter_text = f"""
225
+ You are a visual assistant analyzing a single frame from a video.
226
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
227
+
228
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
229
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
230
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
231
+
232
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
233
+
234
+ - Respond with "YES" if:
235
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
236
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
237
+ 3) Each action is unambiguously recognizable and distinct.
238
+
239
+ - Respond with "NONE" if:
240
+ 1) The actions or pose are not clearly differentiable or too similar.
241
+ 2) They show no noticeable action beyond standing or minor movements.
242
+
243
+ Answer strictly with either "YES" or "NONE".
244
+ """
245
+
246
+
247
+ response1 = captioner.chat.completions.create(
248
+ model="chatgpt-4o-latest",
249
+ messages=[
250
+ {
251
+ "role": "user",
252
+ "content": [
253
+ {
254
+ "type": "text",
255
+ "text": caption_filter_text,
256
+ },
257
+ {
258
+ "type": "image_url",
259
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
260
+ }
261
+ ],
262
+ }
263
+ ],
264
+ )
265
+ response_content = response1.choices[0].message.content
266
+ should_caption = True if "yes" in response_content.lower() else False
267
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
268
+
269
+ else:
270
+ should_caption = False
271
+
272
+ #2단계: dense caption 만들기
273
+ dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
274
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
275
+ I want to use your expressions to create a action-centric referring expression dataset.
276
+ Therefore, your expressions for these {cat_name}s should describe unique action of each object.
277
+
278
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
279
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
280
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
281
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
282
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
283
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
284
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
285
+ 8. Include interactions with objects or other entities when they are prominent and observable.
286
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
287
+ 10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
288
+ 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
289
+ 12. Do not mention object IDs.
290
+ 13. Use '{cat_name}' as the noun for the referring expressions.
291
+
292
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
293
+ Output referring expressions for each object id.
294
+ """
295
+
296
+ dense_caption_prompt = f"""
297
+ You are a visual assistant analyzing a single frame of a video.
298
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
299
+ I want to use your expressions to create a action-centric referring expression dataset.
300
+ Please describe each {cat_name} using **clearly observable** and **specific** actions.
301
+
302
+ ## Guidelines:
303
+ 1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
304
+ 2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
305
+ 3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
306
+ 4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
307
+ Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
308
+ 5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
309
+ 6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
310
+ 7. Base your description on the following action definitions:
311
+ - Facial with object manipulation
312
+ - General body movement, body position or pattern
313
+ - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
314
+ - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
315
+
316
+ ## Output Format:
317
+ - For each labeled {cat_name}, output one line in the format:
318
+ ID. action-oriented description
319
+
320
+ Example:
321
+ 1. a bear grasping the edge of a wood with its front paws
322
+ 2. the bear pushing another bear, leaning forward
323
+
324
+ **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
325
+ **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
326
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
327
+ For each labeled {cat_name}, output referring expressions for each object id.
328
+ """
329
+ if should_caption:
330
+ response2 = captioner.chat.completions.create(
331
+ model="chatgpt-4o-latest",
332
+ messages=[
333
+ {
334
+ "role": "user",
335
+ "content": [
336
+ {
337
+ "type": "text",
338
+ "text": dense_caption_prompt,
339
+ },
340
+ {
341
+ "type": "image_url",
342
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
343
+ },
344
+ ],
345
+ }
346
+ ],
347
+ )
348
+
349
+ caption = response2.choices[0].message.content
350
+ #print(f"{image_path} - {frame_name}: {caption}")
351
+ else:
352
+ caption = None
353
+
354
+ image_captions[frame_name] = caption
355
+ all_captions[cat_name] = image_captions
356
+
357
+ # final : also prepare valid object ids
358
+ valid_obj_ids = dict()
359
+
360
+ for cat in cat_names:
361
+ if cat in ytvos_category_valid_list:
362
+ obj_id_cat = vid_meta['obj_id_cat']
363
+ valid_cat_ids = []
364
+ for obj_id in list(obj_id_cat.keys()):
365
+ if obj_id_cat[obj_id] == cat:
366
+ valid_cat_ids.append(obj_id)
367
+ valid_obj_ids[cat] = valid_cat_ids
368
+
369
+ return vid_id, all_captions, valid_obj_ids
370
+
371
+
372
+ if __name__ == '__main__':
373
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
374
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
375
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
376
+
377
+ args = parser.parse_args()
378
+
379
+ #==================데이터 불러오기===================
380
+ # 전체 데이터셋
381
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
382
+
383
+ # 전체 데이터셋 메타데이터
384
+ metas = train_dataset.metas
385
+
386
+ # 색상 후보 8개 (RGB 형식)
387
+ colors = [
388
+ (255, 0, 0), # Red
389
+ (0, 255, 0), # Green
390
+ (0, 0, 255), # Blue
391
+ (255, 255, 0), # Yellow
392
+ (255, 0, 255), # Magenta
393
+ (0, 255, 255), # Cyan
394
+ (128, 0, 128), # Purple
395
+ (255, 165, 0) # Orange
396
+ ]
397
+
398
+ ytvos_category_valid_list = [
399
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
400
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
401
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
402
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
403
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
404
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
405
+ ]
406
+
407
+ #==================gpt 돌리기===================
408
+ os.environ['OPENAI_API_KEY'] = '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'
409
+
410
+ result_captions = {}
411
+ result_valid_obj_ids = {}
412
+
413
+ for i in range(370):
414
+ vid_id, all_captions, valid_obj_ids = getCaption(i, True)
415
+
416
+ if vid_id not in result_captions:
417
+ result_captions[vid_id] = all_captions
418
+ if vid_id not in result_valid_obj_ids:
419
+ result_valid_obj_ids[vid_id] = valid_obj_ids
420
+
421
+ print("Finished!", flush=True)
422
+
423
+ with open(args.save_caption_path, "w") as file:
424
+ json.dump(result_captions, file, indent=4)
425
+
426
+ with open(args.save_valid_obj_ids_path, "w") as file:
427
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_20250201140343.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from os import path as osp
6
+ from io import BytesIO
7
+
8
+ from mbench.ytvos_ref import build as build_ytvos_ref
9
+ import argparse
10
+ import opts
11
+
12
+ import sys
13
+ from pathlib import Path
14
+ import os
15
+ from os import path as osp
16
+ import skimage
17
+ from io import BytesIO
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ import regex as re
22
+ import json
23
+
24
+ import cv2
25
+ from PIL import Image, ImageDraw
26
+ import torch
27
+ from torchvision.transforms import functional as F
28
+
29
+ from skimage import measure # (pip install scikit-image)
30
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
31
+
32
+ import matplotlib.pyplot as plt
33
+ import matplotlib.patches as patches
34
+ from matplotlib.collections import PatchCollection
35
+ from matplotlib.patches import Rectangle
36
+ import textwrap
37
+
38
+
39
+ import ipywidgets as widgets
40
+ from IPython.display import display, clear_output
41
+
42
+ from openai import OpenAI
43
+ import base64
44
+ import json
45
+
46
+ def number_objects_and_encode(idx, color_mask=False):
47
+ encoded_frames = {}
48
+ contoured_frames = {} # New dictionary for original images
49
+ vid_cat_cnts = {}
50
+
51
+ vid_meta = metas[idx]
52
+ vid_data = train_dataset[idx]
53
+ vid_id = vid_meta['video']
54
+ frame_indx = vid_meta['sample_indx']
55
+ cat_names = set(vid_meta['obj_id_cat'].values())
56
+ imgs = vid_data[0]
57
+
58
+ for cat in cat_names:
59
+ cat_frames = []
60
+ contour_frames = []
61
+ frame_cat_cnts = {}
62
+
63
+ for i in range(imgs.size(0)):
64
+ frame_name = frame_indx[i]
65
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
66
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
67
+
68
+ frame_data = vid_data[2][frame_name]
69
+ obj_ids = list(frame_data.keys())
70
+
71
+ cat_cnt = 0
72
+
73
+ for j in range(len(obj_ids)):
74
+ obj_id = obj_ids[j]
75
+ obj_data = frame_data[obj_id]
76
+ obj_bbox = obj_data['bbox']
77
+ obj_valid = obj_data['valid']
78
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
79
+ obj_cat = obj_data['category_name']
80
+
81
+ if obj_cat == cat and obj_valid:
82
+ cat_cnt += 1
83
+
84
+ if color_mask == False:
85
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
86
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
87
+ for i, contour in enumerate(contours):
88
+ # 윤곽선 중심 계산
89
+ moments = cv2.moments(contour)
90
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
91
+ cx = int(moments["m10"] / moments["m00"])
92
+ cy = int(moments["m01"] / moments["m00"])
93
+ else:
94
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
95
+
96
+ # 텍스트 배경 (검은색 배경 만들기)
97
+ font = cv2.FONT_HERSHEY_SIMPLEX
98
+ text = obj_id
99
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
100
+ text_w, text_h = text_size
101
+
102
+ # 텍스트 배경 그리기 (검은색 배경)
103
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
104
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
105
+
106
+ # 텍스트 그리기 (흰색 텍스트)
107
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
108
+ font, 1, (255, 255, 255), 2)
109
+
110
+ else:
111
+ alpha = 0.08
112
+
113
+ colored_obj_mask = np.zeros_like(frame)
114
+ colored_obj_mask[obj_mask == 1] = colors[j]
115
+ frame[obj_mask == 1] = (
116
+ (1 - alpha) * frame[obj_mask == 1]
117
+ + alpha * colored_obj_mask[obj_mask == 1]
118
+ )
119
+
120
+
121
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
122
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
123
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
124
+
125
+
126
+
127
+ if len(contours) > 0:
128
+ largest_contour = max(contours, key=cv2.contourArea)
129
+ M = cv2.moments(largest_contour)
130
+ if M["m00"] != 0:
131
+ center_x = int(M["m10"] / M["m00"])
132
+ center_y = int(M["m01"] / M["m00"])
133
+ else:
134
+ center_x, center_y = 0, 0
135
+
136
+ font = cv2.FONT_HERSHEY_SIMPLEX
137
+ text = obj_id
138
+
139
+ font_scale = 0.9
140
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
141
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
142
+ text_y = center_y
143
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
144
+
145
+ # 텍스트 배경 사각형 좌표 계산
146
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
147
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
148
+ rect_end = (text_x + text_size[0] + 5, text_y)
149
+
150
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
151
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
152
+
153
+ # plt.figure(figsize=(12, 8))
154
+ # plt.imshow(frame)
155
+ # plt.title(f"frame {frame_name}")
156
+ # plt.tight_layout()
157
+ # plt.axis('off')
158
+ # plt.show()
159
+
160
+ buffer = BytesIO()
161
+ frame = Image.fromarray(frame)
162
+ frame.save(buffer, format='jpeg')
163
+ buffer.seek(0)
164
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
165
+ frame_cat_cnts[frame_name] = cat_cnt
166
+
167
+ buffer.seek(0) # Reuse buffer instead of creating a new one
168
+ buffer.truncate()
169
+ frame_for_contour = Image.fromarray(frame_for_contour)
170
+ frame_for_contour.save(buffer, format='jpeg')
171
+ buffer.seek(0)
172
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
173
+
174
+ encoded_frames[cat] = cat_frames
175
+ contoured_frames[cat] = contour_frames
176
+ vid_cat_cnts[cat] = frame_cat_cnts
177
+
178
+ return encoded_frames, vid_cat_cnts, contoured_frames
179
+
180
+
181
+ def getCaption(idx, model='gpt-4o-mini', color_mask=True):
182
+ vid_meta = metas[idx]
183
+ vid_data = train_dataset[idx]
184
+ vid_id = vid_meta['video']
185
+ print(f"vid id: {vid_id}\n")
186
+
187
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
188
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
189
+ all_captions = dict()
190
+
191
+ base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
192
+ marked = "mask with boundary" if color_mask else "boundary"
193
+
194
+ for cat_name in list(cat_names) :
195
+
196
+ is_movable = False
197
+ if cat_name in ytvos_category_valid_list :
198
+ is_movable = True
199
+
200
+ if not is_movable:
201
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
202
+
203
+
204
+ image_captions = {}
205
+ captioner = OpenAI()
206
+ cat_base64_frames = base64_frames[cat_name]
207
+ cont_base64_frames = contoured_frames[cat_name]
208
+
209
+ for i in range(len(cat_base64_frames)):
210
+ frame_name = frame_indx[i]
211
+ cont_base64_image = cont_base64_frames[i]
212
+ base64_image = cat_base64_frames[i]
213
+ should_filter = False
214
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
215
+
216
+ if frame_cat_cnts >= 2:
217
+ should_filter = True
218
+ else:
219
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
220
+
221
+ if is_movable and should_filter:
222
+ #1단계: 필터링
223
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
224
+ caption_filter_text = f"""
225
+ You are a visual assistant analyzing a single frame from a video.
226
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
227
+
228
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
229
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
230
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
231
+
232
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
233
+
234
+ - Respond with "YES" if:
235
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
236
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
237
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
238
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
239
+
240
+ - Respond with "NONE" if:
241
+ 1) The actions or pose are not clearly differentiable or too similar.
242
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
243
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
244
+
245
+ Answer strictly with either "YES" or "NONE".
246
+ """
247
+
248
+ response1 = captioner.chat.completions.create(
249
+ # model="chatgpt-4o-latest",
250
+ model=model,
251
+ messages=[
252
+ {
253
+ "role": "user",
254
+ "content": [
255
+ {
256
+ "type": "text",
257
+ "text": caption_filter_text,
258
+ },
259
+ {
260
+ "type": "image_url",
261
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
262
+ }
263
+ ],
264
+ }
265
+ ],
266
+ )
267
+ response_content = response1.choices[0].message.content
268
+ should_caption = True if "yes" in response_content.lower() else False
269
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
270
+
271
+ else:
272
+ should_caption = False
273
+
274
+ #2단계: dense caption 만들기
275
+ dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
276
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
277
+ I want to use your expressions to create a action-centric referring expression dataset.
278
+ Therefore, your expressions for these {cat_name}s should describe unique action of each object.
279
+
280
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
281
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
282
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
283
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
284
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
285
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
286
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
287
+ 8. Include interactions with objects or other entities when they are prominent and observable.
288
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
289
+ 10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
290
+ 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
291
+ 12. Do not mention object IDs.
292
+ 13. Use '{cat_name}' as the noun for the referring expressions.
293
+
294
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
295
+ Output referring expressions for each object id.
296
+ """
297
+
298
+ dense_caption_prompt = f"""
299
+ You are a visual assistant analyzing a single frame of a video.
300
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
301
+
302
+ I want to use your expressions to create an **action-centric referring expression** dataset.
303
+ Please describe each {cat_name} using **clearly observable** and **specific** actions.
304
+
305
+ ---
306
+ ## Guidelines:
307
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
308
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
309
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
310
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
311
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
312
+ 6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
313
+ 7. Base your description on these action definitions:
314
+ - Avoid using term 'minimal' or 'slightly'.
315
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
316
+ - details such as motion and intention, facial with object manipulation
317
+ - movements with objects or other entities when they are prominent and observable. expression should be specific.
318
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
319
+ ---
320
+
321
+ ## Output Format:
322
+ - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
323
+ object id. using {cat_name} as subject noun, action-oriented description
324
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
325
+ - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
326
+
327
+ ### Example
328
+ If the frame has 2 labeled bears, your output should look like:
329
+ 1. the bear reaching his right arm while leaning forward to capture the prey
330
+ 2. a bear standing upright facing right, touching the bike aside
331
+
332
+ ---
333
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
334
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
335
+ **Do not include markdown** in the output.
336
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
337
+ For each labeled {cat_name}, output referring expressions for each object id.
338
+ """
339
+ MAX_RETRIES = 2
340
+ retry_count = 0
341
+
342
+ if should_caption:
343
+ while retry_count < MAX_RETRIES:
344
+
345
+ response2 = captioner.chat.completions.create(
346
+ model=model,
347
+ messages=[
348
+ {
349
+ "role": "user",
350
+ "content": [
351
+ {
352
+ "type": "text",
353
+ "text": dense_caption_prompt,
354
+ },
355
+ {
356
+ "type": "image_url",
357
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
358
+ },
359
+ ],
360
+ }
361
+ ],
362
+ )
363
+
364
+ # caption = response2.choices[0].message.content
365
+ #print(f"{image_path} - {frame_name}: {caption}")
366
+
367
+ caption = response2.choices[0].message.content.strip()
368
+ caption_lower = caption.lower().lstrip()
369
+
370
+ if caption_lower.startswith("1.") and not any(
371
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
372
+ ):
373
+ break
374
+
375
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
376
+ retry_count += 1
377
+ time.sleep(2)
378
+
379
+ if retry_count == MAX_RETRIES:
380
+ caption = None
381
+ print("Max retries reached. Caption generation failed.")
382
+
383
+ else:
384
+ caption = None
385
+
386
+ image_captions[frame_name] = caption
387
+ all_captions[cat_name] = image_captions
388
+
389
+ # final : also prepare valid object ids
390
+ valid_obj_ids = dict()
391
+
392
+ for cat in cat_names:
393
+ if cat in ytvos_category_valid_list:
394
+ obj_id_cat = vid_meta['obj_id_cat']
395
+ valid_cat_ids = []
396
+ for obj_id in list(obj_id_cat.keys()):
397
+ if obj_id_cat[obj_id] == cat:
398
+ valid_cat_ids.append(obj_id)
399
+ valid_obj_ids[cat] = valid_cat_ids
400
+
401
+ return all_captions, valid_obj_ids
402
+
403
+
404
+
405
+ if __name__ == '__main__':
406
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
407
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
408
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
409
+
410
+ args = parser.parse_args()
411
+
412
+ #==================데이터 불러오기===================
413
+ # 전체 데이터셋
414
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
415
+
416
+ # 전체 데이터셋 메타데이터
417
+ metas = train_dataset.metas
418
+
419
+ # 색상 후보 8개 (RGB 형식)
420
+ colors = [
421
+ (255, 0, 0), # Red
422
+ (0, 255, 0), # Green
423
+ (0, 0, 255), # Blue
424
+ (255, 255, 0), # Yellow
425
+ (255, 0, 255), # Magenta
426
+ (0, 255, 255), # Cyan
427
+ (128, 0, 128), # Purple
428
+ (255, 165, 0) # Orange
429
+ ]
430
+
431
+ ytvos_category_valid_list = [
432
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
433
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
434
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
435
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
436
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
437
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
438
+ ]
439
+
440
+ #==================gpt 돌리기===================
441
+ os.environ['OPENAI_API_KEY'] = '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'
442
+
443
+ result_captions = {}
444
+ result_valid_obj_ids = {}
445
+
446
+ for i in range(370):
447
+ vid_id, all_captions, valid_obj_ids = getCaption(i, True)
448
+
449
+ if vid_id not in result_captions:
450
+ result_captions[vid_id] = all_captions
451
+ if vid_id not in result_valid_obj_ids:
452
+ result_valid_obj_ids[vid_id] = valid_obj_ids
453
+
454
+ print("Finished!", flush=True)
455
+
456
+ with open(args.save_caption_path, "w") as file:
457
+ json.dump(result_captions, file, indent=4)
458
+
459
+ with open(args.save_valid_obj_ids_path, "w") as file:
460
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_20250201140413.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from os import path as osp
6
+ from io import BytesIO
7
+
8
+ from mbench.ytvos_ref import build as build_ytvos_ref
9
+ import argparse
10
+ import opts
11
+
12
+ import sys
13
+ from pathlib import Path
14
+ import os
15
+ from os import path as osp
16
+ import skimage
17
+ from io import BytesIO
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ import regex as re
22
+ import json
23
+
24
+ import cv2
25
+ from PIL import Image, ImageDraw
26
+ import torch
27
+ from torchvision.transforms import functional as F
28
+
29
+ from skimage import measure # (pip install scikit-image)
30
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
31
+
32
+ import matplotlib.pyplot as plt
33
+ import matplotlib.patches as patches
34
+ from matplotlib.collections import PatchCollection
35
+ from matplotlib.patches import Rectangle
36
+ import textwrap
37
+
38
+
39
+ import ipywidgets as widgets
40
+ from IPython.display import display, clear_output
41
+
42
+ from openai import OpenAI
43
+ import base64
44
+ import json
45
+
46
+ def number_objects_and_encode(idx, color_mask=False):
47
+ encoded_frames = {}
48
+ contoured_frames = {} # New dictionary for original images
49
+ vid_cat_cnts = {}
50
+
51
+ vid_meta = metas[idx]
52
+ vid_data = train_dataset[idx]
53
+ vid_id = vid_meta['video']
54
+ frame_indx = vid_meta['sample_indx']
55
+ cat_names = set(vid_meta['obj_id_cat'].values())
56
+ imgs = vid_data[0]
57
+
58
+ for cat in cat_names:
59
+ cat_frames = []
60
+ contour_frames = []
61
+ frame_cat_cnts = {}
62
+
63
+ for i in range(imgs.size(0)):
64
+ frame_name = frame_indx[i]
65
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
66
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
67
+
68
+ frame_data = vid_data[2][frame_name]
69
+ obj_ids = list(frame_data.keys())
70
+
71
+ cat_cnt = 0
72
+
73
+ for j in range(len(obj_ids)):
74
+ obj_id = obj_ids[j]
75
+ obj_data = frame_data[obj_id]
76
+ obj_bbox = obj_data['bbox']
77
+ obj_valid = obj_data['valid']
78
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
79
+ obj_cat = obj_data['category_name']
80
+
81
+ if obj_cat == cat and obj_valid:
82
+ cat_cnt += 1
83
+
84
+ if color_mask == False:
85
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
86
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
87
+ for i, contour in enumerate(contours):
88
+ # 윤곽선 중심 계산
89
+ moments = cv2.moments(contour)
90
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
91
+ cx = int(moments["m10"] / moments["m00"])
92
+ cy = int(moments["m01"] / moments["m00"])
93
+ else:
94
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
95
+
96
+ # 텍스트 배경 (검은색 배경 만들기)
97
+ font = cv2.FONT_HERSHEY_SIMPLEX
98
+ text = obj_id
99
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
100
+ text_w, text_h = text_size
101
+
102
+ # 텍스트 배경 그리기 (검은색 배경)
103
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
104
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
105
+
106
+ # 텍스트 그리기 (흰색 텍스트)
107
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
108
+ font, 1, (255, 255, 255), 2)
109
+
110
+ else:
111
+ alpha = 0.08
112
+
113
+ colored_obj_mask = np.zeros_like(frame)
114
+ colored_obj_mask[obj_mask == 1] = colors[j]
115
+ frame[obj_mask == 1] = (
116
+ (1 - alpha) * frame[obj_mask == 1]
117
+ + alpha * colored_obj_mask[obj_mask == 1]
118
+ )
119
+
120
+
121
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
122
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
123
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
124
+
125
+
126
+
127
+ if len(contours) > 0:
128
+ largest_contour = max(contours, key=cv2.contourArea)
129
+ M = cv2.moments(largest_contour)
130
+ if M["m00"] != 0:
131
+ center_x = int(M["m10"] / M["m00"])
132
+ center_y = int(M["m01"] / M["m00"])
133
+ else:
134
+ center_x, center_y = 0, 0
135
+
136
+ font = cv2.FONT_HERSHEY_SIMPLEX
137
+ text = obj_id
138
+
139
+ font_scale = 0.9
140
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
141
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
142
+ text_y = center_y
143
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
144
+
145
+ # 텍스트 배경 사각형 좌표 계산
146
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
147
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
148
+ rect_end = (text_x + text_size[0] + 5, text_y)
149
+
150
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
151
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
152
+
153
+ # plt.figure(figsize=(12, 8))
154
+ # plt.imshow(frame)
155
+ # plt.title(f"frame {frame_name}")
156
+ # plt.tight_layout()
157
+ # plt.axis('off')
158
+ # plt.show()
159
+
160
+ buffer = BytesIO()
161
+ frame = Image.fromarray(frame)
162
+ frame.save(buffer, format='jpeg')
163
+ buffer.seek(0)
164
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
165
+ frame_cat_cnts[frame_name] = cat_cnt
166
+
167
+ buffer.seek(0) # Reuse buffer instead of creating a new one
168
+ buffer.truncate()
169
+ frame_for_contour = Image.fromarray(frame_for_contour)
170
+ frame_for_contour.save(buffer, format='jpeg')
171
+ buffer.seek(0)
172
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
173
+
174
+ encoded_frames[cat] = cat_frames
175
+ contoured_frames[cat] = contour_frames
176
+ vid_cat_cnts[cat] = frame_cat_cnts
177
+
178
+ return encoded_frames, vid_cat_cnts, contoured_frames
179
+
180
+
181
+ def getCaption(idx, model='gpt-4o', color_mask=True):
182
+ vid_meta = metas[idx]
183
+ vid_data = train_dataset[idx]
184
+ vid_id = vid_meta['video']
185
+ print(f"vid id: {vid_id}\n")
186
+
187
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
188
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
189
+ all_captions = dict()
190
+
191
+ base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
192
+ marked = "mask with boundary" if color_mask else "boundary"
193
+
194
+ for cat_name in list(cat_names) :
195
+
196
+ is_movable = False
197
+ if cat_name in ytvos_category_valid_list :
198
+ is_movable = True
199
+
200
+ if not is_movable:
201
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
202
+
203
+
204
+ image_captions = {}
205
+ captioner = OpenAI()
206
+ cat_base64_frames = base64_frames[cat_name]
207
+ cont_base64_frames = contoured_frames[cat_name]
208
+
209
+ for i in range(len(cat_base64_frames)):
210
+ frame_name = frame_indx[i]
211
+ cont_base64_image = cont_base64_frames[i]
212
+ base64_image = cat_base64_frames[i]
213
+ should_filter = False
214
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
215
+
216
+ if frame_cat_cnts >= 2:
217
+ should_filter = True
218
+ else:
219
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
220
+
221
+ if is_movable and should_filter:
222
+ #1단계: 필터링
223
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
224
+ caption_filter_text = f"""
225
+ You are a visual assistant analyzing a single frame from a video.
226
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
227
+
228
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
229
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
230
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
231
+
232
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
233
+
234
+ - Respond with "YES" if:
235
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
236
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
237
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
238
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
239
+
240
+ - Respond with "NONE" if:
241
+ 1) The actions or pose are not clearly differentiable or too similar.
242
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
243
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
244
+
245
+ Answer strictly with either "YES" or "NONE".
246
+ """
247
+
248
+ response1 = captioner.chat.completions.create(
249
+ # model="chatgpt-4o-latest",
250
+ model=model,
251
+ messages=[
252
+ {
253
+ "role": "user",
254
+ "content": [
255
+ {
256
+ "type": "text",
257
+ "text": caption_filter_text,
258
+ },
259
+ {
260
+ "type": "image_url",
261
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
262
+ }
263
+ ],
264
+ }
265
+ ],
266
+ )
267
+ response_content = response1.choices[0].message.content
268
+ should_caption = True if "yes" in response_content.lower() else False
269
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
270
+
271
+ else:
272
+ should_caption = False
273
+
274
+ #2단계: dense caption 만들기
275
+ dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
276
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
277
+ I want to use your expressions to create a action-centric referring expression dataset.
278
+ Therefore, your expressions for these {cat_name}s should describe unique action of each object.
279
+
280
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
281
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
282
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
283
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
284
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
285
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
286
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
287
+ 8. Include interactions with objects or other entities when they are prominent and observable.
288
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
289
+ 10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
290
+ 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
291
+ 12. Do not mention object IDs.
292
+ 13. Use '{cat_name}' as the noun for the referring expressions.
293
+
294
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
295
+ Output referring expressions for each object id.
296
+ """
297
+
298
+ dense_caption_prompt = f"""
299
+ You are a visual assistant analyzing a single frame of a video.
300
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
301
+
302
+ I want to use your expressions to create an **action-centric referring expression** dataset.
303
+ Please describe each {cat_name} using **clearly observable** and **specific** actions.
304
+
305
+ ---
306
+ ## Guidelines:
307
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
308
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
309
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
310
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
311
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
312
+ 6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
313
+ 7. Base your description on these action definitions:
314
+ - Avoid using term 'minimal' or 'slightly'.
315
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
316
+ - details such as motion and intention, facial with object manipulation
317
+ - movements with objects or other entities when they are prominent and observable. expression should be specific.
318
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
319
+ ---
320
+
321
+ ## Output Format:
322
+ - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
323
+ object id. using {cat_name} as subject noun, action-oriented description
324
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
325
+ - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
326
+
327
+ ### Example
328
+ If the frame has 2 labeled bears, your output should look like:
329
+ 1. the bear reaching his right arm while leaning forward to capture the prey
330
+ 2. a bear standing upright facing right, touching the bike aside
331
+
332
+ ---
333
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
334
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
335
+ **Do not include markdown** in the output.
336
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
337
+ For each labeled {cat_name}, output referring expressions for each object id.
338
+ """
339
+ MAX_RETRIES = 2
340
+ retry_count = 0
341
+
342
+ if should_caption:
343
+ while retry_count < MAX_RETRIES:
344
+
345
+ response2 = captioner.chat.completions.create(
346
+ model=model,
347
+ messages=[
348
+ {
349
+ "role": "user",
350
+ "content": [
351
+ {
352
+ "type": "text",
353
+ "text": dense_caption_prompt,
354
+ },
355
+ {
356
+ "type": "image_url",
357
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
358
+ },
359
+ ],
360
+ }
361
+ ],
362
+ )
363
+
364
+ # caption = response2.choices[0].message.content
365
+ #print(f"{image_path} - {frame_name}: {caption}")
366
+
367
+ caption = response2.choices[0].message.content.strip()
368
+ caption_lower = caption.lower().lstrip()
369
+
370
+ if caption_lower.startswith("1.") and not any(
371
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
372
+ ):
373
+ break
374
+
375
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
376
+ retry_count += 1
377
+ time.sleep(2)
378
+
379
+ if retry_count == MAX_RETRIES:
380
+ caption = None
381
+ print("Max retries reached. Caption generation failed.")
382
+
383
+ else:
384
+ caption = None
385
+
386
+ image_captions[frame_name] = caption
387
+ all_captions[cat_name] = image_captions
388
+
389
+ # final : also prepare valid object ids
390
+ valid_obj_ids = dict()
391
+
392
+ for cat in cat_names:
393
+ if cat in ytvos_category_valid_list:
394
+ obj_id_cat = vid_meta['obj_id_cat']
395
+ valid_cat_ids = []
396
+ for obj_id in list(obj_id_cat.keys()):
397
+ if obj_id_cat[obj_id] == cat:
398
+ valid_cat_ids.append(obj_id)
399
+ valid_obj_ids[cat] = valid_cat_ids
400
+
401
+ return all_captions, valid_obj_ids
402
+
403
+
404
+
405
+ if __name__ == '__main__':
406
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
407
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
408
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
409
+
410
+ args = parser.parse_args()
411
+
412
+ #==================데이터 불러오기===================
413
+ # 전체 데이터셋
414
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
415
+
416
+ # 전체 데이터셋 메타데이터
417
+ metas = train_dataset.metas
418
+
419
+ # 색상 후보 8개 (RGB 형식)
420
+ colors = [
421
+ (255, 0, 0), # Red
422
+ (0, 255, 0), # Green
423
+ (0, 0, 255), # Blue
424
+ (255, 255, 0), # Yellow
425
+ (255, 0, 255), # Magenta
426
+ (0, 255, 255), # Cyan
427
+ (128, 0, 128), # Purple
428
+ (255, 165, 0) # Orange
429
+ ]
430
+
431
+ ytvos_category_valid_list = [
432
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
433
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
434
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
435
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
436
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
437
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
438
+ ]
439
+
440
+ #==================gpt 돌리기===================
441
+ os.environ['OPENAI_API_KEY'] = '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'
442
+
443
+ result_captions = {}
444
+ result_valid_obj_ids = {}
445
+
446
+ for i in range(370):
447
+ vid_id, all_captions, valid_obj_ids = getCaption(i, True)
448
+
449
+ if vid_id not in result_captions:
450
+ result_captions[vid_id] = all_captions
451
+ if vid_id not in result_valid_obj_ids:
452
+ result_valid_obj_ids[vid_id] = valid_obj_ids
453
+
454
+ print("Finished!", flush=True)
455
+
456
+ with open(args.save_caption_path, "w") as file:
457
+ json.dump(result_captions, file, indent=4)
458
+
459
+ with open(args.save_valid_obj_ids_path, "w") as file:
460
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_20250201141847.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+
9
+ from mbench.ytvos_ref import build as build_ytvos_ref
10
+ import argparse
11
+ import opts
12
+
13
+ import sys
14
+ from pathlib import Path
15
+ import os
16
+ from os import path as osp
17
+ import skimage
18
+ from io import BytesIO
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+ import regex as re
23
+ import json
24
+
25
+ import cv2
26
+ from PIL import Image, ImageDraw
27
+ import torch
28
+ from torchvision.transforms import functional as F
29
+
30
+ from skimage import measure # (pip install scikit-image)
31
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
32
+
33
+ import matplotlib.pyplot as plt
34
+ import matplotlib.patches as patches
35
+ from matplotlib.collections import PatchCollection
36
+ from matplotlib.patches import Rectangle
37
+ import textwrap
38
+
39
+
40
+ import ipywidgets as widgets
41
+ from IPython.display import display, clear_output
42
+
43
+ from openai import OpenAI
44
+ import base64
45
+ import json
46
+
47
+ def number_objects_and_encode(idx, color_mask=False):
48
+ encoded_frames = {}
49
+ contoured_frames = {} # New dictionary for original images
50
+ vid_cat_cnts = {}
51
+
52
+ vid_meta = metas[idx]
53
+ vid_data = train_dataset[idx]
54
+ vid_id = vid_meta['video']
55
+ frame_indx = vid_meta['sample_indx']
56
+ cat_names = set(vid_meta['obj_id_cat'].values())
57
+ imgs = vid_data[0]
58
+
59
+ for cat in cat_names:
60
+ cat_frames = []
61
+ contour_frames = []
62
+ frame_cat_cnts = {}
63
+
64
+ for i in range(imgs.size(0)):
65
+ frame_name = frame_indx[i]
66
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
67
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
68
+
69
+ frame_data = vid_data[2][frame_name]
70
+ obj_ids = list(frame_data.keys())
71
+
72
+ cat_cnt = 0
73
+
74
+ for j in range(len(obj_ids)):
75
+ obj_id = obj_ids[j]
76
+ obj_data = frame_data[obj_id]
77
+ obj_bbox = obj_data['bbox']
78
+ obj_valid = obj_data['valid']
79
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
80
+ obj_cat = obj_data['category_name']
81
+
82
+ if obj_cat == cat and obj_valid:
83
+ cat_cnt += 1
84
+
85
+ if color_mask == False:
86
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
87
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
88
+ for i, contour in enumerate(contours):
89
+ # 윤곽선 중심 계산
90
+ moments = cv2.moments(contour)
91
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
92
+ cx = int(moments["m10"] / moments["m00"])
93
+ cy = int(moments["m01"] / moments["m00"])
94
+ else:
95
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
96
+
97
+ # 텍스트 배경 (검은색 배경 만들기)
98
+ font = cv2.FONT_HERSHEY_SIMPLEX
99
+ text = obj_id
100
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
101
+ text_w, text_h = text_size
102
+
103
+ # 텍스트 배경 그리기 (검은색 배경)
104
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
105
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
106
+
107
+ # 텍스트 그리기 (흰색 텍스트)
108
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
109
+ font, 1, (255, 255, 255), 2)
110
+
111
+ else:
112
+ alpha = 0.08
113
+
114
+ colored_obj_mask = np.zeros_like(frame)
115
+ colored_obj_mask[obj_mask == 1] = colors[j]
116
+ frame[obj_mask == 1] = (
117
+ (1 - alpha) * frame[obj_mask == 1]
118
+ + alpha * colored_obj_mask[obj_mask == 1]
119
+ )
120
+
121
+
122
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
123
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
124
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
125
+
126
+
127
+
128
+ if len(contours) > 0:
129
+ largest_contour = max(contours, key=cv2.contourArea)
130
+ M = cv2.moments(largest_contour)
131
+ if M["m00"] != 0:
132
+ center_x = int(M["m10"] / M["m00"])
133
+ center_y = int(M["m01"] / M["m00"])
134
+ else:
135
+ center_x, center_y = 0, 0
136
+
137
+ font = cv2.FONT_HERSHEY_SIMPLEX
138
+ text = obj_id
139
+
140
+ font_scale = 0.9
141
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
142
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
143
+ text_y = center_y
144
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
145
+
146
+ # 텍스트 배경 사각형 좌표 계산
147
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
148
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
149
+ rect_end = (text_x + text_size[0] + 5, text_y)
150
+
151
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
152
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
153
+
154
+ # plt.figure(figsize=(12, 8))
155
+ # plt.imshow(frame)
156
+ # plt.title(f"frame {frame_name}")
157
+ # plt.tight_layout()
158
+ # plt.axis('off')
159
+ # plt.show()
160
+
161
+ buffer = BytesIO()
162
+ frame = Image.fromarray(frame)
163
+ frame.save(buffer, format='jpeg')
164
+ buffer.seek(0)
165
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
166
+ frame_cat_cnts[frame_name] = cat_cnt
167
+
168
+ buffer.seek(0) # Reuse buffer instead of creating a new one
169
+ buffer.truncate()
170
+ frame_for_contour = Image.fromarray(frame_for_contour)
171
+ frame_for_contour.save(buffer, format='jpeg')
172
+ buffer.seek(0)
173
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
174
+
175
+ encoded_frames[cat] = cat_frames
176
+ contoured_frames[cat] = contour_frames
177
+ vid_cat_cnts[cat] = frame_cat_cnts
178
+
179
+ return encoded_frames, vid_cat_cnts, contoured_frames
180
+
181
+
182
+ def getCaption(idx, model='gpt-4o', color_mask=True):
183
+ vid_meta = metas[idx]
184
+ vid_data = train_dataset[idx]
185
+ vid_id = vid_meta['video']
186
+ print(f"vid id: {vid_id}\n")
187
+
188
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
189
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
190
+ all_captions = dict()
191
+
192
+ base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
193
+ #marked = "mask with boundary" if color_mask else "boundary"
194
+
195
+ for cat_name in list(cat_names) :
196
+
197
+ is_movable = False
198
+ if cat_name in ytvos_category_valid_list :
199
+ is_movable = True
200
+
201
+ if not is_movable:
202
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
203
+
204
+
205
+ image_captions = {}
206
+ captioner = OpenAI()
207
+ cat_base64_frames = base64_frames[cat_name]
208
+ cont_base64_frames = contoured_frames[cat_name]
209
+
210
+ for i in range(len(cat_base64_frames)):
211
+ frame_name = frame_indx[i]
212
+ cont_base64_image = cont_base64_frames[i]
213
+ base64_image = cat_base64_frames[i]
214
+ should_filter = False
215
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
216
+
217
+ if frame_cat_cnts >= 2:
218
+ should_filter = True
219
+ else:
220
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
221
+
222
+ if is_movable and should_filter:
223
+ #1단계: 필터링
224
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
225
+ caption_filter_text = f"""
226
+ You are a visual assistant analyzing a single frame from a video.
227
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
228
+
229
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
230
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
231
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
232
+
233
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
234
+
235
+ - Respond with "YES" if:
236
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
237
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
238
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
239
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
240
+
241
+ - Respond with "NONE" if:
242
+ 1) The actions or pose are not clearly differentiable or too similar.
243
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
244
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
245
+
246
+ Answer strictly with either "YES" or "NONE".
247
+ """
248
+
249
+ response1 = captioner.chat.completions.create(
250
+ model=model,
251
+ messages=[
252
+ {
253
+ "role": "user",
254
+ "content": [
255
+ {
256
+ "type": "text",
257
+ "text": caption_filter_text,
258
+ },
259
+ {
260
+ "type": "image_url",
261
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
262
+ }
263
+ ],
264
+ }
265
+ ],
266
+ )
267
+ response_content = response1.choices[0].message.content
268
+ should_caption = True if "yes" in response_content.lower() else False
269
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
270
+
271
+ else:
272
+ should_caption = False
273
+
274
+ #2단계: dense caption 만들기
275
+ dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
276
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
277
+ I want to use your expressions to create a action-centric referring expression dataset.
278
+ Therefore, your expressions for these {cat_name}s should describe unique action of each object.
279
+
280
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
281
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
282
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
283
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
284
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
285
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
286
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
287
+ 8. Include interactions with objects or other entities when they are prominent and observable.
288
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
289
+ 10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
290
+ 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
291
+ 12. Do not mention object IDs.
292
+ 13. Use '{cat_name}' as the noun for the referring expressions.
293
+
294
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
295
+ Output referring expressions for each object id.
296
+ """
297
+
298
+ dense_caption_prompt = f"""
299
+ You are a visual assistant analyzing a single frame of a video.
300
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
301
+
302
+ I want to use your expressions to create an **action-centric referring expression** dataset.
303
+ Please describe each {cat_name} using **clearly observable** and **specific** actions.
304
+
305
+ ---
306
+ ## Guidelines:
307
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
308
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
309
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
310
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
311
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
312
+ 6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
313
+ 7. Base your description on these action definitions:
314
+ - Avoid using term 'minimal' or 'slightly'.
315
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
316
+ - details such as motion and intention, facial with object manipulation
317
+ - movements with objects or other entities when they are prominent and observable. expression should be specific.
318
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
319
+ ---
320
+
321
+ ## Output Format:
322
+ - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
323
+ object id. using {cat_name} as subject noun, action-oriented description
324
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
325
+ - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
326
+
327
+ ### Example
328
+ If the frame has 2 labeled bears, your output should look like:
329
+ 1. the bear reaching his right arm while leaning forward to capture the prey
330
+ 2. a bear standing upright facing right, touching the bike aside
331
+
332
+ ---
333
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
334
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
335
+ **Do not include markdown** in the output.
336
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
337
+ For each labeled {cat_name}, output referring expressions for each object id.
338
+ """
339
+ MAX_RETRIES = 2
340
+ retry_count = 0
341
+
342
+ if should_caption:
343
+ while retry_count < MAX_RETRIES:
344
+
345
+ response2 = captioner.chat.completions.create(
346
+ model=model,
347
+ messages=[
348
+ {
349
+ "role": "user",
350
+ "content": [
351
+ {
352
+ "type": "text",
353
+ "text": dense_caption_prompt,
354
+ },
355
+ {
356
+ "type": "image_url",
357
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
358
+ },
359
+ ],
360
+ }
361
+ ],
362
+ )
363
+
364
+ # caption = response2.choices[0].message.content
365
+ #print(f"{image_path} - {frame_name}: {caption}")
366
+
367
+ caption = response2.choices[0].message.content.strip()
368
+ caption_lower = caption.lower().lstrip()
369
+
370
+ if caption_lower.startswith("1.") and not any(
371
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
372
+ ):
373
+ break
374
+
375
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
376
+ retry_count += 1
377
+ time.sleep(2)
378
+
379
+ if retry_count == MAX_RETRIES:
380
+ caption = None
381
+ print("Max retries reached. Caption generation failed.")
382
+
383
+ else:
384
+ caption = None
385
+
386
+ image_captions[frame_name] = caption
387
+ all_captions[cat_name] = image_captions
388
+
389
+ # final : also prepare valid object ids
390
+ valid_obj_ids = dict()
391
+
392
+ for cat in cat_names:
393
+ if cat in ytvos_category_valid_list:
394
+ obj_id_cat = vid_meta['obj_id_cat']
395
+ valid_cat_ids = []
396
+ for obj_id in list(obj_id_cat.keys()):
397
+ if obj_id_cat[obj_id] == cat:
398
+ valid_cat_ids.append(obj_id)
399
+ valid_obj_ids[cat] = valid_cat_ids
400
+
401
+ return all_captions, valid_obj_ids
402
+
403
+
404
+
405
+ if __name__ == '__main__':
406
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
407
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
408
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
409
+
410
+ args = parser.parse_args()
411
+
412
+ #==================데이터 불러오기===================
413
+ # 전체 데이터셋
414
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
415
+
416
+ # 전체 데이터셋 메타데이터
417
+ metas = train_dataset.metas
418
+
419
+ # 색상 후보 8개 (RGB 형식)
420
+ colors = [
421
+ (255, 0, 0), # Red
422
+ (0, 255, 0), # Green
423
+ (0, 0, 255), # Blue
424
+ (255, 255, 0), # Yellow
425
+ (255, 0, 255), # Magenta
426
+ (0, 255, 255), # Cyan
427
+ (128, 0, 128), # Purple
428
+ (255, 165, 0) # Orange
429
+ ]
430
+
431
+ ytvos_category_valid_list = [
432
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
433
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
434
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
435
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
436
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
437
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
438
+ ]
439
+
440
+ #==================gpt 돌리기===================
441
+ os.environ['OPENAI_API_KEY'] = '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'
442
+
443
+ result_captions = {}
444
+ result_valid_obj_ids = {}
445
+
446
+ for i in range(370):
447
+ vid_id, all_captions, valid_obj_ids = getCaption(i)
448
+
449
+ if vid_id not in result_captions:
450
+ result_captions[vid_id] = all_captions
451
+ if vid_id not in result_valid_obj_ids:
452
+ result_valid_obj_ids[vid_id] = valid_obj_ids
453
+
454
+ print("Finished!", flush=True)
455
+
456
+ with open(args.save_caption_path, "w") as file:
457
+ json.dump(result_captions, file, indent=4)
458
+
459
+ with open(args.save_valid_obj_ids_path, "w") as file:
460
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250206153011.py ADDED
@@ -0,0 +1,644 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+ import random
9
+
10
+ from mbench.ytvos_ref import build as build_ytvos_ref
11
+ import argparse
12
+ import opts
13
+
14
+ import sys
15
+ from pathlib import Path
16
+ import os
17
+ from os import path as osp
18
+ import skimage
19
+ from io import BytesIO
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import regex as re
24
+ import json
25
+
26
+ import cv2
27
+ from PIL import Image, ImageDraw
28
+ import torch
29
+ from torchvision.transforms import functional as F
30
+
31
+ from skimage import measure # (pip install scikit-image)
32
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
33
+
34
+ import matplotlib.pyplot as plt
35
+ import matplotlib.patches as patches
36
+ from matplotlib.collections import PatchCollection
37
+ from matplotlib.patches import Rectangle
38
+ import textwrap
39
+
40
+
41
+ import ipywidgets as widgets
42
+ from IPython.display import display, clear_output
43
+
44
+ from openai import OpenAI
45
+ import base64
46
+ import json
47
+
48
+ def number_objects_and_encode_old(idx, color_mask=False):
49
+ encoded_frames = {}
50
+ contoured_frames = {} # New dictionary for original images
51
+ vid_cat_cnts = {}
52
+
53
+ vid_meta = metas[idx]
54
+ vid_data = train_dataset[idx]
55
+ vid_id = vid_meta['video']
56
+ frame_indx = vid_meta['sample_indx']
57
+ cat_names = set(vid_meta['obj_id_cat'].values())
58
+ imgs = vid_data[0]
59
+
60
+ for cat in cat_names:
61
+ cat_frames = []
62
+ contour_frames = []
63
+ frame_cat_cnts = {}
64
+
65
+ for i in range(imgs.size(0)):
66
+ frame_name = frame_indx[i]
67
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
68
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
69
+
70
+ frame_data = vid_data[2][frame_name]
71
+ obj_ids = list(frame_data.keys())
72
+
73
+ cat_cnt = 0
74
+
75
+ for j in range(len(obj_ids)):
76
+ obj_id = obj_ids[j]
77
+ obj_data = frame_data[obj_id]
78
+ obj_bbox = obj_data['bbox']
79
+ obj_valid = obj_data['valid']
80
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
81
+ obj_cat = obj_data['category_name']
82
+
83
+ if obj_cat == cat and obj_valid:
84
+ cat_cnt += 1
85
+
86
+ if color_mask == False:
87
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
88
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
89
+ for i, contour in enumerate(contours):
90
+ moments = cv2.moments(contour)
91
+ if moments["m00"] != 0:
92
+ cx = int(moments["m10"] / moments["m00"])
93
+ cy = int(moments["m01"] / moments["m00"])
94
+ else:
95
+ cx, cy = contour[0][0]
96
+
97
+ font = cv2.FONT_HERSHEY_SIMPLEX
98
+ text = obj_id
99
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
100
+ text_w, text_h = text_size
101
+
102
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
103
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
104
+
105
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
106
+ font, 1, (255, 255, 255), 2)
107
+
108
+ else:
109
+ alpha = 0.08
110
+
111
+ colored_obj_mask = np.zeros_like(frame)
112
+ colored_obj_mask[obj_mask == 1] = colors[j]
113
+ frame[obj_mask == 1] = (
114
+ (1 - alpha) * frame[obj_mask == 1]
115
+ + alpha * colored_obj_mask[obj_mask == 1]
116
+ )
117
+
118
+
119
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
120
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
121
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
122
+
123
+ if len(contours) > 0:
124
+ largest_contour = max(contours, key=cv2.contourArea)
125
+ M = cv2.moments(largest_contour)
126
+ if M["m00"] != 0:
127
+ center_x = int(M["m10"] / M["m00"])
128
+ center_y = int(M["m01"] / M["m00"])
129
+ else:
130
+ center_x, center_y = 0, 0
131
+
132
+ font = cv2.FONT_HERSHEY_SIMPLEX
133
+ text = obj_id
134
+
135
+ font_scale = 0.9
136
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
137
+ text_x = center_x - text_size[0] // 1
138
+ text_y = center_y
139
+
140
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
141
+ rect_end = (text_x + text_size[0] + 5, text_y)
142
+
143
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
144
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
145
+
146
+ # plt.figure(figsize=(12, 8))
147
+ # plt.imshow(frame)
148
+ # plt.title(f"frame {frame_name}")
149
+ # plt.tight_layout()
150
+ # plt.axis('off')
151
+ # plt.show()
152
+
153
+ buffer = BytesIO()
154
+ frame = Image.fromarray(frame)
155
+ frame.save(buffer, format='jpeg')
156
+ buffer.seek(0)
157
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
158
+ frame_cat_cnts[frame_name] = cat_cnt
159
+
160
+ buffer.seek(0) # Reuse buffer instead of creating a new one
161
+ buffer.truncate()
162
+ frame_for_contour = Image.fromarray(frame_for_contour)
163
+ frame_for_contour.save(buffer, format='jpeg')
164
+ buffer.seek(0)
165
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
166
+
167
+ encoded_frames[cat] = cat_frames
168
+ contoured_frames[cat] = contour_frames
169
+ vid_cat_cnts[cat] = frame_cat_cnts
170
+
171
+ return encoded_frames, contoured_frames, vid_cat_cnts
172
+
173
+
174
+ def number_objects_and_encode(idx, color_mask=False):
175
+ encoded_frames = {}
176
+ contoured_frames = {} # New dictionary for original images
177
+ vid_cat_cnts = {}
178
+
179
+ vid_meta = metas[idx]
180
+ vid_data = train_dataset[idx]
181
+ vid_id = vid_meta['video']
182
+ frame_indx = vid_meta['sample_indx']
183
+ cat_names = set(vid_meta['obj_id_cat'].values())
184
+ imgs = vid_data[0]
185
+
186
+ for cat in cat_names:
187
+ cat_frames = []
188
+ contour_frames = []
189
+ frame_cat_cnts = {}
190
+
191
+ for i in range(imgs.size(0)):
192
+ frame_name = frame_indx[i]
193
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
194
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
195
+
196
+ frame_data = vid_data[2][frame_name]
197
+ obj_ids = list(frame_data.keys())
198
+
199
+ cat_cnt = 0
200
+
201
+ for j in range(len(obj_ids)):
202
+ obj_id = obj_ids[j]
203
+ obj_data = frame_data[obj_id]
204
+ obj_bbox = obj_data['bbox']
205
+ obj_valid = obj_data['valid']
206
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
207
+ obj_cat = obj_data['category_name']
208
+
209
+ if obj_cat == cat and obj_valid:
210
+ cat_cnt += 1
211
+
212
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
213
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
214
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
215
+
216
+ if len(contours) > 0:
217
+ largest_contour = max(contours, key=cv2.contourArea)
218
+ M = cv2.moments(largest_contour)
219
+ if M["m00"] != 0:
220
+ center_x = int(M["m10"] / M["m00"])
221
+ center_y = int(M["m01"] / M["m00"])
222
+ else:
223
+ center_x, center_y = 0, 0
224
+
225
+ font = cv2.FONT_HERSHEY_SIMPLEX
226
+ text = obj_id
227
+ font_scale = 1.2
228
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
229
+ text_x = center_x - text_size[0] // 1
230
+ text_y = center_y
231
+
232
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
233
+ rect_end = (text_x + text_size[0] + 5, text_y + 3)
234
+
235
+ contour_thickness = 1
236
+ rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
237
+ rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
238
+
239
+ cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
240
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
241
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
242
+
243
+
244
+ if color_mask:
245
+ alpha = 0.08
246
+ colored_obj_mask = np.zeros_like(frame)
247
+ colored_obj_mask[obj_mask == 1] = colors[j]
248
+ frame[obj_mask == 1] = (
249
+ (1 - alpha) * frame[obj_mask == 1]
250
+ + alpha * colored_obj_mask[obj_mask == 1]
251
+ )
252
+
253
+ # plt.figure(figsize=(12, 8))
254
+ # plt.imshow(frame)
255
+ # plt.title(f"frame {frame_name}")
256
+ # plt.tight_layout()
257
+ # plt.axis('off')
258
+ # plt.show()
259
+
260
+ buffer = BytesIO()
261
+ frame = Image.fromarray(frame)
262
+ frame.save(buffer, format='jpeg')
263
+ buffer.seek(0)
264
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
265
+ frame_cat_cnts[frame_name] = cat_cnt
266
+
267
+ buffer.seek(0) # Reuse buffer instead of creating a new one
268
+ buffer.truncate()
269
+ frame_for_contour = Image.fromarray(frame_for_contour)
270
+ frame_for_contour.save(buffer, format='jpeg')
271
+ buffer.seek(0)
272
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
273
+
274
+ encoded_frames[cat] = cat_frames
275
+ contoured_frames[cat] = contour_frames
276
+ vid_cat_cnts[cat] = frame_cat_cnts
277
+
278
+ return encoded_frames, contoured_frames, vid_cat_cnts
279
+
280
+
281
+
282
+ def getCaption(idx, model='gpt-4o'):
283
+ vid_meta = metas[idx]
284
+ vid_data = train_dataset[idx]
285
+ vid_id = vid_meta['video']
286
+ print(f"vid id: {vid_id}\n")
287
+
288
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
289
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
290
+ all_captions = dict()
291
+
292
+ # color_mask = random.choice([True, False])
293
+ color_mask = random.choices([False, True], weights=[60, 40])[0]
294
+
295
+ base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
296
+ #marked = "mask with boundary" if color_mask else "boundary"
297
+
298
+ for cat_name in list(cat_names) :
299
+
300
+ is_movable = False
301
+ if cat_name in ytvos_category_valid_list :
302
+ is_movable = True
303
+
304
+ if not is_movable:
305
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
306
+
307
+
308
+ image_captions = {}
309
+ captioner = OpenAI()
310
+ cat_base64_frames = base64_frames[cat_name]
311
+ # cont_base64_frames = contoured_frames[cat_name]
312
+
313
+ for i in range(len(cat_base64_frames)):
314
+ frame_name = frame_indx[i]
315
+ # cont_base64_image = cont_base64_frames[i]
316
+ base64_image = cat_base64_frames[i]
317
+ should_filter = False
318
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
319
+
320
+ if frame_cat_cnts >= 2:
321
+ should_filter = True
322
+ else:
323
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
324
+
325
+
326
+ if is_movable and should_filter:
327
+ #1단계: 필터링
328
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
329
+ caption_filter_text = f"""
330
+ You are a visual assistant analyzing a single frame from a video.
331
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
332
+
333
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
334
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
335
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
336
+
337
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
338
+
339
+ - Respond with "YES" if:
340
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
341
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
342
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
343
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
344
+
345
+ - Respond with "NONE" if:
346
+ 1) The actions or pose are not clearly differentiable or too similar.
347
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
348
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
349
+
350
+ Answer strictly with either "YES" or "NONE".
351
+ """
352
+
353
+ response1 = captioner.chat.completions.create(
354
+ model=model,
355
+ messages=[
356
+ {
357
+ "role": "user",
358
+ "content": [
359
+ {
360
+ "type": "text",
361
+ "text": caption_filter_text,
362
+ },
363
+ {
364
+ "type": "image_url",
365
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
366
+ }
367
+ ],
368
+ }
369
+ ],
370
+ )
371
+ response_content = response1.choices[0].message.content
372
+ should_caption = True if "yes" in response_content.lower() else False
373
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
374
+
375
+ else:
376
+ should_caption = False
377
+
378
+ #2단계: dense caption 만들기
379
+ dense_caption_prompt_1 = f"""
380
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
381
+
382
+ Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
383
+
384
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
385
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
386
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
387
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
388
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
389
+ 6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
390
+ - expressions like 'seems to be', 'appears to be' are BANNED!
391
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
392
+ 8. Include interactions with objects or other entities when they are prominent and observable.
393
+ 9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
394
+ 10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
395
+ 11. Do not mention object IDs.
396
+ 12. Use '{cat_name}' as the noun for the referring expressions.
397
+
398
+ Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
399
+
400
+ - Your answer should contain details, and follow the following format:
401
+ object id. action-oriented description
402
+ (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
403
+ 2. a person bending over and touching his boots to tie the shoelace.)
404
+ - for action-oriented description, use {cat_name} as subject noun
405
+
406
+ **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
407
+ Please pay attention to the categories of these objects and don’t change them.
408
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
409
+ Output referring expressions for each object id. Please start your answer:"""
410
+
411
+
412
+ dense_caption_prompt_2 = f"""
413
+ You are an advanced visual language model analyzing a video frame.
414
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
415
+
416
+ Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
417
+ Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
418
+
419
+ ---
420
+ ## Key Guidelines:
421
+ 1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
422
+ - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
423
+ - Avoid: "moving slightly to the side" (**(X) Too vague**)
424
+
425
+ 2. **Do not describe appearance, color, or position**—focus purely on the action.
426
+ - (X) "A large brown bear standing on the left"
427
+ - (O) "The bear is lifting its front paws and swiping forward."
428
+
429
+ 3. **Use dynamic, action-specific verbs** rather than passive descriptions.
430
+ - (O) "The giraffe is tilting its head and sniffing the ground."
431
+ - (X) "The giraffe is near a tree and looking around."
432
+
433
+ 4. **Avoid assumptions, emotions, or speculative phrasing.**
434
+ - (X) "The person seems excited" / "The person might be preparing to jump."
435
+ - (O) "The person is pushing its front legs against the rock and leaping forward."
436
+
437
+ 5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
438
+ - expressions like 'seems to be', 'appears to be' are BANNED!
439
+ 6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
440
+
441
+ 7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
442
+ - **Each object should have a unique, descriptive action.**
443
+ - (X) "Two dogs are running."
444
+ - (O) "1. One dog is chasing another, its legs stretched mid-air.
445
+ 2. The other dog is looking back while speeding up."
446
+
447
+ ---
448
+ ## Output Format:
449
+ - Each labeled **{cat_name}** should have exactly **one line of description**.
450
+ - Format: `ID. {cat_name} + action-based description`
451
+ - (O) Example:
452
+ ```
453
+ 1. The person is leaning forward while opening a bag with both hands.
454
+ 2. The person is holding onto a rope and pulling themselves up.
455
+ ```
456
+ - **Ensure that each object is described individually.**
457
+ - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
458
+
459
+ ---
460
+ ## Additional Instructions:
461
+ - **Do NOT** use expressions like "it appears that..." or "it seems like...".
462
+ - **Do NOT** mention object IDs in the description (only use the provided format).
463
+ - **DO NOT** include markdown formatting (no bullet points, no asterisks).
464
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
465
+
466
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
467
+ """
468
+
469
+
470
+ dense_caption_prompt = f"""
471
+ You are a visual assistant analyzing a single frame of a video.
472
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
473
+
474
+ I am building an **action-centric referring expression** dataset.
475
+ Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
476
+
477
+ ---
478
+ ## Guidelines:
479
+ 1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
480
+ 2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
481
+ 3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
482
+ 4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
483
+ 5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
484
+ 6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
485
+ 7. Base your descriptions on these principles:
486
+ - **Avoid words like 'minimal' or 'slightly'.**
487
+ - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
488
+ - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
489
+ - **Specify actions with other objects or entities** only when they are clear and observable.
490
+ - (O) "pushing another person"
491
+ - (X) "interacting with another object"
492
+
493
+ ---
494
+ ## Output Format:
495
+ - Each labeled **{cat_name}** must have **exactly one line**.
496
+ - Format: `ID. {cat_name} + action-based description`
497
+ - (O) Example:
498
+ ```
499
+ 1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
500
+ 2. The person is pulling a baby carriage while smiling.
501
+ ```
502
+ - **Ensure each object is described individually.**
503
+ - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
504
+
505
+ ---
506
+ ## Example:
507
+ If the frame has two labeled **bears**, your output should be:
508
+ ```
509
+ 1. The bear is reaching out its right paw while leaning forward to catch prey.
510
+ 2. A bear is standing upright, facing right, and touching the bike beside it.
511
+ ```
512
+
513
+ ---
514
+ ## Additional Instructions:
515
+ - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
516
+ - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
517
+ - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
518
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
519
+
520
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
521
+
522
+
523
+ MAX_RETRIES = 3
524
+ retry_count = 0
525
+
526
+ if should_caption:
527
+ while retry_count < MAX_RETRIES:
528
+ selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
529
+
530
+ response2 = captioner.chat.completions.create(
531
+ model=model,
532
+ messages=[
533
+ {
534
+ "role": "user",
535
+ "content": [
536
+ {
537
+ "type": "text",
538
+ "text": selected_prompt,
539
+ },
540
+ {
541
+ "type": "image_url",
542
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
543
+ },
544
+ ],
545
+ }
546
+ ],
547
+ )
548
+
549
+ # caption = response2.choices[0].message.content
550
+ #print(f"{image_path} - {frame_name}: {caption}")
551
+
552
+ caption = response2.choices[0].message.content.strip()
553
+ caption_lower = caption.lower().lstrip()
554
+
555
+ if caption_lower.startswith("1.") and not any(
556
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
557
+ ):
558
+ break
559
+
560
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
561
+ retry_count += 1
562
+ time.sleep(2)
563
+
564
+ if retry_count == MAX_RETRIES:
565
+ caption = None
566
+ print("Max retries reached. Caption generation failed.")
567
+
568
+ else:
569
+ caption = None
570
+
571
+ image_captions[frame_name] = caption
572
+ all_captions[cat_name] = image_captions
573
+
574
+ # final : also prepare valid object ids
575
+ valid_obj_ids = dict()
576
+
577
+ for cat in cat_names:
578
+ if cat in ytvos_category_valid_list:
579
+ obj_id_cat = vid_meta['obj_id_cat']
580
+ valid_cat_ids = []
581
+ for obj_id in list(obj_id_cat.keys()):
582
+ if obj_id_cat[obj_id] == cat:
583
+ valid_cat_ids.append(obj_id)
584
+ valid_obj_ids[cat] = valid_cat_ids
585
+
586
+ return vid_id, all_captions, valid_obj_ids
587
+
588
+
589
+ if __name__ == '__main__':
590
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
591
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
592
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
593
+
594
+ args = parser.parse_args()
595
+
596
+ #==================데이터 불러오기===================
597
+ # 전체 데이터셋
598
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
599
+
600
+ # 전체 데이터셋 메타데이터
601
+ metas = train_dataset.metas
602
+
603
+ # 색상 후보 8개 (RGB 형식)
604
+ colors = [
605
+ (255, 0, 0), # Red
606
+ (0, 255, 0), # Green
607
+ (0, 0, 255), # Blue
608
+ (255, 255, 0), # Yellow
609
+ (255, 0, 255), # Magenta
610
+ (0, 255, 255), # Cyan
611
+ (128, 0, 128), # Purple
612
+ (255, 165, 0) # Orange
613
+ ]
614
+
615
+ ytvos_category_valid_list = [
616
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
617
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
618
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
619
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
620
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
621
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
622
+ ]
623
+
624
+ #==================gpt 돌리기===================
625
+ os.environ['OPENAI_API_KEY'] = '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'
626
+
627
+ result_captions = {}
628
+ result_valid_obj_ids = {}
629
+
630
+ for i in range(370):
631
+ vid_id, all_captions, valid_obj_ids = getCaption(i)
632
+
633
+ if vid_id not in result_captions:
634
+ result_captions[vid_id] = all_captions
635
+ if vid_id not in result_valid_obj_ids:
636
+ result_valid_obj_ids[vid_id] = valid_obj_ids
637
+
638
+ print("Finished!", flush=True)
639
+
640
+ with open(args.save_caption_path, "w") as file:
641
+ json.dump(result_captions, file, indent=4)
642
+
643
+ with open(args.save_valid_obj_ids_path, "w") as file:
644
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207171300.py ADDED
@@ -0,0 +1,644 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+ import random
9
+
10
+ from mbench.ytvos_ref import build as build_ytvos_ref
11
+ import argparse
12
+ import opts
13
+
14
+ import sys
15
+ from pathlib import Path
16
+ import os
17
+ from os import path as osp
18
+ import skimage
19
+ from io import BytesIO
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import regex as re
24
+ import json
25
+
26
+ import cv2
27
+ from PIL import Image, ImageDraw
28
+ import torch
29
+ from torchvision.transforms import functional as F
30
+
31
+ from skimage import measure # (pip install scikit-image)
32
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
33
+
34
+ import matplotlib.pyplot as plt
35
+ import matplotlib.patches as patches
36
+ from matplotlib.collections import PatchCollection
37
+ from matplotlib.patches import Rectangle
38
+ import textwrap
39
+
40
+
41
+ import ipywidgets as widgets
42
+ from IPython.display import display, clear_output
43
+
44
+ from openai import OpenAI
45
+ import base64
46
+ import json
47
+
48
+ def number_objects_and_encode_old(idx, color_mask=False):
49
+ encoded_frames = {}
50
+ contoured_frames = {} # New dictionary for original images
51
+ vid_cat_cnts = {}
52
+
53
+ vid_meta = metas[idx]
54
+ vid_data = train_dataset[idx]
55
+ vid_id = vid_meta['video']
56
+ frame_indx = vid_meta['sample_indx']
57
+ cat_names = set(vid_meta['obj_id_cat'].values())
58
+ imgs = vid_data[0]
59
+
60
+ for cat in cat_names:
61
+ cat_frames = []
62
+ contour_frames = []
63
+ frame_cat_cnts = {}
64
+
65
+ for i in range(imgs.size(0)):
66
+ frame_name = frame_indx[i]
67
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
68
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
69
+
70
+ frame_data = vid_data[2][frame_name]
71
+ obj_ids = list(frame_data.keys())
72
+
73
+ cat_cnt = 0
74
+
75
+ for j in range(len(obj_ids)):
76
+ obj_id = obj_ids[j]
77
+ obj_data = frame_data[obj_id]
78
+ obj_bbox = obj_data['bbox']
79
+ obj_valid = obj_data['valid']
80
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
81
+ obj_cat = obj_data['category_name']
82
+
83
+ if obj_cat == cat and obj_valid:
84
+ cat_cnt += 1
85
+
86
+ if color_mask == False:
87
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
88
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
89
+ for i, contour in enumerate(contours):
90
+ moments = cv2.moments(contour)
91
+ if moments["m00"] != 0:
92
+ cx = int(moments["m10"] / moments["m00"])
93
+ cy = int(moments["m01"] / moments["m00"])
94
+ else:
95
+ cx, cy = contour[0][0]
96
+
97
+ font = cv2.FONT_HERSHEY_SIMPLEX
98
+ text = obj_id
99
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
100
+ text_w, text_h = text_size
101
+
102
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
103
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
104
+
105
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
106
+ font, 1, (255, 255, 255), 2)
107
+
108
+ else:
109
+ alpha = 0.08
110
+
111
+ colored_obj_mask = np.zeros_like(frame)
112
+ colored_obj_mask[obj_mask == 1] = colors[j]
113
+ frame[obj_mask == 1] = (
114
+ (1 - alpha) * frame[obj_mask == 1]
115
+ + alpha * colored_obj_mask[obj_mask == 1]
116
+ )
117
+
118
+
119
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
120
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
121
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
122
+
123
+ if len(contours) > 0:
124
+ largest_contour = max(contours, key=cv2.contourArea)
125
+ M = cv2.moments(largest_contour)
126
+ if M["m00"] != 0:
127
+ center_x = int(M["m10"] / M["m00"])
128
+ center_y = int(M["m01"] / M["m00"])
129
+ else:
130
+ center_x, center_y = 0, 0
131
+
132
+ font = cv2.FONT_HERSHEY_SIMPLEX
133
+ text = obj_id
134
+
135
+ font_scale = 0.9
136
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
137
+ text_x = center_x - text_size[0] // 1
138
+ text_y = center_y
139
+
140
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
141
+ rect_end = (text_x + text_size[0] + 5, text_y)
142
+
143
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
144
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
145
+
146
+ # plt.figure(figsize=(12, 8))
147
+ # plt.imshow(frame)
148
+ # plt.title(f"frame {frame_name}")
149
+ # plt.tight_layout()
150
+ # plt.axis('off')
151
+ # plt.show()
152
+
153
+ buffer = BytesIO()
154
+ frame = Image.fromarray(frame)
155
+ frame.save(buffer, format='jpeg')
156
+ buffer.seek(0)
157
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
158
+ frame_cat_cnts[frame_name] = cat_cnt
159
+
160
+ buffer.seek(0) # Reuse buffer instead of creating a new one
161
+ buffer.truncate()
162
+ frame_for_contour = Image.fromarray(frame_for_contour)
163
+ frame_for_contour.save(buffer, format='jpeg')
164
+ buffer.seek(0)
165
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
166
+
167
+ encoded_frames[cat] = cat_frames
168
+ contoured_frames[cat] = contour_frames
169
+ vid_cat_cnts[cat] = frame_cat_cnts
170
+
171
+ return encoded_frames, contoured_frames, vid_cat_cnts
172
+
173
+
174
+ def number_objects_and_encode(idx, color_mask=False):
175
+ encoded_frames = {}
176
+ contoured_frames = {} # New dictionary for original images
177
+ vid_cat_cnts = {}
178
+
179
+ vid_meta = metas[idx]
180
+ vid_data = train_dataset[idx]
181
+ vid_id = vid_meta['video']
182
+ frame_indx = vid_meta['sample_indx']
183
+ cat_names = set(vid_meta['obj_id_cat'].values())
184
+ imgs = vid_data[0]
185
+
186
+ for cat in cat_names:
187
+ cat_frames = []
188
+ contour_frames = []
189
+ frame_cat_cnts = {}
190
+
191
+ for i in range(imgs.size(0)):
192
+ frame_name = frame_indx[i]
193
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
194
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
195
+
196
+ frame_data = vid_data[2][frame_name]
197
+ obj_ids = list(frame_data.keys())
198
+
199
+ cat_cnt = 0
200
+
201
+ for j in range(len(obj_ids)):
202
+ obj_id = obj_ids[j]
203
+ obj_data = frame_data[obj_id]
204
+ obj_bbox = obj_data['bbox']
205
+ obj_valid = obj_data['valid']
206
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
207
+ obj_cat = obj_data['category_name']
208
+
209
+ if obj_cat == cat and obj_valid:
210
+ cat_cnt += 1
211
+
212
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
213
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
214
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
215
+
216
+ if len(contours) > 0:
217
+ largest_contour = max(contours, key=cv2.contourArea)
218
+ M = cv2.moments(largest_contour)
219
+ if M["m00"] != 0:
220
+ center_x = int(M["m10"] / M["m00"])
221
+ center_y = int(M["m01"] / M["m00"])
222
+ else:
223
+ center_x, center_y = 0, 0
224
+
225
+ font = cv2.FONT_HERSHEY_SIMPLEX
226
+ text = obj_id
227
+ font_scale = 1.2
228
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
229
+ text_x = center_x - text_size[0] // 1
230
+ text_y = center_y
231
+
232
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
233
+ rect_end = (text_x + text_size[0] + 5, text_y + 3)
234
+
235
+ contour_thickness = 1
236
+ rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
237
+ rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
238
+
239
+ cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
240
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
241
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
242
+
243
+
244
+ if color_mask:
245
+ alpha = 0.08
246
+ colored_obj_mask = np.zeros_like(frame)
247
+ colored_obj_mask[obj_mask == 1] = colors[j]
248
+ frame[obj_mask == 1] = (
249
+ (1 - alpha) * frame[obj_mask == 1]
250
+ + alpha * colored_obj_mask[obj_mask == 1]
251
+ )
252
+
253
+ # plt.figure(figsize=(12, 8))
254
+ # plt.imshow(frame)
255
+ # plt.title(f"frame {frame_name}")
256
+ # plt.tight_layout()
257
+ # plt.axis('off')
258
+ # plt.show()
259
+
260
+ buffer = BytesIO()
261
+ frame = Image.fromarray(frame)
262
+ frame.save(buffer, format='jpeg')
263
+ buffer.seek(0)
264
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
265
+ frame_cat_cnts[frame_name] = cat_cnt
266
+
267
+ buffer.seek(0) # Reuse buffer instead of creating a new one
268
+ buffer.truncate()
269
+ frame_for_contour = Image.fromarray(frame_for_contour)
270
+ frame_for_contour.save(buffer, format='jpeg')
271
+ buffer.seek(0)
272
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
273
+
274
+ encoded_frames[cat] = cat_frames
275
+ contoured_frames[cat] = contour_frames
276
+ vid_cat_cnts[cat] = frame_cat_cnts
277
+
278
+ return encoded_frames, contoured_frames, vid_cat_cnts
279
+
280
+
281
+
282
+ def getCaption(idx, model='gpt-4o'):
283
+ vid_meta = metas[idx]
284
+ vid_data = train_dataset[idx]
285
+ vid_id = vid_meta['video']
286
+ print(f"vid id: {vid_id}\n")
287
+
288
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
289
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
290
+ all_captions = dict()
291
+
292
+ # color_mask = random.choice([True, False])
293
+ color_mask = random.choices([False, True], weights=[60, 40])[0]
294
+
295
+ base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
296
+ #marked = "mask with boundary" if color_mask else "boundary"
297
+
298
+ for cat_name in list(cat_names) :
299
+
300
+ is_movable = False
301
+ if cat_name in ytvos_category_valid_list :
302
+ is_movable = True
303
+
304
+ if not is_movable:
305
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
306
+
307
+
308
+ image_captions = {}
309
+ captioner = OpenAI()
310
+ cat_base64_frames = base64_frames[cat_name]
311
+ # cont_base64_frames = contoured_frames[cat_name]
312
+
313
+ for i in range(len(cat_base64_frames)):
314
+ frame_name = frame_indx[i]
315
+ # cont_base64_image = cont_base64_frames[i]
316
+ base64_image = cat_base64_frames[i]
317
+ should_filter = False
318
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
319
+
320
+ if frame_cat_cnts >= 2:
321
+ should_filter = True
322
+ else:
323
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
324
+
325
+
326
+ if is_movable and should_filter:
327
+ #1단계: 필터링
328
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
329
+ caption_filter_text = f"""
330
+ You are a visual assistant analyzing a single frame from a video.
331
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
332
+
333
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
334
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
335
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
336
+
337
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
338
+
339
+ - Respond with "YES" if:
340
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
341
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
342
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
343
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
344
+
345
+ - Respond with "NONE" if:
346
+ 1) The actions or pose are not clearly differentiable or too similar.
347
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
348
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
349
+
350
+ Answer strictly with either "YES" or "NONE".
351
+ """
352
+
353
+ response1 = captioner.chat.completions.create(
354
+ model=model,
355
+ messages=[
356
+ {
357
+ "role": "user",
358
+ "content": [
359
+ {
360
+ "type": "text",
361
+ "text": caption_filter_text,
362
+ },
363
+ {
364
+ "type": "image_url",
365
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
366
+ }
367
+ ],
368
+ }
369
+ ],
370
+ )
371
+ response_content = response1.choices[0].message.content
372
+ should_caption = True if "yes" in response_content.lower() else False
373
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
374
+
375
+ else:
376
+ should_caption = False
377
+
378
+ #2단계: dense caption 만들기
379
+ dense_caption_prompt_1 = f"""
380
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
381
+
382
+ Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
383
+
384
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
385
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
386
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
387
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
388
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
389
+ 6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
390
+ - expressions like 'seems to be', 'appears to be' are BANNED!
391
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
392
+ 8. Include interactions with objects or other entities when they are prominent and observable.
393
+ 9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
394
+ 10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
395
+ 11. Do not mention object IDs.
396
+ 12. Use '{cat_name}' as the noun for the referring expressions.
397
+
398
+ Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
399
+
400
+ - Your answer should contain details, and follow the following format:
401
+ object id. action-oriented description
402
+ (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
403
+ 2. a person bending over and touching his boots to tie the shoelace.)
404
+ - for action-oriented description, use {cat_name} as subject noun
405
+
406
+ **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
407
+ Please pay attention to the categories of these objects and don’t change them.
408
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
409
+ Output referring expressions for each object id. Please start your answer:"""
410
+
411
+
412
+ dense_caption_prompt_2 = f"""
413
+ You are an advanced visual language model analyzing a video frame.
414
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
415
+
416
+ Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
417
+ Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
418
+
419
+ ---
420
+ ## Key Guidelines:
421
+ 1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
422
+ - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
423
+ - Avoid: "moving slightly to the side" (**(X) Too vague**)
424
+
425
+ 2. **Do not describe appearance, color, or position**—focus purely on the action.
426
+ - (X) "A large brown bear standing on the left"
427
+ - (O) "The bear is lifting its front paws and swiping forward."
428
+
429
+ 3. **Use dynamic, action-specific verbs** rather than passive descriptions.
430
+ - (O) "The giraffe is tilting its head and sniffing the ground."
431
+ - (X) "The giraffe is near a tree and looking around."
432
+
433
+ 4. **Avoid assumptions, emotions, or speculative phrasing.**
434
+ - (X) "The person seems excited" / "The person might be preparing to jump."
435
+ - (O) "The person is pushing its front legs against the rock and leaping forward."
436
+
437
+ 5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
438
+ - expressions like 'seems to be', 'appears to be' are BANNED!
439
+ 6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
440
+
441
+ 7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
442
+ - **Each object should have a unique, descriptive action.**
443
+ - (X) "Two dogs are running."
444
+ - (O) "1. One dog is chasing another, its legs stretched mid-air.
445
+ 2. The other dog is looking back while speeding up."
446
+
447
+ ---
448
+ ## Output Format:
449
+ - Each labeled **{cat_name}** should have exactly **one line of description**.
450
+ - Format: `ID. {cat_name} + action-based description`
451
+ - (O) Example:
452
+ ```
453
+ 1. The person is leaning forward while opening a bag with both hands.
454
+ 2. The person is holding onto a rope and pulling themselves up.
455
+ ```
456
+ - **Ensure that each object is described individually.**
457
+ - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
458
+
459
+ ---
460
+ ## Additional Instructions:
461
+ - **Do NOT** use expressions like "it appears that..." or "it seems like...".
462
+ - **Do NOT** mention object IDs in the description (only use the provided format).
463
+ - **DO NOT** include markdown formatting (no bullet points, no asterisks).
464
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
465
+
466
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
467
+ """
468
+
469
+
470
+ dense_caption_prompt = f"""
471
+ You are a visual assistant analyzing a single frame of a video.
472
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
473
+
474
+ I am building an **action-centric referring expression** dataset.
475
+ Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
476
+
477
+ ---
478
+ ## Guidelines:
479
+ 1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
480
+ 2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
481
+ 3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
482
+ 4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
483
+ 5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
484
+ 6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
485
+ 7. Base your descriptions on these principles:
486
+ - **Avoid words like 'minimal' or 'slightly'.**
487
+ - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
488
+ - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
489
+ - **Specify actions with other objects or entities** only when they are clear and observable.
490
+ - (O) "pushing another person"
491
+ - (X) "interacting with another object"
492
+
493
+ ---
494
+ ## Output Format:
495
+ - Each labeled **{cat_name}** must have **exactly one line**.
496
+ - Format: `ID. {cat_name} + action-based description`
497
+ - (O) Example:
498
+ ```
499
+ 1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
500
+ 2. The person is pulling a baby carriage while smiling.
501
+ ```
502
+ - **Ensure each object is described individually.**
503
+ - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
504
+
505
+ ---
506
+ ## Example:
507
+ If the frame has two labeled **bears**, your output should be:
508
+ ```
509
+ 1. The bear is reaching out its right paw while leaning forward to catch prey.
510
+ 2. A bear is standing upright, facing right, and touching the bike beside it.
511
+ ```
512
+
513
+ ---
514
+ ## Additional Instructions:
515
+ - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
516
+ - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
517
+ - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
518
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
519
+
520
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
521
+
522
+
523
+ MAX_RETRIES = 3
524
+ retry_count = 0
525
+
526
+ if should_caption:
527
+ while retry_count < MAX_RETRIES:
528
+ selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
529
+
530
+ response2 = captioner.chat.completions.create(
531
+ model=model,
532
+ messages=[
533
+ {
534
+ "role": "user",
535
+ "content": [
536
+ {
537
+ "type": "text",
538
+ "text": selected_prompt,
539
+ },
540
+ {
541
+ "type": "image_url",
542
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
543
+ },
544
+ ],
545
+ }
546
+ ],
547
+ )
548
+
549
+ # caption = response2.choices[0].message.content
550
+ #print(f"{image_path} - {frame_name}: {caption}")
551
+
552
+ caption = response2.choices[0].message.content.strip()
553
+ caption_lower = caption.lower().lstrip()
554
+
555
+ if caption_lower.startswith("1.") and not any(
556
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
557
+ ):
558
+ break
559
+
560
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
561
+ retry_count += 1
562
+ time.sleep(2)
563
+
564
+ if retry_count == MAX_RETRIES:
565
+ caption = None
566
+ print("Max retries reached. Caption generation failed.")
567
+
568
+ else:
569
+ caption = None
570
+
571
+ image_captions[frame_name] = caption
572
+ all_captions[cat_name] = image_captions
573
+
574
+ # final : also prepare valid object ids
575
+ valid_obj_ids = dict()
576
+
577
+ for cat in cat_names:
578
+ if cat in ytvos_category_valid_list:
579
+ obj_id_cat = vid_meta['obj_id_cat']
580
+ valid_cat_ids = []
581
+ for obj_id in list(obj_id_cat.keys()):
582
+ if obj_id_cat[obj_id] == cat:
583
+ valid_cat_ids.append(obj_id)
584
+ valid_obj_ids[cat] = valid_cat_ids
585
+
586
+ return vid_id, all_captions, valid_obj_ids
587
+
588
+
589
+ if __name__ == '__main__':
590
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
591
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
592
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
593
+
594
+ args = parser.parse_args()
595
+
596
+ #==================데이터 불러오기===================
597
+ # 전체 데이터셋
598
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
599
+
600
+ # 전체 데이터셋 메타데이터
601
+ metas = train_dataset.metas
602
+
603
+ # 색상 후보 8개 (RGB 형식)
604
+ colors = [
605
+ (255, 0, 0), # Red
606
+ (0, 255, 0), # Green
607
+ (0, 0, 255), # Blue
608
+ (255, 255, 0), # Yellow
609
+ (255, 0, 255), # Magenta
610
+ (0, 255, 255), # Cyan
611
+ (128, 0, 128), # Purple
612
+ (255, 165, 0) # Orange
613
+ ]
614
+
615
+ ytvos_category_valid_list = [
616
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
617
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
618
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
619
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
620
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
621
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
622
+ ]
623
+
624
+ #==================gpt 돌리기===================
625
+ os.environ['OPENAI_API_KEY'] = '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'
626
+
627
+ result_captions = {}
628
+ result_valid_obj_ids = {}
629
+
630
+ for i in range(len(metas)):
631
+ vid_id, all_captions, valid_obj_ids = getCaption(i)
632
+
633
+ if vid_id not in result_captions:
634
+ result_captions[vid_id] = all_captions
635
+ if vid_id not in result_valid_obj_ids:
636
+ result_valid_obj_ids[vid_id] = valid_obj_ids
637
+
638
+ print("Finished!", flush=True)
639
+
640
+ with open(args.save_caption_path, "w") as file:
641
+ json.dump(result_captions, file, indent=4)
642
+
643
+ with open(args.save_valid_obj_ids_path, "w") as file:
644
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207171416.py ADDED
@@ -0,0 +1,644 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+ import random
9
+
10
+ from mbench.ytvos_ref import build as build_ytvos_ref
11
+ import argparse
12
+ import opts
13
+
14
+ import sys
15
+ from pathlib import Path
16
+ import os
17
+ from os import path as osp
18
+ import skimage
19
+ from io import BytesIO
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import regex as re
24
+ import json
25
+
26
+ import cv2
27
+ from PIL import Image, ImageDraw
28
+ import torch
29
+ from torchvision.transforms import functional as F
30
+
31
+ from skimage import measure # (pip install scikit-image)
32
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
33
+
34
+ import matplotlib.pyplot as plt
35
+ import matplotlib.patches as patches
36
+ from matplotlib.collections import PatchCollection
37
+ from matplotlib.patches import Rectangle
38
+ import textwrap
39
+
40
+
41
+ import ipywidgets as widgets
42
+ from IPython.display import display, clear_output
43
+
44
+ from openai import OpenAI
45
+ import base64
46
+ import json
47
+
48
+ def number_objects_and_encode_old(idx, color_mask=False):
49
+ encoded_frames = {}
50
+ contoured_frames = {} # New dictionary for original images
51
+ vid_cat_cnts = {}
52
+
53
+ vid_meta = metas[idx]
54
+ vid_data = train_dataset[idx]
55
+ vid_id = vid_meta['video']
56
+ frame_indx = vid_meta['sample_indx']
57
+ cat_names = set(vid_meta['obj_id_cat'].values())
58
+ imgs = vid_data[0]
59
+
60
+ for cat in cat_names:
61
+ cat_frames = []
62
+ contour_frames = []
63
+ frame_cat_cnts = {}
64
+
65
+ for i in range(imgs.size(0)):
66
+ frame_name = frame_indx[i]
67
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
68
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
69
+
70
+ frame_data = vid_data[2][frame_name]
71
+ obj_ids = list(frame_data.keys())
72
+
73
+ cat_cnt = 0
74
+
75
+ for j in range(len(obj_ids)):
76
+ obj_id = obj_ids[j]
77
+ obj_data = frame_data[obj_id]
78
+ obj_bbox = obj_data['bbox']
79
+ obj_valid = obj_data['valid']
80
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
81
+ obj_cat = obj_data['category_name']
82
+
83
+ if obj_cat == cat and obj_valid:
84
+ cat_cnt += 1
85
+
86
+ if color_mask == False:
87
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
88
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
89
+ for i, contour in enumerate(contours):
90
+ moments = cv2.moments(contour)
91
+ if moments["m00"] != 0:
92
+ cx = int(moments["m10"] / moments["m00"])
93
+ cy = int(moments["m01"] / moments["m00"])
94
+ else:
95
+ cx, cy = contour[0][0]
96
+
97
+ font = cv2.FONT_HERSHEY_SIMPLEX
98
+ text = obj_id
99
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
100
+ text_w, text_h = text_size
101
+
102
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
103
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
104
+
105
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
106
+ font, 1, (255, 255, 255), 2)
107
+
108
+ else:
109
+ alpha = 0.08
110
+
111
+ colored_obj_mask = np.zeros_like(frame)
112
+ colored_obj_mask[obj_mask == 1] = colors[j]
113
+ frame[obj_mask == 1] = (
114
+ (1 - alpha) * frame[obj_mask == 1]
115
+ + alpha * colored_obj_mask[obj_mask == 1]
116
+ )
117
+
118
+
119
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
120
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
121
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
122
+
123
+ if len(contours) > 0:
124
+ largest_contour = max(contours, key=cv2.contourArea)
125
+ M = cv2.moments(largest_contour)
126
+ if M["m00"] != 0:
127
+ center_x = int(M["m10"] / M["m00"])
128
+ center_y = int(M["m01"] / M["m00"])
129
+ else:
130
+ center_x, center_y = 0, 0
131
+
132
+ font = cv2.FONT_HERSHEY_SIMPLEX
133
+ text = obj_id
134
+
135
+ font_scale = 0.9
136
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
137
+ text_x = center_x - text_size[0] // 1
138
+ text_y = center_y
139
+
140
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
141
+ rect_end = (text_x + text_size[0] + 5, text_y)
142
+
143
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
144
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
145
+
146
+ # plt.figure(figsize=(12, 8))
147
+ # plt.imshow(frame)
148
+ # plt.title(f"frame {frame_name}")
149
+ # plt.tight_layout()
150
+ # plt.axis('off')
151
+ # plt.show()
152
+
153
+ buffer = BytesIO()
154
+ frame = Image.fromarray(frame)
155
+ frame.save(buffer, format='jpeg')
156
+ buffer.seek(0)
157
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
158
+ frame_cat_cnts[frame_name] = cat_cnt
159
+
160
+ buffer.seek(0) # Reuse buffer instead of creating a new one
161
+ buffer.truncate()
162
+ frame_for_contour = Image.fromarray(frame_for_contour)
163
+ frame_for_contour.save(buffer, format='jpeg')
164
+ buffer.seek(0)
165
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
166
+
167
+ encoded_frames[cat] = cat_frames
168
+ contoured_frames[cat] = contour_frames
169
+ vid_cat_cnts[cat] = frame_cat_cnts
170
+
171
+ return encoded_frames, contoured_frames, vid_cat_cnts
172
+
173
+
174
+ def number_objects_and_encode(idx, color_mask=False):
175
+ encoded_frames = {}
176
+ contoured_frames = {} # New dictionary for original images
177
+ vid_cat_cnts = {}
178
+
179
+ vid_meta = metas[idx]
180
+ vid_data = train_dataset[idx]
181
+ vid_id = vid_meta['video']
182
+ frame_indx = vid_meta['sample_indx']
183
+ cat_names = set(vid_meta['obj_id_cat'].values())
184
+ imgs = vid_data[0]
185
+
186
+ for cat in cat_names:
187
+ cat_frames = []
188
+ contour_frames = []
189
+ frame_cat_cnts = {}
190
+
191
+ for i in range(imgs.size(0)):
192
+ frame_name = frame_indx[i]
193
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
194
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
195
+
196
+ frame_data = vid_data[2][frame_name]
197
+ obj_ids = list(frame_data.keys())
198
+
199
+ cat_cnt = 0
200
+
201
+ for j in range(len(obj_ids)):
202
+ obj_id = obj_ids[j]
203
+ obj_data = frame_data[obj_id]
204
+ obj_bbox = obj_data['bbox']
205
+ obj_valid = obj_data['valid']
206
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
207
+ obj_cat = obj_data['category_name']
208
+
209
+ if obj_cat == cat and obj_valid:
210
+ cat_cnt += 1
211
+
212
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
213
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
214
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
215
+
216
+ if len(contours) > 0:
217
+ largest_contour = max(contours, key=cv2.contourArea)
218
+ M = cv2.moments(largest_contour)
219
+ if M["m00"] != 0:
220
+ center_x = int(M["m10"] / M["m00"])
221
+ center_y = int(M["m01"] / M["m00"])
222
+ else:
223
+ center_x, center_y = 0, 0
224
+
225
+ font = cv2.FONT_HERSHEY_SIMPLEX
226
+ text = obj_id
227
+ font_scale = 1.2
228
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
229
+ text_x = center_x - text_size[0] // 1
230
+ text_y = center_y
231
+
232
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
233
+ rect_end = (text_x + text_size[0] + 5, text_y + 3)
234
+
235
+ contour_thickness = 1
236
+ rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
237
+ rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
238
+
239
+ cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
240
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
241
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
242
+
243
+
244
+ if color_mask:
245
+ alpha = 0.08
246
+ colored_obj_mask = np.zeros_like(frame)
247
+ colored_obj_mask[obj_mask == 1] = colors[j]
248
+ frame[obj_mask == 1] = (
249
+ (1 - alpha) * frame[obj_mask == 1]
250
+ + alpha * colored_obj_mask[obj_mask == 1]
251
+ )
252
+
253
+ # plt.figure(figsize=(12, 8))
254
+ # plt.imshow(frame)
255
+ # plt.title(f"frame {frame_name}")
256
+ # plt.tight_layout()
257
+ # plt.axis('off')
258
+ # plt.show()
259
+
260
+ buffer = BytesIO()
261
+ frame = Image.fromarray(frame)
262
+ frame.save(buffer, format='jpeg')
263
+ buffer.seek(0)
264
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
265
+ frame_cat_cnts[frame_name] = cat_cnt
266
+
267
+ buffer.seek(0) # Reuse buffer instead of creating a new one
268
+ buffer.truncate()
269
+ frame_for_contour = Image.fromarray(frame_for_contour)
270
+ frame_for_contour.save(buffer, format='jpeg')
271
+ buffer.seek(0)
272
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
273
+
274
+ encoded_frames[cat] = cat_frames
275
+ contoured_frames[cat] = contour_frames
276
+ vid_cat_cnts[cat] = frame_cat_cnts
277
+
278
+ return encoded_frames, contoured_frames, vid_cat_cnts
279
+
280
+
281
+
282
+ def getCaption(idx, model='gpt-4o'):
283
+ vid_meta = metas[idx]
284
+ vid_data = train_dataset[idx]
285
+ vid_id = vid_meta['video']
286
+ print(f"vid id: {vid_id}\n")
287
+
288
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
289
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
290
+ all_captions = dict()
291
+
292
+ # color_mask = random.choice([True, False])
293
+ color_mask = random.choices([False, True], weights=[60, 40])[0]
294
+
295
+ base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
296
+ #marked = "mask with boundary" if color_mask else "boundary"
297
+
298
+ for cat_name in list(cat_names) :
299
+
300
+ is_movable = False
301
+ if cat_name in ytvos_category_valid_list :
302
+ is_movable = True
303
+
304
+ if not is_movable:
305
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
306
+
307
+
308
+ image_captions = {}
309
+ captioner = OpenAI()
310
+ cat_base64_frames = base64_frames[cat_name]
311
+ # cont_base64_frames = contoured_frames[cat_name]
312
+
313
+ for i in range(len(cat_base64_frames)):
314
+ frame_name = frame_indx[i]
315
+ # cont_base64_image = cont_base64_frames[i]
316
+ base64_image = cat_base64_frames[i]
317
+ should_filter = False
318
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
319
+
320
+ if frame_cat_cnts >= 2:
321
+ should_filter = True
322
+ else:
323
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
324
+
325
+
326
+ if is_movable and should_filter:
327
+ #1단계: 필터링
328
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
329
+ caption_filter_text = f"""
330
+ You are a visual assistant analyzing a single frame from a video.
331
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
332
+
333
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
334
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
335
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
336
+
337
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
338
+
339
+ - Respond with "YES" if:
340
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
341
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
342
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
343
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
344
+
345
+ - Respond with "NONE" if:
346
+ 1) The actions or pose are not clearly differentiable or too similar.
347
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
348
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
349
+
350
+ Answer strictly with either "YES" or "NONE".
351
+ """
352
+
353
+ response1 = captioner.chat.completions.create(
354
+ model=model,
355
+ messages=[
356
+ {
357
+ "role": "user",
358
+ "content": [
359
+ {
360
+ "type": "text",
361
+ "text": caption_filter_text,
362
+ },
363
+ {
364
+ "type": "image_url",
365
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
366
+ }
367
+ ],
368
+ }
369
+ ],
370
+ )
371
+ response_content = response1.choices[0].message.content
372
+ should_caption = True if "yes" in response_content.lower() else False
373
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
374
+
375
+ else:
376
+ should_caption = False
377
+
378
+ #2단계: dense caption 만들기
379
+ dense_caption_prompt_1 = f"""
380
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
381
+
382
+ Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
383
+
384
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
385
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
386
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
387
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
388
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
389
+ 6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
390
+ - expressions like 'seems to be', 'appears to be' are BANNED!
391
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
392
+ 8. Include interactions with objects or other entities when they are prominent and observable.
393
+ 9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
394
+ 10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
395
+ 11. Do not mention object IDs.
396
+ 12. Use '{cat_name}' as the noun for the referring expressions.
397
+
398
+ Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
399
+
400
+ - Your answer should contain details, and follow the following format:
401
+ object id. action-oriented description
402
+ (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
403
+ 2. a person bending over and touching his boots to tie the shoelace.)
404
+ - for action-oriented description, use {cat_name} as subject noun
405
+
406
+ **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
407
+ Please pay attention to the categories of these objects and don’t change them.
408
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
409
+ Output referring expressions for each object id. Please start your answer:"""
410
+
411
+
412
+ dense_caption_prompt_2 = f"""
413
+ You are an advanced visual language model analyzing a video frame.
414
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
415
+
416
+ Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
417
+ Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
418
+
419
+ ---
420
+ ## Key Guidelines:
421
+ 1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
422
+ - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
423
+ - Avoid: "moving slightly to the side" (**(X) Too vague**)
424
+
425
+ 2. **Do not describe appearance, color, or position**—focus purely on the action.
426
+ - (X) "A large brown bear standing on the left"
427
+ - (O) "The bear is lifting its front paws and swiping forward."
428
+
429
+ 3. **Use dynamic, action-specific verbs** rather than passive descriptions.
430
+ - (O) "The giraffe is tilting its head and sniffing the ground."
431
+ - (X) "The giraffe is near a tree and looking around."
432
+
433
+ 4. **Avoid assumptions, emotions, or speculative phrasing.**
434
+ - (X) "The person seems excited" / "The person might be preparing to jump."
435
+ - (O) "The person is pushing its front legs against the rock and leaping forward."
436
+
437
+ 5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
438
+ - expressions like 'seems to be', 'appears to be' are BANNED!
439
+ 6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
440
+
441
+ 7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
442
+ - **Each object should have a unique, descriptive action.**
443
+ - (X) "Two dogs are running."
444
+ - (O) "1. One dog is chasing another, its legs stretched mid-air.
445
+ 2. The other dog is looking back while speeding up."
446
+
447
+ ---
448
+ ## Output Format:
449
+ - Each labeled **{cat_name}** should have exactly **one line of description**.
450
+ - Format: `ID. {cat_name} + action-based description`
451
+ - (O) Example:
452
+ ```
453
+ 1. The person is leaning forward while opening a bag with both hands.
454
+ 2. The person is holding onto a rope and pulling themselves up.
455
+ ```
456
+ - **Ensure that each object is described individually.**
457
+ - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
458
+
459
+ ---
460
+ ## Additional Instructions:
461
+ - **Do NOT** use expressions like "it appears that..." or "it seems like...".
462
+ - **Do NOT** mention object IDs in the description (only use the provided format).
463
+ - **DO NOT** include markdown formatting (no bullet points, no asterisks).
464
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
465
+
466
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
467
+ """
468
+
469
+
470
+ dense_caption_prompt = f"""
471
+ You are a visual assistant analyzing a single frame of a video.
472
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
473
+
474
+ I am building an **action-centric referring expression** dataset.
475
+ Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
476
+
477
+ ---
478
+ ## Guidelines:
479
+ 1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
480
+ 2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
481
+ 3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
482
+ 4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
483
+ 5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
484
+ 6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
485
+ 7. Base your descriptions on these principles:
486
+ - **Avoid words like 'minimal' or 'slightly'.**
487
+ - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
488
+ - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
489
+ - **Specify actions with other objects or entities** only when they are clear and observable.
490
+ - (O) "pushing another person"
491
+ - (X) "interacting with another object"
492
+
493
+ ---
494
+ ## Output Format:
495
+ - Each labeled **{cat_name}** must have **exactly one line**.
496
+ - Format: `ID. {cat_name} + action-based description`
497
+ - (O) Example:
498
+ ```
499
+ 1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
500
+ 2. The person is pulling a baby carriage while smiling.
501
+ ```
502
+ - **Ensure each object is described individually.**
503
+ - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
504
+
505
+ ---
506
+ ## Example:
507
+ If the frame has two labeled **bears**, your output should be:
508
+ ```
509
+ 1. The bear is reaching out its right paw while leaning forward to catch prey.
510
+ 2. A bear is standing upright, facing right, and touching the bike beside it.
511
+ ```
512
+
513
+ ---
514
+ ## Additional Instructions:
515
+ - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
516
+ - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
517
+ - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
518
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
519
+
520
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
521
+
522
+
523
+ MAX_RETRIES = 3
524
+ retry_count = 0
525
+
526
+ if should_caption:
527
+ while retry_count < MAX_RETRIES:
528
+ selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
529
+
530
+ response2 = captioner.chat.completions.create(
531
+ model=model,
532
+ messages=[
533
+ {
534
+ "role": "user",
535
+ "content": [
536
+ {
537
+ "type": "text",
538
+ "text": selected_prompt,
539
+ },
540
+ {
541
+ "type": "image_url",
542
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
543
+ },
544
+ ],
545
+ }
546
+ ],
547
+ )
548
+
549
+ # caption = response2.choices[0].message.content
550
+ #print(f"{image_path} - {frame_name}: {caption}")
551
+
552
+ caption = response2.choices[0].message.content.strip()
553
+ caption_lower = caption.lower().lstrip()
554
+
555
+ if caption_lower.startswith("1.") and not any(
556
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
557
+ ):
558
+ break
559
+
560
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
561
+ retry_count += 1
562
+ time.sleep(2)
563
+
564
+ if retry_count == MAX_RETRIES:
565
+ caption = None
566
+ print("Max retries reached. Caption generation failed.")
567
+
568
+ else:
569
+ caption = None
570
+
571
+ image_captions[frame_name] = caption
572
+ all_captions[cat_name] = image_captions
573
+
574
+ # final : also prepare valid object ids
575
+ valid_obj_ids = dict()
576
+
577
+ for cat in cat_names:
578
+ if cat in ytvos_category_valid_list:
579
+ obj_id_cat = vid_meta['obj_id_cat']
580
+ valid_cat_ids = []
581
+ for obj_id in list(obj_id_cat.keys()):
582
+ if obj_id_cat[obj_id] == cat:
583
+ valid_cat_ids.append(obj_id)
584
+ valid_obj_ids[cat] = valid_cat_ids
585
+
586
+ return vid_id, all_captions, valid_obj_ids
587
+
588
+
589
+ if __name__ == '__main__':
590
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
591
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
592
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
593
+
594
+ args = parser.parse_args()
595
+
596
+ #==================데이터 불러오기===================
597
+ # 전체 데이터셋
598
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
599
+
600
+ # 전체 데이터셋 메타데이터
601
+ metas = train_dataset.metas
602
+
603
+ # 색상 후보 8개 (RGB 형식)
604
+ colors = [
605
+ (255, 0, 0), # Red
606
+ (0, 255, 0), # Green
607
+ (0, 0, 255), # Blue
608
+ (255, 255, 0), # Yellow
609
+ (255, 0, 255), # Magenta
610
+ (0, 255, 255), # Cyan
611
+ (128, 0, 128), # Purple
612
+ (255, 165, 0) # Orange
613
+ ]
614
+
615
+ ytvos_category_valid_list = [
616
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
617
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
618
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
619
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
620
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
621
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
622
+ ]
623
+
624
+ #==================gpt 돌리기===================
625
+ os.environ['OPENAI_API_KEY'] = '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'
626
+
627
+ result_captions = {}
628
+ result_valid_obj_ids = {}
629
+
630
+ for i in range(len(metas)):
631
+ vid_id, all_captions, valid_obj_ids = getCaption(i)
632
+
633
+ if vid_id not in result_captions:
634
+ result_captions[vid_id] = all_captions
635
+ if vid_id not in result_valid_obj_ids:
636
+ result_valid_obj_ids[vid_id] = valid_obj_ids
637
+
638
+ print("Finished!", flush=True)
639
+
640
+ with open(args.save_caption_path, "w") as file:
641
+ json.dump(result_captions, file, indent=4)
642
+
643
+ with open(args.save_valid_obj_ids_path, "w") as file:
644
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207173350.py ADDED
@@ -0,0 +1,677 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+ import random
9
+
10
+ from mbench.ytvos_ref import build as build_ytvos_ref
11
+ import argparse
12
+ import opts
13
+
14
+ import sys
15
+ from pathlib import Path
16
+ import os
17
+ from os import path as osp
18
+ import skimage
19
+ from io import BytesIO
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import regex as re
24
+ import json
25
+
26
+ import cv2
27
+ from PIL import Image, ImageDraw
28
+ import torch
29
+ from torchvision.transforms import functional as F
30
+
31
+ from skimage import measure # (pip install scikit-image)
32
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
33
+
34
+ import matplotlib.pyplot as plt
35
+ import matplotlib.patches as patches
36
+ from matplotlib.collections import PatchCollection
37
+ from matplotlib.patches import Rectangle
38
+ import textwrap
39
+
40
+
41
+ import ipywidgets as widgets
42
+ from IPython.display import display, clear_output
43
+
44
+ from openai import OpenAI
45
+ import base64
46
+ import json
47
+ import requests
48
+ from openai.error import APIConnectionError, OpenAIError
49
+
50
+ def number_objects_and_encode_old(idx, color_mask=False):
51
+ encoded_frames = {}
52
+ contoured_frames = {} # New dictionary for original images
53
+ vid_cat_cnts = {}
54
+
55
+ vid_meta = metas[idx]
56
+ vid_data = train_dataset[idx]
57
+ vid_id = vid_meta['video']
58
+ frame_indx = vid_meta['sample_indx']
59
+ cat_names = set(vid_meta['obj_id_cat'].values())
60
+ imgs = vid_data[0]
61
+
62
+ for cat in cat_names:
63
+ cat_frames = []
64
+ contour_frames = []
65
+ frame_cat_cnts = {}
66
+
67
+ for i in range(imgs.size(0)):
68
+ frame_name = frame_indx[i]
69
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
70
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
71
+
72
+ frame_data = vid_data[2][frame_name]
73
+ obj_ids = list(frame_data.keys())
74
+
75
+ cat_cnt = 0
76
+
77
+ for j in range(len(obj_ids)):
78
+ obj_id = obj_ids[j]
79
+ obj_data = frame_data[obj_id]
80
+ obj_bbox = obj_data['bbox']
81
+ obj_valid = obj_data['valid']
82
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
83
+ obj_cat = obj_data['category_name']
84
+
85
+ if obj_cat == cat and obj_valid:
86
+ cat_cnt += 1
87
+
88
+ if color_mask == False:
89
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
90
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
91
+ for i, contour in enumerate(contours):
92
+ moments = cv2.moments(contour)
93
+ if moments["m00"] != 0:
94
+ cx = int(moments["m10"] / moments["m00"])
95
+ cy = int(moments["m01"] / moments["m00"])
96
+ else:
97
+ cx, cy = contour[0][0]
98
+
99
+ font = cv2.FONT_HERSHEY_SIMPLEX
100
+ text = obj_id
101
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
102
+ text_w, text_h = text_size
103
+
104
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
105
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
106
+
107
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
108
+ font, 1, (255, 255, 255), 2)
109
+
110
+ else:
111
+ alpha = 0.08
112
+
113
+ colored_obj_mask = np.zeros_like(frame)
114
+ colored_obj_mask[obj_mask == 1] = colors[j]
115
+ frame[obj_mask == 1] = (
116
+ (1 - alpha) * frame[obj_mask == 1]
117
+ + alpha * colored_obj_mask[obj_mask == 1]
118
+ )
119
+
120
+
121
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
122
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
123
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
124
+
125
+ if len(contours) > 0:
126
+ largest_contour = max(contours, key=cv2.contourArea)
127
+ M = cv2.moments(largest_contour)
128
+ if M["m00"] != 0:
129
+ center_x = int(M["m10"] / M["m00"])
130
+ center_y = int(M["m01"] / M["m00"])
131
+ else:
132
+ center_x, center_y = 0, 0
133
+
134
+ font = cv2.FONT_HERSHEY_SIMPLEX
135
+ text = obj_id
136
+
137
+ font_scale = 0.9
138
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
139
+ text_x = center_x - text_size[0] // 1
140
+ text_y = center_y
141
+
142
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
143
+ rect_end = (text_x + text_size[0] + 5, text_y)
144
+
145
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
146
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
147
+
148
+ # plt.figure(figsize=(12, 8))
149
+ # plt.imshow(frame)
150
+ # plt.title(f"frame {frame_name}")
151
+ # plt.tight_layout()
152
+ # plt.axis('off')
153
+ # plt.show()
154
+
155
+ buffer = BytesIO()
156
+ frame = Image.fromarray(frame)
157
+ frame.save(buffer, format='jpeg')
158
+ buffer.seek(0)
159
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
160
+ frame_cat_cnts[frame_name] = cat_cnt
161
+
162
+ buffer.seek(0) # Reuse buffer instead of creating a new one
163
+ buffer.truncate()
164
+ frame_for_contour = Image.fromarray(frame_for_contour)
165
+ frame_for_contour.save(buffer, format='jpeg')
166
+ buffer.seek(0)
167
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
168
+
169
+ encoded_frames[cat] = cat_frames
170
+ contoured_frames[cat] = contour_frames
171
+ vid_cat_cnts[cat] = frame_cat_cnts
172
+
173
+ return encoded_frames, contoured_frames, vid_cat_cnts
174
+
175
+
176
+ def number_objects_and_encode(idx, color_mask=False):
177
+ encoded_frames = {}
178
+ contoured_frames = {} # New dictionary for original images
179
+ vid_cat_cnts = {}
180
+
181
+ vid_meta = metas[idx]
182
+ vid_data = train_dataset[idx]
183
+ vid_id = vid_meta['video']
184
+ frame_indx = vid_meta['sample_indx']
185
+ cat_names = set(vid_meta['obj_id_cat'].values())
186
+ imgs = vid_data[0]
187
+
188
+ for cat in cat_names:
189
+ cat_frames = []
190
+ contour_frames = []
191
+ frame_cat_cnts = {}
192
+
193
+ for i in range(imgs.size(0)):
194
+ frame_name = frame_indx[i]
195
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
196
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
197
+
198
+ frame_data = vid_data[2][frame_name]
199
+ obj_ids = list(frame_data.keys())
200
+
201
+ cat_cnt = 0
202
+
203
+ for j in range(len(obj_ids)):
204
+ obj_id = obj_ids[j]
205
+ obj_data = frame_data[obj_id]
206
+ obj_bbox = obj_data['bbox']
207
+ obj_valid = obj_data['valid']
208
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
209
+ obj_cat = obj_data['category_name']
210
+
211
+ if obj_cat == cat and obj_valid:
212
+ cat_cnt += 1
213
+
214
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
215
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
216
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
217
+
218
+ if len(contours) > 0:
219
+ largest_contour = max(contours, key=cv2.contourArea)
220
+ M = cv2.moments(largest_contour)
221
+ if M["m00"] != 0:
222
+ center_x = int(M["m10"] / M["m00"])
223
+ center_y = int(M["m01"] / M["m00"])
224
+ else:
225
+ center_x, center_y = 0, 0
226
+
227
+ font = cv2.FONT_HERSHEY_SIMPLEX
228
+ text = obj_id
229
+ font_scale = 1.2
230
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
231
+ text_x = center_x - text_size[0] // 1
232
+ text_y = center_y
233
+
234
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
235
+ rect_end = (text_x + text_size[0] + 5, text_y + 3)
236
+
237
+ contour_thickness = 1
238
+ rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
239
+ rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
240
+
241
+ cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
242
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
243
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
244
+
245
+
246
+ if color_mask:
247
+ alpha = 0.08
248
+ colored_obj_mask = np.zeros_like(frame)
249
+ colored_obj_mask[obj_mask == 1] = colors[j]
250
+ frame[obj_mask == 1] = (
251
+ (1 - alpha) * frame[obj_mask == 1]
252
+ + alpha * colored_obj_mask[obj_mask == 1]
253
+ )
254
+
255
+ # plt.figure(figsize=(12, 8))
256
+ # plt.imshow(frame)
257
+ # plt.title(f"frame {frame_name}")
258
+ # plt.tight_layout()
259
+ # plt.axis('off')
260
+ # plt.show()
261
+
262
+ buffer = BytesIO()
263
+ frame = Image.fromarray(frame)
264
+ frame.save(buffer, format='jpeg')
265
+ buffer.seek(0)
266
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
267
+ frame_cat_cnts[frame_name] = cat_cnt
268
+
269
+ buffer.seek(0) # Reuse buffer instead of creating a new one
270
+ buffer.truncate()
271
+ frame_for_contour = Image.fromarray(frame_for_contour)
272
+ frame_for_contour.save(buffer, format='jpeg')
273
+ buffer.seek(0)
274
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
275
+
276
+ encoded_frames[cat] = cat_frames
277
+ contoured_frames[cat] = contour_frames
278
+ vid_cat_cnts[cat] = frame_cat_cnts
279
+
280
+ return encoded_frames, contoured_frames, vid_cat_cnts
281
+
282
+
283
+
284
+ def getCaption(idx, model='gpt-4o'):
285
+ vid_meta = metas[idx]
286
+ vid_data = train_dataset[idx]
287
+ vid_id = vid_meta['video']
288
+ print(f"vid id: {vid_id}\n")
289
+
290
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
291
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
292
+ all_captions = dict()
293
+
294
+ # color_mask = random.choice([True, False])
295
+ color_mask = random.choices([False, True], weights=[60, 40])[0]
296
+
297
+ base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
298
+ #marked = "mask with boundary" if color_mask else "boundary"
299
+
300
+ for cat_name in list(cat_names) :
301
+
302
+ is_movable = False
303
+ if cat_name in ytvos_category_valid_list :
304
+ is_movable = True
305
+
306
+ if not is_movable:
307
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
308
+
309
+
310
+ image_captions = {}
311
+ captioner = OpenAI()
312
+ cat_base64_frames = base64_frames[cat_name]
313
+ # cont_base64_frames = contoured_frames[cat_name]
314
+
315
+ for i in range(len(cat_base64_frames)):
316
+ frame_name = frame_indx[i]
317
+ # cont_base64_image = cont_base64_frames[i]
318
+ base64_image = cat_base64_frames[i]
319
+ should_filter = False
320
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
321
+
322
+ if frame_cat_cnts >= 2:
323
+ should_filter = True
324
+ else:
325
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
326
+
327
+
328
+ if is_movable and should_filter:
329
+ #1단계: 필터링
330
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
331
+ caption_filter_text = f"""
332
+ You are a visual assistant analyzing a single frame from a video.
333
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
334
+
335
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
336
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
337
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
338
+
339
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
340
+
341
+ - Respond with "YES" if:
342
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
343
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
344
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
345
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
346
+
347
+ - Respond with "NONE" if:
348
+ 1) The actions or pose are not clearly differentiable or too similar.
349
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
350
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
351
+
352
+ Answer strictly with either "YES" or "NONE".
353
+ """
354
+
355
+ response1 = captioner.chat.completions.create(
356
+ model=model,
357
+ messages=[
358
+ {
359
+ "role": "user",
360
+ "content": [
361
+ {
362
+ "type": "text",
363
+ "text": caption_filter_text,
364
+ },
365
+ {
366
+ "type": "image_url",
367
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
368
+ }
369
+ ],
370
+ }
371
+ ],
372
+ )
373
+ response_content = response1.choices[0].message.content
374
+ should_caption = True if "yes" in response_content.lower() else False
375
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
376
+
377
+ else:
378
+ should_caption = False
379
+
380
+ #2단계: dense caption 만들기
381
+ dense_caption_prompt_1 = f"""
382
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
383
+
384
+ Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
385
+
386
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
387
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
388
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
389
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
390
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
391
+ 6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
392
+ - expressions like 'seems to be', 'appears to be' are BANNED!
393
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
394
+ 8. Include interactions with objects or other entities when they are prominent and observable.
395
+ 9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
396
+ 10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
397
+ 11. Do not mention object IDs.
398
+ 12. Use '{cat_name}' as the noun for the referring expressions.
399
+
400
+ Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
401
+
402
+ - Your answer should contain details, and follow the following format:
403
+ object id. action-oriented description
404
+ (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
405
+ 2. a person bending over and touching his boots to tie the shoelace.)
406
+ - for action-oriented description, use {cat_name} as subject noun
407
+
408
+ **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
409
+ Please pay attention to the categories of these objects and don’t change them.
410
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
411
+ Output referring expressions for each object id. Please start your answer:"""
412
+
413
+
414
+ dense_caption_prompt_2 = f"""
415
+ You are an advanced visual language model analyzing a video frame.
416
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
417
+
418
+ Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
419
+ Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
420
+
421
+ ---
422
+ ## Key Guidelines:
423
+ 1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
424
+ - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
425
+ - Avoid: "moving slightly to the side" (**(X) Too vague**)
426
+
427
+ 2. **Do not describe appearance, color, or position**—focus purely on the action.
428
+ - (X) "A large brown bear standing on the left"
429
+ - (O) "The bear is lifting its front paws and swiping forward."
430
+
431
+ 3. **Use dynamic, action-specific verbs** rather than passive descriptions.
432
+ - (O) "The giraffe is tilting its head and sniffing the ground."
433
+ - (X) "The giraffe is near a tree and looking around."
434
+
435
+ 4. **Avoid assumptions, emotions, or speculative phrasing.**
436
+ - (X) "The person seems excited" / "The person might be preparing to jump."
437
+ - (O) "The person is pushing its front legs against the rock and leaping forward."
438
+
439
+ 5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
440
+ - expressions like 'seems to be', 'appears to be' are BANNED!
441
+ 6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
442
+
443
+ 7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
444
+ - **Each object should have a unique, descriptive action.**
445
+ - (X) "Two dogs are running."
446
+ - (O) "1. One dog is chasing another, its legs stretched mid-air.
447
+ 2. The other dog is looking back while speeding up."
448
+
449
+ ---
450
+ ## Output Format:
451
+ - Each labeled **{cat_name}** should have exactly **one line of description**.
452
+ - Format: `ID. {cat_name} + action-based description`
453
+ - (O) Example:
454
+ ```
455
+ 1. The person is leaning forward while opening a bag with both hands.
456
+ 2. The person is holding onto a rope and pulling themselves up.
457
+ ```
458
+ - **Ensure that each object is described individually.**
459
+ - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
460
+
461
+ ---
462
+ ## Additional Instructions:
463
+ - **Do NOT** use expressions like "it appears that..." or "it seems like...".
464
+ - **Do NOT** mention object IDs in the description (only use the provided format).
465
+ - **DO NOT** include markdown formatting (no bullet points, no asterisks).
466
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
467
+
468
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
469
+ """
470
+
471
+
472
+ dense_caption_prompt = f"""
473
+ You are a visual assistant analyzing a single frame of a video.
474
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
475
+
476
+ I am building an **action-centric referring expression** dataset.
477
+ Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
478
+
479
+ ---
480
+ ## Guidelines:
481
+ 1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
482
+ 2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
483
+ 3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
484
+ 4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
485
+ 5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
486
+ 6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
487
+ 7. Base your descriptions on these principles:
488
+ - **Avoid words like 'minimal' or 'slightly'.**
489
+ - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
490
+ - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
491
+ - **Specify actions with other objects or entities** only when they are clear and observable.
492
+ - (O) "pushing another person"
493
+ - (X) "interacting with another object"
494
+
495
+ ---
496
+ ## Output Format:
497
+ - Each labeled **{cat_name}** must have **exactly one line**.
498
+ - Format: `ID. {cat_name} + action-based description`
499
+ - (O) Example:
500
+ ```
501
+ 1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
502
+ 2. The person is pulling a baby carriage while smiling.
503
+ ```
504
+ - **Ensure each object is described individually.**
505
+ - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
506
+
507
+ ---
508
+ ## Example:
509
+ If the frame has two labeled **bears**, your output should be:
510
+ ```
511
+ 1. The bear is reaching out its right paw while leaning forward to catch prey.
512
+ 2. A bear is standing upright, facing right, and touching the bike beside it.
513
+ ```
514
+
515
+ ---
516
+ ## Additional Instructions:
517
+ - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
518
+ - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
519
+ - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
520
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
521
+
522
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
523
+
524
+
525
+ MAX_RETRIES = 3
526
+ retry_count = 0
527
+
528
+ if should_caption:
529
+ while retry_count < MAX_RETRIES:
530
+ selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
531
+
532
+ response2 = captioner.chat.completions.create(
533
+ model=model,
534
+ messages=[
535
+ {
536
+ "role": "user",
537
+ "content": [
538
+ {
539
+ "type": "text",
540
+ "text": selected_prompt,
541
+ },
542
+ {
543
+ "type": "image_url",
544
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
545
+ },
546
+ ],
547
+ }
548
+ ],
549
+ )
550
+
551
+ # caption = response2.choices[0].message.content
552
+ #print(f"{image_path} - {frame_name}: {caption}")
553
+
554
+ caption = response2.choices[0].message.content.strip()
555
+ caption_lower = caption.lower().lstrip()
556
+
557
+ if caption_lower.startswith("1.") and not any(
558
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
559
+ ):
560
+ break
561
+
562
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
563
+ retry_count += 1
564
+ time.sleep(2)
565
+
566
+ if retry_count == MAX_RETRIES:
567
+ caption = None
568
+ print("Max retries reached. Caption generation failed.")
569
+
570
+ else:
571
+ caption = None
572
+
573
+ image_captions[frame_name] = caption
574
+ all_captions[cat_name] = image_captions
575
+
576
+ # final : also prepare valid object ids
577
+ valid_obj_ids = dict()
578
+
579
+ for cat in cat_names:
580
+ if cat in ytvos_category_valid_list:
581
+ obj_id_cat = vid_meta['obj_id_cat']
582
+ valid_cat_ids = []
583
+ for obj_id in list(obj_id_cat.keys()):
584
+ if obj_id_cat[obj_id] == cat:
585
+ valid_cat_ids.append(obj_id)
586
+ valid_obj_ids[cat] = valid_cat_ids
587
+
588
+ return vid_id, all_captions, valid_obj_ids
589
+
590
+
591
+ if __name__ == '__main__':
592
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
593
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
594
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
595
+
596
+ args = parser.parse_args()
597
+
598
+ #==================데이터 불러오기===================
599
+ # 전체 데이터셋
600
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
601
+
602
+ # 전체 데이터셋 메타데이터
603
+ metas = train_dataset.metas
604
+
605
+ # 색상 후보 8개 (RGB 형식)
606
+ colors = [
607
+ (255, 0, 0), # Red
608
+ (0, 255, 0), # Green
609
+ (0, 0, 255), # Blue
610
+ (255, 255, 0), # Yellow
611
+ (255, 0, 255), # Magenta
612
+ (0, 255, 255), # Cyan
613
+ (128, 0, 128), # Purple
614
+ (255, 165, 0) # Orange
615
+ ]
616
+
617
+ ytvos_category_valid_list = [
618
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
619
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
620
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
621
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
622
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
623
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
624
+ ]
625
+
626
+ #==================gpt 돌리기===================
627
+ os.environ['OPENAI_API_KEY'] = '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'
628
+
629
+ result_captions = {}
630
+ result_valid_obj_ids = {}
631
+
632
+ for i in range(len(metas)):
633
+ try:
634
+ vid_id, all_captions, valid_obj_ids = getCaption(i)
635
+
636
+ if vid_id not in result_captions:
637
+ result_captions[vid_id] = all_captions
638
+ if vid_id not in result_valid_obj_ids:
639
+ result_valid_obj_ids[vid_id] = valid_obj_ids
640
+
641
+ except (requests.exceptions.ConnectionError, APIConnectionError) as e:
642
+ print(f"created caption until {i-1}", flush=True)
643
+ print("인터넷 연결 문제로 요청을 처리할 수 없습니다:", e, flush=True)
644
+
645
+ with open(args.save_caption_path, "w") as file:
646
+ json.dump(result_captions, file, indent=4)
647
+
648
+ with open(args.save_valid_obj_ids_path, "w") as file:
649
+ json.dump(result_valid_obj_ids, file, indent=4)
650
+
651
+ except OpenAIError as e:
652
+ print(f"created caption until {i-1}", flush=True)
653
+ print("OpenAI API 관련 오류가 발생했습니다:", e, flush=True)
654
+
655
+ with open(args.save_caption_path, "w") as file:
656
+ json.dump(result_captions, file, indent=4)
657
+
658
+ with open(args.save_valid_obj_ids_path, "w") as file:
659
+ json.dump(result_valid_obj_ids, file, indent=4)
660
+
661
+ except Exception as e:
662
+ print(f"created caption until {i-1}", flush=True)
663
+ print("알 수 없는 오류 발생:", e, flush=True)
664
+
665
+ with open(args.save_caption_path, "w") as file:
666
+ json.dump(result_captions, file, indent=4)
667
+
668
+ with open(args.save_valid_obj_ids_path, "w") as file:
669
+ json.dump(result_valid_obj_ids, file, indent=4)
670
+
671
+ print("Finished!", flush=True)
672
+
673
+ with open(args.save_caption_path, "w") as file:
674
+ json.dump(result_captions, file, indent=4)
675
+
676
+ with open(args.save_valid_obj_ids_path, "w") as file:
677
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207184812.py ADDED
@@ -0,0 +1,676 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+ import random
9
+
10
+ from mbench.ytvos_ref import build as build_ytvos_ref
11
+ import argparse
12
+ import opts
13
+
14
+ import sys
15
+ from pathlib import Path
16
+ import os
17
+ from os import path as osp
18
+ import skimage
19
+ from io import BytesIO
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import regex as re
24
+ import json
25
+
26
+ import cv2
27
+ from PIL import Image, ImageDraw
28
+ import torch
29
+ from torchvision.transforms import functional as F
30
+
31
+ from skimage import measure # (pip install scikit-image)
32
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
33
+
34
+ import matplotlib.pyplot as plt
35
+ import matplotlib.patches as patches
36
+ from matplotlib.collections import PatchCollection
37
+ from matplotlib.patches import Rectangle
38
+ import textwrap
39
+
40
+
41
+ import ipywidgets as widgets
42
+ from IPython.display import display, clear_output
43
+
44
+ from openai import OpenAI, APIConnectionError, OpenAIError
45
+ import base64
46
+ import json
47
+ import requests
48
+
49
+ def number_objects_and_encode_old(idx, color_mask=False):
50
+ encoded_frames = {}
51
+ contoured_frames = {} # New dictionary for original images
52
+ vid_cat_cnts = {}
53
+
54
+ vid_meta = metas[idx]
55
+ vid_data = train_dataset[idx]
56
+ vid_id = vid_meta['video']
57
+ frame_indx = vid_meta['sample_indx']
58
+ cat_names = set(vid_meta['obj_id_cat'].values())
59
+ imgs = vid_data[0]
60
+
61
+ for cat in cat_names:
62
+ cat_frames = []
63
+ contour_frames = []
64
+ frame_cat_cnts = {}
65
+
66
+ for i in range(imgs.size(0)):
67
+ frame_name = frame_indx[i]
68
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
69
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
70
+
71
+ frame_data = vid_data[2][frame_name]
72
+ obj_ids = list(frame_data.keys())
73
+
74
+ cat_cnt = 0
75
+
76
+ for j in range(len(obj_ids)):
77
+ obj_id = obj_ids[j]
78
+ obj_data = frame_data[obj_id]
79
+ obj_bbox = obj_data['bbox']
80
+ obj_valid = obj_data['valid']
81
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
82
+ obj_cat = obj_data['category_name']
83
+
84
+ if obj_cat == cat and obj_valid:
85
+ cat_cnt += 1
86
+
87
+ if color_mask == False:
88
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
89
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
90
+ for i, contour in enumerate(contours):
91
+ moments = cv2.moments(contour)
92
+ if moments["m00"] != 0:
93
+ cx = int(moments["m10"] / moments["m00"])
94
+ cy = int(moments["m01"] / moments["m00"])
95
+ else:
96
+ cx, cy = contour[0][0]
97
+
98
+ font = cv2.FONT_HERSHEY_SIMPLEX
99
+ text = obj_id
100
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
101
+ text_w, text_h = text_size
102
+
103
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
104
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
105
+
106
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
107
+ font, 1, (255, 255, 255), 2)
108
+
109
+ else:
110
+ alpha = 0.08
111
+
112
+ colored_obj_mask = np.zeros_like(frame)
113
+ colored_obj_mask[obj_mask == 1] = colors[j]
114
+ frame[obj_mask == 1] = (
115
+ (1 - alpha) * frame[obj_mask == 1]
116
+ + alpha * colored_obj_mask[obj_mask == 1]
117
+ )
118
+
119
+
120
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
121
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
122
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
123
+
124
+ if len(contours) > 0:
125
+ largest_contour = max(contours, key=cv2.contourArea)
126
+ M = cv2.moments(largest_contour)
127
+ if M["m00"] != 0:
128
+ center_x = int(M["m10"] / M["m00"])
129
+ center_y = int(M["m01"] / M["m00"])
130
+ else:
131
+ center_x, center_y = 0, 0
132
+
133
+ font = cv2.FONT_HERSHEY_SIMPLEX
134
+ text = obj_id
135
+
136
+ font_scale = 0.9
137
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
138
+ text_x = center_x - text_size[0] // 1
139
+ text_y = center_y
140
+
141
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
142
+ rect_end = (text_x + text_size[0] + 5, text_y)
143
+
144
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
145
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
146
+
147
+ # plt.figure(figsize=(12, 8))
148
+ # plt.imshow(frame)
149
+ # plt.title(f"frame {frame_name}")
150
+ # plt.tight_layout()
151
+ # plt.axis('off')
152
+ # plt.show()
153
+
154
+ buffer = BytesIO()
155
+ frame = Image.fromarray(frame)
156
+ frame.save(buffer, format='jpeg')
157
+ buffer.seek(0)
158
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
159
+ frame_cat_cnts[frame_name] = cat_cnt
160
+
161
+ buffer.seek(0) # Reuse buffer instead of creating a new one
162
+ buffer.truncate()
163
+ frame_for_contour = Image.fromarray(frame_for_contour)
164
+ frame_for_contour.save(buffer, format='jpeg')
165
+ buffer.seek(0)
166
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
167
+
168
+ encoded_frames[cat] = cat_frames
169
+ contoured_frames[cat] = contour_frames
170
+ vid_cat_cnts[cat] = frame_cat_cnts
171
+
172
+ return encoded_frames, contoured_frames, vid_cat_cnts
173
+
174
+
175
+ def number_objects_and_encode(idx, color_mask=False):
176
+ encoded_frames = {}
177
+ contoured_frames = {} # New dictionary for original images
178
+ vid_cat_cnts = {}
179
+
180
+ vid_meta = metas[idx]
181
+ vid_data = train_dataset[idx]
182
+ vid_id = vid_meta['video']
183
+ frame_indx = vid_meta['sample_indx']
184
+ cat_names = set(vid_meta['obj_id_cat'].values())
185
+ imgs = vid_data[0]
186
+
187
+ for cat in cat_names:
188
+ cat_frames = []
189
+ contour_frames = []
190
+ frame_cat_cnts = {}
191
+
192
+ for i in range(imgs.size(0)):
193
+ frame_name = frame_indx[i]
194
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
195
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
196
+
197
+ frame_data = vid_data[2][frame_name]
198
+ obj_ids = list(frame_data.keys())
199
+
200
+ cat_cnt = 0
201
+
202
+ for j in range(len(obj_ids)):
203
+ obj_id = obj_ids[j]
204
+ obj_data = frame_data[obj_id]
205
+ obj_bbox = obj_data['bbox']
206
+ obj_valid = obj_data['valid']
207
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
208
+ obj_cat = obj_data['category_name']
209
+
210
+ if obj_cat == cat and obj_valid:
211
+ cat_cnt += 1
212
+
213
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
214
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
215
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
216
+
217
+ if len(contours) > 0:
218
+ largest_contour = max(contours, key=cv2.contourArea)
219
+ M = cv2.moments(largest_contour)
220
+ if M["m00"] != 0:
221
+ center_x = int(M["m10"] / M["m00"])
222
+ center_y = int(M["m01"] / M["m00"])
223
+ else:
224
+ center_x, center_y = 0, 0
225
+
226
+ font = cv2.FONT_HERSHEY_SIMPLEX
227
+ text = obj_id
228
+ font_scale = 1.2
229
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
230
+ text_x = center_x - text_size[0] // 1
231
+ text_y = center_y
232
+
233
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
234
+ rect_end = (text_x + text_size[0] + 5, text_y + 3)
235
+
236
+ contour_thickness = 1
237
+ rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
238
+ rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
239
+
240
+ cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
241
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
242
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
243
+
244
+
245
+ if color_mask:
246
+ alpha = 0.08
247
+ colored_obj_mask = np.zeros_like(frame)
248
+ colored_obj_mask[obj_mask == 1] = colors[j]
249
+ frame[obj_mask == 1] = (
250
+ (1 - alpha) * frame[obj_mask == 1]
251
+ + alpha * colored_obj_mask[obj_mask == 1]
252
+ )
253
+
254
+ # plt.figure(figsize=(12, 8))
255
+ # plt.imshow(frame)
256
+ # plt.title(f"frame {frame_name}")
257
+ # plt.tight_layout()
258
+ # plt.axis('off')
259
+ # plt.show()
260
+
261
+ buffer = BytesIO()
262
+ frame = Image.fromarray(frame)
263
+ frame.save(buffer, format='jpeg')
264
+ buffer.seek(0)
265
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
266
+ frame_cat_cnts[frame_name] = cat_cnt
267
+
268
+ buffer.seek(0) # Reuse buffer instead of creating a new one
269
+ buffer.truncate()
270
+ frame_for_contour = Image.fromarray(frame_for_contour)
271
+ frame_for_contour.save(buffer, format='jpeg')
272
+ buffer.seek(0)
273
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
274
+
275
+ encoded_frames[cat] = cat_frames
276
+ contoured_frames[cat] = contour_frames
277
+ vid_cat_cnts[cat] = frame_cat_cnts
278
+
279
+ return encoded_frames, contoured_frames, vid_cat_cnts
280
+
281
+
282
+
283
+ def getCaption(idx, model='gpt-4o'):
284
+ vid_meta = metas[idx]
285
+ vid_data = train_dataset[idx]
286
+ vid_id = vid_meta['video']
287
+ print(f"vid id: {vid_id}\n")
288
+
289
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
290
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
291
+ all_captions = dict()
292
+
293
+ # color_mask = random.choice([True, False])
294
+ color_mask = random.choices([False, True], weights=[60, 40])[0]
295
+
296
+ base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
297
+ #marked = "mask with boundary" if color_mask else "boundary"
298
+
299
+ for cat_name in list(cat_names) :
300
+
301
+ is_movable = False
302
+ if cat_name in ytvos_category_valid_list :
303
+ is_movable = True
304
+
305
+ if not is_movable:
306
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
307
+
308
+
309
+ image_captions = {}
310
+ captioner = OpenAI()
311
+ cat_base64_frames = base64_frames[cat_name]
312
+ # cont_base64_frames = contoured_frames[cat_name]
313
+
314
+ for i in range(len(cat_base64_frames)):
315
+ frame_name = frame_indx[i]
316
+ # cont_base64_image = cont_base64_frames[i]
317
+ base64_image = cat_base64_frames[i]
318
+ should_filter = False
319
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
320
+
321
+ if frame_cat_cnts >= 2:
322
+ should_filter = True
323
+ else:
324
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
325
+
326
+
327
+ if is_movable and should_filter:
328
+ #1단계: 필터링
329
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
330
+ caption_filter_text = f"""
331
+ You are a visual assistant analyzing a single frame from a video.
332
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
333
+
334
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
335
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
336
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
337
+
338
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
339
+
340
+ - Respond with "YES" if:
341
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
342
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
343
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
344
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
345
+
346
+ - Respond with "NONE" if:
347
+ 1) The actions or pose are not clearly differentiable or too similar.
348
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
349
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
350
+
351
+ Answer strictly with either "YES" or "NONE".
352
+ """
353
+
354
+ response1 = captioner.chat.completions.create(
355
+ model=model,
356
+ messages=[
357
+ {
358
+ "role": "user",
359
+ "content": [
360
+ {
361
+ "type": "text",
362
+ "text": caption_filter_text,
363
+ },
364
+ {
365
+ "type": "image_url",
366
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
367
+ }
368
+ ],
369
+ }
370
+ ],
371
+ )
372
+ response_content = response1.choices[0].message.content
373
+ should_caption = True if "yes" in response_content.lower() else False
374
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
375
+
376
+ else:
377
+ should_caption = False
378
+
379
+ #2단계: dense caption 만들기
380
+ dense_caption_prompt_1 = f"""
381
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
382
+
383
+ Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
384
+
385
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
386
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
387
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
388
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
389
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
390
+ 6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
391
+ - expressions like 'seems to be', 'appears to be' are BANNED!
392
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
393
+ 8. Include interactions with objects or other entities when they are prominent and observable.
394
+ 9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
395
+ 10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
396
+ 11. Do not mention object IDs.
397
+ 12. Use '{cat_name}' as the noun for the referring expressions.
398
+
399
+ Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
400
+
401
+ - Your answer should contain details, and follow the following format:
402
+ object id. action-oriented description
403
+ (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
404
+ 2. a person bending over and touching his boots to tie the shoelace.)
405
+ - for action-oriented description, use {cat_name} as subject noun
406
+
407
+ **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
408
+ Please pay attention to the categories of these objects and don’t change them.
409
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
410
+ Output referring expressions for each object id. Please start your answer:"""
411
+
412
+
413
+ dense_caption_prompt_2 = f"""
414
+ You are an advanced visual language model analyzing a video frame.
415
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
416
+
417
+ Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
418
+ Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
419
+
420
+ ---
421
+ ## Key Guidelines:
422
+ 1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
423
+ - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
424
+ - Avoid: "moving slightly to the side" (**(X) Too vague**)
425
+
426
+ 2. **Do not describe appearance, color, or position**—focus purely on the action.
427
+ - (X) "A large brown bear standing on the left"
428
+ - (O) "The bear is lifting its front paws and swiping forward."
429
+
430
+ 3. **Use dynamic, action-specific verbs** rather than passive descriptions.
431
+ - (O) "The giraffe is tilting its head and sniffing the ground."
432
+ - (X) "The giraffe is near a tree and looking around."
433
+
434
+ 4. **Avoid assumptions, emotions, or speculative phrasing.**
435
+ - (X) "The person seems excited" / "The person might be preparing to jump."
436
+ - (O) "The person is pushing its front legs against the rock and leaping forward."
437
+
438
+ 5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
439
+ - expressions like 'seems to be', 'appears to be' are BANNED!
440
+ 6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
441
+
442
+ 7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
443
+ - **Each object should have a unique, descriptive action.**
444
+ - (X) "Two dogs are running."
445
+ - (O) "1. One dog is chasing another, its legs stretched mid-air.
446
+ 2. The other dog is looking back while speeding up."
447
+
448
+ ---
449
+ ## Output Format:
450
+ - Each labeled **{cat_name}** should have exactly **one line of description**.
451
+ - Format: `ID. {cat_name} + action-based description`
452
+ - (O) Example:
453
+ ```
454
+ 1. The person is leaning forward while opening a bag with both hands.
455
+ 2. The person is holding onto a rope and pulling themselves up.
456
+ ```
457
+ - **Ensure that each object is described individually.**
458
+ - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
459
+
460
+ ---
461
+ ## Additional Instructions:
462
+ - **Do NOT** use expressions like "it appears that..." or "it seems like...".
463
+ - **Do NOT** mention object IDs in the description (only use the provided format).
464
+ - **DO NOT** include markdown formatting (no bullet points, no asterisks).
465
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
466
+
467
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
468
+ """
469
+
470
+
471
+ dense_caption_prompt = f"""
472
+ You are a visual assistant analyzing a single frame of a video.
473
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
474
+
475
+ I am building an **action-centric referring expression** dataset.
476
+ Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
477
+
478
+ ---
479
+ ## Guidelines:
480
+ 1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
481
+ 2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
482
+ 3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
483
+ 4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
484
+ 5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
485
+ 6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
486
+ 7. Base your descriptions on these principles:
487
+ - **Avoid words like 'minimal' or 'slightly'.**
488
+ - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
489
+ - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
490
+ - **Specify actions with other objects or entities** only when they are clear and observable.
491
+ - (O) "pushing another person"
492
+ - (X) "interacting with another object"
493
+
494
+ ---
495
+ ## Output Format:
496
+ - Each labeled **{cat_name}** must have **exactly one line**.
497
+ - Format: `ID. {cat_name} + action-based description`
498
+ - (O) Example:
499
+ ```
500
+ 1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
501
+ 2. The person is pulling a baby carriage while smiling.
502
+ ```
503
+ - **Ensure each object is described individually.**
504
+ - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
505
+
506
+ ---
507
+ ## Example:
508
+ If the frame has two labeled **bears**, your output should be:
509
+ ```
510
+ 1. The bear is reaching out its right paw while leaning forward to catch prey.
511
+ 2. A bear is standing upright, facing right, and touching the bike beside it.
512
+ ```
513
+
514
+ ---
515
+ ## Additional Instructions:
516
+ - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
517
+ - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
518
+ - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
519
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
520
+
521
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
522
+
523
+
524
+ MAX_RETRIES = 3
525
+ retry_count = 0
526
+
527
+ if should_caption:
528
+ while retry_count < MAX_RETRIES:
529
+ selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
530
+
531
+ response2 = captioner.chat.completions.create(
532
+ model=model,
533
+ messages=[
534
+ {
535
+ "role": "user",
536
+ "content": [
537
+ {
538
+ "type": "text",
539
+ "text": selected_prompt,
540
+ },
541
+ {
542
+ "type": "image_url",
543
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
544
+ },
545
+ ],
546
+ }
547
+ ],
548
+ )
549
+
550
+ # caption = response2.choices[0].message.content
551
+ #print(f"{image_path} - {frame_name}: {caption}")
552
+
553
+ caption = response2.choices[0].message.content.strip()
554
+ caption_lower = caption.lower().lstrip()
555
+
556
+ if caption_lower.startswith("1.") and not any(
557
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
558
+ ):
559
+ break
560
+
561
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
562
+ retry_count += 1
563
+ time.sleep(2)
564
+
565
+ if retry_count == MAX_RETRIES:
566
+ caption = None
567
+ print("Max retries reached. Caption generation failed.")
568
+
569
+ else:
570
+ caption = None
571
+
572
+ image_captions[frame_name] = caption
573
+ all_captions[cat_name] = image_captions
574
+
575
+ # final : also prepare valid object ids
576
+ valid_obj_ids = dict()
577
+
578
+ for cat in cat_names:
579
+ if cat in ytvos_category_valid_list:
580
+ obj_id_cat = vid_meta['obj_id_cat']
581
+ valid_cat_ids = []
582
+ for obj_id in list(obj_id_cat.keys()):
583
+ if obj_id_cat[obj_id] == cat:
584
+ valid_cat_ids.append(obj_id)
585
+ valid_obj_ids[cat] = valid_cat_ids
586
+
587
+ return vid_id, all_captions, valid_obj_ids
588
+
589
+
590
+ if __name__ == '__main__':
591
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
592
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
593
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
594
+
595
+ args = parser.parse_args()
596
+
597
+ #==================데이터 불러오기===================
598
+ # 전체 데이터셋
599
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
600
+
601
+ # 전체 데이터셋 메타데이터
602
+ metas = train_dataset.metas
603
+
604
+ # 색상 후보 8개 (RGB 형식)
605
+ colors = [
606
+ (255, 0, 0), # Red
607
+ (0, 255, 0), # Green
608
+ (0, 0, 255), # Blue
609
+ (255, 255, 0), # Yellow
610
+ (255, 0, 255), # Magenta
611
+ (0, 255, 255), # Cyan
612
+ (128, 0, 128), # Purple
613
+ (255, 165, 0) # Orange
614
+ ]
615
+
616
+ ytvos_category_valid_list = [
617
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
618
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
619
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
620
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
621
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
622
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
623
+ ]
624
+
625
+ #==================gpt 돌리기===================
626
+ os.environ['OPENAI_API_KEY'] = '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'
627
+
628
+ result_captions = {}
629
+ result_valid_obj_ids = {}
630
+
631
+ for i in range(len(metas)):
632
+ try:
633
+ vid_id, all_captions, valid_obj_ids = getCaption(i)
634
+
635
+ if vid_id not in result_captions:
636
+ result_captions[vid_id] = all_captions
637
+ if vid_id not in result_valid_obj_ids:
638
+ result_valid_obj_ids[vid_id] = valid_obj_ids
639
+
640
+ except (requests.exceptions.ConnectionError, APIConnectionError) as e:
641
+ print(f"created caption until {i-1}", flush=True)
642
+ print("인터넷 연결 문제로 요청을 처리할 수 없습니다:", e, flush=True)
643
+
644
+ with open(args.save_caption_path, "w") as file:
645
+ json.dump(result_captions, file, indent=4)
646
+
647
+ with open(args.save_valid_obj_ids_path, "w") as file:
648
+ json.dump(result_valid_obj_ids, file, indent=4)
649
+
650
+ except OpenAIError as e:
651
+ print(f"created caption until {i-1}", flush=True)
652
+ print("OpenAI API 관련 오류가 발생했습니다:", e, flush=True)
653
+
654
+ with open(args.save_caption_path, "w") as file:
655
+ json.dump(result_captions, file, indent=4)
656
+
657
+ with open(args.save_valid_obj_ids_path, "w") as file:
658
+ json.dump(result_valid_obj_ids, file, indent=4)
659
+
660
+ except Exception as e:
661
+ print(f"created caption until {i-1}", flush=True)
662
+ print("알 수 없는 오류 발생:", e, flush=True)
663
+
664
+ with open(args.save_caption_path, "w") as file:
665
+ json.dump(result_captions, file, indent=4)
666
+
667
+ with open(args.save_valid_obj_ids_path, "w") as file:
668
+ json.dump(result_valid_obj_ids, file, indent=4)
669
+
670
+ print("Finished!", flush=True)
671
+
672
+ with open(args.save_caption_path, "w") as file:
673
+ json.dump(result_captions, file, indent=4)
674
+
675
+ with open(args.save_valid_obj_ids_path, "w") as file:
676
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/make_ref-ytvos_json_20250113183250.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+
10
+ from pathlib import Path
11
+ import io
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ import regex as re
16
+ import json
17
+
18
+ import cv2
19
+ from PIL import Image, ImageDraw
20
+ import torch
21
+ from torchvision.transforms import functional as F
22
+
23
+ from skimage import measure # (pip install scikit-image)
24
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
25
+
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib.patches as patches
28
+ from matplotlib.collections import PatchCollection
29
+ from matplotlib.patches import Rectangle
30
+
31
+
32
+ import ipywidgets as widgets
33
+ from IPython.display import display, clear_output
34
+
35
+ #==================json 만들기===================
36
+ def createJson(train_dataset, metas):
37
+ entire_json = {}
38
+
39
+ #초기화
40
+ data_idx = 0
41
+
42
+ while data_idx < 10:
43
+
44
+ #하나의 비디오에 대해
45
+ video_data = {}
46
+ video_id = metas[data_idx]['video']
47
+ video_data['bins'] = metas[data_idx]['bins']
48
+ annotation_data = []
49
+ frame_names = []
50
+
51
+ while metas[data_idx]['video'] == video_id:
52
+
53
+ obj_id = metas[data_idx]['obj_id']
54
+ sample_id = metas[data_idx]['sample_id']
55
+ sample_frames_id = metas[data_idx]['sample_frames_id']
56
+ sample_frame_idx = sample_frames_id.index(sample_id)
57
+
58
+ frames = metas[data_idx]['frames']
59
+
60
+ frame_name = frames[sample_id]
61
+ cat_name = metas[data_idx]['category']
62
+
63
+ bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
64
+
65
+ obj_data = {obj_id: {
66
+ "category_name" : cat_name,
67
+ "bbox": bbox
68
+ }}
69
+
70
+
71
+ annotation_data.append(obj_data)
72
+
73
+ frame_names.append(frame_name)
74
+
75
+ data_idx += 1
76
+
77
+ video_data['annotations'] = annotation_data
78
+ video_data['frame_names'] = frame_names
79
+ video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
80
+
81
+ entire_json[video_id] = video_data
82
+
83
+ return entire_json
84
+
85
+
86
+ if __name__ == '__main__':
87
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
88
+ args = parser.parse_args()
89
+
90
+ #==================데이터 불러오기===================
91
+ # 전체 데이터셋
92
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
93
+
94
+ # 전체 데이터셋 메타데이터
95
+ metas = train_dataset.metas
96
+
97
+ #==================json 만들기===================
98
+ entire_json_dict = createJson(train_dataset, metas)
99
+ print(type(entire_json_dict))
100
+ entire_json = json.dumps(entire_json_dict, indent=4)
101
+
102
+ with open('mbench/sampled_frame.json', mode='w') as file:
103
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250113183335.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+
10
+ from pathlib import Path
11
+ import io
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ import regex as re
16
+ import json
17
+
18
+ import cv2
19
+ from PIL import Image, ImageDraw
20
+ import torch
21
+ from torchvision.transforms import functional as F
22
+
23
+ from skimage import measure # (pip install scikit-image)
24
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
25
+
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib.patches as patches
28
+ from matplotlib.collections import PatchCollection
29
+ from matplotlib.patches import Rectangle
30
+
31
+
32
+ import ipywidgets as widgets
33
+ from IPython.display import display, clear_output
34
+
35
+ #==================json 만들기===================
36
+ def createJson(train_dataset, metas):
37
+ entire_json = {}
38
+
39
+ #초기화
40
+ data_idx = 0
41
+
42
+ while data_idx < 10:
43
+
44
+ #하나의 비디오에 대해
45
+ video_data = {}
46
+ video_id = metas[data_idx]['video']
47
+ video_data['bins'] = metas[data_idx]['bins']
48
+ annotation_data = []
49
+ frame_names = []
50
+
51
+ while metas[data_idx]['video'] == video_id:
52
+
53
+ obj_id = metas[data_idx]['obj_id']
54
+ sample_id = metas[data_idx]['sample_id']
55
+ sample_frames_id = metas[data_idx]['sample_frames_id']
56
+ sample_frame_idx = sample_frames_id.index(sample_id)
57
+
58
+ frames = metas[data_idx]['frames']
59
+
60
+ frame_name = frames[sample_id]
61
+ cat_name = metas[data_idx]['category']
62
+
63
+ bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :].numpy()
64
+
65
+ obj_data = {obj_id: {
66
+ "category_name" : cat_name,
67
+ "bbox": bbox
68
+ }}
69
+
70
+
71
+ annotation_data.append(obj_data)
72
+
73
+ frame_names.append(frame_name)
74
+
75
+ data_idx += 1
76
+
77
+ video_data['annotations'] = annotation_data
78
+ video_data['frame_names'] = frame_names
79
+ video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
80
+
81
+ entire_json[video_id] = video_data
82
+
83
+ return entire_json
84
+
85
+
86
+ if __name__ == '__main__':
87
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
88
+ args = parser.parse_args()
89
+
90
+ #==================데이터 불러오기===================
91
+ # 전체 데이터셋
92
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
93
+
94
+ # 전체 데이터셋 메타데이터
95
+ metas = train_dataset.metas
96
+
97
+ #==================json 만들기===================
98
+ entire_json_dict = createJson(train_dataset, metas)
99
+ print(type(entire_json_dict))
100
+ entire_json = json.dumps(entire_json_dict, indent=4)
101
+
102
+ with open('mbench/sampled_frame.json', mode='w') as file:
103
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250113183413.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+
10
+ from pathlib import Path
11
+ import io
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ import regex as re
16
+ import json
17
+
18
+ import cv2
19
+ from PIL import Image, ImageDraw
20
+ import torch
21
+ from torchvision.transforms import functional as F
22
+
23
+ from skimage import measure # (pip install scikit-image)
24
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
25
+
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib.patches as patches
28
+ from matplotlib.collections import PatchCollection
29
+ from matplotlib.patches import Rectangle
30
+
31
+
32
+ import ipywidgets as widgets
33
+ from IPython.display import display, clear_output
34
+
35
+ #==================json 만들기===================
36
+ def createJson(train_dataset, metas):
37
+ entire_json = {}
38
+
39
+ #초기화
40
+ data_idx = 0
41
+
42
+ while data_idx < 10:
43
+
44
+ #하나의 비디오에 대해
45
+ video_data = {}
46
+ video_id = metas[data_idx]['video']
47
+ video_data['bins'] = metas[data_idx]['bins']
48
+ annotation_data = []
49
+ frame_names = []
50
+
51
+ while metas[data_idx]['video'] == video_id:
52
+
53
+ obj_id = metas[data_idx]['obj_id']
54
+ sample_id = metas[data_idx]['sample_id']
55
+ sample_frames_id = metas[data_idx]['sample_frames_id']
56
+ sample_frame_idx = sample_frames_id.index(sample_id)
57
+
58
+ frames = metas[data_idx]['frames']
59
+
60
+ frame_name = frames[sample_id]
61
+ cat_name = metas[data_idx]['category']
62
+
63
+ bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :].tolist()
64
+
65
+ obj_data = {obj_id: {
66
+ "category_name" : cat_name,
67
+ "bbox": bbox
68
+ }}
69
+
70
+
71
+ annotation_data.append(obj_data)
72
+
73
+ frame_names.append(frame_name)
74
+
75
+ data_idx += 1
76
+
77
+ video_data['annotations'] = annotation_data
78
+ video_data['frame_names'] = frame_names
79
+ video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
80
+
81
+ entire_json[video_id] = video_data
82
+
83
+ return entire_json
84
+
85
+
86
+ if __name__ == '__main__':
87
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
88
+ args = parser.parse_args()
89
+
90
+ #==================데이터 불러오기===================
91
+ # 전체 데이터셋
92
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
93
+
94
+ # 전체 데이터셋 메타데이터
95
+ metas = train_dataset.metas
96
+
97
+ #==================json 만들기===================
98
+ entire_json_dict = createJson(train_dataset, metas)
99
+ print(type(entire_json_dict))
100
+ entire_json = json.dumps(entire_json_dict, indent=4)
101
+
102
+ with open('mbench/sampled_frame.json', mode='w') as file:
103
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250113195227.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+
10
+ from pathlib import Path
11
+ import io
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ import regex as re
16
+ import json
17
+
18
+ import cv2
19
+ from PIL import Image, ImageDraw
20
+ import torch
21
+ from torchvision.transforms import functional as F
22
+
23
+ from skimage import measure # (pip install scikit-image)
24
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
25
+
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib.patches as patches
28
+ from matplotlib.collections import PatchCollection
29
+ from matplotlib.patches import Rectangle
30
+
31
+
32
+ import ipywidgets as widgets
33
+ from IPython.display import display, clear_output
34
+
35
+ #==================json 만들기===================
36
+ def createJson(train_dataset, metas):
37
+ entire_json = {}
38
+
39
+ #초기화
40
+ data_idx = 0
41
+ print(len(train_dataset), len(metas))
42
+ while data_idx < len(train_dataset):
43
+
44
+ #하나의 비디오에 대해
45
+ video_data = {}
46
+ video_id = metas[data_idx]['video']
47
+ video_data['bins'] = metas[data_idx]['bins']
48
+ annotation_data = []
49
+ frame_names = []
50
+
51
+ while metas[data_idx]['video'] == video_id:
52
+
53
+ obj_id = metas[data_idx]['obj_id']
54
+ sample_id = metas[data_idx]['sample_id']
55
+ sample_frames_id = metas[data_idx]['sample_frames_id']
56
+ sample_frame_idx = sample_frames_id.index(sample_id)
57
+
58
+ frames = metas[data_idx]['frames']
59
+
60
+ frame_name = frames[sample_id]
61
+ cat_name = metas[data_idx]['category']
62
+
63
+ bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :].tolist()
64
+
65
+ obj_data = {obj_id: {
66
+ "category_name" : cat_name,
67
+ "bbox": bbox
68
+ }}
69
+
70
+
71
+ annotation_data.append(obj_data)
72
+
73
+ frame_names.append(frame_name)
74
+
75
+ data_idx += 1
76
+
77
+ video_data['annotations'] = annotation_data
78
+ video_data['frame_names'] = frame_names
79
+ video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
80
+
81
+ entire_json[video_id] = video_data
82
+
83
+ return entire_json
84
+
85
+
86
+ if __name__ == '__main__':
87
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
88
+ args = parser.parse_args()
89
+
90
+ #==================데이터 불러오기===================
91
+ # 전체 데이터셋
92
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
93
+
94
+ # 전체 데이터셋 메타데이터
95
+ metas = train_dataset.metas
96
+
97
+ #==================json 만들기===================
98
+ entire_json_dict = createJson(train_dataset, metas)
99
+ print(type(entire_json_dict))
100
+ entire_json = json.dumps(entire_json_dict, indent=4)
101
+
102
+ with open('mbench/sampled_frame.json', mode='w') as file:
103
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250116140938.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+
10
+ from pathlib import Path
11
+ import io
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+ import regex as re
16
+ import json
17
+
18
+ import cv2
19
+ from PIL import Image, ImageDraw
20
+ import torch
21
+ from torchvision.transforms import functional as F
22
+
23
+ from skimage import measure # (pip install scikit-image)
24
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
25
+
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib.patches as patches
28
+ from matplotlib.collections import PatchCollection
29
+ from matplotlib.patches import Rectangle
30
+
31
+
32
+ import ipywidgets as widgets
33
+ from IPython.display import display, clear_output
34
+
35
+ #==================json 만들기===================
36
+ def createJson(train_dataset, metas):
37
+ entire_json = {}
38
+
39
+ #초기화
40
+ vid_idx = 0
41
+
42
+ while vid_idx < len(train_dataset):
43
+
44
+ #하나의 비디오에 대해
45
+ video_data = {}
46
+ video_train_frames, video_train_info = train_dataset[vid_idx]
47
+ video_meta = metas[vid_idx]
48
+
49
+ video_id = video_meta['video']
50
+ video_data['bins'] = video_meta['bins']
51
+ bin_nums = len(video_meta['bins'])
52
+ obj_nums = len(list(video_meta['obj_id_cat'].keys()))
53
+
54
+ annotation_data = []
55
+ frame_names = []
56
+
57
+ for i in range(bin_nums):
58
+ bin_data = {}
59
+ for j in range(obj_nums):
60
+ obj_id = str(j+1)
61
+ obj_data = {
62
+ "category_name":video_meta['obj_id_cat'][obj_id],
63
+ "bbox":video_train_info['boxes'][i*obj_nums+j, :]
64
+ }
65
+ bin_data[obj_id] = obj_data
66
+ annotation_data.append(bin_data)
67
+
68
+ video_data['annotations'] = annotation_data
69
+
70
+
71
+ sample_indx = metas[vid_idx]['sample_indx']
72
+ frames = metas[vid_idx]['frames']
73
+ for i in sample_indx:
74
+ frame_name = frames[i]
75
+ frame_names.append(frame_name)
76
+
77
+ video_data['frame_names'] = frame_names
78
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
79
+ entire_json[video_id] = video_data
80
+
81
+ vid_idx += 1
82
+
83
+ return entire_json
84
+
85
+
86
+ if __name__ == '__main__':
87
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
88
+ args = parser.parse_args()
89
+
90
+ #==================데이터 불러오기===================
91
+ # 전체 데이터셋
92
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
93
+
94
+ # 전체 데이터셋 메타데이터
95
+ metas = train_dataset.metas
96
+
97
+ #==================json 만들기===================
98
+ entire_json_dict = createJson(train_dataset, metas)
99
+ print(type(entire_json_dict))
100
+ entire_json = json.dumps(entire_json_dict, indent=4)
101
+
102
+ with open('mbench/sampled_frame.json', mode='w') as file:
103
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250116141629.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from os import path as osp
4
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
5
+
6
+ from datasets import build_dataset
7
+ import argparse
8
+ import opts
9
+
10
+
11
+ from pathlib import Path
12
+ import io
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ #==================json 만들기===================
37
+ def createJson(train_dataset, metas):
38
+ entire_json = {}
39
+
40
+ #초기화
41
+ vid_idx = 0
42
+
43
+ while vid_idx < len(train_dataset):
44
+
45
+ #하나의 비디오에 대해
46
+ video_data = {}
47
+ video_train_frames, video_train_info = train_dataset[vid_idx]
48
+ video_meta = metas[vid_idx]
49
+
50
+ video_id = video_meta['video']
51
+ video_data['bins'] = video_meta['bins']
52
+ bin_nums = len(video_meta['bins'])
53
+ obj_nums = len(list(video_meta['obj_id_cat'].keys()))
54
+
55
+ annotation_data = []
56
+ frame_names = []
57
+
58
+ for i in range(bin_nums):
59
+ bin_data = {}
60
+ for j in range(obj_nums):
61
+ obj_id = str(j+1)
62
+ obj_data = {
63
+ "category_name":video_meta['obj_id_cat'][obj_id],
64
+ "bbox":video_train_info['boxes'][i*obj_nums+j, :]
65
+ }
66
+ bin_data[obj_id] = obj_data
67
+ annotation_data.append(bin_data)
68
+
69
+ video_data['annotations'] = annotation_data
70
+
71
+
72
+ sample_indx = metas[vid_idx]['sample_indx']
73
+ frames = metas[vid_idx]['frames']
74
+ for i in sample_indx:
75
+ frame_name = frames[i]
76
+ frame_names.append(frame_name)
77
+
78
+ video_data['frame_names'] = frame_names
79
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
80
+ entire_json[video_id] = video_data
81
+
82
+ vid_idx += 1
83
+
84
+ return entire_json
85
+
86
+
87
+ if __name__ == '__main__':
88
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
89
+ args = parser.parse_args()
90
+
91
+ #==================데이터 불러오기===================
92
+ # 전체 데이터셋
93
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
94
+
95
+ # 전체 데이터셋 메타데이터
96
+ metas = train_dataset.metas
97
+
98
+ #==================json 만들기===================
99
+ entire_json_dict = createJson(train_dataset, metas)
100
+ print(type(entire_json_dict))
101
+ entire_json = json.dumps(entire_json_dict, indent=4)
102
+
103
+ with open('mbench/sampled_frame.json', mode='w') as file:
104
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250117072647.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from os import path as osp
4
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
5
+
6
+ from datasets import build_dataset
7
+ import argparse
8
+ import opts
9
+
10
+
11
+ from pathlib import Path
12
+ import io
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ #==================json 만들기===================
37
+ def createJson(train_dataset, metas):
38
+ entire_json = {}
39
+
40
+ #초기화
41
+ vid_idx = 0
42
+
43
+ while vid_idx < len(train_dataset):
44
+
45
+ #하나의 비디오에 대해
46
+ video_data = {}
47
+ video_train_frames, video_train_info = train_dataset[vid_idx]
48
+ video_meta = metas[vid_idx]
49
+
50
+ video_id = video_meta['video']
51
+ video_data['bins'] = video_meta['bins']
52
+ bin_nums = len(video_meta['bins'])
53
+ obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
54
+
55
+ annotation_data = []
56
+ frame_names = []
57
+
58
+ for i in range(bin_nums):
59
+ bin_data = {}
60
+ for j in range(obj_nums):
61
+ obj_id = str(j+1)
62
+ try:
63
+ obj_data = {
64
+ "category_name":video_meta['obj_id_cat'][obj_id],
65
+ "bbox":video_train_info['boxes'][i*obj_nums+j, :]
66
+ }
67
+ except:
68
+ obj_data = {}
69
+ bin_data[obj_id] = obj_data
70
+ annotation_data.append(bin_data)
71
+
72
+ video_data['annotations'] = annotation_data
73
+
74
+
75
+ sample_indx = metas[vid_idx]['sample_indx']
76
+ frames = metas[vid_idx]['frames']
77
+ for i in sample_indx:
78
+ frame_name = frames[i]
79
+ frame_names.append(frame_name)
80
+
81
+ video_data['frame_names'] = frame_names
82
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
83
+ entire_json[video_id] = video_data
84
+
85
+ vid_idx += 1
86
+
87
+ return entire_json
88
+
89
+
90
+ if __name__ == '__main__':
91
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
92
+ args = parser.parse_args()
93
+
94
+ #==================데이터 불러오기===================
95
+ # 전체 데이터셋
96
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
97
+
98
+ # 전체 데이터셋 메타데이터
99
+ metas = train_dataset.metas
100
+
101
+ #==================json 만들기===================
102
+ entire_json_dict = createJson(train_dataset, metas)
103
+ print(type(entire_json_dict))
104
+ entire_json = json.dumps(entire_json_dict, indent=4)
105
+
106
+ with open('mbench/sampled_frame2.json', mode='w') as file:
107
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250117074149.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from os import path as osp
4
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
5
+
6
+ from datasets import build_dataset
7
+ import argparse
8
+ import opts
9
+
10
+
11
+ from pathlib import Path
12
+ import io
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ #==================json 만들기===================
37
+ def createJson(train_dataset, metas):
38
+ entire_json = {}
39
+
40
+ #초기화
41
+ vid_idx = 0
42
+
43
+ while vid_idx < len(train_dataset):
44
+
45
+ #하나의 비디오에 대해
46
+ video_data = {}
47
+ video_train_frames, video_train_info = train_dataset[vid_idx]
48
+ video_meta = metas[vid_idx]
49
+
50
+ video_id = video_meta['video']
51
+ video_data['bins'] = video_meta['bins']
52
+ bin_nums = len(video_meta['bins'])
53
+ obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
54
+
55
+ annotation_data = []
56
+ frame_names = []
57
+
58
+ for i in range(bin_nums):
59
+ bin_data = {}
60
+ for j in range(obj_nums):
61
+ obj_id = str(j+1)
62
+ try:
63
+ obj_data = {
64
+ "category_name":video_meta['obj_id_cat'][obj_id],
65
+ "bbox":video_train_info['boxes'][i*obj_nums+j, :].tolist()
66
+ }
67
+ except:
68
+ obj_data = {}
69
+ bin_data[obj_id] = obj_data
70
+ annotation_data.append(bin_data)
71
+
72
+ video_data['annotations'] = annotation_data
73
+
74
+
75
+ sample_indx = metas[vid_idx]['sample_indx']
76
+ frames = metas[vid_idx]['frames']
77
+ for i in sample_indx:
78
+ frame_name = frames[i]
79
+ frame_names.append(frame_name)
80
+
81
+ video_data['frame_names'] = frame_names
82
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
83
+ entire_json[video_id] = video_data
84
+
85
+ vid_idx += 1
86
+
87
+ return entire_json
88
+
89
+
90
+ if __name__ == '__main__':
91
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
92
+ args = parser.parse_args()
93
+
94
+ #==================데이터 불러오기===================
95
+ # 전체 데이터셋
96
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
97
+
98
+ # 전체 데이터셋 메타데이터
99
+ metas = train_dataset.metas
100
+
101
+ #==================json 만들기===================
102
+ entire_json_dict = createJson(train_dataset, metas)
103
+ print(type(entire_json_dict))
104
+ entire_json = json.dumps(entire_json_dict, indent=4)
105
+
106
+ with open('mbench/sampled_frame2.json', mode='w') as file:
107
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250118024354.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from os import path as osp
4
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
5
+
6
+ from datasets import build_dataset
7
+ import argparse
8
+ import opts
9
+
10
+
11
+ from pathlib import Path
12
+ import io
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ #==================json 만들기===================
37
+ def createJson(train_dataset, metas):
38
+ entire_json = {}
39
+
40
+ #초기화
41
+ vid_idx = 0
42
+
43
+ while vid_idx < len(train_dataset):
44
+
45
+ #하나의 비디오에 대해
46
+ video_data = {}
47
+ video_train_frames, video_train_info = train_dataset[vid_idx]
48
+ video_meta = metas[vid_idx]
49
+
50
+ video_id = video_meta['video']
51
+ video_data['bins'] = video_meta['bins']
52
+ bin_nums = len(video_meta['bins'])
53
+ obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
54
+
55
+ annotation_data = []
56
+ frame_names = []
57
+
58
+ for i in range(bin_nums):
59
+ bin_data = {}
60
+ for j in range(obj_nums):
61
+ obj_id = str(j+1)
62
+ try:
63
+ obj_data = {
64
+ "category_name":video_meta['obj_id_cat'][obj_id],
65
+ "bbox":video_train_info['boxes'][i*obj_nums+j, :].tolist(),
66
+ "valid":video_train_info['valid'][i*obj_nums+j].item()
67
+ }
68
+ except:
69
+ obj_data = {}
70
+ bin_data[obj_id] = obj_data
71
+ annotation_data.append(bin_data)
72
+
73
+ video_data['annotations'] = annotation_data
74
+
75
+
76
+ sample_indx = metas[vid_idx]['sample_indx']
77
+ frames = metas[vid_idx]['frames']
78
+ for i in sample_indx:
79
+ frame_name = frames[i]
80
+ frame_names.append(frame_name)
81
+
82
+ video_data['frame_names'] = frame_names
83
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
84
+ entire_json[video_id] = video_data
85
+
86
+ vid_idx += 1
87
+
88
+ return entire_json
89
+
90
+
91
+ if __name__ == '__main__':
92
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
93
+ args = parser.parse_args()
94
+
95
+ #==================데이터 불러오기===================
96
+ # 전체 데이터셋
97
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
98
+
99
+ # 전체 데이터셋 메타데이터
100
+ metas = train_dataset.metas
101
+
102
+ #==================json 만들기===================
103
+ entire_json_dict = createJson(train_dataset, metas)
104
+ print(type(entire_json_dict))
105
+ entire_json = json.dumps(entire_json_dict, indent=4)
106
+
107
+ with open('mbench/sampled_frame3.json', mode='w') as file:
108
+ file.write(entire_json)
.history/mbench/ytvos_ref_20250121140600.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.utils.data import Dataset
8
+ import transforms_video as T
9
+
10
+ import os
11
+ from PIL import Image
12
+ import json
13
+ import numpy as np
14
+ import random
15
+
16
+ # from datasets.categories import ytvos_category_dict as category_dict
17
+
18
+
19
+ category_dict = {
20
+ 'airplane': 0, 'ape': 1, 'bear': 2, 'bike': 3, 'bird': 4, 'boat': 5, 'bucket': 6, 'bus': 7, 'camel': 8, 'cat': 9,
21
+ 'cow': 10, 'crocodile': 11, 'deer': 12, 'dog': 13, 'dolphin': 14, 'duck': 15, 'eagle': 16, 'earless_seal': 17,
22
+ 'elephant': 18, 'fish': 19, 'fox': 20, 'frisbee': 21, 'frog': 22, 'giant_panda': 23, 'giraffe': 24, 'hand': 25,
23
+ 'hat': 26, 'hedgehog': 27, 'horse': 28, 'knife': 29, 'leopard': 30, 'lion': 31, 'lizard': 32, 'monkey': 33,
24
+ 'motorbike': 34, 'mouse': 35, 'others': 36, 'owl': 37, 'paddle': 38, 'parachute': 39, 'parrot': 40, 'penguin': 41,
25
+ 'person': 42, 'plant': 43, 'rabbit': 44, 'raccoon': 45, 'sedan': 46, 'shark': 47, 'sheep': 48, 'sign': 49,
26
+ 'skateboard': 50, 'snail': 51, 'snake': 52, 'snowboard': 53, 'squirrel': 54, 'surfboard': 55, 'tennis_racket': 56,
27
+ 'tiger': 57, 'toilet': 58, 'train': 59, 'truck': 60, 'turtle': 61, 'umbrella': 62, 'whale': 63, 'zebra': 64
28
+ }
29
+
30
+
31
+
32
+ class YTVOSDataset(Dataset):
33
+ """
34
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
35
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
36
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
37
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
38
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
39
+ through the Youtube-VOS referring video object segmentation competition page at:
40
+ https://competitions.codalab.org/competitions/29139
41
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
42
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
43
+ currently only be done on the competition 'validation' subset using the competition's server, as
44
+ annotations were publicly released only for the 'train' subset of the competition.
45
+
46
+ """
47
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
48
+ num_frames: int, max_skip: int):
49
+ self.img_folder = img_folder
50
+ self.ann_file = ann_file
51
+ self._transforms = transforms
52
+ self.return_masks = return_masks # not used
53
+ self.num_frames = num_frames
54
+ self.max_skip = max_skip
55
+ # create video meta data
56
+ self.prepare_metas()
57
+
58
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
59
+ print('\n')
60
+
61
+ def prepare_metas(self):
62
+ # read object information
63
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
64
+ subset_metas_by_video = json.load(f)['videos']
65
+
66
+ # read expression data
67
+ with open(str(self.ann_file), 'r') as f:
68
+ subset_expressions_by_video = json.load(f)['videos']
69
+ self.videos = list(subset_expressions_by_video.keys())
70
+
71
+ self.metas = []
72
+ skip_vid_count = 0
73
+
74
+ for vid in self.videos:
75
+ vid_meta = subset_metas_by_video[vid]
76
+ vid_data = subset_expressions_by_video[vid]
77
+ vid_frames = sorted(vid_data['frames'])
78
+ vid_len = len(vid_frames)
79
+
80
+ if vid_len < 11:
81
+ #print(f"Too short video: {vid} with frame length {vid_len}")
82
+ skip_vid_count += 1
83
+ continue
84
+
85
+
86
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
87
+ start_idx , end_idx = 2, vid_len-2
88
+ bin_size = (end_idx - start_idx) // 4
89
+
90
+ bins = []
91
+ for i in range(4):
92
+ bin_start = start_idx + i * bin_size
93
+ bin_end = bin_start + bin_size if i < 3 else end_idx
94
+
95
+ bins.append((bin_start, bin_end))
96
+
97
+ # Random sample one frame from each bin
98
+ sample_indx = []
99
+ for start_idx, end_idx in bins:
100
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
101
+ sample_indx.sort() # Ensure indices are in order
102
+
103
+
104
+ meta = {
105
+ 'video':vid,
106
+ 'sample_indx':sample_indx,
107
+ 'bins':bins,
108
+ 'frames':vid_frames
109
+ }
110
+ obj_id_cat = {}
111
+ for exp_id, exp_dict in vid_data['expressions'].items():
112
+ obj_id = exp_dict['obj_id']
113
+ if obj_id not in obj_id_cat:
114
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
115
+ meta['obj_id_cat'] = obj_id_cat
116
+ self.metas.append(meta)
117
+
118
+ print(f"skipped {skip_vid_count} short videos")
119
+
120
+
121
+ @staticmethod
122
+ def bounding_box(img):
123
+ rows = np.any(img, axis=1)
124
+ cols = np.any(img, axis=0)
125
+ rmin, rmax = np.where(rows)[0][[0, -1]]
126
+ cmin, cmax = np.where(cols)[0][[0, -1]]
127
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
128
+
129
+ def __len__(self):
130
+ return len(self.metas)
131
+
132
+ def __getitem__(self, idx):
133
+ meta = self.metas[idx] # dict
134
+
135
+ video, sample_indx, bins, frames, obj_id_cat = \
136
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
137
+
138
+ # read frames and masks
139
+ annos = {}
140
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
141
+ for frame_indx in sample_indx:
142
+ frame_name = frames[frame_indx]
143
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
144
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
145
+ img = Image.open(img_path).convert('RGB')
146
+ imgs.append(img)
147
+
148
+ mask = Image.open(mask_path).convert('P')
149
+ mask = np.array(mask)
150
+
151
+ frame_annotations = {}
152
+
153
+ # create the target
154
+ for obj_id in list(obj_id_cat.keys()):
155
+ obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary
156
+ if (obj_mask > 0).any():
157
+ y1, y2, x1, x2 = self.bounding_box(obj_mask)
158
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
159
+ valid.append(1)
160
+ val = 1
161
+ else: # some frame didn't contain the instance
162
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
163
+ valid.append(0)
164
+ val = 0
165
+ obj_mask = torch.from_numpy(obj_mask)
166
+
167
+ # append
168
+ masks.append(obj_mask)
169
+ boxes.append(box)
170
+
171
+ frame_annotations[obj_id] = {
172
+ 'category_name': obj_id_cat[obj_id],
173
+ 'bbox': box,
174
+ 'valid' : val,
175
+ 'mask': obj_mask
176
+ }
177
+
178
+ annos[frame_indx] = frame_annotations
179
+
180
+
181
+ # transform
182
+ w, h = img.size
183
+ boxes = torch.stack(boxes, dim=0)
184
+ boxes[:, 0::2].clamp_(min=0, max=w)
185
+ boxes[:, 1::2].clamp_(min=0, max=h)
186
+ masks = torch.stack(masks, dim=0)
187
+ target = {
188
+ 'frames_idx': sample_indx, # [T,]
189
+ 'boxes': boxes, # [T, 4], xyxy
190
+ 'masks': masks, # [T, H, W]
191
+ 'valid': torch.tensor(valid), # [T,]
192
+ 'obj_ids' : list(obj_id_cat.keys()),
193
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
194
+ 'size': torch.as_tensor([int(h), int(w)])
195
+ }
196
+
197
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
198
+ # if self._transforms:
199
+ # imgs, target = self._transforms(imgs, target)
200
+ # imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
201
+ # else:
202
+ imgs = np.array(imgs)
203
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
204
+
205
+
206
+ # # FIXME: handle "valid", since some box may be removed due to random crop
207
+ # if torch.any(target['valid'] == 1): # at leatst one instance
208
+ # instance_check = True
209
+ # else:
210
+ # idx = random.randint(0, self.__len__() - 1)
211
+
212
+ return imgs, target, annos
213
+
214
+
215
+ def make_coco_transforms(image_set, max_size=640):
216
+ normalize = T.Compose([
217
+ T.ToTensor(),
218
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
219
+ ])
220
+
221
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
222
+
223
+ if image_set == 'train':
224
+ return T.Compose([
225
+ T.RandomHorizontalFlip(),
226
+ T.PhotometricDistort(),
227
+ T.RandomSelect(
228
+ T.Compose([
229
+ T.RandomResize(scales, max_size=max_size),
230
+ T.Check(),
231
+ ]),
232
+ T.Compose([
233
+ T.RandomResize([400, 500, 600]),
234
+ T.RandomSizeCrop(384, 600),
235
+ T.RandomResize(scales, max_size=max_size),
236
+ T.Check(),
237
+ ])
238
+ ),
239
+ normalize,
240
+ ])
241
+
242
+ # we do not use the 'val' set since the annotations are inaccessible
243
+ if image_set == 'val':
244
+ return T.Compose([
245
+ T.RandomResize([360], max_size=640),
246
+ normalize,
247
+ ])
248
+
249
+ raise ValueError(f'unknown {image_set}')
250
+
251
+
252
+ def build(image_set, args):
253
+ root = Path(args.ytvos_path)
254
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
255
+ PATHS = {
256
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
257
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
258
+ }
259
+ img_folder, ann_file = PATHS[image_set]
260
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
261
+ # num_frames=args.num_frames, max_skip=args.max_skip)
262
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
263
+ num_frames=args.num_frames, max_skip=args.max_skip)
264
+ return dataset
265
+
.history/mbench_a2d/gpt_a2d_numbered_20250205111521.py ADDED
File without changes
.history/mbench_a2d/gpt_a2d_numbered_20250205151640.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+ import time
9
+
10
+ import numpy as np
11
+ import matplotlib.pyplot as plt
12
+ import cv2
13
+ from io import BytesIO
14
+ import base64
15
+ from PIL import Image
16
+ import json
17
+
18
+ from openai import OpenAI
19
+
20
+ def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
21
+ #마스크 색칠할지
22
+ if color_mask == True:
23
+ alpha = 0.1
24
+
25
+ colored_mask = np.zeros_like(frame)
26
+ colored_mask[mask == 1] = [255, 0, 0]
27
+ frame[mask == 1] = (
28
+ (1 - alpha) * frame[mask == 1] +
29
+ alpha * colored_mask[mask == 1]
30
+ )
31
+
32
+ #마스크 아웃라인 그리기
33
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
34
+ cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
35
+
36
+ #instance_id 적을지
37
+ if label_number == True:
38
+ if len(contours) > 0:
39
+ largest_contour = max(contours, key=cv2.contourArea)
40
+ M = cv2.moments(largest_contour)
41
+ if M["m00"] != 0:
42
+ center_x = int(M["m10"] / M["m00"])
43
+ center_y = int(M["m01"] / M["m00"])
44
+ else:
45
+ center_x, center_y = 0, 0
46
+
47
+ font = cv2.FONT_HERSHEY_SIMPLEX
48
+ text = str(instance_id)
49
+ font_scale = 0.6
50
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
51
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
52
+ text_y = center_y
53
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
54
+
55
+ # 텍스트 배경 사각형 좌표 계산
56
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
57
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
58
+ rect_end = (text_x + text_size[0] + 5, text_y)
59
+
60
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
61
+ cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
62
+
63
+ # plt.figure(figsize=(6, 10))
64
+ # plt.imshow(frame)
65
+ # plt.title(text_query)
66
+ # plt.tight_layout()
67
+ # plt.axis('off')
68
+ # plt.show()
69
+
70
+ buffer = BytesIO()
71
+ frame = Image.fromarray(frame)
72
+ frame.save(buffer, format='jpeg')
73
+ buffer.seek(0)
74
+ encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
75
+
76
+ return encoded_frame
77
+
78
+ def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
79
+
80
+ base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
81
+
82
+ captioner = OpenAI()
83
+
84
+ #필터링하지 않고 바로 ref exp 만들기
85
+ dense_caption_prompt = f"""
86
+ You are a visual assistant analyzing a single frame of a video.
87
+ In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
88
+ I also give you a text query describing the marked object.
89
+ I want to use your expression to create an **action-centric referring expression** dataset.
90
+ Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
91
+ ---
92
+ ## Guidelines:
93
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
94
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
95
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
96
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
97
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
98
+ 6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
99
+ 7. Base your description on these action definitions:
100
+ - Avoid using term 'minimal' or 'slightly'.
101
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
102
+ - details such as motion and intention, facial with object manipulation
103
+ - movements with object or other entities when they are prominent and observable. expression should be specific.
104
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
105
+ --
106
+ ## Output Format:
107
+ - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
108
+ object id. action-oriented description
109
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
110
+ ### Example
111
+ If the frame has 1 labeled bear, your output should look like:
112
+ 1. the bear reaching his right arm while leaning forward to capture the prey
113
+ ---
114
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
115
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
116
+ **Do not include markdown** in the output.
117
+ Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
118
+ For each labeled object, output referring expressions for each object id.
119
+ """
120
+ prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
121
+
122
+ MAX_RETRIES = 2
123
+ retry_count = 0
124
+
125
+ while retry_count < MAX_RETRIES:
126
+ response = captioner.chat.completions.create(
127
+ model=model,
128
+ messages=[
129
+ {
130
+ "role": "user",
131
+ "content": [
132
+ {
133
+ "type": "text",
134
+ "text": prompt_with_text_query,
135
+ },
136
+ {
137
+ "type": "image_url",
138
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
139
+ },
140
+ ],
141
+ }
142
+ ],
143
+ )
144
+
145
+
146
+ caption = response.choices[0].message.content.strip()
147
+ caption_lower = caption.lower().lstrip()
148
+ if caption_lower.startswith("1.") and not any(
149
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
150
+ ):
151
+ break
152
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
153
+ retry_count += 1
154
+ time.sleep(2)
155
+
156
+ if retry_count == MAX_RETRIES:
157
+ caption = None
158
+ print("Max retries reached. Caption generation failed.")
159
+
160
+ else:
161
+ caption = None
162
+
163
+ return caption
164
+
165
+ if __name__ == "__main__":
166
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
167
+ parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
168
+ args = parser.parse_args()
169
+
170
+ train_dataset = build_dataset('a2d', image_set = 'train', args = args)
171
+ text_annotations = train_dataset.text_annotations
172
+
173
+ all_captions = {}
174
+
175
+ os.environ['OPENAI_API_KEY'] = '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'
176
+
177
+ for idx in range(100):
178
+ imgs, target = train_dataset[idx]
179
+ frames_idx = target['frames_idx'].tolist()
180
+ text_query, vid_id, frame_id, instance_id = text_annotations[idx]
181
+
182
+ frame_id = frame_id - 1
183
+ frame_order = frames_idx.index(frame_id)
184
+
185
+ frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
186
+ mask = target['masks'].numpy().astype(np.uint8).squeeze()
187
+
188
+ caption = getCaption(frame, mask, instance_id, text_query)
189
+ if vid_id not in all_captions:
190
+ all_captions[vid_id] = {frame_id : caption}
191
+ else:
192
+ all_captions[vid_id][frame_id] = caption
193
+
194
+
195
+ with open(args.save_caption_path, 'w') as file:
196
+ json.dump(all_captions, file, indent=4)
197
+
.history/mbench_a2d/gpt_a2d_numbered_20250205151759.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+ import time
9
+
10
+ import numpy as np
11
+ import matplotlib.pyplot as plt
12
+ import cv2
13
+ from io import BytesIO
14
+ import base64
15
+ from PIL import Image
16
+ import json
17
+
18
+ from openai import OpenAI
19
+
20
+ def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
21
+ #마스크 색칠할지
22
+ if color_mask == True:
23
+ alpha = 0.1
24
+
25
+ colored_mask = np.zeros_like(frame)
26
+ colored_mask[mask == 1] = [255, 0, 0]
27
+ frame[mask == 1] = (
28
+ (1 - alpha) * frame[mask == 1] +
29
+ alpha * colored_mask[mask == 1]
30
+ )
31
+
32
+ #마스크 아웃라인 그리기
33
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
34
+ cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
35
+
36
+ #instance_id 적을지
37
+ if label_number == True:
38
+ if len(contours) > 0:
39
+ largest_contour = max(contours, key=cv2.contourArea)
40
+ M = cv2.moments(largest_contour)
41
+ if M["m00"] != 0:
42
+ center_x = int(M["m10"] / M["m00"])
43
+ center_y = int(M["m01"] / M["m00"])
44
+ else:
45
+ center_x, center_y = 0, 0
46
+
47
+ font = cv2.FONT_HERSHEY_SIMPLEX
48
+ text = str(instance_id)
49
+ font_scale = 0.6
50
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
51
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
52
+ text_y = center_y
53
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
54
+
55
+ # 텍스트 배경 사각형 좌표 계산
56
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
57
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
58
+ rect_end = (text_x + text_size[0] + 5, text_y)
59
+
60
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
61
+ cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
62
+
63
+ # plt.figure(figsize=(6, 10))
64
+ # plt.imshow(frame)
65
+ # plt.title(text_query)
66
+ # plt.tight_layout()
67
+ # plt.axis('off')
68
+ # plt.show()
69
+
70
+ buffer = BytesIO()
71
+ frame = Image.fromarray(frame)
72
+ frame.save(buffer, format='jpeg')
73
+ buffer.seek(0)
74
+ encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
75
+
76
+ return encoded_frame
77
+
78
+ def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
79
+
80
+ base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
81
+
82
+ captioner = OpenAI()
83
+
84
+ #필터링하지 않고 바로 ref exp 만들기
85
+ dense_caption_prompt = f"""
86
+ You are a visual assistant analyzing a single frame of a video.
87
+ In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
88
+ I also give you a text query describing the marked object.
89
+ I want to use your expression to create an **action-centric referring expression** dataset.
90
+ Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
91
+ ---
92
+ ## Guidelines:
93
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
94
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
95
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
96
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
97
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
98
+ 6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
99
+ 7. Base your description on these action definitions:
100
+ - Avoid using term 'minimal' or 'slightly'.
101
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
102
+ - details such as motion and intention, facial with object manipulation
103
+ - movements with object or other entities when they are prominent and observable. expression should be specific.
104
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
105
+ --
106
+ ## Output Format:
107
+ - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
108
+ object id. action-oriented description
109
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
110
+ ### Example
111
+ If the frame has 1 labeled bear, your output should look like:
112
+ 1. the bear reaching his right arm while leaning forward to capture the prey
113
+ ---
114
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
115
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
116
+ **Do not include markdown** in the output.
117
+ Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
118
+ For each labeled object, output referring expressions for each object id.
119
+ """
120
+ prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
121
+
122
+ MAX_RETRIES = 2
123
+ retry_count = 0
124
+
125
+ while retry_count < MAX_RETRIES:
126
+ response = captioner.chat.completions.create(
127
+ model=model,
128
+ messages=[
129
+ {
130
+ "role": "user",
131
+ "content": [
132
+ {
133
+ "type": "text",
134
+ "text": prompt_with_text_query,
135
+ },
136
+ {
137
+ "type": "image_url",
138
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
139
+ },
140
+ ],
141
+ }
142
+ ],
143
+ )
144
+
145
+
146
+ caption = response.choices[0].message.content.strip()
147
+ caption_lower = caption.lower().lstrip()
148
+ if caption_lower.startswith("1.") and not any(
149
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
150
+ ):
151
+ break
152
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
153
+ retry_count += 1
154
+ time.sleep(2)
155
+
156
+ if retry_count == MAX_RETRIES:
157
+ caption = None
158
+ print("Max retries reached. Caption generation failed.")
159
+
160
+ else:
161
+ caption = None
162
+
163
+ return caption
164
+
165
+ if __name__ == "__main__":
166
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
167
+ parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
168
+ args = parser.parse_args()
169
+
170
+ train_dataset = build_dataset('a2d', image_set = 'train', args = args)
171
+ text_annotations = train_dataset.text_annotations
172
+
173
+ all_captions = {}
174
+
175
+ os.environ['OPENAI_API_KEY'] = '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'
176
+
177
+ for idx in range(100):
178
+ imgs, target = train_dataset[idx]
179
+ frames_idx = target['frames_idx'].tolist()
180
+ text_query, vid_id, frame_id, instance_id = text_annotations[idx]
181
+ print(f"vid id: {vid_id}", flush=True)
182
+
183
+ frame_id = frame_id - 1
184
+ frame_order = frames_idx.index(frame_id)
185
+
186
+ frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
187
+ mask = target['masks'].numpy().astype(np.uint8).squeeze()
188
+
189
+ caption = getCaption(frame, mask, instance_id, text_query)
190
+ if vid_id not in all_captions:
191
+ all_captions[vid_id] = {frame_id : caption}
192
+ else:
193
+ all_captions[vid_id][frame_id] = caption
194
+
195
+ print("Finished!", flush=True)
196
+
197
+ with open(args.save_caption_path, 'w') as file:
198
+ json.dump(all_captions, file, indent=4)
199
+
.history/mbench_a2d/gpt_a2d_numbered_20250205151827.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+ import time
9
+
10
+ import numpy as np
11
+ import matplotlib.pyplot as plt
12
+ import cv2
13
+ from io import BytesIO
14
+ import base64
15
+ from PIL import Image
16
+ import json
17
+
18
+ from openai import OpenAI
19
+
20
+ def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
21
+ #마스크 색칠할지
22
+ if color_mask == True:
23
+ alpha = 0.1
24
+
25
+ colored_mask = np.zeros_like(frame)
26
+ colored_mask[mask == 1] = [255, 0, 0]
27
+ frame[mask == 1] = (
28
+ (1 - alpha) * frame[mask == 1] +
29
+ alpha * colored_mask[mask == 1]
30
+ )
31
+
32
+ #마스크 아웃라인 그리기
33
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
34
+ cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
35
+
36
+ #instance_id 적을지
37
+ if label_number == True:
38
+ if len(contours) > 0:
39
+ largest_contour = max(contours, key=cv2.contourArea)
40
+ M = cv2.moments(largest_contour)
41
+ if M["m00"] != 0:
42
+ center_x = int(M["m10"] / M["m00"])
43
+ center_y = int(M["m01"] / M["m00"])
44
+ else:
45
+ center_x, center_y = 0, 0
46
+
47
+ font = cv2.FONT_HERSHEY_SIMPLEX
48
+ text = str(instance_id)
49
+ font_scale = 0.6
50
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
51
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
52
+ text_y = center_y
53
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
54
+
55
+ # 텍스트 배경 사각형 좌표 계산
56
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
57
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
58
+ rect_end = (text_x + text_size[0] + 5, text_y)
59
+
60
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
61
+ cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
62
+
63
+ # plt.figure(figsize=(6, 10))
64
+ # plt.imshow(frame)
65
+ # plt.title(text_query)
66
+ # plt.tight_layout()
67
+ # plt.axis('off')
68
+ # plt.show()
69
+
70
+ buffer = BytesIO()
71
+ frame = Image.fromarray(frame)
72
+ frame.save(buffer, format='jpeg')
73
+ buffer.seek(0)
74
+ encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
75
+
76
+ return encoded_frame
77
+
78
+ def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
79
+
80
+ base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
81
+
82
+ captioner = OpenAI()
83
+
84
+ #필터링하지 않고 바로 ref exp 만들기
85
+ dense_caption_prompt = f"""
86
+ You are a visual assistant analyzing a single frame of a video.
87
+ In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
88
+ I also give you a text query describing the marked object.
89
+ I want to use your expression to create an **action-centric referring expression** dataset.
90
+ Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
91
+ ---
92
+ ## Guidelines:
93
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
94
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
95
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
96
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
97
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
98
+ 6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
99
+ 7. Base your description on these action definitions:
100
+ - Avoid using term 'minimal' or 'slightly'.
101
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
102
+ - details such as motion and intention, facial with object manipulation
103
+ - movements with object or other entities when they are prominent and observable. expression should be specific.
104
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
105
+ --
106
+ ## Output Format:
107
+ - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
108
+ object id. action-oriented description
109
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
110
+ ### Example
111
+ If the frame has 1 labeled bear, your output should look like:
112
+ 1. the bear reaching his right arm while leaning forward to capture the prey
113
+ ---
114
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
115
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
116
+ **Do not include markdown** in the output.
117
+ Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
118
+ For each labeled object, output referring expressions for each object id.
119
+ """
120
+ prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
121
+
122
+ MAX_RETRIES = 2
123
+ retry_count = 0
124
+
125
+ while retry_count < MAX_RETRIES:
126
+ response = captioner.chat.completions.create(
127
+ model=model,
128
+ messages=[
129
+ {
130
+ "role": "user",
131
+ "content": [
132
+ {
133
+ "type": "text",
134
+ "text": prompt_with_text_query,
135
+ },
136
+ {
137
+ "type": "image_url",
138
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
139
+ },
140
+ ],
141
+ }
142
+ ],
143
+ )
144
+
145
+
146
+ caption = response.choices[0].message.content.strip()
147
+ caption_lower = caption.lower().lstrip()
148
+ if caption_lower.startswith("1.") and not any(
149
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
150
+ ):
151
+ break
152
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
153
+ retry_count += 1
154
+ time.sleep(2)
155
+
156
+ if retry_count == MAX_RETRIES:
157
+ caption = None
158
+ print("Max retries reached. Caption generation failed.")
159
+
160
+ else:
161
+ caption = None
162
+
163
+ return caption
164
+
165
+ if __name__ == "__main__":
166
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
167
+ parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
168
+ args = parser.parse_args()
169
+
170
+ train_dataset = build_dataset('a2d', image_set = 'train', args = args)
171
+ text_annotations = train_dataset.text_annotations
172
+
173
+ all_captions = {}
174
+
175
+ os.environ['OPENAI_API_KEY'] = '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'
176
+
177
+ for idx in range(100):
178
+ imgs, target = train_dataset[idx]
179
+ frames_idx = target['frames_idx'].tolist()
180
+ text_query, vid_id, frame_id, instance_id = text_annotations[idx]
181
+ print(f"vid id: {vid_id}, frame id: {frame_id}", flush=True)
182
+
183
+ frame_id = frame_id - 1
184
+ frame_order = frames_idx.index(frame_id)
185
+
186
+ frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
187
+ mask = target['masks'].numpy().astype(np.uint8).squeeze()
188
+
189
+ caption = getCaption(frame, mask, instance_id, text_query)
190
+ if vid_id not in all_captions:
191
+ all_captions[vid_id] = {frame_id : caption}
192
+ else:
193
+ all_captions[vid_id][frame_id] = caption
194
+
195
+ print("Finished!", flush=True)
196
+
197
+ with open(args.save_caption_path, 'w') as file:
198
+ json.dump(all_captions, file, indent=4)
199
+
.history/mbench_a2d/gpt_a2d_numbered_20250205151833.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+ import time
9
+
10
+ import numpy as np
11
+ import matplotlib.pyplot as plt
12
+ import cv2
13
+ from io import BytesIO
14
+ import base64
15
+ from PIL import Image
16
+ import json
17
+
18
+ from openai import OpenAI
19
+
20
+ def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
21
+ #마스크 색칠할지
22
+ if color_mask == True:
23
+ alpha = 0.1
24
+
25
+ colored_mask = np.zeros_like(frame)
26
+ colored_mask[mask == 1] = [255, 0, 0]
27
+ frame[mask == 1] = (
28
+ (1 - alpha) * frame[mask == 1] +
29
+ alpha * colored_mask[mask == 1]
30
+ )
31
+
32
+ #마스크 아웃라인 그리기
33
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
34
+ cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
35
+
36
+ #instance_id 적을지
37
+ if label_number == True:
38
+ if len(contours) > 0:
39
+ largest_contour = max(contours, key=cv2.contourArea)
40
+ M = cv2.moments(largest_contour)
41
+ if M["m00"] != 0:
42
+ center_x = int(M["m10"] / M["m00"])
43
+ center_y = int(M["m01"] / M["m00"])
44
+ else:
45
+ center_x, center_y = 0, 0
46
+
47
+ font = cv2.FONT_HERSHEY_SIMPLEX
48
+ text = str(instance_id)
49
+ font_scale = 0.6
50
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
51
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
52
+ text_y = center_y
53
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
54
+
55
+ # 텍스트 배경 사각형 좌표 계산
56
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
57
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
58
+ rect_end = (text_x + text_size[0] + 5, text_y)
59
+
60
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
61
+ cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
62
+
63
+ # plt.figure(figsize=(6, 10))
64
+ # plt.imshow(frame)
65
+ # plt.title(text_query)
66
+ # plt.tight_layout()
67
+ # plt.axis('off')
68
+ # plt.show()
69
+
70
+ buffer = BytesIO()
71
+ frame = Image.fromarray(frame)
72
+ frame.save(buffer, format='jpeg')
73
+ buffer.seek(0)
74
+ encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
75
+
76
+ return encoded_frame
77
+
78
+ def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
79
+
80
+ base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
81
+
82
+ captioner = OpenAI()
83
+
84
+ #필터링하지 않고 바로 ref exp 만들기
85
+ dense_caption_prompt = f"""
86
+ You are a visual assistant analyzing a single frame of a video.
87
+ In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
88
+ I also give you a text query describing the marked object.
89
+ I want to use your expression to create an **action-centric referring expression** dataset.
90
+ Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
91
+ ---
92
+ ## Guidelines:
93
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
94
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
95
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
96
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
97
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
98
+ 6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
99
+ 7. Base your description on these action definitions:
100
+ - Avoid using term 'minimal' or 'slightly'.
101
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
102
+ - details such as motion and intention, facial with object manipulation
103
+ - movements with object or other entities when they are prominent and observable. expression should be specific.
104
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
105
+ --
106
+ ## Output Format:
107
+ - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
108
+ object id. action-oriented description
109
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
110
+ ### Example
111
+ If the frame has 1 labeled bear, your output should look like:
112
+ 1. the bear reaching his right arm while leaning forward to capture the prey
113
+ ---
114
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
115
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
116
+ **Do not include markdown** in the output.
117
+ Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
118
+ For each labeled object, output referring expressions for each object id.
119
+ """
120
+ prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
121
+
122
+ MAX_RETRIES = 2
123
+ retry_count = 0
124
+
125
+ while retry_count < MAX_RETRIES:
126
+ response = captioner.chat.completions.create(
127
+ model=model,
128
+ messages=[
129
+ {
130
+ "role": "user",
131
+ "content": [
132
+ {
133
+ "type": "text",
134
+ "text": prompt_with_text_query,
135
+ },
136
+ {
137
+ "type": "image_url",
138
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
139
+ },
140
+ ],
141
+ }
142
+ ],
143
+ )
144
+
145
+
146
+ caption = response.choices[0].message.content.strip()
147
+ caption_lower = caption.lower().lstrip()
148
+ if caption_lower.startswith("1.") and not any(
149
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
150
+ ):
151
+ break
152
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
153
+ retry_count += 1
154
+ time.sleep(2)
155
+
156
+ if retry_count == MAX_RETRIES:
157
+ caption = None
158
+ print("Max retries reached. Caption generation failed.")
159
+
160
+ else:
161
+ caption = None
162
+
163
+ return caption
164
+
165
+ if __name__ == "__main__":
166
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
167
+ parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
168
+ args = parser.parse_args()
169
+
170
+ train_dataset = build_dataset('a2d', image_set = 'train', args = args)
171
+ text_annotations = train_dataset.text_annotations
172
+
173
+ all_captions = {}
174
+
175
+ os.environ['OPENAI_API_KEY'] = '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'
176
+
177
+ for idx in range(100):
178
+ imgs, target = train_dataset[idx]
179
+ frames_idx = target['frames_idx'].tolist()
180
+ text_query, vid_id, frame_id, instance_id = text_annotations[idx]
181
+ print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
182
+
183
+ frame_id = frame_id - 1
184
+ frame_order = frames_idx.index(frame_id)
185
+
186
+ frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
187
+ mask = target['masks'].numpy().astype(np.uint8).squeeze()
188
+
189
+ caption = getCaption(frame, mask, instance_id, text_query)
190
+ if vid_id not in all_captions:
191
+ all_captions[vid_id] = {frame_id : caption}
192
+ else:
193
+ all_captions[vid_id][frame_id] = caption
194
+
195
+ print("Finished!", flush=True)
196
+
197
+ with open(args.save_caption_path, 'w') as file:
198
+ json.dump(all_captions, file, indent=4)
199
+
.history/mbench_a2d/gpt_a2d_numbered_20250205152714.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+ import time
9
+
10
+ import numpy as np
11
+ import matplotlib.pyplot as plt
12
+ import cv2
13
+ from io import BytesIO
14
+ import base64
15
+ from PIL import Image
16
+ import json
17
+
18
+ from openai import OpenAI
19
+
20
+ def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
21
+ #마스크 색칠할지
22
+ if color_mask == True:
23
+ alpha = 0.1
24
+
25
+ colored_mask = np.zeros_like(frame)
26
+ colored_mask[mask == 1] = [255, 0, 0]
27
+ frame[mask == 1] = (
28
+ (1 - alpha) * frame[mask == 1] +
29
+ alpha * colored_mask[mask == 1]
30
+ )
31
+
32
+ #마스크 아웃라인 그리기
33
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
34
+ cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
35
+
36
+ #instance_id 적을지
37
+ if label_number == True:
38
+ if len(contours) > 0:
39
+ largest_contour = max(contours, key=cv2.contourArea)
40
+ M = cv2.moments(largest_contour)
41
+ if M["m00"] != 0:
42
+ center_x = int(M["m10"] / M["m00"])
43
+ center_y = int(M["m01"] / M["m00"])
44
+ else:
45
+ center_x, center_y = 0, 0
46
+
47
+ font = cv2.FONT_HERSHEY_SIMPLEX
48
+ text = str(instance_id)
49
+ font_scale = 0.6
50
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
51
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
52
+ text_y = center_y
53
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
54
+
55
+ # 텍스트 배경 사각형 좌표 계산
56
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
57
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
58
+ rect_end = (text_x + text_size[0] + 5, text_y)
59
+
60
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
61
+ cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
62
+
63
+ # plt.figure(figsize=(6, 10))
64
+ # plt.imshow(frame)
65
+ # plt.title(text_query)
66
+ # plt.tight_layout()
67
+ # plt.axis('off')
68
+ # plt.show()
69
+
70
+ buffer = BytesIO()
71
+ frame = Image.fromarray(frame)
72
+ frame.save(buffer, format='jpeg')
73
+ buffer.seek(0)
74
+ encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
75
+
76
+ return encoded_frame
77
+
78
+ def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
79
+
80
+ base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
81
+
82
+ captioner = OpenAI()
83
+
84
+ #필터링하지 않고 바로 ref exp 만들기
85
+ dense_caption_prompt = f"""
86
+ You are a visual assistant analyzing a single frame of a video.
87
+ In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
88
+ I also give you a text query describing the marked object.
89
+ I want to use your expression to create an **action-centric referring expression** dataset.
90
+ Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
91
+ ---
92
+ ## Guidelines:
93
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
94
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
95
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
96
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
97
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
98
+ 6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
99
+ 7. Base your description on these action definitions:
100
+ - Avoid using term 'minimal' or 'slightly'.
101
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
102
+ - details such as motion and intention, facial with object manipulation
103
+ - movements with object or other entities when they are prominent and observable. expression should be specific.
104
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
105
+ --
106
+ ## Output Format:
107
+ - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
108
+ object id. action-oriented description
109
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
110
+ ### Example
111
+ If the frame has 1 labeled bear, your output should look like:
112
+ 1. the bear reaching his right arm while leaning forward to capture the prey
113
+ ---
114
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
115
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
116
+ **Do not include markdown** in the output.
117
+ Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
118
+ For each labeled object, output referring expressions for each object id.
119
+ """
120
+ prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
121
+
122
+ MAX_RETRIES = 2
123
+ retry_count = 0
124
+
125
+ while retry_count < MAX_RETRIES:
126
+ response = captioner.chat.completions.create(
127
+ model=model,
128
+ messages=[
129
+ {
130
+ "role": "user",
131
+ "content": [
132
+ {
133
+ "type": "text",
134
+ "text": prompt_with_text_query,
135
+ },
136
+ {
137
+ "type": "image_url",
138
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
139
+ },
140
+ ],
141
+ }
142
+ ],
143
+ )
144
+
145
+
146
+ caption = response.choices[0].message.content.strip()
147
+ caption_lower = caption.lower().lstrip()
148
+ if caption_lower.startswith("1.") and not any(
149
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
150
+ ):
151
+ break
152
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
153
+ retry_count += 1
154
+ time.sleep(2)
155
+
156
+ if retry_count == MAX_RETRIES:
157
+ caption = None
158
+ print("Max retries reached. Caption generation failed.")
159
+
160
+ else:
161
+ caption = None
162
+
163
+ return caption
164
+
165
+ if __name__ == "__main__":
166
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
167
+ parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
168
+ args = parser.parse_args()
169
+
170
+ train_dataset = build_dataset('a2d', image_set = 'train', args = args)
171
+ text_annotations = train_dataset.text_annotations
172
+
173
+ all_captions = {}
174
+
175
+ #os.environ['OPENAI_API_KEY'] = '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'
176
+ os.environ['OPENAI_API_KEY'] = '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'
177
+
178
+ for idx in range(100):
179
+ imgs, target = train_dataset[idx]
180
+ frames_idx = target['frames_idx'].tolist()
181
+ text_query, vid_id, frame_id, instance_id = text_annotations[idx]
182
+ print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
183
+
184
+ frame_id = frame_id - 1
185
+ frame_order = frames_idx.index(frame_id)
186
+
187
+ frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
188
+ mask = target['masks'].numpy().astype(np.uint8).squeeze()
189
+
190
+ caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini')
191
+ if vid_id not in all_captions:
192
+ all_captions[vid_id] = {frame_id : caption}
193
+ else:
194
+ all_captions[vid_id][frame_id] = caption
195
+
196
+ print("Finished!", flush=True)
197
+
198
+ with open(args.save_caption_path, 'w') as file:
199
+ json.dump(all_captions, file, indent=4)
200
+
.history/mbench_a2d/gpt_a2d_numbered_20250206114221.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+ import time
9
+
10
+ import numpy as np
11
+ import matplotlib.pyplot as plt
12
+ import cv2
13
+ from io import BytesIO
14
+ import base64
15
+ from PIL import Image
16
+ import json
17
+
18
+ from openai import OpenAI
19
+
20
+ def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
21
+ #마스크 색칠할지
22
+ if color_mask == True:
23
+ alpha = 0.1
24
+
25
+ colored_mask = np.zeros_like(frame)
26
+ colored_mask[mask == 1] = [255, 0, 0]
27
+ frame[mask == 1] = (
28
+ (1 - alpha) * frame[mask == 1] +
29
+ alpha * colored_mask[mask == 1]
30
+ )
31
+
32
+ #마스크 아웃라인 그리기
33
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
34
+ cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
35
+
36
+ #instance_id 적을지
37
+ if label_number == True:
38
+ if len(contours) > 0:
39
+ largest_contour = max(contours, key=cv2.contourArea)
40
+ M = cv2.moments(largest_contour)
41
+ if M["m00"] != 0:
42
+ center_x = int(M["m10"] / M["m00"])
43
+ center_y = int(M["m01"] / M["m00"])
44
+ else:
45
+ center_x, center_y = 0, 0
46
+
47
+ font = cv2.FONT_HERSHEY_SIMPLEX
48
+ text = str(instance_id)
49
+ font_scale = 0.6
50
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
51
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
52
+ text_y = center_y
53
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
54
+
55
+ # 텍스트 배경 사각형 좌표 계산
56
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
57
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
58
+ rect_end = (text_x + text_size[0] + 5, text_y)
59
+
60
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
61
+ cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
62
+
63
+ # plt.figure(figsize=(6, 10))
64
+ # plt.imshow(frame)
65
+ # plt.title(text_query)
66
+ # plt.tight_layout()
67
+ # plt.axis('off')
68
+ # plt.show()
69
+
70
+ buffer = BytesIO()
71
+ frame = Image.fromarray(frame)
72
+ frame.save(buffer, format='jpeg')
73
+ buffer.seek(0)
74
+ encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
75
+
76
+ return encoded_frame
77
+
78
+ def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
79
+
80
+ base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
81
+
82
+ captioner = OpenAI()
83
+
84
+ #필터링하지 않고 바로 ref exp 만들기
85
+ dense_caption_prompt = f"""
86
+ You are a visual assistant analyzing a single frame of a video.
87
+ In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
88
+ I also give you a text query describing the marked object.
89
+ I want to use your expression to create an **action-centric referring expression** dataset.
90
+ Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
91
+ ---
92
+ ## Guidelines:
93
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
94
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
95
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
96
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
97
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
98
+ 6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
99
+ 7. Base your description on these action definitions:
100
+ - Avoid using term 'minimal' or 'slightly'.
101
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
102
+ - details such as motion and intention, facial with object manipulation
103
+ - movements with object or other entities when they are prominent and observable. expression should be specific.
104
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
105
+ --
106
+ ## Output Format:
107
+ - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
108
+ object id. action-oriented description
109
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
110
+ ### Example
111
+ If the frame has 1 labeled bear, your output should look like:
112
+ 1. the bear reaching his right arm while leaning forward to capture the prey
113
+ ---
114
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
115
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
116
+ **Do not include markdown** in the output.
117
+ Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
118
+ For each labeled object, output referring expressions for each object id.
119
+ """
120
+ prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
121
+
122
+ MAX_RETRIES = 2
123
+ retry_count = 0
124
+
125
+ while retry_count < MAX_RETRIES:
126
+ response = captioner.chat.completions.create(
127
+ model=model,
128
+ messages=[
129
+ {
130
+ "role": "user",
131
+ "content": [
132
+ {
133
+ "type": "text",
134
+ "text": prompt_with_text_query,
135
+ },
136
+ {
137
+ "type": "image_url",
138
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
139
+ },
140
+ ],
141
+ }
142
+ ],
143
+ )
144
+
145
+
146
+ caption = response.choices[0].message.content.strip()
147
+ caption_lower = caption.lower().lstrip()
148
+ if caption_lower.startswith("1.") and not any(
149
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
150
+ ):
151
+ break
152
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
153
+ retry_count += 1
154
+ time.sleep(2)
155
+
156
+ if retry_count == MAX_RETRIES:
157
+ caption = None
158
+ print("Max retries reached. Caption generation failed.")
159
+
160
+ else:
161
+ caption = None
162
+
163
+ return caption
164
+
165
+ if __name__ == "__main__":
166
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
167
+ parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
168
+ args = parser.parse_args()
169
+
170
+ train_dataset = build_dataset('a2d', image_set = 'train', args = args)
171
+ text_annotations = train_dataset.text_annotations
172
+
173
+ all_captions = {}
174
+
175
+ #os.environ['OPENAI_API_KEY'] = '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'
176
+ os.environ['OPENAI_API_KEY'] = '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'
177
+
178
+ first_text_query = ""
179
+ for idx in range(300):
180
+ imgs, target = train_dataset[idx]
181
+ frames_idx = target['frames_idx'].tolist()
182
+ text_query, vid_id, frame_id, instance_id = text_annotations[idx]
183
+
184
+ if text_query == first_text_query:
185
+ continue
186
+
187
+ print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
188
+
189
+ frame_id = frame_id - 1
190
+ frame_order = frames_idx.index(frame_id)
191
+
192
+ frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
193
+ mask = target['masks'].numpy().astype(np.uint8).squeeze()
194
+
195
+ caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini')
196
+ if vid_id not in all_captions:
197
+ all_captions[vid_id] = {frame_id : caption}
198
+ else:
199
+ all_captions[vid_id][frame_id] = caption
200
+
201
+ print("Finished!", flush=True)
202
+
203
+ with open(args.save_caption_path, 'w') as file:
204
+ json.dump(all_captions, file, indent=4)
205
+
.history/mbench_a2d/gpt_a2d_numbered_20250206114540.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+ import time
9
+
10
+ import numpy as np
11
+ import matplotlib.pyplot as plt
12
+ import cv2
13
+ from io import BytesIO
14
+ import base64
15
+ from PIL import Image
16
+ import json
17
+
18
+ from openai import OpenAI
19
+
20
+ def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
21
+ #마스크 색칠할지
22
+ if color_mask == True:
23
+ alpha = 0.1
24
+
25
+ colored_mask = np.zeros_like(frame)
26
+ colored_mask[mask == 1] = [255, 0, 0]
27
+ frame[mask == 1] = (
28
+ (1 - alpha) * frame[mask == 1] +
29
+ alpha * colored_mask[mask == 1]
30
+ )
31
+
32
+ #마스크 아웃라인 그리기
33
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
34
+ cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
35
+
36
+ #instance_id 적을지
37
+ if label_number == True:
38
+ if len(contours) > 0:
39
+ largest_contour = max(contours, key=cv2.contourArea)
40
+ M = cv2.moments(largest_contour)
41
+ if M["m00"] != 0:
42
+ center_x = int(M["m10"] / M["m00"])
43
+ center_y = int(M["m01"] / M["m00"])
44
+ else:
45
+ center_x, center_y = 0, 0
46
+
47
+ font = cv2.FONT_HERSHEY_SIMPLEX
48
+ text = str(instance_id)
49
+ font_scale = 0.6
50
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
51
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
52
+ text_y = center_y
53
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
54
+
55
+ # 텍스트 배경 사각형 좌표 계산
56
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
57
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
58
+ rect_end = (text_x + text_size[0] + 5, text_y)
59
+
60
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
61
+ cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
62
+
63
+ # plt.figure(figsize=(6, 10))
64
+ # plt.imshow(frame)
65
+ # plt.title(text_query)
66
+ # plt.tight_layout()
67
+ # plt.axis('off')
68
+ # plt.show()
69
+
70
+ buffer = BytesIO()
71
+ frame = Image.fromarray(frame)
72
+ frame.save(buffer, format='jpeg')
73
+ buffer.seek(0)
74
+ encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
75
+
76
+ return encoded_frame
77
+
78
+ def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
79
+
80
+ base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
81
+
82
+ captioner = OpenAI()
83
+
84
+ #필터링하지 않고 바로 ref exp 만들기
85
+ dense_caption_prompt = f"""
86
+ You are a visual assistant analyzing a single frame of a video.
87
+ In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
88
+ I also give you a text query describing the marked object.
89
+ I want to use your expression to create an **action-centric referring expression** dataset.
90
+ Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
91
+ ---
92
+ ## Guidelines:
93
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
94
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
95
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
96
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
97
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
98
+ 6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
99
+ 7. Base your description on these action definitions:
100
+ - Avoid using term 'minimal' or 'slightly'.
101
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
102
+ - details such as motion and intention, facial with object manipulation
103
+ - movements with object or other entities when they are prominent and observable. expression should be specific.
104
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
105
+ --
106
+ ## Output Format:
107
+ - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
108
+ object id. action-oriented description
109
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
110
+ ### Example
111
+ If the frame has 1 labeled bear, your output should look like:
112
+ 1. the bear reaching his right arm while leaning forward to capture the prey
113
+ ---
114
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
115
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
116
+ **Do not include markdown** in the output.
117
+ Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
118
+ For each labeled object, output referring expressions for each object id.
119
+ """
120
+ prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
121
+
122
+ MAX_RETRIES = 2
123
+ retry_count = 0
124
+
125
+ while retry_count < MAX_RETRIES:
126
+ response = captioner.chat.completions.create(
127
+ model=model,
128
+ messages=[
129
+ {
130
+ "role": "user",
131
+ "content": [
132
+ {
133
+ "type": "text",
134
+ "text": prompt_with_text_query,
135
+ },
136
+ {
137
+ "type": "image_url",
138
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
139
+ },
140
+ ],
141
+ }
142
+ ],
143
+ )
144
+
145
+
146
+ caption = response.choices[0].message.content.strip()
147
+ caption_lower = caption.lower().lstrip()
148
+ if caption_lower.startswith("1.") and not any(
149
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
150
+ ):
151
+ break
152
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
153
+ retry_count += 1
154
+ time.sleep(2)
155
+
156
+ if retry_count == MAX_RETRIES:
157
+ caption = None
158
+ print("Max retries reached. Caption generation failed.")
159
+
160
+ else:
161
+ caption = None
162
+
163
+ return caption
164
+
165
+ if __name__ == "__main__":
166
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
167
+ parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
168
+ args = parser.parse_args()
169
+
170
+ train_dataset = build_dataset('a2d', image_set = 'train', args = args)
171
+ text_annotations = train_dataset.text_annotations
172
+
173
+ all_captions = {}
174
+
175
+ #os.environ['OPENAI_API_KEY'] = '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'
176
+ os.environ['OPENAI_API_KEY'] = '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'
177
+
178
+ first_text_query = ""
179
+ for idx in range(300):
180
+ imgs, target = train_dataset[idx]
181
+ frames_idx = target['frames_idx'].tolist()
182
+ text_query, vid_id, frame_id, instance_id = text_annotations[idx]
183
+
184
+ if text_query == first_text_query:
185
+ continue
186
+
187
+ print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
188
+
189
+ frame_id = frame_id - 1
190
+ frame_order = frames_idx.index(frame_id)
191
+
192
+ frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
193
+ mask = target['masks'].numpy().astype(np.uint8).squeeze()
194
+
195
+ caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini')
196
+ if vid_id not in all_captions:
197
+ all_captions[vid_id] = {frame_id : caption}
198
+ else:
199
+ all_captions[vid_id][frame_id] = caption
200
+
201
+ if idx % 50 == 0:
202
+ with open(args.save_caption_path, 'w') as file:
203
+ json.dump(all_captions, file, indent=4)
204
+
205
+ print("Finished!", flush=True)
206
+
207
+ with open(args.save_caption_path, 'w') as file:
208
+ json.dump(all_captions, file, indent=4)
209
+
.history/mbench_a2d/gpt_a2d_numbered_20250206145656.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+ import time
9
+
10
+ import numpy as np
11
+ import matplotlib.pyplot as plt
12
+ import cv2
13
+ from io import BytesIO
14
+ import base64
15
+ from PIL import Image
16
+ import json
17
+
18
+ from openai import OpenAI
19
+
20
+ def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
21
+ #마스크 색칠할지
22
+ if color_mask == True:
23
+ alpha = 0.1
24
+
25
+ colored_mask = np.zeros_like(frame)
26
+ colored_mask[mask == 1] = [255, 0, 0]
27
+ frame[mask == 1] = (
28
+ (1 - alpha) * frame[mask == 1] +
29
+ alpha * colored_mask[mask == 1]
30
+ )
31
+
32
+ #마스크 아웃라인 그리기
33
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
34
+ cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
35
+
36
+ #instance_id 적을지
37
+ if label_number == True:
38
+ if len(contours) > 0:
39
+ largest_contour = max(contours, key=cv2.contourArea)
40
+ M = cv2.moments(largest_contour)
41
+ if M["m00"] != 0:
42
+ center_x = int(M["m10"] / M["m00"])
43
+ center_y = int(M["m01"] / M["m00"])
44
+ else:
45
+ center_x, center_y = 0, 0
46
+
47
+ font = cv2.FONT_HERSHEY_SIMPLEX
48
+ text = str(instance_id)
49
+ font_scale = 0.6
50
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
51
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
52
+ text_y = center_y
53
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
54
+
55
+ # 텍스트 배경 사각형 좌표 계산
56
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
57
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
58
+ rect_end = (text_x + text_size[0] + 5, text_y)
59
+
60
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
61
+ cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
62
+
63
+ # plt.figure(figsize=(6, 10))
64
+ # plt.imshow(frame)
65
+ # plt.title(text_query)
66
+ # plt.tight_layout()
67
+ # plt.axis('off')
68
+ # plt.show()
69
+
70
+ buffer = BytesIO()
71
+ frame = Image.fromarray(frame)
72
+ frame.save(buffer, format='jpeg')
73
+ buffer.seek(0)
74
+ encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
75
+
76
+ return encoded_frame
77
+
78
+ def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
79
+
80
+ base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
81
+
82
+ captioner = OpenAI()
83
+
84
+ #필터링하지 않고 바로 ref exp 만들기
85
+ dense_caption_prompt = f"""
86
+ You are a visual assistant analyzing a single frame of a video.
87
+ In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
88
+ I also give you a text query describing the marked object.
89
+ I want to use your expression to create an **action-centric referring expression** dataset.
90
+ Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
91
+ ---
92
+ ## Guidelines:
93
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
94
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
95
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
96
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
97
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
98
+ 6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
99
+ 7. Base your description on these action definitions:
100
+ - Avoid using term 'minimal' or 'slightly'.
101
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
102
+ - details such as motion and intention, facial with object manipulation
103
+ - movements with object or other entities when they are prominent and observable. expression should be specific.
104
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
105
+ --
106
+ ## Output Format:
107
+ - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
108
+ object id. action-oriented description
109
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
110
+ ### Example
111
+ If the frame has 1 labeled bear, your output should look like:
112
+ 1. the bear reaching his right arm while leaning forward to capture the prey
113
+ ---
114
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
115
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
116
+ **Do not include markdown** in the output.
117
+ Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
118
+ For each labeled object, output referring expressions for each object id.
119
+ """
120
+ prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
121
+
122
+ MAX_RETRIES = 2
123
+ retry_count = 0
124
+
125
+ while retry_count < MAX_RETRIES:
126
+ response = captioner.chat.completions.create(
127
+ model=model,
128
+ messages=[
129
+ {
130
+ "role": "user",
131
+ "content": [
132
+ {
133
+ "type": "text",
134
+ "text": prompt_with_text_query,
135
+ },
136
+ {
137
+ "type": "image_url",
138
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
139
+ },
140
+ ],
141
+ }
142
+ ],
143
+ )
144
+
145
+
146
+ caption = response.choices[0].message.content.strip()
147
+ caption_lower = caption.lower().lstrip()
148
+ if caption_lower.startswith("1.") and not any(
149
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
150
+ ):
151
+ break
152
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
153
+ retry_count += 1
154
+ time.sleep(2)
155
+
156
+ if retry_count == MAX_RETRIES:
157
+ caption = None
158
+ print("Max retries reached. Caption generation failed.")
159
+
160
+ else:
161
+ caption = None
162
+
163
+ return caption
164
+
165
+ if __name__ == "__main__":
166
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
167
+ parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
168
+ args = parser.parse_args()
169
+
170
+ train_dataset = build_dataset('a2d', image_set = 'train', args = args)
171
+ text_annotations = train_dataset.text_annotations
172
+
173
+ all_captions = {}
174
+
175
+ #os.environ['OPENAI_API_KEY'] = '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'
176
+ os.environ['OPENAI_API_KEY'] = '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'
177
+
178
+ first_text_query = ""
179
+ for idx in range(300):
180
+ imgs, target = train_dataset[idx]
181
+ frames_idx = target['frames_idx'].tolist()
182
+ text_query, vid_id, frame_id, instance_id = text_annotations[idx]
183
+
184
+ if text_query == first_text_query:
185
+ continue
186
+
187
+ print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
188
+
189
+ frame_id = frame_id - 1
190
+ frame_order = frames_idx.index(frame_id)
191
+
192
+ frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
193
+ mask = target['masks'].numpy().astype(np.uint8).squeeze()
194
+
195
+ caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini')
196
+ if vid_id not in all_captions:
197
+ all_captions[vid_id] = {idx : caption}
198
+ else:
199
+ all_captions[vid_id][idx] = caption
200
+
201
+ if idx % 50 == 0:
202
+ with open(args.save_caption_path, 'w') as file:
203
+ json.dump(all_captions, file, indent=4)
204
+
205
+ print("Finished!", flush=True)
206
+
207
+ with open(args.save_caption_path, 'w') as file:
208
+ json.dump(all_captions, file, indent=4)
209
+
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130185215.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
4
+ #SBATCH --partition=a4000
5
+ #SBATCH --nodelist=node05
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=14-00:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
11
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
12
+
13
+ ml purge
14
+ ml load cuda/12.1
15
+ eval "$(conda shell.bash hook)"
16
+ conda activate referformer
17
+
18
+ python3 mbench/gpt_ref-ytvos_numbered_cy.py
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207173418.sh ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=mbench_gpt_ref-ytvos_numbered_final
4
+ #SBATCH --partition=a4000
5
+ #SBATCH --nodelist=node05
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH --time=14-00:00:00
8
+ #SBATCH --mem=5G
9
+ #SBATCH --cpus-per-task=4
10
+ #SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered_final.out
11
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
12
+
13
+ ml purge
14
+ ml load cuda/12.1
15
+ eval "$(conda shell.bash hook)"
16
+ conda activate referformer
17
+
18
+ python3 mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py \
19
+ --save_caption_path mbench/numbered_captions_gpt-4o_final.json \
20
+ --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o_final.json
hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b.lock ADDED
File without changes
hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/cf1c08b23cfa58fa714ab5a4a233b9b42ee9bb9b.lock ADDED
File without changes