Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .history/datasets/ytvos_ref_20250113163402.py +249 -0
- .history/datasets/ytvos_ref_20250116073826.py +240 -0
- .history/mbench/gpt_ref-ytvos-cy_20250121151408.py +431 -0
- .history/mbench/gpt_ref-ytvos-cy_20250121155710.py +428 -0
- .history/mbench/gpt_ref-ytvos-revised_20250121155717.py +428 -0
- .history/mbench/gpt_ref-ytvos-revised_20250121155956.py +428 -0
- .history/mbench/gpt_ref-ytvos-revised_20250121160813.py +428 -0
- .history/mbench/gpt_ref-ytvos_20250119070213.py +277 -0
- .history/mbench/gpt_ref-ytvos_20250119070707.py +282 -0
- .history/mbench/gpt_ref-ytvos_20250119070824.py +286 -0
- .history/mbench/gpt_ref-ytvos_20250119071214.py +290 -0
- .history/mbench/gpt_ref-ytvos_20250119073250.py +292 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_20250130183735.py +0 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_20250130183916.py +199 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_20250130185048.py +422 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_20250130190055.py +428 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_20250130190447.py +430 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_20250130190713.py +430 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_20250131124156.py +427 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_20250201140343.py +460 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_20250201140413.py +460 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_20250201141847.py +460 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250206153011.py +644 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207171300.py +644 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207171416.py +644 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207173350.py +677 -0
- .history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207184812.py +676 -0
- .history/mbench/make_ref-ytvos_json_20250113183250.py +103 -0
- .history/mbench/make_ref-ytvos_json_20250113183335.py +103 -0
- .history/mbench/make_ref-ytvos_json_20250113183413.py +103 -0
- .history/mbench/make_ref-ytvos_json_20250113195227.py +103 -0
- .history/mbench/make_ref-ytvos_json_20250116140938.py +103 -0
- .history/mbench/make_ref-ytvos_json_20250116141629.py +104 -0
- .history/mbench/make_ref-ytvos_json_20250117072647.py +107 -0
- .history/mbench/make_ref-ytvos_json_20250117074149.py +107 -0
- .history/mbench/make_ref-ytvos_json_20250118024354.py +108 -0
- .history/mbench/ytvos_ref_20250121140600.py +265 -0
- .history/mbench_a2d/gpt_a2d_numbered_20250205111521.py +0 -0
- .history/mbench_a2d/gpt_a2d_numbered_20250205151640.py +197 -0
- .history/mbench_a2d/gpt_a2d_numbered_20250205151759.py +199 -0
- .history/mbench_a2d/gpt_a2d_numbered_20250205151827.py +199 -0
- .history/mbench_a2d/gpt_a2d_numbered_20250205151833.py +199 -0
- .history/mbench_a2d/gpt_a2d_numbered_20250205152714.py +200 -0
- .history/mbench_a2d/gpt_a2d_numbered_20250206114221.py +205 -0
- .history/mbench_a2d/gpt_a2d_numbered_20250206114540.py +209 -0
- .history/mbench_a2d/gpt_a2d_numbered_20250206145656.py +209 -0
- .history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130185215.sh +18 -0
- .history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207173418.sh +20 -0
- hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b.lock +0 -0
- hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/cf1c08b23cfa58fa714ab5a4a233b9b42ee9bb9b.lock +0 -0
.history/datasets/ytvos_ref_20250113163402.py
ADDED
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Ref-YoutubeVOS data loader
|
3 |
+
"""
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from torch.autograd.grad_mode import F
|
8 |
+
from torch.utils.data import Dataset
|
9 |
+
import datasets.transforms_video as T
|
10 |
+
|
11 |
+
import os
|
12 |
+
from PIL import Image
|
13 |
+
import json
|
14 |
+
import numpy as np
|
15 |
+
import random
|
16 |
+
|
17 |
+
from datasets.categories import ytvos_category_dict as category_dict
|
18 |
+
|
19 |
+
|
20 |
+
class YTVOSDataset(Dataset):
|
21 |
+
"""
|
22 |
+
A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
|
23 |
+
"URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
|
24 |
+
(see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
|
25 |
+
The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
|
26 |
+
dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
|
27 |
+
through the Youtube-VOS referring video object segmentation competition page at:
|
28 |
+
https://competitions.codalab.org/competitions/29139
|
29 |
+
Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
|
30 |
+
two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
|
31 |
+
currently only be done on the competition 'validation' subset using the competition's server, as
|
32 |
+
annotations were publicly released only for the 'train' subset of the competition.
|
33 |
+
|
34 |
+
"""
|
35 |
+
def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
|
36 |
+
num_frames: int, max_skip: int):
|
37 |
+
self.img_folder = img_folder
|
38 |
+
self.ann_file = ann_file
|
39 |
+
self._transforms = transforms
|
40 |
+
self.return_masks = return_masks # not used
|
41 |
+
self.num_frames = num_frames
|
42 |
+
self.max_skip = max_skip
|
43 |
+
# create video meta data
|
44 |
+
self.prepare_metas()
|
45 |
+
|
46 |
+
print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
|
47 |
+
print('\n')
|
48 |
+
|
49 |
+
def prepare_metas(self):
|
50 |
+
# read object information
|
51 |
+
with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
|
52 |
+
subset_metas_by_video = json.load(f)['videos']
|
53 |
+
|
54 |
+
# read expression data
|
55 |
+
with open(str(self.ann_file), 'r') as f:
|
56 |
+
subset_expressions_by_video = json.load(f)['videos']
|
57 |
+
self.videos = list(subset_expressions_by_video.keys())
|
58 |
+
|
59 |
+
self.metas = []
|
60 |
+
skip_vid_count = 0
|
61 |
+
|
62 |
+
for vid in self.videos:
|
63 |
+
vid_meta = subset_metas_by_video[vid]
|
64 |
+
vid_data = subset_expressions_by_video[vid]
|
65 |
+
vid_frames = sorted(vid_data['frames'])
|
66 |
+
vid_len = len(vid_frames)
|
67 |
+
|
68 |
+
if vid_len < 11:
|
69 |
+
#print(f"Too short video: {vid} with frame length {vid_len}")
|
70 |
+
skip_vid_count += 1
|
71 |
+
continue
|
72 |
+
|
73 |
+
for exp_id, exp_dict in vid_data['expressions'].items():
|
74 |
+
# Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
|
75 |
+
start_idx , end_idx = 2, vid_len-2
|
76 |
+
bin_size = (end_idx - start_idx) // 4
|
77 |
+
|
78 |
+
bins = []
|
79 |
+
for i in range(4):
|
80 |
+
bin_start = start_idx + i * bin_size
|
81 |
+
bin_end = bin_start + bin_size if i < 3 else end_idx
|
82 |
+
|
83 |
+
bins.append((bin_start, bin_end))
|
84 |
+
|
85 |
+
# Random sample one frame from each bin
|
86 |
+
sample_indx = []
|
87 |
+
for start_idx, end_idx in bins:
|
88 |
+
sample_indx.append(random.randint(start_idx, end_idx - 1))
|
89 |
+
sample_indx.sort() # Ensure indices are in order
|
90 |
+
|
91 |
+
|
92 |
+
for frame_id in sample_indx:
|
93 |
+
meta = {
|
94 |
+
'video': vid,
|
95 |
+
'exp': exp_dict['exp'],
|
96 |
+
'obj_id': int(exp_dict['obj_id']),
|
97 |
+
'frames': vid_frames,
|
98 |
+
'frame_id' : frame_id,
|
99 |
+
'sample_frames_id' : sample_indx,
|
100 |
+
'bins': bins,
|
101 |
+
'category': vid_meta['objects'][exp_dict['obj_id']]['category']
|
102 |
+
}
|
103 |
+
self.metas.append(meta)
|
104 |
+
print(f"skipped {skip_vid_count} short videos")
|
105 |
+
|
106 |
+
|
107 |
+
@staticmethod
|
108 |
+
def bounding_box(img):
|
109 |
+
rows = np.any(img, axis=1)
|
110 |
+
cols = np.any(img, axis=0)
|
111 |
+
rmin, rmax = np.where(rows)[0][[0, -1]]
|
112 |
+
cmin, cmax = np.where(cols)[0][[0, -1]]
|
113 |
+
return rmin, rmax, cmin, cmax # y1, y2, x1, x2
|
114 |
+
|
115 |
+
def __len__(self):
|
116 |
+
return len(self.metas)
|
117 |
+
|
118 |
+
def __getitem__(self, idx):
|
119 |
+
instance_check = False
|
120 |
+
while not instance_check:
|
121 |
+
meta = self.metas[idx] # dict
|
122 |
+
|
123 |
+
|
124 |
+
video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
|
125 |
+
meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
|
126 |
+
|
127 |
+
|
128 |
+
# clean up the caption
|
129 |
+
exp = " ".join(exp.lower().split())
|
130 |
+
category_id = category_dict[category]
|
131 |
+
vid_len = len(frames)
|
132 |
+
|
133 |
+
# num_frames = self.num_frames
|
134 |
+
|
135 |
+
# read frames and masks
|
136 |
+
imgs, labels, boxes, masks, valid = [], [], [], [], []
|
137 |
+
for frame_indx in sample_frames_id:
|
138 |
+
frame_name = frames[frame_indx]
|
139 |
+
img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
|
140 |
+
mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
|
141 |
+
img = Image.open(img_path).convert('RGB')
|
142 |
+
mask = Image.open(mask_path).convert('P')
|
143 |
+
|
144 |
+
# create the target
|
145 |
+
label = torch.tensor(category_id)
|
146 |
+
mask = np.array(mask)
|
147 |
+
mask = (mask==obj_id).astype(np.float32) # 0,1 binary
|
148 |
+
if (mask > 0).any():
|
149 |
+
y1, y2, x1, x2 = self.bounding_box(mask)
|
150 |
+
box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
|
151 |
+
valid.append(1)
|
152 |
+
else: # some frame didn't contain the instance
|
153 |
+
box = torch.tensor([0, 0, 0, 0]).to(torch.float)
|
154 |
+
valid.append(0)
|
155 |
+
mask = torch.from_numpy(mask)
|
156 |
+
|
157 |
+
# append
|
158 |
+
imgs.append(img)
|
159 |
+
labels.append(label)
|
160 |
+
masks.append(mask)
|
161 |
+
boxes.append(box)
|
162 |
+
|
163 |
+
# transform
|
164 |
+
w, h = img.size
|
165 |
+
labels = torch.stack(labels, dim=0)
|
166 |
+
boxes = torch.stack(boxes, dim=0)
|
167 |
+
boxes[:, 0::2].clamp_(min=0, max=w)
|
168 |
+
boxes[:, 1::2].clamp_(min=0, max=h)
|
169 |
+
masks = torch.stack(masks, dim=0)
|
170 |
+
target = {
|
171 |
+
'frames_idx': torch.tensor(sample_frames_id), # [T,]
|
172 |
+
'labels': labels, # [T,]
|
173 |
+
'boxes': boxes, # [T, 4], xyxy
|
174 |
+
'masks': masks, # [T, H, W]
|
175 |
+
'valid': torch.tensor(valid), # [T,]
|
176 |
+
'caption': exp,
|
177 |
+
'orig_size': torch.as_tensor([int(h), int(w)]),
|
178 |
+
'size': torch.as_tensor([int(h), int(w)])
|
179 |
+
}
|
180 |
+
|
181 |
+
# "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
|
182 |
+
if self._transforms:
|
183 |
+
imgs, target = self._transforms(imgs, target)
|
184 |
+
imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
|
185 |
+
else:
|
186 |
+
imgs = np.array(imgs)
|
187 |
+
imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
|
188 |
+
|
189 |
+
|
190 |
+
# FIXME: handle "valid", since some box may be removed due to random crop
|
191 |
+
if torch.any(target['valid'] == 1): # at leatst one instance
|
192 |
+
instance_check = True
|
193 |
+
else:
|
194 |
+
idx = random.randint(0, self.__len__() - 1)
|
195 |
+
|
196 |
+
return imgs, target
|
197 |
+
|
198 |
+
|
199 |
+
def make_coco_transforms(image_set, max_size=640):
|
200 |
+
normalize = T.Compose([
|
201 |
+
T.ToTensor(),
|
202 |
+
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
203 |
+
])
|
204 |
+
|
205 |
+
scales = [288, 320, 352, 392, 416, 448, 480, 512]
|
206 |
+
|
207 |
+
if image_set == 'train':
|
208 |
+
return T.Compose([
|
209 |
+
T.RandomHorizontalFlip(),
|
210 |
+
T.PhotometricDistort(),
|
211 |
+
T.RandomSelect(
|
212 |
+
T.Compose([
|
213 |
+
T.RandomResize(scales, max_size=max_size),
|
214 |
+
T.Check(),
|
215 |
+
]),
|
216 |
+
T.Compose([
|
217 |
+
T.RandomResize([400, 500, 600]),
|
218 |
+
T.RandomSizeCrop(384, 600),
|
219 |
+
T.RandomResize(scales, max_size=max_size),
|
220 |
+
T.Check(),
|
221 |
+
])
|
222 |
+
),
|
223 |
+
normalize,
|
224 |
+
])
|
225 |
+
|
226 |
+
# we do not use the 'val' set since the annotations are inaccessible
|
227 |
+
if image_set == 'val':
|
228 |
+
return T.Compose([
|
229 |
+
T.RandomResize([360], max_size=640),
|
230 |
+
normalize,
|
231 |
+
])
|
232 |
+
|
233 |
+
raise ValueError(f'unknown {image_set}')
|
234 |
+
|
235 |
+
|
236 |
+
def build(image_set, args):
|
237 |
+
root = Path(args.ytvos_path)
|
238 |
+
assert root.exists(), f'provided YTVOS path {root} does not exist'
|
239 |
+
PATHS = {
|
240 |
+
"train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
|
241 |
+
"val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
|
242 |
+
}
|
243 |
+
img_folder, ann_file = PATHS[image_set]
|
244 |
+
# dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
|
245 |
+
# num_frames=args.num_frames, max_skip=args.max_skip)
|
246 |
+
dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
|
247 |
+
num_frames=args.num_frames, max_skip=args.max_skip)
|
248 |
+
return dataset
|
249 |
+
|
.history/datasets/ytvos_ref_20250116073826.py
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Ref-YoutubeVOS data loader
|
3 |
+
"""
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from torch.autograd.grad_mode import F
|
8 |
+
from torch.utils.data import Dataset
|
9 |
+
import datasets.transforms_video as T
|
10 |
+
|
11 |
+
import os
|
12 |
+
from PIL import Image
|
13 |
+
import json
|
14 |
+
import numpy as np
|
15 |
+
import random
|
16 |
+
|
17 |
+
from datasets.categories import ytvos_category_dict as category_dict
|
18 |
+
|
19 |
+
|
20 |
+
class YTVOSDataset(Dataset):
|
21 |
+
"""
|
22 |
+
A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
|
23 |
+
"URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
|
24 |
+
(see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
|
25 |
+
The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
|
26 |
+
dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
|
27 |
+
through the Youtube-VOS referring video object segmentation competition page at:
|
28 |
+
https://competitions.codalab.org/competitions/29139
|
29 |
+
Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
|
30 |
+
two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
|
31 |
+
currently only be done on the competition 'validation' subset using the competition's server, as
|
32 |
+
annotations were publicly released only for the 'train' subset of the competition.
|
33 |
+
|
34 |
+
"""
|
35 |
+
def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
|
36 |
+
num_frames: int, max_skip: int):
|
37 |
+
self.img_folder = img_folder
|
38 |
+
self.ann_file = ann_file
|
39 |
+
self._transforms = transforms
|
40 |
+
self.return_masks = return_masks # not used
|
41 |
+
self.num_frames = num_frames
|
42 |
+
self.max_skip = max_skip
|
43 |
+
# create video meta data
|
44 |
+
self.prepare_metas()
|
45 |
+
|
46 |
+
print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
|
47 |
+
print('\n')
|
48 |
+
|
49 |
+
def prepare_metas(self):
|
50 |
+
# read object information
|
51 |
+
with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
|
52 |
+
subset_metas_by_video = json.load(f)['videos']
|
53 |
+
|
54 |
+
# read expression data
|
55 |
+
with open(str(self.ann_file), 'r') as f:
|
56 |
+
subset_expressions_by_video = json.load(f)['videos']
|
57 |
+
self.videos = list(subset_expressions_by_video.keys())
|
58 |
+
|
59 |
+
self.metas = []
|
60 |
+
skip_vid_count = 0
|
61 |
+
|
62 |
+
for vid in self.videos:
|
63 |
+
vid_meta = subset_metas_by_video[vid]
|
64 |
+
vid_data = subset_expressions_by_video[vid]
|
65 |
+
vid_frames = sorted(vid_data['frames'])
|
66 |
+
vid_len = len(vid_frames)
|
67 |
+
|
68 |
+
if vid_len < 11:
|
69 |
+
#print(f"Too short video: {vid} with frame length {vid_len}")
|
70 |
+
skip_vid_count += 1
|
71 |
+
continue
|
72 |
+
|
73 |
+
|
74 |
+
# Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
|
75 |
+
start_idx , end_idx = 2, vid_len-2
|
76 |
+
bin_size = (end_idx - start_idx) // 4
|
77 |
+
|
78 |
+
bins = []
|
79 |
+
for i in range(4):
|
80 |
+
bin_start = start_idx + i * bin_size
|
81 |
+
bin_end = bin_start + bin_size if i < 3 else end_idx
|
82 |
+
|
83 |
+
bins.append((bin_start, bin_end))
|
84 |
+
|
85 |
+
# Random sample one frame from each bin
|
86 |
+
sample_indx = []
|
87 |
+
for start_idx, end_idx in bins:
|
88 |
+
sample_indx.append(random.randint(start_idx, end_idx - 1))
|
89 |
+
sample_indx.sort() # Ensure indices are in order
|
90 |
+
|
91 |
+
|
92 |
+
meta = {
|
93 |
+
'video':vid,
|
94 |
+
'sample_indx':sample_indx,
|
95 |
+
'bins':bins,
|
96 |
+
'frames':vid_frames
|
97 |
+
}
|
98 |
+
obj_id_cat = {}
|
99 |
+
for exp_id, exp_dict in vid_data['expressions'].items():
|
100 |
+
obj_id = exp_dict['obj_id']
|
101 |
+
if obj_id not in obj_id_cat:
|
102 |
+
obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
|
103 |
+
meta['obj_id_cat'] = obj_id_cat
|
104 |
+
self.metas.append(meta)
|
105 |
+
|
106 |
+
print(f"skipped {skip_vid_count} short videos")
|
107 |
+
|
108 |
+
|
109 |
+
@staticmethod
|
110 |
+
def bounding_box(img):
|
111 |
+
rows = np.any(img, axis=1)
|
112 |
+
cols = np.any(img, axis=0)
|
113 |
+
rmin, rmax = np.where(rows)[0][[0, -1]]
|
114 |
+
cmin, cmax = np.where(cols)[0][[0, -1]]
|
115 |
+
return rmin, rmax, cmin, cmax # y1, y2, x1, x2
|
116 |
+
|
117 |
+
def __len__(self):
|
118 |
+
return len(self.metas)
|
119 |
+
|
120 |
+
def __getitem__(self, idx):
|
121 |
+
meta = self.metas[idx] # dict
|
122 |
+
|
123 |
+
video, sample_indx, bins, frames, obj_id_cat = \
|
124 |
+
meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
|
125 |
+
|
126 |
+
# read frames and masks
|
127 |
+
imgs, labels, boxes, masks, valid = [], [], [], [], []
|
128 |
+
for frame_indx in sample_indx:
|
129 |
+
frame_name = frames[frame_indx]
|
130 |
+
img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
|
131 |
+
mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
|
132 |
+
img = Image.open(img_path).convert('RGB')
|
133 |
+
imgs.append(img)
|
134 |
+
|
135 |
+
mask = Image.open(mask_path).convert('P')
|
136 |
+
mask = np.array(mask)
|
137 |
+
print(mask.dtype)
|
138 |
+
|
139 |
+
# create the target
|
140 |
+
for obj_id in list(obj_id_cat.keys()):
|
141 |
+
obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary
|
142 |
+
if (obj_mask > 0).any():
|
143 |
+
y1, y2, x1, x2 = self.bounding_box(mask)
|
144 |
+
box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
|
145 |
+
valid.append(1)
|
146 |
+
else: # some frame didn't contain the instance
|
147 |
+
box = torch.tensor([0, 0, 0, 0]).to(torch.float)
|
148 |
+
valid.append(0)
|
149 |
+
obj_mask = torch.from_numpy(obj_mask)
|
150 |
+
|
151 |
+
# append
|
152 |
+
masks.append(obj_mask)
|
153 |
+
boxes.append(box)
|
154 |
+
|
155 |
+
|
156 |
+
# transform
|
157 |
+
w, h = img.size
|
158 |
+
boxes = torch.stack(boxes, dim=0)
|
159 |
+
boxes[:, 0::2].clamp_(min=0, max=w)
|
160 |
+
boxes[:, 1::2].clamp_(min=0, max=h)
|
161 |
+
masks = torch.stack(masks, dim=0)
|
162 |
+
target = {
|
163 |
+
'frames_idx': sample_indx, # [T,]
|
164 |
+
'boxes': boxes, # [T, 4], xyxy
|
165 |
+
'masks': masks, # [T, H, W]
|
166 |
+
'valid': torch.tensor(valid), # [T,]
|
167 |
+
'obj_ids' : list(obj_id_cat.keys()),
|
168 |
+
'orig_size': torch.as_tensor([int(h), int(w)]),
|
169 |
+
'size': torch.as_tensor([int(h), int(w)])
|
170 |
+
}
|
171 |
+
|
172 |
+
# "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
|
173 |
+
if self._transforms:
|
174 |
+
imgs, target = self._transforms(imgs, target)
|
175 |
+
imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
|
176 |
+
else:
|
177 |
+
imgs = np.array(imgs)
|
178 |
+
imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
|
179 |
+
|
180 |
+
|
181 |
+
# # FIXME: handle "valid", since some box may be removed due to random crop
|
182 |
+
# if torch.any(target['valid'] == 1): # at leatst one instance
|
183 |
+
# instance_check = True
|
184 |
+
# else:
|
185 |
+
# idx = random.randint(0, self.__len__() - 1)
|
186 |
+
|
187 |
+
return imgs, target
|
188 |
+
|
189 |
+
|
190 |
+
def make_coco_transforms(image_set, max_size=640):
|
191 |
+
normalize = T.Compose([
|
192 |
+
T.ToTensor(),
|
193 |
+
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
194 |
+
])
|
195 |
+
|
196 |
+
scales = [288, 320, 352, 392, 416, 448, 480, 512]
|
197 |
+
|
198 |
+
if image_set == 'train':
|
199 |
+
return T.Compose([
|
200 |
+
T.RandomHorizontalFlip(),
|
201 |
+
T.PhotometricDistort(),
|
202 |
+
T.RandomSelect(
|
203 |
+
T.Compose([
|
204 |
+
T.RandomResize(scales, max_size=max_size),
|
205 |
+
T.Check(),
|
206 |
+
]),
|
207 |
+
T.Compose([
|
208 |
+
T.RandomResize([400, 500, 600]),
|
209 |
+
T.RandomSizeCrop(384, 600),
|
210 |
+
T.RandomResize(scales, max_size=max_size),
|
211 |
+
T.Check(),
|
212 |
+
])
|
213 |
+
),
|
214 |
+
normalize,
|
215 |
+
])
|
216 |
+
|
217 |
+
# we do not use the 'val' set since the annotations are inaccessible
|
218 |
+
if image_set == 'val':
|
219 |
+
return T.Compose([
|
220 |
+
T.RandomResize([360], max_size=640),
|
221 |
+
normalize,
|
222 |
+
])
|
223 |
+
|
224 |
+
raise ValueError(f'unknown {image_set}')
|
225 |
+
|
226 |
+
|
227 |
+
def build(image_set, args):
|
228 |
+
root = Path(args.ytvos_path)
|
229 |
+
assert root.exists(), f'provided YTVOS path {root} does not exist'
|
230 |
+
PATHS = {
|
231 |
+
"train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
|
232 |
+
"val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
|
233 |
+
}
|
234 |
+
img_folder, ann_file = PATHS[image_set]
|
235 |
+
# dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
|
236 |
+
# num_frames=args.num_frames, max_skip=args.max_skip)
|
237 |
+
dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
|
238 |
+
num_frames=args.num_frames, max_skip=args.max_skip)
|
239 |
+
return dataset
|
240 |
+
|
.history/mbench/gpt_ref-ytvos-cy_20250121151408.py
ADDED
@@ -0,0 +1,431 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from os import path as osp
|
3 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
|
9 |
+
import sys
|
10 |
+
from pathlib import Path
|
11 |
+
import os
|
12 |
+
from os import path as osp
|
13 |
+
import skimage
|
14 |
+
from io import BytesIO
|
15 |
+
|
16 |
+
import numpy as np
|
17 |
+
import pandas as pd
|
18 |
+
import regex as re
|
19 |
+
import json
|
20 |
+
|
21 |
+
import cv2
|
22 |
+
from PIL import Image, ImageDraw
|
23 |
+
import torch
|
24 |
+
from torchvision.transforms import functional as F
|
25 |
+
|
26 |
+
from skimage import measure # (pip install scikit-image)
|
27 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
28 |
+
|
29 |
+
import matplotlib.pyplot as plt
|
30 |
+
import matplotlib.patches as patches
|
31 |
+
from matplotlib.collections import PatchCollection
|
32 |
+
from matplotlib.patches import Rectangle
|
33 |
+
|
34 |
+
|
35 |
+
import ipywidgets as widgets
|
36 |
+
from IPython.display import display, clear_output
|
37 |
+
|
38 |
+
from openai import OpenAI
|
39 |
+
import base64
|
40 |
+
|
41 |
+
# Function to encode the image
|
42 |
+
def encode_image(image_path):
|
43 |
+
with open(image_path, "rb") as image_file:
|
44 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
45 |
+
|
46 |
+
|
47 |
+
# Captioner
|
48 |
+
ytvos_category_valid_list = [
|
49 |
+
'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
50 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
51 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
52 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
53 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
54 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
55 |
+
]
|
56 |
+
def getCaption(video_id, json_data):
|
57 |
+
#데이터 가져오기
|
58 |
+
video_data = json_data[video_id]
|
59 |
+
frame_names = video_data['frame_names']
|
60 |
+
video_path = video_data['video_path']
|
61 |
+
|
62 |
+
cat_names = set()
|
63 |
+
all_captions = dict()
|
64 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
65 |
+
cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
|
66 |
+
|
67 |
+
# cat_names : person, snowboard
|
68 |
+
# 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
|
69 |
+
# 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
|
70 |
+
|
71 |
+
for cat_name in list(cat_names) :
|
72 |
+
image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
|
73 |
+
image_captions = {}
|
74 |
+
|
75 |
+
captioner = OpenAI()
|
76 |
+
|
77 |
+
#0단계: action의 대상이 될 수 있는가?
|
78 |
+
is_movable = False
|
79 |
+
if cat_name in ytvos_category_valid_list :
|
80 |
+
is_movable = True
|
81 |
+
|
82 |
+
# response_check = captioner.chat.completions.create(
|
83 |
+
# model="gpt-4o",
|
84 |
+
# messages=[
|
85 |
+
# {
|
86 |
+
# "role": "user",
|
87 |
+
# "content": f"""
|
88 |
+
# Can a {cat_name} be a subject of distinct actions or movements?
|
89 |
+
# For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
|
90 |
+
# However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
|
91 |
+
# Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
|
92 |
+
# Answer only YES or NONE.
|
93 |
+
# """
|
94 |
+
# }
|
95 |
+
# ],
|
96 |
+
# )
|
97 |
+
# response_check_content = response_check.choices[0].message.content.strip().lower()
|
98 |
+
# print(f"Movable Check for {cat_name}: {response_check_content}")
|
99 |
+
|
100 |
+
# if response_check_content == "yes": is_movable = True
|
101 |
+
|
102 |
+
if not is_movable:
|
103 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.")
|
104 |
+
continue
|
105 |
+
|
106 |
+
for i in range(len(image_paths)):
|
107 |
+
image_path = image_paths[i]
|
108 |
+
frame_name = frame_names[i]
|
109 |
+
base64_image = encode_image(image_path)
|
110 |
+
|
111 |
+
#1단계: 필터링
|
112 |
+
print(cat_name, frame_name)
|
113 |
+
response1 = captioner.chat.completions.create(
|
114 |
+
model="gpt-4o",
|
115 |
+
messages=[
|
116 |
+
{
|
117 |
+
"role": "user",
|
118 |
+
"content": [
|
119 |
+
{
|
120 |
+
"type": "text",
|
121 |
+
|
122 |
+
"text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
|
123 |
+
Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
|
124 |
+
Each action should be unique and clearly associated with a specific object.
|
125 |
+
|
126 |
+
Respond with YES if:
|
127 |
+
- The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
|
128 |
+
- The {cat_name}s involve clear, distinguishable actions performed independently.
|
129 |
+
|
130 |
+
Respond with NONE if:
|
131 |
+
- The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
|
132 |
+
- Actions are ambiguous, minor, or not clearly visible.
|
133 |
+
|
134 |
+
If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
|
135 |
+
If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
|
136 |
+
|
137 |
+
Answer only YES or NONE."""
|
138 |
+
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"type": "image_url",
|
142 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
143 |
+
},
|
144 |
+
],
|
145 |
+
}
|
146 |
+
],
|
147 |
+
)
|
148 |
+
response_content = response1.choices[0].message.content
|
149 |
+
should_caption = True if "yes" in response_content.lower() else False
|
150 |
+
print(f"are {cat_name}s distinguished by action: {response_content}")
|
151 |
+
|
152 |
+
#2단계: dense caption 만들기
|
153 |
+
if should_caption:
|
154 |
+
response2 = captioner.chat.completions.create(
|
155 |
+
model="gpt-4o-mini",
|
156 |
+
messages=[
|
157 |
+
{
|
158 |
+
"role": "user",
|
159 |
+
"content": [
|
160 |
+
{
|
161 |
+
"type": "text",
|
162 |
+
|
163 |
+
"text": f"""
|
164 |
+
Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
|
165 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
166 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
167 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
168 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
169 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
170 |
+
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
171 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
172 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
173 |
+
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
174 |
+
Output only the caption.""",
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"type": "image_url",
|
178 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
179 |
+
},
|
180 |
+
],
|
181 |
+
}
|
182 |
+
],
|
183 |
+
)
|
184 |
+
|
185 |
+
caption = response2.choices[0].message.content
|
186 |
+
print(f"{image_path} - {frame_name}: {caption}")
|
187 |
+
else:
|
188 |
+
caption = None
|
189 |
+
|
190 |
+
image_captions[frame_name] = caption
|
191 |
+
all_captions[cat_name] = image_captions
|
192 |
+
|
193 |
+
# final : also prepare valid object ids
|
194 |
+
valid_obj_ids = []
|
195 |
+
valid_cat_names = list(all_captions.keys())
|
196 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
197 |
+
cat = video_data['annotations'][0][obj_id]['category_name']
|
198 |
+
if cat in valid_cat_names : valid_obj_ids.append(obj_id)
|
199 |
+
|
200 |
+
return all_captions, valid_obj_ids
|
201 |
+
|
202 |
+
|
203 |
+
# Referring expression generator and QA filter
|
204 |
+
def getRefExp(video_id, frame_name, caption, obj_id, json_data):
|
205 |
+
# 이미지에 해당 물체 바운딩 박스 그리기
|
206 |
+
video_data = json_data[video_id]
|
207 |
+
frame_names = video_data['frame_names']
|
208 |
+
video_path = video_data['video_path']
|
209 |
+
I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
|
210 |
+
frame_indx = frame_names.index(frame_name)
|
211 |
+
obj_data = video_data['annotations'][frame_indx][obj_id]
|
212 |
+
|
213 |
+
bbox = obj_data['bbox']
|
214 |
+
cat_name = obj_data['category_name']
|
215 |
+
valid = obj_data['valid']
|
216 |
+
|
217 |
+
if valid == 0:
|
218 |
+
print("Object not in this frame!")
|
219 |
+
return {}
|
220 |
+
|
221 |
+
|
222 |
+
x_min, y_min, x_max, y_max = bbox
|
223 |
+
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
|
224 |
+
cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
|
225 |
+
plt.figure()
|
226 |
+
plt.imshow(I)
|
227 |
+
plt.axis('off')
|
228 |
+
plt.show()
|
229 |
+
|
230 |
+
#cropped object for visibility check
|
231 |
+
cropped_I = I[y_min:y_max, x_min:x_max]
|
232 |
+
pil_cropped_I = Image.fromarray(cropped_I)
|
233 |
+
buff_crop = BytesIO()
|
234 |
+
pil_cropped_I.save(buff_crop, format='JPEG')
|
235 |
+
base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
|
236 |
+
|
237 |
+
#entire image for referring expression generation
|
238 |
+
pil_I = Image.fromarray(I)
|
239 |
+
buff = BytesIO()
|
240 |
+
pil_I.save(buff, format='JPEG')
|
241 |
+
base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
|
242 |
+
|
243 |
+
# 구분 가능 여부 확인
|
244 |
+
generator = OpenAI()
|
245 |
+
response_check = generator.chat.completions.create(
|
246 |
+
model="chatgpt-4o-latest",
|
247 |
+
messages=[
|
248 |
+
{
|
249 |
+
"role": "user",
|
250 |
+
"content": [
|
251 |
+
{
|
252 |
+
|
253 |
+
"type": "text",
|
254 |
+
"text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
|
255 |
+
Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
|
256 |
+
|
257 |
+
Guidelines:
|
258 |
+
- If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
|
259 |
+
- If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
|
260 |
+
- If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
|
261 |
+
|
262 |
+
Output only either YES or NONE.
|
263 |
+
"""
|
264 |
+
},
|
265 |
+
{
|
266 |
+
"type": "image_url",
|
267 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
|
268 |
+
}
|
269 |
+
]
|
270 |
+
},
|
271 |
+
]
|
272 |
+
)
|
273 |
+
|
274 |
+
response_check_content = response_check.choices[0].message.content.strip().lower()
|
275 |
+
print(f"is object {obj_id} visible: {response_check_content}")
|
276 |
+
|
277 |
+
if "yes" not in response_check_content:
|
278 |
+
print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
|
279 |
+
return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
|
280 |
+
|
281 |
+
# Referring expression 만들기
|
282 |
+
# generator = OpenAI()
|
283 |
+
response = generator.chat.completions.create(
|
284 |
+
model="chatgpt-4o-latest",
|
285 |
+
messages=[
|
286 |
+
{
|
287 |
+
"role": "user",
|
288 |
+
"content": [
|
289 |
+
{
|
290 |
+
"type": "text",
|
291 |
+
|
292 |
+
"text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
|
293 |
+
Guidelines for creating the referring expression:
|
294 |
+
1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
|
295 |
+
2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
|
296 |
+
3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
|
297 |
+
4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
|
298 |
+
5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
|
299 |
+
6. Use '{cat_name}' as the noun for the referring expressions.
|
300 |
+
Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
|
301 |
+
|
302 |
+
{caption}
|
303 |
+
"""
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"type": "image_url",
|
307 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
308 |
+
},
|
309 |
+
# {
|
310 |
+
# "type": "image_url",
|
311 |
+
# "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
|
312 |
+
# }
|
313 |
+
],
|
314 |
+
}
|
315 |
+
],
|
316 |
+
)
|
317 |
+
|
318 |
+
ref_exp = response.choices[0].message.content.strip()
|
319 |
+
|
320 |
+
#QA filtering
|
321 |
+
#QA1: 원하는 물체를 설명하는지
|
322 |
+
filter = OpenAI()
|
323 |
+
response1 = filter.chat.completions.create(
|
324 |
+
model="gpt-4o",
|
325 |
+
messages=[
|
326 |
+
{
|
327 |
+
"role": "user",
|
328 |
+
"content": [
|
329 |
+
{
|
330 |
+
"type": "text",
|
331 |
+
"text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
|
332 |
+
{ref_exp}""",
|
333 |
+
},
|
334 |
+
{
|
335 |
+
"type": "image_url",
|
336 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
337 |
+
},
|
338 |
+
],
|
339 |
+
}
|
340 |
+
],
|
341 |
+
)
|
342 |
+
|
343 |
+
response1_content = response1.choices[0].message.content
|
344 |
+
describesHighlighted = True if "yes" in response1_content.lower() else False
|
345 |
+
|
346 |
+
#QA2: 원하지 않는 물체를 설명하지 않는지
|
347 |
+
response2 = filter.chat.completions.create(
|
348 |
+
model="gpt-4o-mini",
|
349 |
+
messages=[
|
350 |
+
{
|
351 |
+
"role": "user",
|
352 |
+
"content": [
|
353 |
+
{
|
354 |
+
"type": "text",
|
355 |
+
"text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
|
356 |
+
{ref_exp}""",
|
357 |
+
},
|
358 |
+
{
|
359 |
+
"type": "image_url",
|
360 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
361 |
+
},
|
362 |
+
],
|
363 |
+
}
|
364 |
+
],
|
365 |
+
)
|
366 |
+
|
367 |
+
response2_content = response2.choices[0].message.content
|
368 |
+
describesNotHighlighted = True if "yes" in response2_content.lower() else False
|
369 |
+
|
370 |
+
isValid = True if describesHighlighted and not describesNotHighlighted else False
|
371 |
+
|
372 |
+
print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
|
373 |
+
|
374 |
+
return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
|
375 |
+
|
376 |
+
|
377 |
+
|
378 |
+
if __name__ == '__main__':
|
379 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
380 |
+
args = parser.parse_args()
|
381 |
+
|
382 |
+
#==================데이터 불러오기===================
|
383 |
+
# # 전체 데이터셋
|
384 |
+
# train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
385 |
+
|
386 |
+
# # 전체 데이터셋 메타데이터
|
387 |
+
# metas = train_dataset.metas
|
388 |
+
|
389 |
+
with open('mbench/sampled_frame3.json', 'r') as file:
|
390 |
+
data = json.load(file)
|
391 |
+
|
392 |
+
vid_ids = list(data.keys())
|
393 |
+
|
394 |
+
all_ref_exps = {}
|
395 |
+
|
396 |
+
#==================GPT 돌리기===================
|
397 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
398 |
+
|
399 |
+
# 전체 데이터셋의 vid_id에 대해
|
400 |
+
for i in range(1):
|
401 |
+
vid_id = vid_ids[i]
|
402 |
+
|
403 |
+
#====캡션 만들기====
|
404 |
+
caption, valid_obj_ids = getCaption(vid_id, data)
|
405 |
+
cats_in_vid = list(caption.keys())
|
406 |
+
|
407 |
+
#====referring expression 만들고 QA filtering====
|
408 |
+
ref_expressions = {}
|
409 |
+
# 각 카테고리별로
|
410 |
+
for cat_name in cats_in_vid:
|
411 |
+
if cat_name not in ref_expressions:
|
412 |
+
ref_expressions[cat_name] = {}
|
413 |
+
|
414 |
+
# 각 비디오 프레임 별로
|
415 |
+
for frame_name in data[vid_id]['frame_names']:
|
416 |
+
|
417 |
+
if frame_name not in ref_expressions[cat_name]:
|
418 |
+
ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary
|
419 |
+
|
420 |
+
caption = caption[cat_name][frame_name]
|
421 |
+
|
422 |
+
if not caption : continue
|
423 |
+
else :
|
424 |
+
# 각 obj id별로
|
425 |
+
for obj_id in valid_obj_ids:
|
426 |
+
ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
|
427 |
+
ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp
|
428 |
+
|
429 |
+
|
430 |
+
all_ref_exps[vid_id] = ref_expressions
|
431 |
+
|
.history/mbench/gpt_ref-ytvos-cy_20250121155710.py
ADDED
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from os import path as osp
|
3 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
|
9 |
+
import sys
|
10 |
+
from pathlib import Path
|
11 |
+
import os
|
12 |
+
from os import path as osp
|
13 |
+
import skimage
|
14 |
+
from io import BytesIO
|
15 |
+
|
16 |
+
import numpy as np
|
17 |
+
import pandas as pd
|
18 |
+
import regex as re
|
19 |
+
import json
|
20 |
+
|
21 |
+
import cv2
|
22 |
+
from PIL import Image, ImageDraw
|
23 |
+
import torch
|
24 |
+
from torchvision.transforms import functional as F
|
25 |
+
|
26 |
+
from skimage import measure # (pip install scikit-image)
|
27 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
28 |
+
|
29 |
+
import matplotlib.pyplot as plt
|
30 |
+
import matplotlib.patches as patches
|
31 |
+
from matplotlib.collections import PatchCollection
|
32 |
+
from matplotlib.patches import Rectangle
|
33 |
+
|
34 |
+
|
35 |
+
import ipywidgets as widgets
|
36 |
+
from IPython.display import display, clear_output
|
37 |
+
|
38 |
+
from openai import OpenAI
|
39 |
+
import base64
|
40 |
+
|
41 |
+
# Function to encode the image
|
42 |
+
def encode_image(image_path):
|
43 |
+
with open(image_path, "rb") as image_file:
|
44 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
45 |
+
|
46 |
+
# Captioner
|
47 |
+
ytvos_category_valid_list = [
|
48 |
+
'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
49 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
50 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
51 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
52 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
53 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
54 |
+
]
|
55 |
+
def getCaption(video_id, json_data):
|
56 |
+
#데이터 가져오기
|
57 |
+
video_data = json_data[video_id]
|
58 |
+
frame_names = video_data['frame_names']
|
59 |
+
video_path = video_data['video_path']
|
60 |
+
|
61 |
+
cat_names = set()
|
62 |
+
all_captions = dict()
|
63 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
64 |
+
cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
|
65 |
+
|
66 |
+
# cat_names : person, snowboard
|
67 |
+
# 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
|
68 |
+
# 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
|
69 |
+
|
70 |
+
for cat_name in list(cat_names) :
|
71 |
+
image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
|
72 |
+
image_captions = {}
|
73 |
+
|
74 |
+
captioner = OpenAI()
|
75 |
+
|
76 |
+
#0단계: action의 대상이 될 수 있는가?
|
77 |
+
is_movable = False
|
78 |
+
if cat_name in ytvos_category_valid_list :
|
79 |
+
is_movable = True
|
80 |
+
|
81 |
+
# response_check = captioner.chat.completions.create(
|
82 |
+
# model="gpt-4o",
|
83 |
+
# messages=[
|
84 |
+
# {
|
85 |
+
# "role": "user",
|
86 |
+
# "content": f"""
|
87 |
+
# Can a {cat_name} be a subject of distinct actions or movements?
|
88 |
+
# For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
|
89 |
+
# However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
|
90 |
+
# Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
|
91 |
+
# Answer only YES or NONE.
|
92 |
+
# """
|
93 |
+
# }
|
94 |
+
# ],
|
95 |
+
# )
|
96 |
+
# response_check_content = response_check.choices[0].message.content.strip().lower()
|
97 |
+
# print(f"Movable Check for {cat_name}: {response_check_content}")
|
98 |
+
|
99 |
+
# if response_check_content == "yes": is_movable = True
|
100 |
+
|
101 |
+
if not is_movable:
|
102 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.")
|
103 |
+
continue
|
104 |
+
|
105 |
+
for i in range(len(image_paths)):
|
106 |
+
image_path = image_paths[i]
|
107 |
+
frame_name = frame_names[i]
|
108 |
+
base64_image = encode_image(image_path)
|
109 |
+
|
110 |
+
#1단계: 필터링
|
111 |
+
#print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
112 |
+
response1 = captioner.chat.completions.create(
|
113 |
+
model="chatgpt-4o-latest",
|
114 |
+
messages=[
|
115 |
+
{
|
116 |
+
"role": "user",
|
117 |
+
"content": [
|
118 |
+
{
|
119 |
+
"type": "text",
|
120 |
+
|
121 |
+
"text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
|
122 |
+
Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
|
123 |
+
Each action should be unique and clearly associated with a specific object.
|
124 |
+
|
125 |
+
Respond with YES if:
|
126 |
+
- The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
|
127 |
+
- The {cat_name}s involve clear, distinguishable actions performed independently.
|
128 |
+
|
129 |
+
Respond with NONE if:
|
130 |
+
- The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
|
131 |
+
- Actions are ambiguous, minor, or not clearly visible.
|
132 |
+
|
133 |
+
If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
|
134 |
+
If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
|
135 |
+
|
136 |
+
Answer only YES or NONE."""
|
137 |
+
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"type": "image_url",
|
141 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
142 |
+
},
|
143 |
+
],
|
144 |
+
}
|
145 |
+
],
|
146 |
+
)
|
147 |
+
response_content = response1.choices[0].message.content
|
148 |
+
should_caption = True if "yes" in response_content.lower() else False
|
149 |
+
#print(f"are {cat_name}s distinguished by action: {response_content}")
|
150 |
+
|
151 |
+
#2단계: dense caption 만들기
|
152 |
+
if should_caption:
|
153 |
+
response2 = captioner.chat.completions.create(
|
154 |
+
model="chatgpt-4o-latest",
|
155 |
+
messages=[
|
156 |
+
{
|
157 |
+
"role": "user",
|
158 |
+
"content": [
|
159 |
+
{
|
160 |
+
"type": "text",
|
161 |
+
|
162 |
+
"text": f"""
|
163 |
+
Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
|
164 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
165 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
166 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
167 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
168 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
169 |
+
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
170 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
171 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
172 |
+
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
173 |
+
Output only the caption.""",
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"type": "image_url",
|
177 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
178 |
+
},
|
179 |
+
],
|
180 |
+
}
|
181 |
+
],
|
182 |
+
)
|
183 |
+
|
184 |
+
caption = response2.choices[0].message.content
|
185 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
186 |
+
else:
|
187 |
+
caption = None
|
188 |
+
|
189 |
+
image_captions[frame_name] = caption
|
190 |
+
all_captions[cat_name] = image_captions
|
191 |
+
|
192 |
+
# final : also prepare valid object ids
|
193 |
+
valid_obj_ids = []
|
194 |
+
valid_cat_names = list(all_captions.keys())
|
195 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
196 |
+
cat = video_data['annotations'][0][obj_id]['category_name']
|
197 |
+
if cat in valid_cat_names : valid_obj_ids.append(obj_id)
|
198 |
+
|
199 |
+
return all_captions, valid_obj_ids
|
200 |
+
|
201 |
+
# Referring expression generator and QA filter
|
202 |
+
def getRefExp(video_id, frame_name, caption, obj_id, json_data):
|
203 |
+
|
204 |
+
# 이미지에 해당 물체 바운딩 박스 그리기
|
205 |
+
video_data = json_data[video_id]
|
206 |
+
frame_names = video_data['frame_names']
|
207 |
+
video_path = video_data['video_path']
|
208 |
+
I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
|
209 |
+
frame_indx = frame_names.index(frame_name)
|
210 |
+
obj_data = video_data['annotations'][frame_indx][obj_id]
|
211 |
+
|
212 |
+
bbox = obj_data['bbox']
|
213 |
+
cat_name = obj_data['category_name']
|
214 |
+
valid = obj_data['valid']
|
215 |
+
|
216 |
+
if valid == 0:
|
217 |
+
print("Object not in this frame!")
|
218 |
+
return {}
|
219 |
+
|
220 |
+
|
221 |
+
x_min, y_min, x_max, y_max = bbox
|
222 |
+
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
|
223 |
+
cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
|
224 |
+
plt.figure()
|
225 |
+
plt.imshow(I)
|
226 |
+
plt.axis('off')
|
227 |
+
plt.show()
|
228 |
+
|
229 |
+
#cropped object for visibility check
|
230 |
+
cropped_I = I[y_min:y_max, x_min:x_max]
|
231 |
+
pil_cropped_I = Image.fromarray(cropped_I)
|
232 |
+
buff_crop = BytesIO()
|
233 |
+
pil_cropped_I.save(buff_crop, format='JPEG')
|
234 |
+
base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
|
235 |
+
|
236 |
+
#entire image for referring expression generation
|
237 |
+
pil_I = Image.fromarray(I)
|
238 |
+
buff = BytesIO()
|
239 |
+
pil_I.save(buff, format='JPEG')
|
240 |
+
base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
|
241 |
+
|
242 |
+
# 구분 가능 여부 확인
|
243 |
+
generator = OpenAI()
|
244 |
+
response_check = generator.chat.completions.create(
|
245 |
+
model="chatgpt-4o-latest",
|
246 |
+
messages=[
|
247 |
+
{
|
248 |
+
"role": "user",
|
249 |
+
"content": [
|
250 |
+
{
|
251 |
+
|
252 |
+
"type": "text",
|
253 |
+
"text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
|
254 |
+
Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
|
255 |
+
|
256 |
+
Guidelines:
|
257 |
+
- If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
|
258 |
+
- If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
|
259 |
+
- If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
|
260 |
+
|
261 |
+
Output only either YES or NONE.
|
262 |
+
"""
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"type": "image_url",
|
266 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
|
267 |
+
}
|
268 |
+
]
|
269 |
+
},
|
270 |
+
]
|
271 |
+
)
|
272 |
+
|
273 |
+
response_check_content = response_check.choices[0].message.content.strip().lower()
|
274 |
+
#print(f"is object {obj_id} visible: {response_check_content}")
|
275 |
+
|
276 |
+
if "yes" not in response_check_content:
|
277 |
+
print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
|
278 |
+
return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
|
279 |
+
|
280 |
+
# Referring expression 만들기
|
281 |
+
# generator = OpenAI()
|
282 |
+
response = generator.chat.completions.create(
|
283 |
+
model="chatgpt-4o-latest",
|
284 |
+
messages=[
|
285 |
+
{
|
286 |
+
"role": "user",
|
287 |
+
"content": [
|
288 |
+
{
|
289 |
+
"type": "text",
|
290 |
+
|
291 |
+
"text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
|
292 |
+
Guidelines for creating the referring expression:
|
293 |
+
1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
|
294 |
+
2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
|
295 |
+
3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
|
296 |
+
4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
|
297 |
+
5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
|
298 |
+
6. Use '{cat_name}' as the noun for the referring expressions.
|
299 |
+
Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
|
300 |
+
|
301 |
+
{caption}
|
302 |
+
"""
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"type": "image_url",
|
306 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
307 |
+
},
|
308 |
+
# {
|
309 |
+
# "type": "image_url",
|
310 |
+
# "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
|
311 |
+
# }
|
312 |
+
],
|
313 |
+
}
|
314 |
+
],
|
315 |
+
)
|
316 |
+
|
317 |
+
ref_exp = response.choices[0].message.content.strip()
|
318 |
+
|
319 |
+
#QA filtering
|
320 |
+
#QA1: 원하는 물체를 설명하는지
|
321 |
+
filter = OpenAI()
|
322 |
+
response1 = filter.chat.completions.create(
|
323 |
+
model="chatgpt-4o-latest",
|
324 |
+
messages=[
|
325 |
+
{
|
326 |
+
"role": "user",
|
327 |
+
"content": [
|
328 |
+
{
|
329 |
+
"type": "text",
|
330 |
+
"text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
|
331 |
+
{ref_exp}""",
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"type": "image_url",
|
335 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
336 |
+
},
|
337 |
+
],
|
338 |
+
}
|
339 |
+
],
|
340 |
+
)
|
341 |
+
|
342 |
+
response1_content = response1.choices[0].message.content
|
343 |
+
describesHighlighted = True if "yes" in response1_content.lower() else False
|
344 |
+
|
345 |
+
#QA2: 원하지 않는 물체를 설명하지 않는지
|
346 |
+
response2 = filter.chat.completions.create(
|
347 |
+
model="chatgpt-4o-latest",
|
348 |
+
messages=[
|
349 |
+
{
|
350 |
+
"role": "user",
|
351 |
+
"content": [
|
352 |
+
{
|
353 |
+
"type": "text",
|
354 |
+
"text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
|
355 |
+
{ref_exp}""",
|
356 |
+
},
|
357 |
+
{
|
358 |
+
"type": "image_url",
|
359 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
360 |
+
},
|
361 |
+
],
|
362 |
+
}
|
363 |
+
],
|
364 |
+
)
|
365 |
+
|
366 |
+
response2_content = response2.choices[0].message.content
|
367 |
+
notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
|
368 |
+
|
369 |
+
isValid = True if describesHighlighted and notDescribesNotHighlighted else False
|
370 |
+
|
371 |
+
#print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
|
372 |
+
#print(f"ref exp: {ref_exp}")
|
373 |
+
#print("")
|
374 |
+
|
375 |
+
return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
|
376 |
+
|
377 |
+
|
378 |
+
if __name__ == '__main__':
|
379 |
+
with open('mbench/sampled_frame3.json', 'r') as file:
|
380 |
+
data = json.load(file)
|
381 |
+
|
382 |
+
vid_ids = list(data.keys())
|
383 |
+
all_ref_exps = {}
|
384 |
+
|
385 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
386 |
+
|
387 |
+
# 전체 데이터셋의 vid_id에 대해
|
388 |
+
for i in range(50):
|
389 |
+
vid_id = vid_ids[i]
|
390 |
+
|
391 |
+
#====캡션 만들기====
|
392 |
+
# print("=====================captioner========================")
|
393 |
+
captions, valid_obj_ids = getCaption(vid_id, data)
|
394 |
+
cats_in_vid = list(captions.keys())
|
395 |
+
# print()
|
396 |
+
|
397 |
+
#====referring expression 만들고 QA filtering====
|
398 |
+
# print("=====================referring expression generator & QA filter========================")
|
399 |
+
ref_expressions = {}
|
400 |
+
|
401 |
+
# 각 카테고리별로
|
402 |
+
for cat_name in cats_in_vid:
|
403 |
+
if cat_name not in ref_expressions:
|
404 |
+
ref_expressions[cat_name] = {}
|
405 |
+
# 각 비디오 프레임 별로
|
406 |
+
for frame_name in data[vid_id]['frame_names']:
|
407 |
+
# print(f'--------category: {cat_name}, frame_name: {frame_name}')
|
408 |
+
|
409 |
+
if frame_name not in ref_expressions[cat_name]:
|
410 |
+
ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary
|
411 |
+
caption = captions[cat_name][frame_name]
|
412 |
+
if not caption : continue
|
413 |
+
else :
|
414 |
+
# 각 obj id별로
|
415 |
+
for obj_id in valid_obj_ids:
|
416 |
+
ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
|
417 |
+
ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp
|
418 |
+
|
419 |
+
all_ref_exps[vid_id] = ref_expressions
|
420 |
+
|
421 |
+
|
422 |
+
with open('mbench/result_revised.json', 'w') as file:
|
423 |
+
json.dump(all_ref_exps, file, indent=4)
|
424 |
+
|
425 |
+
|
426 |
+
|
427 |
+
|
428 |
+
|
.history/mbench/gpt_ref-ytvos-revised_20250121155717.py
ADDED
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from os import path as osp
|
3 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
|
9 |
+
import sys
|
10 |
+
from pathlib import Path
|
11 |
+
import os
|
12 |
+
from os import path as osp
|
13 |
+
import skimage
|
14 |
+
from io import BytesIO
|
15 |
+
|
16 |
+
import numpy as np
|
17 |
+
import pandas as pd
|
18 |
+
import regex as re
|
19 |
+
import json
|
20 |
+
|
21 |
+
import cv2
|
22 |
+
from PIL import Image, ImageDraw
|
23 |
+
import torch
|
24 |
+
from torchvision.transforms import functional as F
|
25 |
+
|
26 |
+
from skimage import measure # (pip install scikit-image)
|
27 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
28 |
+
|
29 |
+
import matplotlib.pyplot as plt
|
30 |
+
import matplotlib.patches as patches
|
31 |
+
from matplotlib.collections import PatchCollection
|
32 |
+
from matplotlib.patches import Rectangle
|
33 |
+
|
34 |
+
|
35 |
+
import ipywidgets as widgets
|
36 |
+
from IPython.display import display, clear_output
|
37 |
+
|
38 |
+
from openai import OpenAI
|
39 |
+
import base64
|
40 |
+
|
41 |
+
# Function to encode the image
|
42 |
+
def encode_image(image_path):
|
43 |
+
with open(image_path, "rb") as image_file:
|
44 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
45 |
+
|
46 |
+
# Captioner
|
47 |
+
ytvos_category_valid_list = [
|
48 |
+
'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
49 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
50 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
51 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
52 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
53 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
54 |
+
]
|
55 |
+
def getCaption(video_id, json_data):
|
56 |
+
#데이터 가져오기
|
57 |
+
video_data = json_data[video_id]
|
58 |
+
frame_names = video_data['frame_names']
|
59 |
+
video_path = video_data['video_path']
|
60 |
+
|
61 |
+
cat_names = set()
|
62 |
+
all_captions = dict()
|
63 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
64 |
+
cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
|
65 |
+
|
66 |
+
# cat_names : person, snowboard
|
67 |
+
# 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
|
68 |
+
# 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
|
69 |
+
|
70 |
+
for cat_name in list(cat_names) :
|
71 |
+
image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
|
72 |
+
image_captions = {}
|
73 |
+
|
74 |
+
captioner = OpenAI()
|
75 |
+
|
76 |
+
#0단계: action의 대상이 될 수 있는가?
|
77 |
+
is_movable = False
|
78 |
+
if cat_name in ytvos_category_valid_list :
|
79 |
+
is_movable = True
|
80 |
+
|
81 |
+
# response_check = captioner.chat.completions.create(
|
82 |
+
# model="gpt-4o",
|
83 |
+
# messages=[
|
84 |
+
# {
|
85 |
+
# "role": "user",
|
86 |
+
# "content": f"""
|
87 |
+
# Can a {cat_name} be a subject of distinct actions or movements?
|
88 |
+
# For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
|
89 |
+
# However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
|
90 |
+
# Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
|
91 |
+
# Answer only YES or NONE.
|
92 |
+
# """
|
93 |
+
# }
|
94 |
+
# ],
|
95 |
+
# )
|
96 |
+
# response_check_content = response_check.choices[0].message.content.strip().lower()
|
97 |
+
# print(f"Movable Check for {cat_name}: {response_check_content}")
|
98 |
+
|
99 |
+
# if response_check_content == "yes": is_movable = True
|
100 |
+
|
101 |
+
if not is_movable:
|
102 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.")
|
103 |
+
continue
|
104 |
+
|
105 |
+
for i in range(len(image_paths)):
|
106 |
+
image_path = image_paths[i]
|
107 |
+
frame_name = frame_names[i]
|
108 |
+
base64_image = encode_image(image_path)
|
109 |
+
|
110 |
+
#1단계: 필터링
|
111 |
+
#print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
112 |
+
response1 = captioner.chat.completions.create(
|
113 |
+
model="chatgpt-4o-latest",
|
114 |
+
messages=[
|
115 |
+
{
|
116 |
+
"role": "user",
|
117 |
+
"content": [
|
118 |
+
{
|
119 |
+
"type": "text",
|
120 |
+
|
121 |
+
"text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
|
122 |
+
Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
|
123 |
+
Each action should be unique and clearly associated with a specific object.
|
124 |
+
|
125 |
+
Respond with YES if:
|
126 |
+
- The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
|
127 |
+
- The {cat_name}s involve clear, distinguishable actions performed independently.
|
128 |
+
|
129 |
+
Respond with NONE if:
|
130 |
+
- The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
|
131 |
+
- Actions are ambiguous, minor, or not clearly visible.
|
132 |
+
|
133 |
+
If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
|
134 |
+
If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
|
135 |
+
|
136 |
+
Answer only YES or NONE."""
|
137 |
+
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"type": "image_url",
|
141 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
142 |
+
},
|
143 |
+
],
|
144 |
+
}
|
145 |
+
],
|
146 |
+
)
|
147 |
+
response_content = response1.choices[0].message.content
|
148 |
+
should_caption = True if "yes" in response_content.lower() else False
|
149 |
+
#print(f"are {cat_name}s distinguished by action: {response_content}")
|
150 |
+
|
151 |
+
#2단계: dense caption 만들기
|
152 |
+
if should_caption:
|
153 |
+
response2 = captioner.chat.completions.create(
|
154 |
+
model="chatgpt-4o-latest",
|
155 |
+
messages=[
|
156 |
+
{
|
157 |
+
"role": "user",
|
158 |
+
"content": [
|
159 |
+
{
|
160 |
+
"type": "text",
|
161 |
+
|
162 |
+
"text": f"""
|
163 |
+
Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
|
164 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
165 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
166 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
167 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
168 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
169 |
+
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
170 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
171 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
172 |
+
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
173 |
+
Output only the caption.""",
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"type": "image_url",
|
177 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
178 |
+
},
|
179 |
+
],
|
180 |
+
}
|
181 |
+
],
|
182 |
+
)
|
183 |
+
|
184 |
+
caption = response2.choices[0].message.content
|
185 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
186 |
+
else:
|
187 |
+
caption = None
|
188 |
+
|
189 |
+
image_captions[frame_name] = caption
|
190 |
+
all_captions[cat_name] = image_captions
|
191 |
+
|
192 |
+
# final : also prepare valid object ids
|
193 |
+
valid_obj_ids = []
|
194 |
+
valid_cat_names = list(all_captions.keys())
|
195 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
196 |
+
cat = video_data['annotations'][0][obj_id]['category_name']
|
197 |
+
if cat in valid_cat_names : valid_obj_ids.append(obj_id)
|
198 |
+
|
199 |
+
return all_captions, valid_obj_ids
|
200 |
+
|
201 |
+
# Referring expression generator and QA filter
|
202 |
+
def getRefExp(video_id, frame_name, caption, obj_id, json_data):
|
203 |
+
|
204 |
+
# 이미지에 해당 물체 바운딩 박스 그리기
|
205 |
+
video_data = json_data[video_id]
|
206 |
+
frame_names = video_data['frame_names']
|
207 |
+
video_path = video_data['video_path']
|
208 |
+
I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
|
209 |
+
frame_indx = frame_names.index(frame_name)
|
210 |
+
obj_data = video_data['annotations'][frame_indx][obj_id]
|
211 |
+
|
212 |
+
bbox = obj_data['bbox']
|
213 |
+
cat_name = obj_data['category_name']
|
214 |
+
valid = obj_data['valid']
|
215 |
+
|
216 |
+
if valid == 0:
|
217 |
+
print("Object not in this frame!")
|
218 |
+
return {}
|
219 |
+
|
220 |
+
|
221 |
+
x_min, y_min, x_max, y_max = bbox
|
222 |
+
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
|
223 |
+
cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
|
224 |
+
plt.figure()
|
225 |
+
plt.imshow(I)
|
226 |
+
plt.axis('off')
|
227 |
+
plt.show()
|
228 |
+
|
229 |
+
#cropped object for visibility check
|
230 |
+
cropped_I = I[y_min:y_max, x_min:x_max]
|
231 |
+
pil_cropped_I = Image.fromarray(cropped_I)
|
232 |
+
buff_crop = BytesIO()
|
233 |
+
pil_cropped_I.save(buff_crop, format='JPEG')
|
234 |
+
base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
|
235 |
+
|
236 |
+
#entire image for referring expression generation
|
237 |
+
pil_I = Image.fromarray(I)
|
238 |
+
buff = BytesIO()
|
239 |
+
pil_I.save(buff, format='JPEG')
|
240 |
+
base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
|
241 |
+
|
242 |
+
# 구분 가능 여부 확인
|
243 |
+
generator = OpenAI()
|
244 |
+
response_check = generator.chat.completions.create(
|
245 |
+
model="chatgpt-4o-latest",
|
246 |
+
messages=[
|
247 |
+
{
|
248 |
+
"role": "user",
|
249 |
+
"content": [
|
250 |
+
{
|
251 |
+
|
252 |
+
"type": "text",
|
253 |
+
"text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
|
254 |
+
Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
|
255 |
+
|
256 |
+
Guidelines:
|
257 |
+
- If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
|
258 |
+
- If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
|
259 |
+
- If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
|
260 |
+
|
261 |
+
Output only either YES or NONE.
|
262 |
+
"""
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"type": "image_url",
|
266 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
|
267 |
+
}
|
268 |
+
]
|
269 |
+
},
|
270 |
+
]
|
271 |
+
)
|
272 |
+
|
273 |
+
response_check_content = response_check.choices[0].message.content.strip().lower()
|
274 |
+
#print(f"is object {obj_id} visible: {response_check_content}")
|
275 |
+
|
276 |
+
if "yes" not in response_check_content:
|
277 |
+
print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
|
278 |
+
return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
|
279 |
+
|
280 |
+
# Referring expression 만들기
|
281 |
+
# generator = OpenAI()
|
282 |
+
response = generator.chat.completions.create(
|
283 |
+
model="chatgpt-4o-latest",
|
284 |
+
messages=[
|
285 |
+
{
|
286 |
+
"role": "user",
|
287 |
+
"content": [
|
288 |
+
{
|
289 |
+
"type": "text",
|
290 |
+
|
291 |
+
"text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
|
292 |
+
Guidelines for creating the referring expression:
|
293 |
+
1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
|
294 |
+
2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
|
295 |
+
3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
|
296 |
+
4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
|
297 |
+
5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
|
298 |
+
6. Use '{cat_name}' as the noun for the referring expressions.
|
299 |
+
Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
|
300 |
+
|
301 |
+
{caption}
|
302 |
+
"""
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"type": "image_url",
|
306 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
307 |
+
},
|
308 |
+
# {
|
309 |
+
# "type": "image_url",
|
310 |
+
# "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
|
311 |
+
# }
|
312 |
+
],
|
313 |
+
}
|
314 |
+
],
|
315 |
+
)
|
316 |
+
|
317 |
+
ref_exp = response.choices[0].message.content.strip()
|
318 |
+
|
319 |
+
#QA filtering
|
320 |
+
#QA1: 원하는 물체를 설명하는지
|
321 |
+
filter = OpenAI()
|
322 |
+
response1 = filter.chat.completions.create(
|
323 |
+
model="chatgpt-4o-latest",
|
324 |
+
messages=[
|
325 |
+
{
|
326 |
+
"role": "user",
|
327 |
+
"content": [
|
328 |
+
{
|
329 |
+
"type": "text",
|
330 |
+
"text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
|
331 |
+
{ref_exp}""",
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"type": "image_url",
|
335 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
336 |
+
},
|
337 |
+
],
|
338 |
+
}
|
339 |
+
],
|
340 |
+
)
|
341 |
+
|
342 |
+
response1_content = response1.choices[0].message.content
|
343 |
+
describesHighlighted = True if "yes" in response1_content.lower() else False
|
344 |
+
|
345 |
+
#QA2: 원하지 않는 물체를 설명하지 않는지
|
346 |
+
response2 = filter.chat.completions.create(
|
347 |
+
model="chatgpt-4o-latest",
|
348 |
+
messages=[
|
349 |
+
{
|
350 |
+
"role": "user",
|
351 |
+
"content": [
|
352 |
+
{
|
353 |
+
"type": "text",
|
354 |
+
"text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
|
355 |
+
{ref_exp}""",
|
356 |
+
},
|
357 |
+
{
|
358 |
+
"type": "image_url",
|
359 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
360 |
+
},
|
361 |
+
],
|
362 |
+
}
|
363 |
+
],
|
364 |
+
)
|
365 |
+
|
366 |
+
response2_content = response2.choices[0].message.content
|
367 |
+
notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
|
368 |
+
|
369 |
+
isValid = True if describesHighlighted and notDescribesNotHighlighted else False
|
370 |
+
|
371 |
+
#print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
|
372 |
+
#print(f"ref exp: {ref_exp}")
|
373 |
+
#print("")
|
374 |
+
|
375 |
+
return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
|
376 |
+
|
377 |
+
|
378 |
+
if __name__ == '__main__':
|
379 |
+
with open('mbench/sampled_frame3.json', 'r') as file:
|
380 |
+
data = json.load(file)
|
381 |
+
|
382 |
+
vid_ids = list(data.keys())
|
383 |
+
all_ref_exps = {}
|
384 |
+
|
385 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
386 |
+
|
387 |
+
# 전체 데이터셋의 vid_id에 대해
|
388 |
+
for i in range(1):
|
389 |
+
vid_id = vid_ids[i]
|
390 |
+
|
391 |
+
#====캡션 만들기====
|
392 |
+
# print("=====================captioner========================")
|
393 |
+
captions, valid_obj_ids = getCaption(vid_id, data)
|
394 |
+
cats_in_vid = list(captions.keys())
|
395 |
+
# print()
|
396 |
+
|
397 |
+
#====referring expression 만들고 QA filtering====
|
398 |
+
# print("=====================referring expression generator & QA filter========================")
|
399 |
+
ref_expressions = {}
|
400 |
+
|
401 |
+
# 각 카테고리별로
|
402 |
+
for cat_name in cats_in_vid:
|
403 |
+
if cat_name not in ref_expressions:
|
404 |
+
ref_expressions[cat_name] = {}
|
405 |
+
# 각 비디오 프레임 별로
|
406 |
+
for frame_name in data[vid_id]['frame_names']:
|
407 |
+
# print(f'--------category: {cat_name}, frame_name: {frame_name}')
|
408 |
+
|
409 |
+
if frame_name not in ref_expressions[cat_name]:
|
410 |
+
ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary
|
411 |
+
caption = captions[cat_name][frame_name]
|
412 |
+
if not caption : continue
|
413 |
+
else :
|
414 |
+
# 각 obj id별로
|
415 |
+
for obj_id in valid_obj_ids:
|
416 |
+
ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
|
417 |
+
ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp
|
418 |
+
|
419 |
+
all_ref_exps[vid_id] = ref_expressions
|
420 |
+
|
421 |
+
|
422 |
+
with open('mbench/result_revised.json', 'w') as file:
|
423 |
+
json.dump(all_ref_exps, file, indent=4)
|
424 |
+
|
425 |
+
|
426 |
+
|
427 |
+
|
428 |
+
|
.history/mbench/gpt_ref-ytvos-revised_20250121155956.py
ADDED
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from os import path as osp
|
3 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
|
9 |
+
import sys
|
10 |
+
from pathlib import Path
|
11 |
+
import os
|
12 |
+
from os import path as osp
|
13 |
+
import skimage
|
14 |
+
from io import BytesIO
|
15 |
+
|
16 |
+
import numpy as np
|
17 |
+
import pandas as pd
|
18 |
+
import regex as re
|
19 |
+
import json
|
20 |
+
|
21 |
+
import cv2
|
22 |
+
from PIL import Image, ImageDraw
|
23 |
+
import torch
|
24 |
+
from torchvision.transforms import functional as F
|
25 |
+
|
26 |
+
from skimage import measure # (pip install scikit-image)
|
27 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
28 |
+
|
29 |
+
import matplotlib.pyplot as plt
|
30 |
+
import matplotlib.patches as patches
|
31 |
+
from matplotlib.collections import PatchCollection
|
32 |
+
from matplotlib.patches import Rectangle
|
33 |
+
|
34 |
+
|
35 |
+
import ipywidgets as widgets
|
36 |
+
from IPython.display import display, clear_output
|
37 |
+
|
38 |
+
from openai import OpenAI
|
39 |
+
import base64
|
40 |
+
|
41 |
+
# Function to encode the image
|
42 |
+
def encode_image(image_path):
|
43 |
+
with open(image_path, "rb") as image_file:
|
44 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
45 |
+
|
46 |
+
# Captioner
|
47 |
+
ytvos_category_valid_list = [
|
48 |
+
'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
49 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
50 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
51 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
52 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
53 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
54 |
+
]
|
55 |
+
def getCaption(video_id, json_data):
|
56 |
+
#데이터 가져오기
|
57 |
+
video_data = json_data[video_id]
|
58 |
+
frame_names = video_data['frame_names']
|
59 |
+
video_path = video_data['video_path']
|
60 |
+
|
61 |
+
cat_names = set()
|
62 |
+
all_captions = dict()
|
63 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
64 |
+
cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
|
65 |
+
|
66 |
+
# cat_names : person, snowboard
|
67 |
+
# 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
|
68 |
+
# 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
|
69 |
+
|
70 |
+
for cat_name in list(cat_names) :
|
71 |
+
image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
|
72 |
+
image_captions = {}
|
73 |
+
|
74 |
+
captioner = OpenAI()
|
75 |
+
|
76 |
+
#0단계: action의 대상이 될 수 있는가?
|
77 |
+
is_movable = False
|
78 |
+
if cat_name in ytvos_category_valid_list :
|
79 |
+
is_movable = True
|
80 |
+
|
81 |
+
# response_check = captioner.chat.completions.create(
|
82 |
+
# model="gpt-4o",
|
83 |
+
# messages=[
|
84 |
+
# {
|
85 |
+
# "role": "user",
|
86 |
+
# "content": f"""
|
87 |
+
# Can a {cat_name} be a subject of distinct actions or movements?
|
88 |
+
# For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
|
89 |
+
# However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
|
90 |
+
# Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
|
91 |
+
# Answer only YES or NONE.
|
92 |
+
# """
|
93 |
+
# }
|
94 |
+
# ],
|
95 |
+
# )
|
96 |
+
# response_check_content = response_check.choices[0].message.content.strip().lower()
|
97 |
+
# print(f"Movable Check for {cat_name}: {response_check_content}")
|
98 |
+
|
99 |
+
# if response_check_content == "yes": is_movable = True
|
100 |
+
|
101 |
+
if not is_movable:
|
102 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.")
|
103 |
+
continue
|
104 |
+
|
105 |
+
for i in range(len(image_paths)):
|
106 |
+
image_path = image_paths[i]
|
107 |
+
frame_name = frame_names[i]
|
108 |
+
base64_image = encode_image(image_path)
|
109 |
+
|
110 |
+
#1단계: 필터링
|
111 |
+
#print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
112 |
+
response1 = captioner.chat.completions.create(
|
113 |
+
model="chatgpt-4o-latest",
|
114 |
+
messages=[
|
115 |
+
{
|
116 |
+
"role": "user",
|
117 |
+
"content": [
|
118 |
+
{
|
119 |
+
"type": "text",
|
120 |
+
|
121 |
+
"text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
|
122 |
+
Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
|
123 |
+
Each action should be unique and clearly associated with a specific object.
|
124 |
+
|
125 |
+
Respond with YES if:
|
126 |
+
- The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
|
127 |
+
- The {cat_name}s involve clear, distinguishable actions performed independently.
|
128 |
+
|
129 |
+
Respond with NONE if:
|
130 |
+
- The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
|
131 |
+
- Actions are ambiguous, minor, or not clearly visible.
|
132 |
+
|
133 |
+
If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
|
134 |
+
If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
|
135 |
+
|
136 |
+
Answer only YES or NONE."""
|
137 |
+
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"type": "image_url",
|
141 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
142 |
+
},
|
143 |
+
],
|
144 |
+
}
|
145 |
+
],
|
146 |
+
)
|
147 |
+
response_content = response1.choices[0].message.content
|
148 |
+
should_caption = True if "yes" in response_content.lower() else False
|
149 |
+
#print(f"are {cat_name}s distinguished by action: {response_content}")
|
150 |
+
|
151 |
+
#2단계: dense caption 만들기
|
152 |
+
if should_caption:
|
153 |
+
response2 = captioner.chat.completions.create(
|
154 |
+
model="chatgpt-4o-latest",
|
155 |
+
messages=[
|
156 |
+
{
|
157 |
+
"role": "user",
|
158 |
+
"content": [
|
159 |
+
{
|
160 |
+
"type": "text",
|
161 |
+
|
162 |
+
"text": f"""
|
163 |
+
Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
|
164 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
165 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
166 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
167 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
168 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
169 |
+
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
170 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
171 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
172 |
+
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
173 |
+
Output only the caption.""",
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"type": "image_url",
|
177 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
178 |
+
},
|
179 |
+
],
|
180 |
+
}
|
181 |
+
],
|
182 |
+
)
|
183 |
+
|
184 |
+
caption = response2.choices[0].message.content
|
185 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
186 |
+
else:
|
187 |
+
caption = None
|
188 |
+
|
189 |
+
image_captions[frame_name] = caption
|
190 |
+
all_captions[cat_name] = image_captions
|
191 |
+
|
192 |
+
# final : also prepare valid object ids
|
193 |
+
valid_obj_ids = []
|
194 |
+
valid_cat_names = list(all_captions.keys())
|
195 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
196 |
+
cat = video_data['annotations'][0][obj_id]['category_name']
|
197 |
+
if cat in valid_cat_names : valid_obj_ids.append(obj_id)
|
198 |
+
|
199 |
+
return all_captions, valid_obj_ids
|
200 |
+
|
201 |
+
# Referring expression generator and QA filter
|
202 |
+
def getRefExp(video_id, frame_name, caption, obj_id, json_data):
|
203 |
+
|
204 |
+
# 이미지에 해당 물체 바운딩 박스 그리기
|
205 |
+
video_data = json_data[video_id]
|
206 |
+
frame_names = video_data['frame_names']
|
207 |
+
video_path = video_data['video_path']
|
208 |
+
I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
|
209 |
+
frame_indx = frame_names.index(frame_name)
|
210 |
+
obj_data = video_data['annotations'][frame_indx][obj_id]
|
211 |
+
|
212 |
+
bbox = obj_data['bbox']
|
213 |
+
cat_name = obj_data['category_name']
|
214 |
+
valid = obj_data['valid']
|
215 |
+
|
216 |
+
if valid == 0:
|
217 |
+
print("Object not in this frame!")
|
218 |
+
return {}
|
219 |
+
|
220 |
+
|
221 |
+
x_min, y_min, x_max, y_max = bbox
|
222 |
+
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
|
223 |
+
cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
|
224 |
+
plt.figure()
|
225 |
+
plt.imshow(I)
|
226 |
+
plt.axis('off')
|
227 |
+
plt.show()
|
228 |
+
|
229 |
+
#cropped object for visibility check
|
230 |
+
cropped_I = I[y_min:y_max, x_min:x_max]
|
231 |
+
pil_cropped_I = Image.fromarray(cropped_I)
|
232 |
+
buff_crop = BytesIO()
|
233 |
+
pil_cropped_I.save(buff_crop, format='JPEG')
|
234 |
+
base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
|
235 |
+
|
236 |
+
#entire image for referring expression generation
|
237 |
+
pil_I = Image.fromarray(I)
|
238 |
+
buff = BytesIO()
|
239 |
+
pil_I.save(buff, format='JPEG')
|
240 |
+
base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
|
241 |
+
|
242 |
+
# 구분 가능 여부 확인
|
243 |
+
generator = OpenAI()
|
244 |
+
response_check = generator.chat.completions.create(
|
245 |
+
model="chatgpt-4o-latest",
|
246 |
+
messages=[
|
247 |
+
{
|
248 |
+
"role": "user",
|
249 |
+
"content": [
|
250 |
+
{
|
251 |
+
|
252 |
+
"type": "text",
|
253 |
+
"text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
|
254 |
+
Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
|
255 |
+
|
256 |
+
Guidelines:
|
257 |
+
- If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
|
258 |
+
- If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
|
259 |
+
- If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
|
260 |
+
|
261 |
+
Output only either YES or NONE.
|
262 |
+
"""
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"type": "image_url",
|
266 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
|
267 |
+
}
|
268 |
+
]
|
269 |
+
},
|
270 |
+
]
|
271 |
+
)
|
272 |
+
|
273 |
+
response_check_content = response_check.choices[0].message.content.strip().lower()
|
274 |
+
#print(f"is object {obj_id} visible: {response_check_content}")
|
275 |
+
|
276 |
+
if "yes" not in response_check_content:
|
277 |
+
print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
|
278 |
+
return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
|
279 |
+
|
280 |
+
# Referring expression 만들기
|
281 |
+
# generator = OpenAI()
|
282 |
+
response = generator.chat.completions.create(
|
283 |
+
model="chatgpt-4o-latest",
|
284 |
+
messages=[
|
285 |
+
{
|
286 |
+
"role": "user",
|
287 |
+
"content": [
|
288 |
+
{
|
289 |
+
"type": "text",
|
290 |
+
|
291 |
+
"text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
|
292 |
+
Guidelines for creating the referring expression:
|
293 |
+
1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
|
294 |
+
2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
|
295 |
+
3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
|
296 |
+
4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
|
297 |
+
5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
|
298 |
+
6. Use '{cat_name}' as the noun for the referring expressions.
|
299 |
+
Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
|
300 |
+
|
301 |
+
{caption}
|
302 |
+
"""
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"type": "image_url",
|
306 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
307 |
+
},
|
308 |
+
# {
|
309 |
+
# "type": "image_url",
|
310 |
+
# "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
|
311 |
+
# }
|
312 |
+
],
|
313 |
+
}
|
314 |
+
],
|
315 |
+
)
|
316 |
+
|
317 |
+
ref_exp = response.choices[0].message.content.strip()
|
318 |
+
|
319 |
+
#QA filtering
|
320 |
+
#QA1: 원하는 물체를 설명하는지
|
321 |
+
filter = OpenAI()
|
322 |
+
response1 = filter.chat.completions.create(
|
323 |
+
model="chatgpt-4o-latest",
|
324 |
+
messages=[
|
325 |
+
{
|
326 |
+
"role": "user",
|
327 |
+
"content": [
|
328 |
+
{
|
329 |
+
"type": "text",
|
330 |
+
"text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
|
331 |
+
{ref_exp}""",
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"type": "image_url",
|
335 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
336 |
+
},
|
337 |
+
],
|
338 |
+
}
|
339 |
+
],
|
340 |
+
)
|
341 |
+
|
342 |
+
response1_content = response1.choices[0].message.content
|
343 |
+
describesHighlighted = True if "yes" in response1_content.lower() else False
|
344 |
+
|
345 |
+
#QA2: 원하지 않는 물체를 설명하지 않는지
|
346 |
+
response2 = filter.chat.completions.create(
|
347 |
+
model="chatgpt-4o-latest",
|
348 |
+
messages=[
|
349 |
+
{
|
350 |
+
"role": "user",
|
351 |
+
"content": [
|
352 |
+
{
|
353 |
+
"type": "text",
|
354 |
+
"text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
|
355 |
+
{ref_exp}""",
|
356 |
+
},
|
357 |
+
{
|
358 |
+
"type": "image_url",
|
359 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
360 |
+
},
|
361 |
+
],
|
362 |
+
}
|
363 |
+
],
|
364 |
+
)
|
365 |
+
|
366 |
+
response2_content = response2.choices[0].message.content
|
367 |
+
notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
|
368 |
+
|
369 |
+
isValid = True if describesHighlighted and notDescribesNotHighlighted else False
|
370 |
+
|
371 |
+
#print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
|
372 |
+
#print(f"ref exp: {ref_exp}")
|
373 |
+
#print("")
|
374 |
+
|
375 |
+
return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
|
376 |
+
|
377 |
+
|
378 |
+
if __name__ == '__main__':
|
379 |
+
with open('mbench/sampled_frame3.json', 'r') as file:
|
380 |
+
data = json.load(file)
|
381 |
+
|
382 |
+
vid_ids = list(data.keys())
|
383 |
+
all_ref_exps = {}
|
384 |
+
|
385 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
386 |
+
|
387 |
+
# 전체 데이터셋의 vid_id에 대해
|
388 |
+
for i in range(1):
|
389 |
+
vid_id = vid_ids[i]
|
390 |
+
|
391 |
+
#====캡션 만들기====
|
392 |
+
# print("=====================captioner========================")
|
393 |
+
captions, valid_obj_ids = getCaption(vid_id, data)
|
394 |
+
cats_in_vid = list(captions.keys())
|
395 |
+
# print()
|
396 |
+
|
397 |
+
#====referring expression 만들고 QA filtering====
|
398 |
+
# print("=====================referring expression generator & QA filter========================")
|
399 |
+
ref_expressions = {}
|
400 |
+
|
401 |
+
# 각 카테고리별로
|
402 |
+
for cat_name in cats_in_vid:
|
403 |
+
if cat_name not in ref_expressions:
|
404 |
+
ref_expressions[cat_name] = {}
|
405 |
+
# 각 비디오 프레임 별로
|
406 |
+
for frame_name in data[vid_id]['frame_names']:
|
407 |
+
# print(f'--------category: {cat_name}, frame_name: {frame_name}')
|
408 |
+
|
409 |
+
if frame_name not in ref_expressions[cat_name]:
|
410 |
+
ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary
|
411 |
+
caption = captions[cat_name][frame_name]
|
412 |
+
if not caption : continue
|
413 |
+
else :
|
414 |
+
# 각 obj id별로
|
415 |
+
for obj_id in valid_obj_ids:
|
416 |
+
ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
|
417 |
+
ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp
|
418 |
+
|
419 |
+
all_ref_exps[vid_id] = ref_expressions
|
420 |
+
|
421 |
+
|
422 |
+
with open('mbench/result_revised.json', 'w') as file:
|
423 |
+
json.dump(all_ref_exps, file, indent=4)
|
424 |
+
|
425 |
+
|
426 |
+
|
427 |
+
|
428 |
+
|
.history/mbench/gpt_ref-ytvos-revised_20250121160813.py
ADDED
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from os import path as osp
|
3 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
|
9 |
+
import sys
|
10 |
+
from pathlib import Path
|
11 |
+
import os
|
12 |
+
from os import path as osp
|
13 |
+
import skimage
|
14 |
+
from io import BytesIO
|
15 |
+
|
16 |
+
import numpy as np
|
17 |
+
import pandas as pd
|
18 |
+
import regex as re
|
19 |
+
import json
|
20 |
+
|
21 |
+
import cv2
|
22 |
+
from PIL import Image, ImageDraw
|
23 |
+
import torch
|
24 |
+
from torchvision.transforms import functional as F
|
25 |
+
|
26 |
+
from skimage import measure # (pip install scikit-image)
|
27 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
28 |
+
|
29 |
+
import matplotlib.pyplot as plt
|
30 |
+
import matplotlib.patches as patches
|
31 |
+
from matplotlib.collections import PatchCollection
|
32 |
+
from matplotlib.patches import Rectangle
|
33 |
+
|
34 |
+
|
35 |
+
import ipywidgets as widgets
|
36 |
+
from IPython.display import display, clear_output
|
37 |
+
|
38 |
+
from openai import OpenAI
|
39 |
+
import base64
|
40 |
+
|
41 |
+
# Function to encode the image
|
42 |
+
def encode_image(image_path):
|
43 |
+
with open(image_path, "rb") as image_file:
|
44 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
45 |
+
|
46 |
+
# Captioner
|
47 |
+
ytvos_category_valid_list = [
|
48 |
+
'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
49 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
50 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
51 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
52 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
53 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
54 |
+
]
|
55 |
+
def getCaption(video_id, json_data):
|
56 |
+
#데이터 가져오기
|
57 |
+
video_data = json_data[video_id]
|
58 |
+
frame_names = video_data['frame_names']
|
59 |
+
video_path = video_data['video_path']
|
60 |
+
|
61 |
+
cat_names = set()
|
62 |
+
all_captions = dict()
|
63 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
64 |
+
cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
|
65 |
+
|
66 |
+
# cat_names : person, snowboard
|
67 |
+
# 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
|
68 |
+
# 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
|
69 |
+
|
70 |
+
for cat_name in list(cat_names) :
|
71 |
+
image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
|
72 |
+
image_captions = {}
|
73 |
+
|
74 |
+
captioner = OpenAI()
|
75 |
+
|
76 |
+
#0단계: action의 대상이 될 수 있는가?
|
77 |
+
is_movable = False
|
78 |
+
if cat_name in ytvos_category_valid_list :
|
79 |
+
is_movable = True
|
80 |
+
|
81 |
+
# response_check = captioner.chat.completions.create(
|
82 |
+
# model="gpt-4o",
|
83 |
+
# messages=[
|
84 |
+
# {
|
85 |
+
# "role": "user",
|
86 |
+
# "content": f"""
|
87 |
+
# Can a {cat_name} be a subject of distinct actions or movements?
|
88 |
+
# For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
|
89 |
+
# However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
|
90 |
+
# Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
|
91 |
+
# Answer only YES or NONE.
|
92 |
+
# """
|
93 |
+
# }
|
94 |
+
# ],
|
95 |
+
# )
|
96 |
+
# response_check_content = response_check.choices[0].message.content.strip().lower()
|
97 |
+
# print(f"Movable Check for {cat_name}: {response_check_content}")
|
98 |
+
|
99 |
+
# if response_check_content == "yes": is_movable = True
|
100 |
+
|
101 |
+
if not is_movable:
|
102 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.")
|
103 |
+
continue
|
104 |
+
|
105 |
+
for i in range(len(image_paths)):
|
106 |
+
image_path = image_paths[i]
|
107 |
+
frame_name = frame_names[i]
|
108 |
+
base64_image = encode_image(image_path)
|
109 |
+
|
110 |
+
#1단계: 필터링
|
111 |
+
#print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
112 |
+
response1 = captioner.chat.completions.create(
|
113 |
+
model="chatgpt-4o-latest",
|
114 |
+
messages=[
|
115 |
+
{
|
116 |
+
"role": "user",
|
117 |
+
"content": [
|
118 |
+
{
|
119 |
+
"type": "text",
|
120 |
+
|
121 |
+
"text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
|
122 |
+
Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
|
123 |
+
Each action should be unique and clearly associated with a specific object.
|
124 |
+
|
125 |
+
Respond with YES if:
|
126 |
+
- The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
|
127 |
+
- The {cat_name}s involve clear, distinguishable actions performed independently.
|
128 |
+
|
129 |
+
Respond with NONE if:
|
130 |
+
- The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
|
131 |
+
- Actions are ambiguous, minor, or not clearly visible.
|
132 |
+
|
133 |
+
If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
|
134 |
+
If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
|
135 |
+
|
136 |
+
Answer only YES or NONE."""
|
137 |
+
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"type": "image_url",
|
141 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
142 |
+
},
|
143 |
+
],
|
144 |
+
}
|
145 |
+
],
|
146 |
+
)
|
147 |
+
response_content = response1.choices[0].message.content
|
148 |
+
should_caption = True if "yes" in response_content.lower() else False
|
149 |
+
#print(f"are {cat_name}s distinguished by action: {response_content}")
|
150 |
+
|
151 |
+
#2단계: dense caption 만들기
|
152 |
+
if should_caption:
|
153 |
+
response2 = captioner.chat.completions.create(
|
154 |
+
model="chatgpt-4o-latest",
|
155 |
+
messages=[
|
156 |
+
{
|
157 |
+
"role": "user",
|
158 |
+
"content": [
|
159 |
+
{
|
160 |
+
"type": "text",
|
161 |
+
|
162 |
+
"text": f"""
|
163 |
+
Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
|
164 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
165 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
166 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
167 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
168 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
169 |
+
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
170 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
171 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
172 |
+
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
173 |
+
Output only the caption.""",
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"type": "image_url",
|
177 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
178 |
+
},
|
179 |
+
],
|
180 |
+
}
|
181 |
+
],
|
182 |
+
)
|
183 |
+
|
184 |
+
caption = response2.choices[0].message.content
|
185 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
186 |
+
else:
|
187 |
+
caption = None
|
188 |
+
|
189 |
+
image_captions[frame_name] = caption
|
190 |
+
all_captions[cat_name] = image_captions
|
191 |
+
|
192 |
+
# final : also prepare valid object ids
|
193 |
+
valid_obj_ids = []
|
194 |
+
valid_cat_names = list(all_captions.keys())
|
195 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
196 |
+
cat = video_data['annotations'][0][obj_id]['category_name']
|
197 |
+
if cat in valid_cat_names : valid_obj_ids.append(obj_id)
|
198 |
+
|
199 |
+
return all_captions, valid_obj_ids
|
200 |
+
|
201 |
+
# Referring expression generator and QA filter
|
202 |
+
def getRefExp(video_id, frame_name, caption, obj_id, json_data):
|
203 |
+
|
204 |
+
# 이미지에 해당 물체 바운딩 박스 그리기
|
205 |
+
video_data = json_data[video_id]
|
206 |
+
frame_names = video_data['frame_names']
|
207 |
+
video_path = video_data['video_path']
|
208 |
+
I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
|
209 |
+
frame_indx = frame_names.index(frame_name)
|
210 |
+
obj_data = video_data['annotations'][frame_indx][obj_id]
|
211 |
+
|
212 |
+
bbox = obj_data['bbox']
|
213 |
+
cat_name = obj_data['category_name']
|
214 |
+
valid = obj_data['valid']
|
215 |
+
|
216 |
+
if valid == 0:
|
217 |
+
print("Object not in this frame!")
|
218 |
+
return {}
|
219 |
+
|
220 |
+
|
221 |
+
x_min, y_min, x_max, y_max = bbox
|
222 |
+
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
|
223 |
+
cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
|
224 |
+
plt.figure()
|
225 |
+
plt.imshow(I)
|
226 |
+
plt.axis('off')
|
227 |
+
plt.show()
|
228 |
+
|
229 |
+
#cropped object for visibility check
|
230 |
+
cropped_I = I[y_min:y_max, x_min:x_max]
|
231 |
+
pil_cropped_I = Image.fromarray(cropped_I)
|
232 |
+
buff_crop = BytesIO()
|
233 |
+
pil_cropped_I.save(buff_crop, format='JPEG')
|
234 |
+
base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
|
235 |
+
|
236 |
+
#entire image for referring expression generation
|
237 |
+
pil_I = Image.fromarray(I)
|
238 |
+
buff = BytesIO()
|
239 |
+
pil_I.save(buff, format='JPEG')
|
240 |
+
base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
|
241 |
+
|
242 |
+
# 구분 가능 여부 확인
|
243 |
+
generator = OpenAI()
|
244 |
+
response_check = generator.chat.completions.create(
|
245 |
+
model="chatgpt-4o-latest",
|
246 |
+
messages=[
|
247 |
+
{
|
248 |
+
"role": "user",
|
249 |
+
"content": [
|
250 |
+
{
|
251 |
+
|
252 |
+
"type": "text",
|
253 |
+
"text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
|
254 |
+
Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
|
255 |
+
|
256 |
+
Guidelines:
|
257 |
+
- If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
|
258 |
+
- If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
|
259 |
+
- If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
|
260 |
+
|
261 |
+
Output only either YES or NONE.
|
262 |
+
"""
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"type": "image_url",
|
266 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
|
267 |
+
}
|
268 |
+
]
|
269 |
+
},
|
270 |
+
]
|
271 |
+
)
|
272 |
+
|
273 |
+
response_check_content = response_check.choices[0].message.content.strip().lower()
|
274 |
+
#print(f"is object {obj_id} visible: {response_check_content}")
|
275 |
+
|
276 |
+
if "yes" not in response_check_content:
|
277 |
+
print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
|
278 |
+
return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
|
279 |
+
|
280 |
+
# Referring expression 만들기
|
281 |
+
# generator = OpenAI()
|
282 |
+
response = generator.chat.completions.create(
|
283 |
+
model="chatgpt-4o-latest",
|
284 |
+
messages=[
|
285 |
+
{
|
286 |
+
"role": "user",
|
287 |
+
"content": [
|
288 |
+
{
|
289 |
+
"type": "text",
|
290 |
+
|
291 |
+
"text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
|
292 |
+
Guidelines for creating the referring expression:
|
293 |
+
1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
|
294 |
+
2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
|
295 |
+
3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
|
296 |
+
4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
|
297 |
+
5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
|
298 |
+
6. Use '{cat_name}' as the noun for the referring expressions.
|
299 |
+
Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
|
300 |
+
|
301 |
+
{caption}
|
302 |
+
"""
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"type": "image_url",
|
306 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
307 |
+
},
|
308 |
+
# {
|
309 |
+
# "type": "image_url",
|
310 |
+
# "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
|
311 |
+
# }
|
312 |
+
],
|
313 |
+
}
|
314 |
+
],
|
315 |
+
)
|
316 |
+
|
317 |
+
ref_exp = response.choices[0].message.content.strip()
|
318 |
+
|
319 |
+
#QA filtering
|
320 |
+
#QA1: 원하는 물체를 설명하는지
|
321 |
+
filter = OpenAI()
|
322 |
+
response1 = filter.chat.completions.create(
|
323 |
+
model="chatgpt-4o-latest",
|
324 |
+
messages=[
|
325 |
+
{
|
326 |
+
"role": "user",
|
327 |
+
"content": [
|
328 |
+
{
|
329 |
+
"type": "text",
|
330 |
+
"text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
|
331 |
+
{ref_exp}""",
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"type": "image_url",
|
335 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
336 |
+
},
|
337 |
+
],
|
338 |
+
}
|
339 |
+
],
|
340 |
+
)
|
341 |
+
|
342 |
+
response1_content = response1.choices[0].message.content
|
343 |
+
describesHighlighted = True if "yes" in response1_content.lower() else False
|
344 |
+
|
345 |
+
#QA2: 원하지 않는 물체를 설명하지 않는지
|
346 |
+
response2 = filter.chat.completions.create(
|
347 |
+
model="chatgpt-4o-latest",
|
348 |
+
messages=[
|
349 |
+
{
|
350 |
+
"role": "user",
|
351 |
+
"content": [
|
352 |
+
{
|
353 |
+
"type": "text",
|
354 |
+
"text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
|
355 |
+
{ref_exp}""",
|
356 |
+
},
|
357 |
+
{
|
358 |
+
"type": "image_url",
|
359 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
360 |
+
},
|
361 |
+
],
|
362 |
+
}
|
363 |
+
],
|
364 |
+
)
|
365 |
+
|
366 |
+
response2_content = response2.choices[0].message.content
|
367 |
+
notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
|
368 |
+
|
369 |
+
isValid = True if describesHighlighted and notDescribesNotHighlighted else False
|
370 |
+
|
371 |
+
#print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
|
372 |
+
#print(f"ref exp: {ref_exp}")
|
373 |
+
#print("")
|
374 |
+
|
375 |
+
return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
|
376 |
+
|
377 |
+
|
378 |
+
if __name__ == '__main__':
|
379 |
+
with open('mbench/sampled_frame3.json', 'r') as file:
|
380 |
+
data = json.load(file)
|
381 |
+
|
382 |
+
vid_ids = list(data.keys())
|
383 |
+
all_ref_exps = {}
|
384 |
+
|
385 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
386 |
+
|
387 |
+
# 전체 데이터셋의 vid_id에 대해
|
388 |
+
for i in range(50):
|
389 |
+
vid_id = vid_ids[i]
|
390 |
+
|
391 |
+
#====캡션 만들기====
|
392 |
+
# print("=====================captioner========================")
|
393 |
+
captions, valid_obj_ids = getCaption(vid_id, data)
|
394 |
+
cats_in_vid = list(captions.keys())
|
395 |
+
# print()
|
396 |
+
|
397 |
+
#====referring expression 만들고 QA filtering====
|
398 |
+
# print("=====================referring expression generator & QA filter========================")
|
399 |
+
ref_expressions = {}
|
400 |
+
|
401 |
+
# 각 카테고리별로
|
402 |
+
for cat_name in cats_in_vid:
|
403 |
+
if cat_name not in ref_expressions:
|
404 |
+
ref_expressions[cat_name] = {}
|
405 |
+
# 각 비디오 프레임 별로
|
406 |
+
for frame_name in data[vid_id]['frame_names']:
|
407 |
+
# print(f'--------category: {cat_name}, frame_name: {frame_name}')
|
408 |
+
|
409 |
+
if frame_name not in ref_expressions[cat_name]:
|
410 |
+
ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary
|
411 |
+
caption = captions[cat_name][frame_name]
|
412 |
+
if not caption : continue
|
413 |
+
else :
|
414 |
+
# 각 obj id별로
|
415 |
+
for obj_id in valid_obj_ids:
|
416 |
+
ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
|
417 |
+
ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp
|
418 |
+
|
419 |
+
all_ref_exps[vid_id] = ref_expressions
|
420 |
+
|
421 |
+
|
422 |
+
with open('mbench/result_revised.json', 'w') as file:
|
423 |
+
json.dump(all_ref_exps, file, indent=4)
|
424 |
+
|
425 |
+
|
426 |
+
|
427 |
+
|
428 |
+
|
.history/mbench/gpt_ref-ytvos_20250119070213.py
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import build_dataset
|
2 |
+
import argparse
|
3 |
+
import opts
|
4 |
+
|
5 |
+
import sys
|
6 |
+
from pathlib import Path
|
7 |
+
import os
|
8 |
+
from os import path as osp
|
9 |
+
import skimage
|
10 |
+
from io import BytesIO
|
11 |
+
|
12 |
+
import numpy as np
|
13 |
+
import pandas as pd
|
14 |
+
import regex as re
|
15 |
+
import json
|
16 |
+
|
17 |
+
import cv2
|
18 |
+
from PIL import Image, ImageDraw
|
19 |
+
import torch
|
20 |
+
from torchvision.transforms import functional as F
|
21 |
+
|
22 |
+
from skimage import measure # (pip install scikit-image)
|
23 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
24 |
+
|
25 |
+
import matplotlib.pyplot as plt
|
26 |
+
import matplotlib.patches as patches
|
27 |
+
from matplotlib.collections import PatchCollection
|
28 |
+
from matplotlib.patches import Rectangle
|
29 |
+
|
30 |
+
|
31 |
+
import ipywidgets as widgets
|
32 |
+
from IPython.display import display, clear_output
|
33 |
+
|
34 |
+
from openai import OpenAI
|
35 |
+
import base64
|
36 |
+
|
37 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
38 |
+
|
39 |
+
# Function to encode the image
|
40 |
+
def encode_image(image_path):
|
41 |
+
with open(image_path, "rb") as image_file:
|
42 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
43 |
+
|
44 |
+
def getCaption(video_id, json_data):
|
45 |
+
#데이터 가져오기
|
46 |
+
video_data = json_data[video_id]
|
47 |
+
frame_names = video_data['frame_names']
|
48 |
+
video_path = video_data['video_path']
|
49 |
+
|
50 |
+
cat_names = set()
|
51 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
52 |
+
cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
|
53 |
+
|
54 |
+
if len(cat_names) == 1:
|
55 |
+
cat_name = next(iter(cat_names))
|
56 |
+
else:
|
57 |
+
print("more than 2 categories")
|
58 |
+
return -1
|
59 |
+
|
60 |
+
image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
|
61 |
+
image_captions = {}
|
62 |
+
|
63 |
+
captioner = OpenAI()
|
64 |
+
for i in range(len(image_paths)):
|
65 |
+
image_path = image_paths[i]
|
66 |
+
frame_name = frame_names[i]
|
67 |
+
base64_image = encode_image(image_path)
|
68 |
+
|
69 |
+
#1단계: 필터링
|
70 |
+
response1 = captioner.chat.completions.create(
|
71 |
+
model="gpt-4o-mini",
|
72 |
+
messages=[
|
73 |
+
{
|
74 |
+
"role": "user",
|
75 |
+
"content": [
|
76 |
+
{
|
77 |
+
"type": "text",
|
78 |
+
"text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"type": "image_url",
|
82 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
83 |
+
},
|
84 |
+
],
|
85 |
+
}
|
86 |
+
],
|
87 |
+
)
|
88 |
+
response_content = response1.choices[0].message.content
|
89 |
+
should_caption = True if "yes" in response_content.lower() else False
|
90 |
+
|
91 |
+
#2단계: dense caption 만들기
|
92 |
+
if should_caption:
|
93 |
+
response2 = captioner.chat.completions.create(
|
94 |
+
model="gpt-4o-mini",
|
95 |
+
messages=[
|
96 |
+
{
|
97 |
+
"role": "user",
|
98 |
+
"content": [
|
99 |
+
{
|
100 |
+
"type": "text",
|
101 |
+
"text": f"""
|
102 |
+
Describe the image in detail focusing on the {cat_name}s' actions.
|
103 |
+
1. Each action should be prominent, clear and unique, describing the corresponding object only.
|
104 |
+
2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
|
105 |
+
3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
|
106 |
+
4. Do not include actions that needs to be guessed or suggested.""",
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"type": "image_url",
|
110 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
111 |
+
},
|
112 |
+
],
|
113 |
+
}
|
114 |
+
],
|
115 |
+
)
|
116 |
+
|
117 |
+
caption = response2.choices[0].message.content
|
118 |
+
else:
|
119 |
+
caption = None
|
120 |
+
|
121 |
+
image_captions[frame_name] = caption
|
122 |
+
return image_captions
|
123 |
+
|
124 |
+
def getRefExp(video_id, frame_name, caption, obj_id, json_data):
|
125 |
+
# 이미지에 해당 물체 바운딩 박스 그리기
|
126 |
+
video_data = json_data[video_id]
|
127 |
+
frame_names = video_data['frame_names']
|
128 |
+
video_path = video_data['video_path']
|
129 |
+
I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
|
130 |
+
frame_indx = frame_names.index(frame_name)
|
131 |
+
obj_data = video_data['annotations'][frame_indx][obj_id]
|
132 |
+
|
133 |
+
bbox = obj_data['bbox']
|
134 |
+
cat_name = obj_data['category_name']
|
135 |
+
valid = obj_data['valid']
|
136 |
+
|
137 |
+
if valid == 0:
|
138 |
+
print("Object not in this frame!")
|
139 |
+
return {}
|
140 |
+
|
141 |
+
|
142 |
+
x_min, y_min, x_max, y_max = bbox
|
143 |
+
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
|
144 |
+
cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
|
145 |
+
plt.figure()
|
146 |
+
plt.imshow(I)
|
147 |
+
plt.axis('off')
|
148 |
+
plt.show()
|
149 |
+
pil_I = Image.fromarray(I)
|
150 |
+
buff = BytesIO()
|
151 |
+
pil_I.save(buff, format='JPEG')
|
152 |
+
base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
|
153 |
+
|
154 |
+
#ref expression 만들기
|
155 |
+
generator = OpenAI()
|
156 |
+
response = generator.chat.completions.create(
|
157 |
+
model="gpt-4o-mini",
|
158 |
+
messages=[
|
159 |
+
{
|
160 |
+
"role": "user",
|
161 |
+
"content": [
|
162 |
+
{
|
163 |
+
"type": "text",
|
164 |
+
"text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
|
165 |
+
1. The referring expression describes the action and does not contain information about appearance or location in the picture.
|
166 |
+
2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
|
167 |
+
3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
|
168 |
+
4. The referring expression should only describe the highlighted {cat_name} and not any other.
|
169 |
+
5. Use '{cat_name}' as the noun for the referring expressions.
|
170 |
+
Output only the referring expression.
|
171 |
+
{caption}""",
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"type": "image_url",
|
175 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
176 |
+
},
|
177 |
+
],
|
178 |
+
}
|
179 |
+
],
|
180 |
+
)
|
181 |
+
|
182 |
+
ref_exp = response.choices[0].message.content
|
183 |
+
|
184 |
+
#QA filtering
|
185 |
+
#QA1: 원하는 물체를 설명하는지
|
186 |
+
filter = OpenAI()
|
187 |
+
response1 = filter.chat.completions.create(
|
188 |
+
model="gpt-4o-mini",
|
189 |
+
messages=[
|
190 |
+
{
|
191 |
+
"role": "user",
|
192 |
+
"content": [
|
193 |
+
{
|
194 |
+
"type": "text",
|
195 |
+
"text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
|
196 |
+
{ref_exp}""",
|
197 |
+
},
|
198 |
+
{
|
199 |
+
"type": "image_url",
|
200 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
201 |
+
},
|
202 |
+
],
|
203 |
+
}
|
204 |
+
],
|
205 |
+
)
|
206 |
+
|
207 |
+
response1_content = response1.choices[0].message.content
|
208 |
+
describesHighlighted = True if "yes" in response1_content.lower() else False
|
209 |
+
|
210 |
+
#QA2: 원하지 않는 물체를 설명하지 않는지
|
211 |
+
response2 = filter.chat.completions.create(
|
212 |
+
model="gpt-4o-mini",
|
213 |
+
messages=[
|
214 |
+
{
|
215 |
+
"role": "user",
|
216 |
+
"content": [
|
217 |
+
{
|
218 |
+
"type": "text",
|
219 |
+
"text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
|
220 |
+
{ref_exp}""",
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"type": "image_url",
|
224 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
225 |
+
},
|
226 |
+
],
|
227 |
+
}
|
228 |
+
],
|
229 |
+
)
|
230 |
+
|
231 |
+
response2_content = response2.choices[0].message.content
|
232 |
+
describesNotHighlighted = True if "yes" in response2_content.lower() else False
|
233 |
+
|
234 |
+
isValid = True if describesHighlighted and not describesNotHighlighted else False
|
235 |
+
|
236 |
+
print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
|
237 |
+
|
238 |
+
return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
|
239 |
+
|
240 |
+
def createRefExp(video_id, json_data):
|
241 |
+
video_data = json_data[video_id]
|
242 |
+
obj_ids = list(video_data['annotations'][0].keys())
|
243 |
+
frame_names = video_data['frame_names']
|
244 |
+
|
245 |
+
captions_per_frame = getCaption(video_id, json_data)
|
246 |
+
|
247 |
+
if captions_per_frame == -1:
|
248 |
+
print("There are more than 2 cateories")
|
249 |
+
return
|
250 |
+
|
251 |
+
|
252 |
+
video_ref_exps = {}
|
253 |
+
|
254 |
+
for frame_name in frame_names:
|
255 |
+
frame_caption = captions_per_frame[frame_name]
|
256 |
+
|
257 |
+
if frame_caption == None:
|
258 |
+
video_ref_exps[frame_name] = None
|
259 |
+
|
260 |
+
else:
|
261 |
+
frame_ref_exps = {}
|
262 |
+
for obj_id in obj_ids:
|
263 |
+
exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
|
264 |
+
frame_ref_exps[obj_id] = exp_per_obj
|
265 |
+
video_ref_exps[frame_name] = frame_ref_exps
|
266 |
+
|
267 |
+
return video_ref_exps
|
268 |
+
|
269 |
+
if __name__ == '__main__':
|
270 |
+
with open('mbench/sampled_frame3.json', 'r') as file:
|
271 |
+
data = json.load(file)
|
272 |
+
|
273 |
+
all_video_refs = {}
|
274 |
+
for i in range(10):
|
275 |
+
video_id = list(data.keys())[i]
|
276 |
+
video_ref = createRefExp(video_id, data)
|
277 |
+
all_video_refs[video_id] = video_ref
|
.history/mbench/gpt_ref-ytvos_20250119070707.py
ADDED
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import build_dataset
|
2 |
+
import argparse
|
3 |
+
import opts
|
4 |
+
|
5 |
+
import sys
|
6 |
+
from pathlib import Path
|
7 |
+
import os
|
8 |
+
from os import path as osp
|
9 |
+
import skimage
|
10 |
+
from io import BytesIO
|
11 |
+
|
12 |
+
import numpy as np
|
13 |
+
import pandas as pd
|
14 |
+
import regex as re
|
15 |
+
import json
|
16 |
+
|
17 |
+
import cv2
|
18 |
+
from PIL import Image, ImageDraw
|
19 |
+
import torch
|
20 |
+
from torchvision.transforms import functional as F
|
21 |
+
|
22 |
+
from skimage import measure # (pip install scikit-image)
|
23 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
24 |
+
|
25 |
+
import matplotlib.pyplot as plt
|
26 |
+
import matplotlib.patches as patches
|
27 |
+
from matplotlib.collections import PatchCollection
|
28 |
+
from matplotlib.patches import Rectangle
|
29 |
+
|
30 |
+
|
31 |
+
import ipywidgets as widgets
|
32 |
+
from IPython.display import display, clear_output
|
33 |
+
|
34 |
+
from openai import OpenAI
|
35 |
+
import base64
|
36 |
+
|
37 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
38 |
+
|
39 |
+
# Function to encode the image
|
40 |
+
def encode_image(image_path):
|
41 |
+
with open(image_path, "rb") as image_file:
|
42 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
43 |
+
|
44 |
+
def getCaption(video_id, json_data):
|
45 |
+
#데이터 가져오기
|
46 |
+
video_data = json_data[video_id]
|
47 |
+
frame_names = video_data['frame_names']
|
48 |
+
video_path = video_data['video_path']
|
49 |
+
|
50 |
+
cat_names = set()
|
51 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
52 |
+
cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
|
53 |
+
|
54 |
+
if len(cat_names) == 1:
|
55 |
+
cat_name = next(iter(cat_names))
|
56 |
+
else:
|
57 |
+
print("more than 2 categories")
|
58 |
+
return -1
|
59 |
+
|
60 |
+
image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
|
61 |
+
image_captions = {}
|
62 |
+
|
63 |
+
captioner = OpenAI()
|
64 |
+
for i in range(len(image_paths)):
|
65 |
+
image_path = image_paths[i]
|
66 |
+
frame_name = frame_names[i]
|
67 |
+
base64_image = encode_image(image_path)
|
68 |
+
|
69 |
+
#1단계: 필터링
|
70 |
+
response1 = captioner.chat.completions.create(
|
71 |
+
model="gpt-4o-mini",
|
72 |
+
messages=[
|
73 |
+
{
|
74 |
+
"role": "user",
|
75 |
+
"content": [
|
76 |
+
{
|
77 |
+
"type": "text",
|
78 |
+
"text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"type": "image_url",
|
82 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
83 |
+
},
|
84 |
+
],
|
85 |
+
}
|
86 |
+
],
|
87 |
+
)
|
88 |
+
response_content = response1.choices[0].message.content
|
89 |
+
should_caption = True if "yes" in response_content.lower() else False
|
90 |
+
|
91 |
+
#2단계: dense caption 만들기
|
92 |
+
if should_caption:
|
93 |
+
response2 = captioner.chat.completions.create(
|
94 |
+
model="gpt-4o-mini",
|
95 |
+
messages=[
|
96 |
+
{
|
97 |
+
"role": "user",
|
98 |
+
"content": [
|
99 |
+
{
|
100 |
+
"type": "text",
|
101 |
+
"text": f"""
|
102 |
+
Describe the image in detail focusing on the {cat_name}s' actions.
|
103 |
+
1. Each action should be prominent, clear and unique, describing the corresponding object only.
|
104 |
+
2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
|
105 |
+
3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
|
106 |
+
4. Do not include actions that needs to be guessed or suggested.""",
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"type": "image_url",
|
110 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
111 |
+
},
|
112 |
+
],
|
113 |
+
}
|
114 |
+
],
|
115 |
+
)
|
116 |
+
|
117 |
+
caption = response2.choices[0].message.content
|
118 |
+
else:
|
119 |
+
caption = None
|
120 |
+
|
121 |
+
image_captions[frame_name] = caption
|
122 |
+
return image_captions
|
123 |
+
|
124 |
+
def getRefExp(video_id, frame_name, caption, obj_id, json_data):
|
125 |
+
# 이미지에 해당 물체 바운딩 박스 그리기
|
126 |
+
video_data = json_data[video_id]
|
127 |
+
frame_names = video_data['frame_names']
|
128 |
+
video_path = video_data['video_path']
|
129 |
+
I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
|
130 |
+
frame_indx = frame_names.index(frame_name)
|
131 |
+
obj_data = video_data['annotations'][frame_indx][obj_id]
|
132 |
+
|
133 |
+
bbox = obj_data['bbox']
|
134 |
+
cat_name = obj_data['category_name']
|
135 |
+
valid = obj_data['valid']
|
136 |
+
|
137 |
+
if valid == 0:
|
138 |
+
print("Object not in this frame!")
|
139 |
+
return {}
|
140 |
+
|
141 |
+
|
142 |
+
x_min, y_min, x_max, y_max = bbox
|
143 |
+
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
|
144 |
+
cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
|
145 |
+
plt.figure()
|
146 |
+
plt.imshow(I)
|
147 |
+
plt.axis('off')
|
148 |
+
plt.show()
|
149 |
+
pil_I = Image.fromarray(I)
|
150 |
+
buff = BytesIO()
|
151 |
+
pil_I.save(buff, format='JPEG')
|
152 |
+
base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
|
153 |
+
|
154 |
+
#ref expression 만들기
|
155 |
+
generator = OpenAI()
|
156 |
+
response = generator.chat.completions.create(
|
157 |
+
model="gpt-4o-mini",
|
158 |
+
messages=[
|
159 |
+
{
|
160 |
+
"role": "user",
|
161 |
+
"content": [
|
162 |
+
{
|
163 |
+
"type": "text",
|
164 |
+
"text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
|
165 |
+
1. The referring expression describes the action and does not contain information about appearance or location in the picture.
|
166 |
+
2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
|
167 |
+
3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
|
168 |
+
4. The referring expression should only describe the highlighted {cat_name} and not any other.
|
169 |
+
5. Use '{cat_name}' as the noun for the referring expressions.
|
170 |
+
Output only the referring expression.
|
171 |
+
{caption}""",
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"type": "image_url",
|
175 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
176 |
+
},
|
177 |
+
],
|
178 |
+
}
|
179 |
+
],
|
180 |
+
)
|
181 |
+
|
182 |
+
ref_exp = response.choices[0].message.content
|
183 |
+
|
184 |
+
#QA filtering
|
185 |
+
#QA1: 원하는 물체를 설명하는지
|
186 |
+
filter = OpenAI()
|
187 |
+
response1 = filter.chat.completions.create(
|
188 |
+
model="gpt-4o-mini",
|
189 |
+
messages=[
|
190 |
+
{
|
191 |
+
"role": "user",
|
192 |
+
"content": [
|
193 |
+
{
|
194 |
+
"type": "text",
|
195 |
+
"text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
|
196 |
+
{ref_exp}""",
|
197 |
+
},
|
198 |
+
{
|
199 |
+
"type": "image_url",
|
200 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
201 |
+
},
|
202 |
+
],
|
203 |
+
}
|
204 |
+
],
|
205 |
+
)
|
206 |
+
|
207 |
+
response1_content = response1.choices[0].message.content
|
208 |
+
describesHighlighted = True if "yes" in response1_content.lower() else False
|
209 |
+
|
210 |
+
#QA2: 원하지 않는 물체를 설명하지 않는지
|
211 |
+
response2 = filter.chat.completions.create(
|
212 |
+
model="gpt-4o-mini",
|
213 |
+
messages=[
|
214 |
+
{
|
215 |
+
"role": "user",
|
216 |
+
"content": [
|
217 |
+
{
|
218 |
+
"type": "text",
|
219 |
+
"text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
|
220 |
+
{ref_exp}""",
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"type": "image_url",
|
224 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
225 |
+
},
|
226 |
+
],
|
227 |
+
}
|
228 |
+
],
|
229 |
+
)
|
230 |
+
|
231 |
+
response2_content = response2.choices[0].message.content
|
232 |
+
describesNotHighlighted = True if "yes" in response2_content.lower() else False
|
233 |
+
|
234 |
+
isValid = True if describesHighlighted and not describesNotHighlighted else False
|
235 |
+
|
236 |
+
print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
|
237 |
+
|
238 |
+
return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
|
239 |
+
|
240 |
+
def createRefExp(video_id, json_data):
|
241 |
+
video_data = json_data[video_id]
|
242 |
+
obj_ids = list(video_data['annotations'][0].keys())
|
243 |
+
frame_names = video_data['frame_names']
|
244 |
+
|
245 |
+
captions_per_frame = getCaption(video_id, json_data)
|
246 |
+
|
247 |
+
if captions_per_frame == -1:
|
248 |
+
print("There are more than 2 cateories")
|
249 |
+
return
|
250 |
+
|
251 |
+
|
252 |
+
video_ref_exps = {}
|
253 |
+
|
254 |
+
for frame_name in frame_names:
|
255 |
+
frame_caption = captions_per_frame[frame_name]
|
256 |
+
|
257 |
+
if frame_caption == None:
|
258 |
+
video_ref_exps[frame_name] = None
|
259 |
+
|
260 |
+
else:
|
261 |
+
frame_ref_exps = {}
|
262 |
+
for obj_id in obj_ids:
|
263 |
+
exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
|
264 |
+
frame_ref_exps[obj_id] = exp_per_obj
|
265 |
+
video_ref_exps[frame_name] = frame_ref_exps
|
266 |
+
|
267 |
+
return video_ref_exps
|
268 |
+
|
269 |
+
if __name__ == '__main__':
|
270 |
+
with open('mbench/sampled_frame3.json', 'r') as file:
|
271 |
+
data = json.load(file)
|
272 |
+
|
273 |
+
with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
|
274 |
+
manual_select = list(file)
|
275 |
+
for frame in manual_select:
|
276 |
+
result = json.loads
|
277 |
+
|
278 |
+
all_video_refs = {}
|
279 |
+
for i in range(10):
|
280 |
+
video_id = list(data.keys())[i]
|
281 |
+
video_ref = createRefExp(video_id, data)
|
282 |
+
all_video_refs[video_id] = video_ref
|
.history/mbench/gpt_ref-ytvos_20250119070824.py
ADDED
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import build_dataset
|
2 |
+
import argparse
|
3 |
+
import opts
|
4 |
+
|
5 |
+
import sys
|
6 |
+
from pathlib import Path
|
7 |
+
import os
|
8 |
+
from os import path as osp
|
9 |
+
import skimage
|
10 |
+
from io import BytesIO
|
11 |
+
|
12 |
+
import numpy as np
|
13 |
+
import pandas as pd
|
14 |
+
import regex as re
|
15 |
+
import json
|
16 |
+
|
17 |
+
import cv2
|
18 |
+
from PIL import Image, ImageDraw
|
19 |
+
import torch
|
20 |
+
from torchvision.transforms import functional as F
|
21 |
+
|
22 |
+
from skimage import measure # (pip install scikit-image)
|
23 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
24 |
+
|
25 |
+
import matplotlib.pyplot as plt
|
26 |
+
import matplotlib.patches as patches
|
27 |
+
from matplotlib.collections import PatchCollection
|
28 |
+
from matplotlib.patches import Rectangle
|
29 |
+
|
30 |
+
|
31 |
+
import ipywidgets as widgets
|
32 |
+
from IPython.display import display, clear_output
|
33 |
+
|
34 |
+
from openai import OpenAI
|
35 |
+
import base64
|
36 |
+
|
37 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
38 |
+
|
39 |
+
# Function to encode the image
|
40 |
+
def encode_image(image_path):
|
41 |
+
with open(image_path, "rb") as image_file:
|
42 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
43 |
+
|
44 |
+
def getCaption(video_id, json_data):
|
45 |
+
#데이터 가져오기
|
46 |
+
video_data = json_data[video_id]
|
47 |
+
frame_names = video_data['frame_names']
|
48 |
+
video_path = video_data['video_path']
|
49 |
+
|
50 |
+
cat_names = set()
|
51 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
52 |
+
cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
|
53 |
+
|
54 |
+
if len(cat_names) == 1:
|
55 |
+
cat_name = next(iter(cat_names))
|
56 |
+
else:
|
57 |
+
print("more than 2 categories")
|
58 |
+
return -1
|
59 |
+
|
60 |
+
image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
|
61 |
+
image_captions = {}
|
62 |
+
|
63 |
+
captioner = OpenAI()
|
64 |
+
for i in range(len(image_paths)):
|
65 |
+
image_path = image_paths[i]
|
66 |
+
frame_name = frame_names[i]
|
67 |
+
base64_image = encode_image(image_path)
|
68 |
+
|
69 |
+
#1단계: 필터링
|
70 |
+
response1 = captioner.chat.completions.create(
|
71 |
+
model="gpt-4o-mini",
|
72 |
+
messages=[
|
73 |
+
{
|
74 |
+
"role": "user",
|
75 |
+
"content": [
|
76 |
+
{
|
77 |
+
"type": "text",
|
78 |
+
"text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"type": "image_url",
|
82 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
83 |
+
},
|
84 |
+
],
|
85 |
+
}
|
86 |
+
],
|
87 |
+
)
|
88 |
+
response_content = response1.choices[0].message.content
|
89 |
+
should_caption = True if "yes" in response_content.lower() else False
|
90 |
+
|
91 |
+
#2단계: dense caption 만들기
|
92 |
+
if should_caption:
|
93 |
+
response2 = captioner.chat.completions.create(
|
94 |
+
model="gpt-4o-mini",
|
95 |
+
messages=[
|
96 |
+
{
|
97 |
+
"role": "user",
|
98 |
+
"content": [
|
99 |
+
{
|
100 |
+
"type": "text",
|
101 |
+
"text": f"""
|
102 |
+
Describe the image in detail focusing on the {cat_name}s' actions.
|
103 |
+
1. Each action should be prominent, clear and unique, describing the corresponding object only.
|
104 |
+
2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
|
105 |
+
3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
|
106 |
+
4. Do not include actions that needs to be guessed or suggested.""",
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"type": "image_url",
|
110 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
111 |
+
},
|
112 |
+
],
|
113 |
+
}
|
114 |
+
],
|
115 |
+
)
|
116 |
+
|
117 |
+
caption = response2.choices[0].message.content
|
118 |
+
else:
|
119 |
+
caption = None
|
120 |
+
|
121 |
+
image_captions[frame_name] = caption
|
122 |
+
return image_captions
|
123 |
+
|
124 |
+
def getRefExp(video_id, frame_name, caption, obj_id, json_data):
|
125 |
+
# 이미지에 해당 물체 바운딩 박스 그리기
|
126 |
+
video_data = json_data[video_id]
|
127 |
+
frame_names = video_data['frame_names']
|
128 |
+
video_path = video_data['video_path']
|
129 |
+
I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
|
130 |
+
frame_indx = frame_names.index(frame_name)
|
131 |
+
obj_data = video_data['annotations'][frame_indx][obj_id]
|
132 |
+
|
133 |
+
bbox = obj_data['bbox']
|
134 |
+
cat_name = obj_data['category_name']
|
135 |
+
valid = obj_data['valid']
|
136 |
+
|
137 |
+
if valid == 0:
|
138 |
+
print("Object not in this frame!")
|
139 |
+
return {}
|
140 |
+
|
141 |
+
|
142 |
+
x_min, y_min, x_max, y_max = bbox
|
143 |
+
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
|
144 |
+
cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
|
145 |
+
plt.figure()
|
146 |
+
plt.imshow(I)
|
147 |
+
plt.axis('off')
|
148 |
+
plt.show()
|
149 |
+
pil_I = Image.fromarray(I)
|
150 |
+
buff = BytesIO()
|
151 |
+
pil_I.save(buff, format='JPEG')
|
152 |
+
base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
|
153 |
+
|
154 |
+
#ref expression 만들기
|
155 |
+
generator = OpenAI()
|
156 |
+
response = generator.chat.completions.create(
|
157 |
+
model="gpt-4o-mini",
|
158 |
+
messages=[
|
159 |
+
{
|
160 |
+
"role": "user",
|
161 |
+
"content": [
|
162 |
+
{
|
163 |
+
"type": "text",
|
164 |
+
"text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
|
165 |
+
1. The referring expression describes the action and does not contain information about appearance or location in the picture.
|
166 |
+
2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
|
167 |
+
3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
|
168 |
+
4. The referring expression should only describe the highlighted {cat_name} and not any other.
|
169 |
+
5. Use '{cat_name}' as the noun for the referring expressions.
|
170 |
+
Output only the referring expression.
|
171 |
+
{caption}""",
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"type": "image_url",
|
175 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
176 |
+
},
|
177 |
+
],
|
178 |
+
}
|
179 |
+
],
|
180 |
+
)
|
181 |
+
|
182 |
+
ref_exp = response.choices[0].message.content
|
183 |
+
|
184 |
+
#QA filtering
|
185 |
+
#QA1: 원하는 물체를 설명하는지
|
186 |
+
filter = OpenAI()
|
187 |
+
response1 = filter.chat.completions.create(
|
188 |
+
model="gpt-4o-mini",
|
189 |
+
messages=[
|
190 |
+
{
|
191 |
+
"role": "user",
|
192 |
+
"content": [
|
193 |
+
{
|
194 |
+
"type": "text",
|
195 |
+
"text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
|
196 |
+
{ref_exp}""",
|
197 |
+
},
|
198 |
+
{
|
199 |
+
"type": "image_url",
|
200 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
201 |
+
},
|
202 |
+
],
|
203 |
+
}
|
204 |
+
],
|
205 |
+
)
|
206 |
+
|
207 |
+
response1_content = response1.choices[0].message.content
|
208 |
+
describesHighlighted = True if "yes" in response1_content.lower() else False
|
209 |
+
|
210 |
+
#QA2: 원하지 않는 물체를 설명하지 않는지
|
211 |
+
response2 = filter.chat.completions.create(
|
212 |
+
model="gpt-4o-mini",
|
213 |
+
messages=[
|
214 |
+
{
|
215 |
+
"role": "user",
|
216 |
+
"content": [
|
217 |
+
{
|
218 |
+
"type": "text",
|
219 |
+
"text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
|
220 |
+
{ref_exp}""",
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"type": "image_url",
|
224 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
225 |
+
},
|
226 |
+
],
|
227 |
+
}
|
228 |
+
],
|
229 |
+
)
|
230 |
+
|
231 |
+
response2_content = response2.choices[0].message.content
|
232 |
+
describesNotHighlighted = True if "yes" in response2_content.lower() else False
|
233 |
+
|
234 |
+
isValid = True if describesHighlighted and not describesNotHighlighted else False
|
235 |
+
|
236 |
+
print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
|
237 |
+
|
238 |
+
return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
|
239 |
+
|
240 |
+
def createRefExp(video_id, json_data):
|
241 |
+
video_data = json_data[video_id]
|
242 |
+
obj_ids = list(video_data['annotations'][0].keys())
|
243 |
+
frame_names = video_data['frame_names']
|
244 |
+
|
245 |
+
captions_per_frame = getCaption(video_id, json_data)
|
246 |
+
|
247 |
+
if captions_per_frame == -1:
|
248 |
+
print("There are more than 2 cateories")
|
249 |
+
return
|
250 |
+
|
251 |
+
|
252 |
+
video_ref_exps = {}
|
253 |
+
|
254 |
+
for frame_name in frame_names:
|
255 |
+
frame_caption = captions_per_frame[frame_name]
|
256 |
+
|
257 |
+
if frame_caption == None:
|
258 |
+
video_ref_exps[frame_name] = None
|
259 |
+
|
260 |
+
else:
|
261 |
+
frame_ref_exps = {}
|
262 |
+
for obj_id in obj_ids:
|
263 |
+
exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
|
264 |
+
frame_ref_exps[obj_id] = exp_per_obj
|
265 |
+
video_ref_exps[frame_name] = frame_ref_exps
|
266 |
+
|
267 |
+
return video_ref_exps
|
268 |
+
|
269 |
+
if __name__ == '__main__':
|
270 |
+
with open('mbench/sampled_frame3.json', 'r') as file:
|
271 |
+
data = json.load(file)
|
272 |
+
|
273 |
+
videos = set()
|
274 |
+
with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
|
275 |
+
manual_select = list(file)
|
276 |
+
for frame in manual_select:
|
277 |
+
result = json.loads(frame)
|
278 |
+
videos.add(result['video'])
|
279 |
+
videos = list(videos)
|
280 |
+
|
281 |
+
|
282 |
+
all_video_refs = {}
|
283 |
+
for i in range(1):
|
284 |
+
video_id = videos[i]
|
285 |
+
video_ref = createRefExp(video_id, data)
|
286 |
+
all_video_refs[video_id] = video_ref
|
.history/mbench/gpt_ref-ytvos_20250119071214.py
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import build_dataset
|
2 |
+
import argparse
|
3 |
+
import opts
|
4 |
+
|
5 |
+
import sys
|
6 |
+
from pathlib import Path
|
7 |
+
import os
|
8 |
+
from os import path as osp
|
9 |
+
import skimage
|
10 |
+
from io import BytesIO
|
11 |
+
|
12 |
+
import numpy as np
|
13 |
+
import pandas as pd
|
14 |
+
import regex as re
|
15 |
+
import json
|
16 |
+
|
17 |
+
import cv2
|
18 |
+
from PIL import Image, ImageDraw
|
19 |
+
import torch
|
20 |
+
from torchvision.transforms import functional as F
|
21 |
+
|
22 |
+
from skimage import measure # (pip install scikit-image)
|
23 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
24 |
+
|
25 |
+
import matplotlib.pyplot as plt
|
26 |
+
import matplotlib.patches as patches
|
27 |
+
from matplotlib.collections import PatchCollection
|
28 |
+
from matplotlib.patches import Rectangle
|
29 |
+
|
30 |
+
|
31 |
+
import ipywidgets as widgets
|
32 |
+
from IPython.display import display, clear_output
|
33 |
+
|
34 |
+
from openai import OpenAI
|
35 |
+
import base64
|
36 |
+
|
37 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
38 |
+
|
39 |
+
# Function to encode the image
|
40 |
+
def encode_image(image_path):
|
41 |
+
with open(image_path, "rb") as image_file:
|
42 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
43 |
+
|
44 |
+
def getCaption(video_id, json_data):
|
45 |
+
#데이터 가져오기
|
46 |
+
video_data = json_data[video_id]
|
47 |
+
frame_names = video_data['frame_names']
|
48 |
+
video_path = video_data['video_path']
|
49 |
+
|
50 |
+
cat_names = set()
|
51 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
52 |
+
cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
|
53 |
+
|
54 |
+
if len(cat_names) == 1:
|
55 |
+
cat_name = next(iter(cat_names))
|
56 |
+
else:
|
57 |
+
print("more than 2 categories")
|
58 |
+
return -1
|
59 |
+
|
60 |
+
image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
|
61 |
+
image_captions = {}
|
62 |
+
|
63 |
+
captioner = OpenAI()
|
64 |
+
for i in range(len(image_paths)):
|
65 |
+
image_path = image_paths[i]
|
66 |
+
frame_name = frame_names[i]
|
67 |
+
base64_image = encode_image(image_path)
|
68 |
+
|
69 |
+
#1단계: 필터링
|
70 |
+
response1 = captioner.chat.completions.create(
|
71 |
+
model="gpt-4o-mini",
|
72 |
+
messages=[
|
73 |
+
{
|
74 |
+
"role": "user",
|
75 |
+
"content": [
|
76 |
+
{
|
77 |
+
"type": "text",
|
78 |
+
"text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"type": "image_url",
|
82 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
83 |
+
},
|
84 |
+
],
|
85 |
+
}
|
86 |
+
],
|
87 |
+
)
|
88 |
+
response_content = response1.choices[0].message.content
|
89 |
+
should_caption = True if "yes" in response_content.lower() else False
|
90 |
+
|
91 |
+
#2단계: dense caption 만들기
|
92 |
+
if should_caption:
|
93 |
+
response2 = captioner.chat.completions.create(
|
94 |
+
model="gpt-4o-mini",
|
95 |
+
messages=[
|
96 |
+
{
|
97 |
+
"role": "user",
|
98 |
+
"content": [
|
99 |
+
{
|
100 |
+
"type": "text",
|
101 |
+
"text": f"""
|
102 |
+
Describe the image in detail focusing on the {cat_name}s' actions.
|
103 |
+
1. Each action should be prominent, clear and unique, describing the corresponding object only.
|
104 |
+
2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
|
105 |
+
3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
|
106 |
+
4. Do not include actions that needs to be guessed or suggested.""",
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"type": "image_url",
|
110 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
111 |
+
},
|
112 |
+
],
|
113 |
+
}
|
114 |
+
],
|
115 |
+
)
|
116 |
+
|
117 |
+
caption = response2.choices[0].message.content
|
118 |
+
else:
|
119 |
+
caption = None
|
120 |
+
|
121 |
+
image_captions[frame_name] = caption
|
122 |
+
return image_captions
|
123 |
+
|
124 |
+
def getRefExp(video_id, frame_name, caption, obj_id, json_data):
|
125 |
+
# 이미지에 해당 물체 바운딩 박스 그리기
|
126 |
+
video_data = json_data[video_id]
|
127 |
+
frame_names = video_data['frame_names']
|
128 |
+
video_path = video_data['video_path']
|
129 |
+
I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
|
130 |
+
frame_indx = frame_names.index(frame_name)
|
131 |
+
obj_data = video_data['annotations'][frame_indx][obj_id]
|
132 |
+
|
133 |
+
bbox = obj_data['bbox']
|
134 |
+
cat_name = obj_data['category_name']
|
135 |
+
valid = obj_data['valid']
|
136 |
+
|
137 |
+
if valid == 0:
|
138 |
+
print("Object not in this frame!")
|
139 |
+
return {}
|
140 |
+
|
141 |
+
|
142 |
+
x_min, y_min, x_max, y_max = bbox
|
143 |
+
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
|
144 |
+
cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
|
145 |
+
plt.figure()
|
146 |
+
plt.imshow(I)
|
147 |
+
plt.axis('off')
|
148 |
+
plt.show()
|
149 |
+
pil_I = Image.fromarray(I)
|
150 |
+
buff = BytesIO()
|
151 |
+
pil_I.save(buff, format='JPEG')
|
152 |
+
base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
|
153 |
+
|
154 |
+
#ref expression 만들기
|
155 |
+
generator = OpenAI()
|
156 |
+
response = generator.chat.completions.create(
|
157 |
+
model="gpt-4o-mini",
|
158 |
+
messages=[
|
159 |
+
{
|
160 |
+
"role": "user",
|
161 |
+
"content": [
|
162 |
+
{
|
163 |
+
"type": "text",
|
164 |
+
"text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
|
165 |
+
1. The referring expression describes the action and does not contain information about appearance or location in the picture.
|
166 |
+
2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
|
167 |
+
3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
|
168 |
+
4. The referring expression should only describe the highlighted {cat_name} and not any other.
|
169 |
+
5. Use '{cat_name}' as the noun for the referring expressions.
|
170 |
+
Output only the referring expression.
|
171 |
+
{caption}""",
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"type": "image_url",
|
175 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
176 |
+
},
|
177 |
+
],
|
178 |
+
}
|
179 |
+
],
|
180 |
+
)
|
181 |
+
|
182 |
+
ref_exp = response.choices[0].message.content
|
183 |
+
|
184 |
+
#QA filtering
|
185 |
+
#QA1: 원하는 물체를 설명하는지
|
186 |
+
filter = OpenAI()
|
187 |
+
response1 = filter.chat.completions.create(
|
188 |
+
model="gpt-4o-mini",
|
189 |
+
messages=[
|
190 |
+
{
|
191 |
+
"role": "user",
|
192 |
+
"content": [
|
193 |
+
{
|
194 |
+
"type": "text",
|
195 |
+
"text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
|
196 |
+
{ref_exp}""",
|
197 |
+
},
|
198 |
+
{
|
199 |
+
"type": "image_url",
|
200 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
201 |
+
},
|
202 |
+
],
|
203 |
+
}
|
204 |
+
],
|
205 |
+
)
|
206 |
+
|
207 |
+
response1_content = response1.choices[0].message.content
|
208 |
+
describesHighlighted = True if "yes" in response1_content.lower() else False
|
209 |
+
|
210 |
+
#QA2: 원하지 않는 물체를 설명하지 않는지
|
211 |
+
response2 = filter.chat.completions.create(
|
212 |
+
model="gpt-4o-mini",
|
213 |
+
messages=[
|
214 |
+
{
|
215 |
+
"role": "user",
|
216 |
+
"content": [
|
217 |
+
{
|
218 |
+
"type": "text",
|
219 |
+
"text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
|
220 |
+
{ref_exp}""",
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"type": "image_url",
|
224 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
225 |
+
},
|
226 |
+
],
|
227 |
+
}
|
228 |
+
],
|
229 |
+
)
|
230 |
+
|
231 |
+
response2_content = response2.choices[0].message.content
|
232 |
+
describesNotHighlighted = True if "yes" in response2_content.lower() else False
|
233 |
+
|
234 |
+
isValid = True if describesHighlighted and not describesNotHighlighted else False
|
235 |
+
|
236 |
+
print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
|
237 |
+
|
238 |
+
return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
|
239 |
+
|
240 |
+
def createRefExp(video_id, json_data):
|
241 |
+
video_data = json_data[video_id]
|
242 |
+
obj_ids = list(video_data['annotations'][0].keys())
|
243 |
+
frame_names = video_data['frame_names']
|
244 |
+
|
245 |
+
captions_per_frame = getCaption(video_id, json_data)
|
246 |
+
|
247 |
+
if captions_per_frame == -1:
|
248 |
+
print("There are more than 2 cateories")
|
249 |
+
return
|
250 |
+
|
251 |
+
|
252 |
+
video_ref_exps = {}
|
253 |
+
|
254 |
+
for frame_name in frame_names:
|
255 |
+
frame_caption = captions_per_frame[frame_name]
|
256 |
+
|
257 |
+
if frame_caption == None:
|
258 |
+
video_ref_exps[frame_name] = None
|
259 |
+
|
260 |
+
else:
|
261 |
+
frame_ref_exps = {}
|
262 |
+
for obj_id in obj_ids:
|
263 |
+
exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
|
264 |
+
frame_ref_exps[obj_id] = exp_per_obj
|
265 |
+
video_ref_exps[frame_name] = frame_ref_exps
|
266 |
+
|
267 |
+
return video_ref_exps
|
268 |
+
|
269 |
+
if __name__ == '__main__':
|
270 |
+
with open('mbench/sampled_frame3.json', 'r') as file:
|
271 |
+
data = json.load(file)
|
272 |
+
|
273 |
+
videos = set()
|
274 |
+
with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
|
275 |
+
manual_select = list(file)
|
276 |
+
for frame in manual_select:
|
277 |
+
result = json.loads(frame)
|
278 |
+
videos.add(result['video'])
|
279 |
+
videos = list(videos)
|
280 |
+
|
281 |
+
|
282 |
+
all_video_refs = {}
|
283 |
+
for i in range(1):
|
284 |
+
video_id = videos[i]
|
285 |
+
video_ref = createRefExp(video_id, data)
|
286 |
+
all_video_refs[video_id] = video_ref
|
287 |
+
|
288 |
+
json_obj = json.dumps(all_video_refs, indent=4)
|
289 |
+
with open('mbench/result.json', 'w') as file:
|
290 |
+
file.wirte(json_obj)
|
.history/mbench/gpt_ref-ytvos_20250119073250.py
ADDED
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from os import path as osp
|
3 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
|
9 |
+
from pathlib import Path
|
10 |
+
import os
|
11 |
+
import skimage
|
12 |
+
from io import BytesIO
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import pandas as pd
|
16 |
+
import regex as re
|
17 |
+
import json
|
18 |
+
|
19 |
+
import cv2
|
20 |
+
from PIL import Image, ImageDraw
|
21 |
+
import torch
|
22 |
+
from torchvision.transforms import functional as F
|
23 |
+
|
24 |
+
from skimage import measure # (pip install scikit-image)
|
25 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
26 |
+
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
import matplotlib.patches as patches
|
29 |
+
from matplotlib.collections import PatchCollection
|
30 |
+
from matplotlib.patches import Rectangle
|
31 |
+
|
32 |
+
|
33 |
+
import ipywidgets as widgets
|
34 |
+
from IPython.display import display, clear_output
|
35 |
+
|
36 |
+
from openai import OpenAI
|
37 |
+
import base64
|
38 |
+
|
39 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
40 |
+
|
41 |
+
# Function to encode the image
|
42 |
+
def encode_image(image_path):
|
43 |
+
with open(image_path, "rb") as image_file:
|
44 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
45 |
+
|
46 |
+
def getCaption(video_id, json_data):
|
47 |
+
#데이터 가져오기
|
48 |
+
video_data = json_data[video_id]
|
49 |
+
frame_names = video_data['frame_names']
|
50 |
+
video_path = video_data['video_path']
|
51 |
+
|
52 |
+
cat_names = set()
|
53 |
+
for obj_id in list(video_data['annotations'][0].keys()):
|
54 |
+
cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
|
55 |
+
|
56 |
+
if len(cat_names) == 1:
|
57 |
+
cat_name = next(iter(cat_names))
|
58 |
+
else:
|
59 |
+
print("more than 2 categories")
|
60 |
+
return -1
|
61 |
+
|
62 |
+
image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
|
63 |
+
image_captions = {}
|
64 |
+
|
65 |
+
captioner = OpenAI()
|
66 |
+
for i in range(len(image_paths)):
|
67 |
+
image_path = image_paths[i]
|
68 |
+
frame_name = frame_names[i]
|
69 |
+
base64_image = encode_image(image_path)
|
70 |
+
|
71 |
+
#1단계: 필터링
|
72 |
+
response1 = captioner.chat.completions.create(
|
73 |
+
model="gpt-4o-mini",
|
74 |
+
messages=[
|
75 |
+
{
|
76 |
+
"role": "user",
|
77 |
+
"content": [
|
78 |
+
{
|
79 |
+
"type": "text",
|
80 |
+
"text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"type": "image_url",
|
84 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
85 |
+
},
|
86 |
+
],
|
87 |
+
}
|
88 |
+
],
|
89 |
+
)
|
90 |
+
response_content = response1.choices[0].message.content
|
91 |
+
should_caption = True if "yes" in response_content.lower() else False
|
92 |
+
|
93 |
+
#2단계: dense caption 만들기
|
94 |
+
if should_caption:
|
95 |
+
response2 = captioner.chat.completions.create(
|
96 |
+
model="gpt-4o-mini",
|
97 |
+
messages=[
|
98 |
+
{
|
99 |
+
"role": "user",
|
100 |
+
"content": [
|
101 |
+
{
|
102 |
+
"type": "text",
|
103 |
+
"text": f"""
|
104 |
+
Describe the image in detail focusing on the {cat_name}s' actions.
|
105 |
+
1. Each action should be prominent, clear and unique, describing the corresponding object only.
|
106 |
+
2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
|
107 |
+
3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
|
108 |
+
4. Do not include actions that needs to be guessed or suggested.""",
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"type": "image_url",
|
112 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
113 |
+
},
|
114 |
+
],
|
115 |
+
}
|
116 |
+
],
|
117 |
+
)
|
118 |
+
|
119 |
+
caption = response2.choices[0].message.content
|
120 |
+
else:
|
121 |
+
caption = None
|
122 |
+
|
123 |
+
image_captions[frame_name] = caption
|
124 |
+
return image_captions
|
125 |
+
|
126 |
+
def getRefExp(video_id, frame_name, caption, obj_id, json_data):
|
127 |
+
# 이미지에 해당 물체 바운딩 박스 그리기
|
128 |
+
video_data = json_data[video_id]
|
129 |
+
frame_names = video_data['frame_names']
|
130 |
+
video_path = video_data['video_path']
|
131 |
+
I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
|
132 |
+
frame_indx = frame_names.index(frame_name)
|
133 |
+
obj_data = video_data['annotations'][frame_indx][obj_id]
|
134 |
+
|
135 |
+
bbox = obj_data['bbox']
|
136 |
+
cat_name = obj_data['category_name']
|
137 |
+
valid = obj_data['valid']
|
138 |
+
|
139 |
+
if valid == 0:
|
140 |
+
print("Object not in this frame!")
|
141 |
+
return {}
|
142 |
+
|
143 |
+
|
144 |
+
x_min, y_min, x_max, y_max = bbox
|
145 |
+
x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
|
146 |
+
cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
|
147 |
+
# plt.figure()
|
148 |
+
# plt.imshow(I)
|
149 |
+
# plt.axis('off')
|
150 |
+
# plt.show()
|
151 |
+
pil_I = Image.fromarray(I)
|
152 |
+
buff = BytesIO()
|
153 |
+
pil_I.save(buff, format='JPEG')
|
154 |
+
base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
|
155 |
+
|
156 |
+
#ref expression 만들기
|
157 |
+
generator = OpenAI()
|
158 |
+
response = generator.chat.completions.create(
|
159 |
+
model="gpt-4o-mini",
|
160 |
+
messages=[
|
161 |
+
{
|
162 |
+
"role": "user",
|
163 |
+
"content": [
|
164 |
+
{
|
165 |
+
"type": "text",
|
166 |
+
"text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
|
167 |
+
1. The referring expression describes the action and does not contain information about appearance or location in the picture.
|
168 |
+
2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
|
169 |
+
3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
|
170 |
+
4. The referring expression should only describe the highlighted {cat_name} and not any other.
|
171 |
+
5. Use '{cat_name}' as the noun for the referring expressions.
|
172 |
+
Output only the referring expression.
|
173 |
+
{caption}""",
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"type": "image_url",
|
177 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
178 |
+
},
|
179 |
+
],
|
180 |
+
}
|
181 |
+
],
|
182 |
+
)
|
183 |
+
|
184 |
+
ref_exp = response.choices[0].message.content
|
185 |
+
|
186 |
+
#QA filtering
|
187 |
+
#QA1: 원하는 물체를 설명하는지
|
188 |
+
filter = OpenAI()
|
189 |
+
response1 = filter.chat.completions.create(
|
190 |
+
model="gpt-4o-mini",
|
191 |
+
messages=[
|
192 |
+
{
|
193 |
+
"role": "user",
|
194 |
+
"content": [
|
195 |
+
{
|
196 |
+
"type": "text",
|
197 |
+
"text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
|
198 |
+
{ref_exp}""",
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"type": "image_url",
|
202 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
203 |
+
},
|
204 |
+
],
|
205 |
+
}
|
206 |
+
],
|
207 |
+
)
|
208 |
+
|
209 |
+
response1_content = response1.choices[0].message.content
|
210 |
+
describesHighlighted = True if "yes" in response1_content.lower() else False
|
211 |
+
|
212 |
+
#QA2: 원하지 않는 물체를 설명하지 않는지
|
213 |
+
response2 = filter.chat.completions.create(
|
214 |
+
model="gpt-4o-mini",
|
215 |
+
messages=[
|
216 |
+
{
|
217 |
+
"role": "user",
|
218 |
+
"content": [
|
219 |
+
{
|
220 |
+
"type": "text",
|
221 |
+
"text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
|
222 |
+
{ref_exp}""",
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"type": "image_url",
|
226 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
|
227 |
+
},
|
228 |
+
],
|
229 |
+
}
|
230 |
+
],
|
231 |
+
)
|
232 |
+
|
233 |
+
response2_content = response2.choices[0].message.content
|
234 |
+
describesNotHighlighted = True if "yes" in response2_content.lower() else False
|
235 |
+
|
236 |
+
isValid = True if describesHighlighted and not describesNotHighlighted else False
|
237 |
+
|
238 |
+
print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
|
239 |
+
|
240 |
+
return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
|
241 |
+
|
242 |
+
def createRefExp(video_id, json_data):
|
243 |
+
video_data = json_data[video_id]
|
244 |
+
obj_ids = list(video_data['annotations'][0].keys())
|
245 |
+
frame_names = video_data['frame_names']
|
246 |
+
|
247 |
+
captions_per_frame = getCaption(video_id, json_data)
|
248 |
+
|
249 |
+
if captions_per_frame == -1:
|
250 |
+
print("There are more than 2 cateories")
|
251 |
+
return None
|
252 |
+
|
253 |
+
|
254 |
+
video_ref_exps = {}
|
255 |
+
|
256 |
+
for frame_name in frame_names:
|
257 |
+
frame_caption = captions_per_frame[frame_name]
|
258 |
+
|
259 |
+
if frame_caption == None:
|
260 |
+
video_ref_exps[frame_name] = None
|
261 |
+
|
262 |
+
else:
|
263 |
+
frame_ref_exps = {}
|
264 |
+
for obj_id in obj_ids:
|
265 |
+
exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
|
266 |
+
frame_ref_exps[obj_id] = exp_per_obj
|
267 |
+
video_ref_exps[frame_name] = frame_ref_exps
|
268 |
+
|
269 |
+
return video_ref_exps
|
270 |
+
|
271 |
+
if __name__ == '__main__':
|
272 |
+
with open('mbench/sampled_frame3.json', 'r') as file:
|
273 |
+
data = json.load(file)
|
274 |
+
|
275 |
+
videos = set()
|
276 |
+
with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
|
277 |
+
manual_select = list(file)
|
278 |
+
for frame in manual_select:
|
279 |
+
result = json.loads(frame)
|
280 |
+
videos.add(result['video'])
|
281 |
+
videos = list(videos)
|
282 |
+
|
283 |
+
|
284 |
+
all_video_refs = {}
|
285 |
+
for i in range(10):
|
286 |
+
video_id = videos[i]
|
287 |
+
video_ref = createRefExp(video_id, data)
|
288 |
+
all_video_refs[video_id] = video_ref
|
289 |
+
|
290 |
+
json_obj = json.dumps(all_video_refs, indent=4)
|
291 |
+
with open('mbench/result.json', 'w') as file:
|
292 |
+
file.write(json_obj)
|
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130183735.py
ADDED
File without changes
|
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130183916.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from os import path as osp
|
4 |
+
from io import BytesIO
|
5 |
+
|
6 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
7 |
+
import argparse
|
8 |
+
import opts
|
9 |
+
|
10 |
+
import sys
|
11 |
+
from pathlib import Path
|
12 |
+
import os
|
13 |
+
from os import path as osp
|
14 |
+
import skimage
|
15 |
+
from io import BytesIO
|
16 |
+
|
17 |
+
import numpy as np
|
18 |
+
import pandas as pd
|
19 |
+
import regex as re
|
20 |
+
import json
|
21 |
+
|
22 |
+
import cv2
|
23 |
+
from PIL import Image, ImageDraw
|
24 |
+
import torch
|
25 |
+
from torchvision.transforms import functional as F
|
26 |
+
|
27 |
+
from skimage import measure # (pip install scikit-image)
|
28 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
29 |
+
|
30 |
+
import matplotlib.pyplot as plt
|
31 |
+
import matplotlib.patches as patches
|
32 |
+
from matplotlib.collections import PatchCollection
|
33 |
+
from matplotlib.patches import Rectangle
|
34 |
+
import textwrap
|
35 |
+
|
36 |
+
|
37 |
+
import ipywidgets as widgets
|
38 |
+
from IPython.display import display, clear_output
|
39 |
+
|
40 |
+
from openai import OpenAI
|
41 |
+
import base64
|
42 |
+
|
43 |
+
def number_objects_and_encode(idx, color_mask=False):
|
44 |
+
encoded_frames = {}
|
45 |
+
contoured_frames = {} # New dictionary for original images
|
46 |
+
vid_cat_cnts = {}
|
47 |
+
|
48 |
+
vid_meta = metas[idx]
|
49 |
+
vid_data = train_dataset[idx]
|
50 |
+
vid_id = vid_meta['video']
|
51 |
+
frame_indx = vid_meta['sample_indx']
|
52 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
53 |
+
imgs = vid_data[0]
|
54 |
+
|
55 |
+
for cat in cat_names:
|
56 |
+
cat_frames = []
|
57 |
+
contour_frames = []
|
58 |
+
frame_cat_cnts = {}
|
59 |
+
|
60 |
+
for i in range(imgs.size(0)):
|
61 |
+
frame_name = frame_indx[i]
|
62 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
63 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
64 |
+
|
65 |
+
frame_data = vid_data[2][frame_name]
|
66 |
+
obj_ids = list(frame_data.keys())
|
67 |
+
|
68 |
+
cat_cnt = 0
|
69 |
+
|
70 |
+
for j in range(len(obj_ids)):
|
71 |
+
obj_id = obj_ids[j]
|
72 |
+
obj_data = frame_data[obj_id]
|
73 |
+
obj_bbox = obj_data['bbox']
|
74 |
+
obj_valid = obj_data['valid']
|
75 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
76 |
+
obj_cat = obj_data['category_name']
|
77 |
+
|
78 |
+
if obj_cat == cat and obj_valid:
|
79 |
+
cat_cnt += 1
|
80 |
+
|
81 |
+
if color_mask == False:
|
82 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
83 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
84 |
+
for i, contour in enumerate(contours):
|
85 |
+
# 윤곽선 중심 계산
|
86 |
+
moments = cv2.moments(contour)
|
87 |
+
if moments["m00"] != 0: # 중심 계산 가능 여부 확인
|
88 |
+
cx = int(moments["m10"] / moments["m00"])
|
89 |
+
cy = int(moments["m01"] / moments["m00"])
|
90 |
+
else:
|
91 |
+
cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
|
92 |
+
|
93 |
+
# 텍스트 배경 (검은색 배경 만들기)
|
94 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
95 |
+
text = obj_id
|
96 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
97 |
+
text_w, text_h = text_size
|
98 |
+
|
99 |
+
# 텍스트 배경 그리기 (검은색 배경)
|
100 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
101 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
102 |
+
|
103 |
+
# 텍스트 그리기 (흰색 텍스트)
|
104 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
105 |
+
font, 1, (255, 255, 255), 2)
|
106 |
+
|
107 |
+
else:
|
108 |
+
alpha = 0.08
|
109 |
+
|
110 |
+
colored_obj_mask = np.zeros_like(frame)
|
111 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
112 |
+
frame[obj_mask == 1] = (
|
113 |
+
(1 - alpha) * frame[obj_mask == 1]
|
114 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
115 |
+
)
|
116 |
+
|
117 |
+
|
118 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
119 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
120 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
if len(contours) > 0:
|
125 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
126 |
+
M = cv2.moments(largest_contour)
|
127 |
+
if M["m00"] != 0:
|
128 |
+
center_x = int(M["m10"] / M["m00"])
|
129 |
+
center_y = int(M["m01"] / M["m00"])
|
130 |
+
else:
|
131 |
+
center_x, center_y = 0, 0
|
132 |
+
|
133 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
134 |
+
text = obj_id
|
135 |
+
|
136 |
+
font_scale = 0.9
|
137 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
138 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
139 |
+
text_y = center_y
|
140 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
141 |
+
|
142 |
+
# 텍스트 배경 사각형 좌표 계산
|
143 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
144 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
145 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
146 |
+
|
147 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
148 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
149 |
+
|
150 |
+
plt.figure(figsize=(12, 8))
|
151 |
+
plt.imshow(frame)
|
152 |
+
plt.title(f"frame {frame_name}")
|
153 |
+
plt.tight_layout()
|
154 |
+
plt.axis('off')
|
155 |
+
plt.show()
|
156 |
+
|
157 |
+
buffer = BytesIO()
|
158 |
+
frame = Image.fromarray(frame)
|
159 |
+
frame.save(buffer, format='jpeg')
|
160 |
+
buffer.seek(0)
|
161 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
162 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
163 |
+
|
164 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
165 |
+
buffer.truncate()
|
166 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
167 |
+
frame_for_contour.save(buffer, format='jpeg')
|
168 |
+
buffer.seek(0)
|
169 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
170 |
+
|
171 |
+
encoded_frames[cat] = cat_frames
|
172 |
+
contoured_frames[cat] = contour_frames
|
173 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
174 |
+
|
175 |
+
return encoded_frames, vid_cat_cnts, contoured_frames
|
176 |
+
|
177 |
+
if __name__ == '__main__':
|
178 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
179 |
+
args = parser.parse_args()
|
180 |
+
|
181 |
+
#==================데이터 불러오기===================
|
182 |
+
# 전체 데이터셋
|
183 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
184 |
+
|
185 |
+
# 전체 데이터셋 메타데이터
|
186 |
+
metas = train_dataset.metas
|
187 |
+
|
188 |
+
# 색상 후보 8개 (RGB 형식)
|
189 |
+
colors = [
|
190 |
+
(255, 0, 0), # Red
|
191 |
+
(0, 255, 0), # Green
|
192 |
+
(0, 0, 255), # Blue
|
193 |
+
(255, 255, 0), # Yellow
|
194 |
+
(255, 0, 255), # Magenta
|
195 |
+
(0, 255, 255), # Cyan
|
196 |
+
(128, 0, 128), # Purple
|
197 |
+
(255, 165, 0) # Orange
|
198 |
+
]
|
199 |
+
|
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130185048.py
ADDED
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from os import path as osp
|
4 |
+
from io import BytesIO
|
5 |
+
|
6 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
7 |
+
import argparse
|
8 |
+
import opts
|
9 |
+
|
10 |
+
import sys
|
11 |
+
from pathlib import Path
|
12 |
+
import os
|
13 |
+
from os import path as osp
|
14 |
+
import skimage
|
15 |
+
from io import BytesIO
|
16 |
+
|
17 |
+
import numpy as np
|
18 |
+
import pandas as pd
|
19 |
+
import regex as re
|
20 |
+
import json
|
21 |
+
|
22 |
+
import cv2
|
23 |
+
from PIL import Image, ImageDraw
|
24 |
+
import torch
|
25 |
+
from torchvision.transforms import functional as F
|
26 |
+
|
27 |
+
from skimage import measure # (pip install scikit-image)
|
28 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
29 |
+
|
30 |
+
import matplotlib.pyplot as plt
|
31 |
+
import matplotlib.patches as patches
|
32 |
+
from matplotlib.collections import PatchCollection
|
33 |
+
from matplotlib.patches import Rectangle
|
34 |
+
import textwrap
|
35 |
+
|
36 |
+
|
37 |
+
import ipywidgets as widgets
|
38 |
+
from IPython.display import display, clear_output
|
39 |
+
|
40 |
+
from openai import OpenAI
|
41 |
+
import base64
|
42 |
+
import json
|
43 |
+
|
44 |
+
def number_objects_and_encode(idx, color_mask=False):
|
45 |
+
encoded_frames = {}
|
46 |
+
contoured_frames = {} # New dictionary for original images
|
47 |
+
vid_cat_cnts = {}
|
48 |
+
|
49 |
+
vid_meta = metas[idx]
|
50 |
+
vid_data = train_dataset[idx]
|
51 |
+
vid_id = vid_meta['video']
|
52 |
+
frame_indx = vid_meta['sample_indx']
|
53 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
54 |
+
imgs = vid_data[0]
|
55 |
+
|
56 |
+
for cat in cat_names:
|
57 |
+
cat_frames = []
|
58 |
+
contour_frames = []
|
59 |
+
frame_cat_cnts = {}
|
60 |
+
|
61 |
+
for i in range(imgs.size(0)):
|
62 |
+
frame_name = frame_indx[i]
|
63 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
64 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
65 |
+
|
66 |
+
frame_data = vid_data[2][frame_name]
|
67 |
+
obj_ids = list(frame_data.keys())
|
68 |
+
|
69 |
+
cat_cnt = 0
|
70 |
+
|
71 |
+
for j in range(len(obj_ids)):
|
72 |
+
obj_id = obj_ids[j]
|
73 |
+
obj_data = frame_data[obj_id]
|
74 |
+
obj_bbox = obj_data['bbox']
|
75 |
+
obj_valid = obj_data['valid']
|
76 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
77 |
+
obj_cat = obj_data['category_name']
|
78 |
+
|
79 |
+
if obj_cat == cat and obj_valid:
|
80 |
+
cat_cnt += 1
|
81 |
+
|
82 |
+
if color_mask == False:
|
83 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
84 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
85 |
+
for i, contour in enumerate(contours):
|
86 |
+
# 윤곽선 중심 계산
|
87 |
+
moments = cv2.moments(contour)
|
88 |
+
if moments["m00"] != 0: # 중심 계산 가능 여부 확인
|
89 |
+
cx = int(moments["m10"] / moments["m00"])
|
90 |
+
cy = int(moments["m01"] / moments["m00"])
|
91 |
+
else:
|
92 |
+
cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
|
93 |
+
|
94 |
+
# 텍스트 배경 (검은색 배경 만들기)
|
95 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
96 |
+
text = obj_id
|
97 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
98 |
+
text_w, text_h = text_size
|
99 |
+
|
100 |
+
# 텍스트 배경 그리기 (검은색 배경)
|
101 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
102 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
103 |
+
|
104 |
+
# 텍스트 그리기 (흰색 텍스트)
|
105 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
106 |
+
font, 1, (255, 255, 255), 2)
|
107 |
+
|
108 |
+
else:
|
109 |
+
alpha = 0.08
|
110 |
+
|
111 |
+
colored_obj_mask = np.zeros_like(frame)
|
112 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
113 |
+
frame[obj_mask == 1] = (
|
114 |
+
(1 - alpha) * frame[obj_mask == 1]
|
115 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
116 |
+
)
|
117 |
+
|
118 |
+
|
119 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
120 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
121 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
if len(contours) > 0:
|
126 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
127 |
+
M = cv2.moments(largest_contour)
|
128 |
+
if M["m00"] != 0:
|
129 |
+
center_x = int(M["m10"] / M["m00"])
|
130 |
+
center_y = int(M["m01"] / M["m00"])
|
131 |
+
else:
|
132 |
+
center_x, center_y = 0, 0
|
133 |
+
|
134 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
135 |
+
text = obj_id
|
136 |
+
|
137 |
+
font_scale = 0.9
|
138 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
139 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
140 |
+
text_y = center_y
|
141 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
142 |
+
|
143 |
+
# 텍스트 배경 사각형 좌표 계산
|
144 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
145 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
146 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
147 |
+
|
148 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
149 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
150 |
+
|
151 |
+
# plt.figure(figsize=(12, 8))
|
152 |
+
# plt.imshow(frame)
|
153 |
+
# plt.title(f"frame {frame_name}")
|
154 |
+
# plt.tight_layout()
|
155 |
+
# plt.axis('off')
|
156 |
+
# plt.show()
|
157 |
+
|
158 |
+
buffer = BytesIO()
|
159 |
+
frame = Image.fromarray(frame)
|
160 |
+
frame.save(buffer, format='jpeg')
|
161 |
+
buffer.seek(0)
|
162 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
163 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
164 |
+
|
165 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
166 |
+
buffer.truncate()
|
167 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
168 |
+
frame_for_contour.save(buffer, format='jpeg')
|
169 |
+
buffer.seek(0)
|
170 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
171 |
+
|
172 |
+
encoded_frames[cat] = cat_frames
|
173 |
+
contoured_frames[cat] = contour_frames
|
174 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
175 |
+
|
176 |
+
return encoded_frames, vid_cat_cnts, contoured_frames
|
177 |
+
|
178 |
+
|
179 |
+
def getCaption(idx, color_mask=True):
|
180 |
+
vid_meta = metas[idx]
|
181 |
+
vid_data = train_dataset[idx]
|
182 |
+
vid_id = vid_meta['video']
|
183 |
+
print(f"vid id: {vid_id}\n")
|
184 |
+
|
185 |
+
frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
|
186 |
+
cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
|
187 |
+
all_captions = dict()
|
188 |
+
|
189 |
+
base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
|
190 |
+
marked = "mask with boundary" if color_mask else "boundary"
|
191 |
+
|
192 |
+
for cat_name in list(cat_names) :
|
193 |
+
|
194 |
+
is_movable = False
|
195 |
+
if cat_name in ytvos_category_valid_list :
|
196 |
+
is_movable = True
|
197 |
+
|
198 |
+
if not is_movable:
|
199 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
|
200 |
+
|
201 |
+
|
202 |
+
image_captions = {}
|
203 |
+
captioner = OpenAI()
|
204 |
+
cat_base64_frames = base64_frames[cat_name]
|
205 |
+
cont_base64_frames = contoured_frames[cat_name]
|
206 |
+
|
207 |
+
for i in range(len(cat_base64_frames)):
|
208 |
+
frame_name = frame_indx[i]
|
209 |
+
cont_base64_image = cont_base64_frames[i]
|
210 |
+
base64_image = cat_base64_frames[i]
|
211 |
+
should_filter = False
|
212 |
+
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
|
213 |
+
|
214 |
+
if frame_cat_cnts >= 2:
|
215 |
+
should_filter = True
|
216 |
+
else:
|
217 |
+
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
|
218 |
+
|
219 |
+
if is_movable and should_filter:
|
220 |
+
#1단계: 필터링
|
221 |
+
print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
222 |
+
caption_filter_text = f"""
|
223 |
+
You are a visual assistant analyzing a single frame from a video.
|
224 |
+
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
|
225 |
+
|
226 |
+
Are {cat_name}s in the image performing all different and recognizable actions or postures?
|
227 |
+
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
|
228 |
+
facial expressions, and any notable interactions with objects or other {cat_name}s or people.
|
229 |
+
|
230 |
+
Only focus on obvious, prominent actions that can be reliably identified from this single frame.
|
231 |
+
|
232 |
+
- Respond with "YES" if:
|
233 |
+
1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
|
234 |
+
2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
|
235 |
+
3) Each action is unambiguously recognizable and distinct.
|
236 |
+
|
237 |
+
- Respond with "NONE" if:
|
238 |
+
1) The actions or pose are not clearly differentiable or too similar.
|
239 |
+
2) They show no noticeable action beyond standing or minor movements.
|
240 |
+
|
241 |
+
Answer strictly with either "YES" or "NONE".
|
242 |
+
"""
|
243 |
+
|
244 |
+
|
245 |
+
response1 = captioner.chat.completions.create(
|
246 |
+
model="chatgpt-4o-latest",
|
247 |
+
messages=[
|
248 |
+
{
|
249 |
+
"role": "user",
|
250 |
+
"content": [
|
251 |
+
{
|
252 |
+
"type": "text",
|
253 |
+
"text": caption_filter_text,
|
254 |
+
},
|
255 |
+
{
|
256 |
+
"type": "image_url",
|
257 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
258 |
+
}
|
259 |
+
],
|
260 |
+
}
|
261 |
+
],
|
262 |
+
)
|
263 |
+
response_content = response1.choices[0].message.content
|
264 |
+
should_caption = True if "yes" in response_content.lower() else False
|
265 |
+
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
|
266 |
+
|
267 |
+
else:
|
268 |
+
should_caption = False
|
269 |
+
|
270 |
+
#2단계: dense caption 만들기
|
271 |
+
dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
|
272 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
273 |
+
I want to use your expressions to create a action-centric referring expression dataset.
|
274 |
+
Therefore, your expressions for these {cat_name}s should describe unique action of each object.
|
275 |
+
|
276 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
277 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
278 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
279 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
280 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
281 |
+
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
282 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
283 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
284 |
+
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
285 |
+
10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
|
286 |
+
11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
|
287 |
+
12. Do not mention object IDs.
|
288 |
+
13. Use '{cat_name}' as the noun for the referring expressions.
|
289 |
+
|
290 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
291 |
+
Output referring expressions for each object id.
|
292 |
+
"""
|
293 |
+
|
294 |
+
dense_caption_prompt = f"""
|
295 |
+
You are a visual assistant analyzing a single frame of a video.
|
296 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
297 |
+
I want to use your expressions to create a action-centric referring expression dataset.
|
298 |
+
Please describe each {cat_name} using **clearly observable** and **specific** actions.
|
299 |
+
|
300 |
+
## Guidelines:
|
301 |
+
1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
|
302 |
+
2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
|
303 |
+
3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
|
304 |
+
4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
|
305 |
+
Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
|
306 |
+
5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
307 |
+
6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
|
308 |
+
7. Base your description on the following action definitions:
|
309 |
+
- Facial with object manipulation
|
310 |
+
- General body movement, body position or pattern
|
311 |
+
- Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
|
312 |
+
- Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
|
313 |
+
|
314 |
+
## Output Format:
|
315 |
+
- For each labeled {cat_name}, output one line in the format:
|
316 |
+
ID. action-oriented description
|
317 |
+
|
318 |
+
Example:
|
319 |
+
1. a bear grasping the edge of a wood with its front paws
|
320 |
+
2. the bear pushing another bear, leaning forward
|
321 |
+
|
322 |
+
**Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
|
323 |
+
**Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
|
324 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
325 |
+
For each labeled {cat_name}, output referring expressions for each object id.
|
326 |
+
"""
|
327 |
+
if should_caption:
|
328 |
+
response2 = captioner.chat.completions.create(
|
329 |
+
model="chatgpt-4o-latest",
|
330 |
+
messages=[
|
331 |
+
{
|
332 |
+
"role": "user",
|
333 |
+
"content": [
|
334 |
+
{
|
335 |
+
"type": "text",
|
336 |
+
"text": dense_caption_prompt,
|
337 |
+
},
|
338 |
+
{
|
339 |
+
"type": "image_url",
|
340 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
341 |
+
},
|
342 |
+
],
|
343 |
+
}
|
344 |
+
],
|
345 |
+
)
|
346 |
+
|
347 |
+
caption = response2.choices[0].message.content
|
348 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
349 |
+
else:
|
350 |
+
caption = None
|
351 |
+
|
352 |
+
image_captions[frame_name] = caption
|
353 |
+
all_captions[cat_name] = image_captions
|
354 |
+
|
355 |
+
# final : also prepare valid object ids
|
356 |
+
valid_obj_ids = dict()
|
357 |
+
|
358 |
+
for cat in cat_names:
|
359 |
+
if cat in ytvos_category_valid_list:
|
360 |
+
obj_id_cat = vid_meta['obj_id_cat']
|
361 |
+
valid_cat_ids = []
|
362 |
+
for obj_id in list(obj_id_cat.keys()):
|
363 |
+
if obj_id_cat[obj_id] == cat:
|
364 |
+
valid_cat_ids.append(obj_id)
|
365 |
+
valid_obj_ids[cat] = valid_cat_ids
|
366 |
+
|
367 |
+
return vid_id, all_captions, valid_obj_ids
|
368 |
+
|
369 |
+
|
370 |
+
if __name__ == '__main__':
|
371 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
372 |
+
args = parser.parse_args()
|
373 |
+
|
374 |
+
#==================데이터 불러오기===================
|
375 |
+
# 전체 데이터셋
|
376 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
377 |
+
|
378 |
+
# 전체 데이터셋 메타데이터
|
379 |
+
metas = train_dataset.metas
|
380 |
+
|
381 |
+
# 색상 후보 8개 (RGB 형식)
|
382 |
+
colors = [
|
383 |
+
(255, 0, 0), # Red
|
384 |
+
(0, 255, 0), # Green
|
385 |
+
(0, 0, 255), # Blue
|
386 |
+
(255, 255, 0), # Yellow
|
387 |
+
(255, 0, 255), # Magenta
|
388 |
+
(0, 255, 255), # Cyan
|
389 |
+
(128, 0, 128), # Purple
|
390 |
+
(255, 165, 0) # Orange
|
391 |
+
]
|
392 |
+
|
393 |
+
ytvos_category_valid_list = [
|
394 |
+
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
395 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
396 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
397 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
398 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
399 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
400 |
+
]
|
401 |
+
|
402 |
+
#==================gpt 돌리기===================
|
403 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
404 |
+
|
405 |
+
result_captions = {}
|
406 |
+
result_valid_obj_ids = {}
|
407 |
+
|
408 |
+
for i in range(370):
|
409 |
+
vid_id, all_captions, valid_obj_ids = getCaption(i, True)
|
410 |
+
|
411 |
+
if vid_id not in result_captions:
|
412 |
+
result_captions[vid_id] = all_captions
|
413 |
+
if vid_id not in result_valid_obj_ids:
|
414 |
+
result_valid_obj_ids[vid_id] = valid_obj_ids
|
415 |
+
|
416 |
+
print("Finished!", flush=True)
|
417 |
+
|
418 |
+
with open("mbench/numbered_captions.json", "w") as file:
|
419 |
+
json.dump(result_captions, file, indent=4)
|
420 |
+
|
421 |
+
with open("mbench/numbered_valid_obj_ids.json", "w") as file:
|
422 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190055.py
ADDED
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from os import path as osp
|
4 |
+
from io import BytesIO
|
5 |
+
|
6 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
7 |
+
import argparse
|
8 |
+
import opts
|
9 |
+
|
10 |
+
import sys
|
11 |
+
from pathlib import Path
|
12 |
+
import os
|
13 |
+
from os import path as osp
|
14 |
+
import skimage
|
15 |
+
from io import BytesIO
|
16 |
+
|
17 |
+
import numpy as np
|
18 |
+
import pandas as pd
|
19 |
+
import regex as re
|
20 |
+
import json
|
21 |
+
|
22 |
+
import cv2
|
23 |
+
from PIL import Image, ImageDraw
|
24 |
+
import torch
|
25 |
+
from torchvision.transforms import functional as F
|
26 |
+
|
27 |
+
from skimage import measure # (pip install scikit-image)
|
28 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
29 |
+
|
30 |
+
import matplotlib.pyplot as plt
|
31 |
+
import matplotlib.patches as patches
|
32 |
+
from matplotlib.collections import PatchCollection
|
33 |
+
from matplotlib.patches import Rectangle
|
34 |
+
import textwrap
|
35 |
+
|
36 |
+
|
37 |
+
import ipywidgets as widgets
|
38 |
+
from IPython.display import display, clear_output
|
39 |
+
|
40 |
+
from openai import OpenAI
|
41 |
+
import base64
|
42 |
+
import json
|
43 |
+
|
44 |
+
def number_objects_and_encode(idx, color_mask=False):
|
45 |
+
encoded_frames = {}
|
46 |
+
contoured_frames = {} # New dictionary for original images
|
47 |
+
vid_cat_cnts = {}
|
48 |
+
|
49 |
+
vid_meta = metas[idx]
|
50 |
+
vid_data = train_dataset[idx]
|
51 |
+
vid_id = vid_meta['video']
|
52 |
+
frame_indx = vid_meta['sample_indx']
|
53 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
54 |
+
imgs = vid_data[0]
|
55 |
+
|
56 |
+
for cat in cat_names:
|
57 |
+
cat_frames = []
|
58 |
+
contour_frames = []
|
59 |
+
frame_cat_cnts = {}
|
60 |
+
|
61 |
+
for i in range(imgs.size(0)):
|
62 |
+
frame_name = frame_indx[i]
|
63 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
64 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
65 |
+
|
66 |
+
frame_data = vid_data[2][frame_name]
|
67 |
+
obj_ids = list(frame_data.keys())
|
68 |
+
|
69 |
+
cat_cnt = 0
|
70 |
+
|
71 |
+
for j in range(len(obj_ids)):
|
72 |
+
obj_id = obj_ids[j]
|
73 |
+
obj_data = frame_data[obj_id]
|
74 |
+
obj_bbox = obj_data['bbox']
|
75 |
+
obj_valid = obj_data['valid']
|
76 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
77 |
+
obj_cat = obj_data['category_name']
|
78 |
+
|
79 |
+
if obj_cat == cat and obj_valid:
|
80 |
+
cat_cnt += 1
|
81 |
+
|
82 |
+
if color_mask == False:
|
83 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
84 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
85 |
+
for i, contour in enumerate(contours):
|
86 |
+
# 윤곽선 중심 계산
|
87 |
+
moments = cv2.moments(contour)
|
88 |
+
if moments["m00"] != 0: # 중심 계산 가능 여부 확인
|
89 |
+
cx = int(moments["m10"] / moments["m00"])
|
90 |
+
cy = int(moments["m01"] / moments["m00"])
|
91 |
+
else:
|
92 |
+
cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
|
93 |
+
|
94 |
+
# 텍스트 배경 (검은색 배경 만들기)
|
95 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
96 |
+
text = obj_id
|
97 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
98 |
+
text_w, text_h = text_size
|
99 |
+
|
100 |
+
# 텍스트 배경 그리기 (검은색 배경)
|
101 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
102 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
103 |
+
|
104 |
+
# 텍스트 그리기 (흰색 텍스트)
|
105 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
106 |
+
font, 1, (255, 255, 255), 2)
|
107 |
+
|
108 |
+
else:
|
109 |
+
alpha = 0.08
|
110 |
+
|
111 |
+
colored_obj_mask = np.zeros_like(frame)
|
112 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
113 |
+
frame[obj_mask == 1] = (
|
114 |
+
(1 - alpha) * frame[obj_mask == 1]
|
115 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
116 |
+
)
|
117 |
+
|
118 |
+
|
119 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
120 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
121 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
if len(contours) > 0:
|
126 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
127 |
+
M = cv2.moments(largest_contour)
|
128 |
+
if M["m00"] != 0:
|
129 |
+
center_x = int(M["m10"] / M["m00"])
|
130 |
+
center_y = int(M["m01"] / M["m00"])
|
131 |
+
else:
|
132 |
+
center_x, center_y = 0, 0
|
133 |
+
|
134 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
135 |
+
text = obj_id
|
136 |
+
|
137 |
+
font_scale = 0.9
|
138 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
139 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
140 |
+
text_y = center_y
|
141 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
142 |
+
|
143 |
+
# 텍스트 배경 사각형 좌표 계산
|
144 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
145 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
146 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
147 |
+
|
148 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
149 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
150 |
+
|
151 |
+
# plt.figure(figsize=(12, 8))
|
152 |
+
# plt.imshow(frame)
|
153 |
+
# plt.title(f"frame {frame_name}")
|
154 |
+
# plt.tight_layout()
|
155 |
+
# plt.axis('off')
|
156 |
+
# plt.show()
|
157 |
+
|
158 |
+
buffer = BytesIO()
|
159 |
+
frame = Image.fromarray(frame)
|
160 |
+
frame.save(buffer, format='jpeg')
|
161 |
+
buffer.seek(0)
|
162 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
163 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
164 |
+
|
165 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
166 |
+
buffer.truncate()
|
167 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
168 |
+
frame_for_contour.save(buffer, format='jpeg')
|
169 |
+
buffer.seek(0)
|
170 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
171 |
+
|
172 |
+
encoded_frames[cat] = cat_frames
|
173 |
+
contoured_frames[cat] = contour_frames
|
174 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
175 |
+
|
176 |
+
return encoded_frames, vid_cat_cnts, contoured_frames
|
177 |
+
|
178 |
+
|
179 |
+
def getCaption(idx, color_mask=True):
|
180 |
+
vid_meta = metas[idx]
|
181 |
+
vid_data = train_dataset[idx]
|
182 |
+
vid_id = vid_meta['video']
|
183 |
+
print(f"vid id: {vid_id}\n")
|
184 |
+
|
185 |
+
frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
|
186 |
+
cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
|
187 |
+
all_captions = dict()
|
188 |
+
|
189 |
+
base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
|
190 |
+
marked = "mask with boundary" if color_mask else "boundary"
|
191 |
+
|
192 |
+
for cat_name in list(cat_names) :
|
193 |
+
|
194 |
+
is_movable = False
|
195 |
+
if cat_name in ytvos_category_valid_list :
|
196 |
+
is_movable = True
|
197 |
+
|
198 |
+
if not is_movable:
|
199 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
|
200 |
+
|
201 |
+
|
202 |
+
image_captions = {}
|
203 |
+
captioner = OpenAI()
|
204 |
+
cat_base64_frames = base64_frames[cat_name]
|
205 |
+
cont_base64_frames = contoured_frames[cat_name]
|
206 |
+
|
207 |
+
for i in range(len(cat_base64_frames)):
|
208 |
+
frame_name = frame_indx[i]
|
209 |
+
cont_base64_image = cont_base64_frames[i]
|
210 |
+
base64_image = cat_base64_frames[i]
|
211 |
+
should_filter = False
|
212 |
+
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
|
213 |
+
|
214 |
+
if frame_cat_cnts >= 2:
|
215 |
+
should_filter = True
|
216 |
+
else:
|
217 |
+
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
|
218 |
+
|
219 |
+
if is_movable and should_filter:
|
220 |
+
#1단계: 필터링
|
221 |
+
print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
222 |
+
caption_filter_text = f"""
|
223 |
+
You are a visual assistant analyzing a single frame from a video.
|
224 |
+
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
|
225 |
+
|
226 |
+
Are {cat_name}s in the image performing all different and recognizable actions or postures?
|
227 |
+
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
|
228 |
+
facial expressions, and any notable interactions with objects or other {cat_name}s or people.
|
229 |
+
|
230 |
+
Only focus on obvious, prominent actions that can be reliably identified from this single frame.
|
231 |
+
|
232 |
+
- Respond with "YES" if:
|
233 |
+
1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
|
234 |
+
2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
|
235 |
+
3) Each action is unambiguously recognizable and distinct.
|
236 |
+
|
237 |
+
- Respond with "NONE" if:
|
238 |
+
1) The actions or pose are not clearly differentiable or too similar.
|
239 |
+
2) They show no noticeable action beyond standing or minor movements.
|
240 |
+
|
241 |
+
Answer strictly with either "YES" or "NONE".
|
242 |
+
"""
|
243 |
+
|
244 |
+
|
245 |
+
response1 = captioner.chat.completions.create(
|
246 |
+
model="chatgpt-4o-latest",
|
247 |
+
messages=[
|
248 |
+
{
|
249 |
+
"role": "user",
|
250 |
+
"content": [
|
251 |
+
{
|
252 |
+
"type": "text",
|
253 |
+
"text": caption_filter_text,
|
254 |
+
},
|
255 |
+
{
|
256 |
+
"type": "image_url",
|
257 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
258 |
+
}
|
259 |
+
],
|
260 |
+
}
|
261 |
+
],
|
262 |
+
)
|
263 |
+
response_content = response1.choices[0].message.content
|
264 |
+
should_caption = True if "yes" in response_content.lower() else False
|
265 |
+
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
|
266 |
+
|
267 |
+
else:
|
268 |
+
should_caption = False
|
269 |
+
|
270 |
+
#2단계: dense caption 만들기
|
271 |
+
dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
|
272 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
273 |
+
I want to use your expressions to create a action-centric referring expression dataset.
|
274 |
+
Therefore, your expressions for these {cat_name}s should describe unique action of each object.
|
275 |
+
|
276 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
277 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
278 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
279 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
280 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
281 |
+
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
282 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
283 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
284 |
+
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
285 |
+
10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
|
286 |
+
11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
|
287 |
+
12. Do not mention object IDs.
|
288 |
+
13. Use '{cat_name}' as the noun for the referring expressions.
|
289 |
+
|
290 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
291 |
+
Output referring expressions for each object id.
|
292 |
+
"""
|
293 |
+
|
294 |
+
dense_caption_prompt = f"""
|
295 |
+
You are a visual assistant analyzing a single frame of a video.
|
296 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
297 |
+
I want to use your expressions to create a action-centric referring expression dataset.
|
298 |
+
Please describe each {cat_name} using **clearly observable** and **specific** actions.
|
299 |
+
|
300 |
+
## Guidelines:
|
301 |
+
1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
|
302 |
+
2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
|
303 |
+
3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
|
304 |
+
4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
|
305 |
+
Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
|
306 |
+
5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
307 |
+
6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
|
308 |
+
7. Base your description on the following action definitions:
|
309 |
+
- Facial with object manipulation
|
310 |
+
- General body movement, body position or pattern
|
311 |
+
- Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
|
312 |
+
- Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
|
313 |
+
|
314 |
+
## Output Format:
|
315 |
+
- For each labeled {cat_name}, output one line in the format:
|
316 |
+
ID. action-oriented description
|
317 |
+
|
318 |
+
Example:
|
319 |
+
1. a bear grasping the edge of a wood with its front paws
|
320 |
+
2. the bear pushing another bear, leaning forward
|
321 |
+
|
322 |
+
**Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
|
323 |
+
**Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
|
324 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
325 |
+
For each labeled {cat_name}, output referring expressions for each object id.
|
326 |
+
"""
|
327 |
+
if should_caption:
|
328 |
+
response2 = captioner.chat.completions.create(
|
329 |
+
model="chatgpt-4o-latest",
|
330 |
+
messages=[
|
331 |
+
{
|
332 |
+
"role": "user",
|
333 |
+
"content": [
|
334 |
+
{
|
335 |
+
"type": "text",
|
336 |
+
"text": dense_caption_prompt,
|
337 |
+
},
|
338 |
+
{
|
339 |
+
"type": "image_url",
|
340 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
341 |
+
},
|
342 |
+
],
|
343 |
+
}
|
344 |
+
],
|
345 |
+
)
|
346 |
+
|
347 |
+
caption = response2.choices[0].message.content
|
348 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
349 |
+
else:
|
350 |
+
caption = None
|
351 |
+
|
352 |
+
image_captions[frame_name] = caption
|
353 |
+
all_captions[cat_name] = image_captions
|
354 |
+
|
355 |
+
# final : also prepare valid object ids
|
356 |
+
valid_obj_ids = dict()
|
357 |
+
|
358 |
+
for cat in cat_names:
|
359 |
+
if cat in ytvos_category_valid_list:
|
360 |
+
obj_id_cat = vid_meta['obj_id_cat']
|
361 |
+
valid_cat_ids = []
|
362 |
+
for obj_id in list(obj_id_cat.keys()):
|
363 |
+
if obj_id_cat[obj_id] == cat:
|
364 |
+
valid_cat_ids.append(obj_id)
|
365 |
+
valid_obj_ids[cat] = valid_cat_ids
|
366 |
+
|
367 |
+
return vid_id, all_captions, valid_obj_ids
|
368 |
+
|
369 |
+
|
370 |
+
if __name__ == '__main__':
|
371 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
372 |
+
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
|
373 |
+
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
|
374 |
+
|
375 |
+
args = parser.parse_args()
|
376 |
+
|
377 |
+
print(args.save_caption_path, flush=True)
|
378 |
+
print(args.save_valid_obj_ids_path, flush=True)
|
379 |
+
|
380 |
+
#==================데이터 불러오기===================
|
381 |
+
# 전체 데이터셋
|
382 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
383 |
+
|
384 |
+
# 전체 데이터셋 메타데이터
|
385 |
+
metas = train_dataset.metas
|
386 |
+
|
387 |
+
# 색상 후보 8개 (RGB 형식)
|
388 |
+
colors = [
|
389 |
+
(255, 0, 0), # Red
|
390 |
+
(0, 255, 0), # Green
|
391 |
+
(0, 0, 255), # Blue
|
392 |
+
(255, 255, 0), # Yellow
|
393 |
+
(255, 0, 255), # Magenta
|
394 |
+
(0, 255, 255), # Cyan
|
395 |
+
(128, 0, 128), # Purple
|
396 |
+
(255, 165, 0) # Orange
|
397 |
+
]
|
398 |
+
|
399 |
+
ytvos_category_valid_list = [
|
400 |
+
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
401 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
402 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
403 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
404 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
405 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
406 |
+
]
|
407 |
+
|
408 |
+
#==================gpt 돌리기===================
|
409 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
410 |
+
|
411 |
+
result_captions = {}
|
412 |
+
result_valid_obj_ids = {}
|
413 |
+
|
414 |
+
for i in range(370):
|
415 |
+
vid_id, all_captions, valid_obj_ids = getCaption(i, True)
|
416 |
+
|
417 |
+
if vid_id not in result_captions:
|
418 |
+
result_captions[vid_id] = all_captions
|
419 |
+
if vid_id not in result_valid_obj_ids:
|
420 |
+
result_valid_obj_ids[vid_id] = valid_obj_ids
|
421 |
+
|
422 |
+
print("Finished!", flush=True)
|
423 |
+
|
424 |
+
with open(args.save_caption_path, "w") as file:
|
425 |
+
json.dump(result_captions, file, indent=4)
|
426 |
+
|
427 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
428 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190447.py
ADDED
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
print(os.getcwd()) # 현재 작업 디렉토리 출력
|
3 |
+
|
4 |
+
import sys
|
5 |
+
from os import path as osp
|
6 |
+
from io import BytesIO
|
7 |
+
|
8 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
9 |
+
import argparse
|
10 |
+
import opts
|
11 |
+
|
12 |
+
import sys
|
13 |
+
from pathlib import Path
|
14 |
+
import os
|
15 |
+
from os import path as osp
|
16 |
+
import skimage
|
17 |
+
from io import BytesIO
|
18 |
+
|
19 |
+
import numpy as np
|
20 |
+
import pandas as pd
|
21 |
+
import regex as re
|
22 |
+
import json
|
23 |
+
|
24 |
+
import cv2
|
25 |
+
from PIL import Image, ImageDraw
|
26 |
+
import torch
|
27 |
+
from torchvision.transforms import functional as F
|
28 |
+
|
29 |
+
from skimage import measure # (pip install scikit-image)
|
30 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
31 |
+
|
32 |
+
import matplotlib.pyplot as plt
|
33 |
+
import matplotlib.patches as patches
|
34 |
+
from matplotlib.collections import PatchCollection
|
35 |
+
from matplotlib.patches import Rectangle
|
36 |
+
import textwrap
|
37 |
+
|
38 |
+
|
39 |
+
import ipywidgets as widgets
|
40 |
+
from IPython.display import display, clear_output
|
41 |
+
|
42 |
+
from openai import OpenAI
|
43 |
+
import base64
|
44 |
+
import json
|
45 |
+
|
46 |
+
def number_objects_and_encode(idx, color_mask=False):
|
47 |
+
encoded_frames = {}
|
48 |
+
contoured_frames = {} # New dictionary for original images
|
49 |
+
vid_cat_cnts = {}
|
50 |
+
|
51 |
+
vid_meta = metas[idx]
|
52 |
+
vid_data = train_dataset[idx]
|
53 |
+
vid_id = vid_meta['video']
|
54 |
+
frame_indx = vid_meta['sample_indx']
|
55 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
56 |
+
imgs = vid_data[0]
|
57 |
+
|
58 |
+
for cat in cat_names:
|
59 |
+
cat_frames = []
|
60 |
+
contour_frames = []
|
61 |
+
frame_cat_cnts = {}
|
62 |
+
|
63 |
+
for i in range(imgs.size(0)):
|
64 |
+
frame_name = frame_indx[i]
|
65 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
66 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
67 |
+
|
68 |
+
frame_data = vid_data[2][frame_name]
|
69 |
+
obj_ids = list(frame_data.keys())
|
70 |
+
|
71 |
+
cat_cnt = 0
|
72 |
+
|
73 |
+
for j in range(len(obj_ids)):
|
74 |
+
obj_id = obj_ids[j]
|
75 |
+
obj_data = frame_data[obj_id]
|
76 |
+
obj_bbox = obj_data['bbox']
|
77 |
+
obj_valid = obj_data['valid']
|
78 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
79 |
+
obj_cat = obj_data['category_name']
|
80 |
+
|
81 |
+
if obj_cat == cat and obj_valid:
|
82 |
+
cat_cnt += 1
|
83 |
+
|
84 |
+
if color_mask == False:
|
85 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
86 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
87 |
+
for i, contour in enumerate(contours):
|
88 |
+
# 윤곽선 중심 계산
|
89 |
+
moments = cv2.moments(contour)
|
90 |
+
if moments["m00"] != 0: # 중심 계산 가능 여부 확인
|
91 |
+
cx = int(moments["m10"] / moments["m00"])
|
92 |
+
cy = int(moments["m01"] / moments["m00"])
|
93 |
+
else:
|
94 |
+
cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
|
95 |
+
|
96 |
+
# 텍스트 배경 (검은색 배경 만들기)
|
97 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
98 |
+
text = obj_id
|
99 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
100 |
+
text_w, text_h = text_size
|
101 |
+
|
102 |
+
# 텍스트 배경 그리기 (검은색 배경)
|
103 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
104 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
105 |
+
|
106 |
+
# 텍스트 그리기 (흰색 텍스트)
|
107 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
108 |
+
font, 1, (255, 255, 255), 2)
|
109 |
+
|
110 |
+
else:
|
111 |
+
alpha = 0.08
|
112 |
+
|
113 |
+
colored_obj_mask = np.zeros_like(frame)
|
114 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
115 |
+
frame[obj_mask == 1] = (
|
116 |
+
(1 - alpha) * frame[obj_mask == 1]
|
117 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
118 |
+
)
|
119 |
+
|
120 |
+
|
121 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
122 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
123 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
124 |
+
|
125 |
+
|
126 |
+
|
127 |
+
if len(contours) > 0:
|
128 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
129 |
+
M = cv2.moments(largest_contour)
|
130 |
+
if M["m00"] != 0:
|
131 |
+
center_x = int(M["m10"] / M["m00"])
|
132 |
+
center_y = int(M["m01"] / M["m00"])
|
133 |
+
else:
|
134 |
+
center_x, center_y = 0, 0
|
135 |
+
|
136 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
137 |
+
text = obj_id
|
138 |
+
|
139 |
+
font_scale = 0.9
|
140 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
141 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
142 |
+
text_y = center_y
|
143 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
144 |
+
|
145 |
+
# 텍스트 배경 사각형 좌표 계산
|
146 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
147 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
148 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
149 |
+
|
150 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
151 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
152 |
+
|
153 |
+
# plt.figure(figsize=(12, 8))
|
154 |
+
# plt.imshow(frame)
|
155 |
+
# plt.title(f"frame {frame_name}")
|
156 |
+
# plt.tight_layout()
|
157 |
+
# plt.axis('off')
|
158 |
+
# plt.show()
|
159 |
+
|
160 |
+
buffer = BytesIO()
|
161 |
+
frame = Image.fromarray(frame)
|
162 |
+
frame.save(buffer, format='jpeg')
|
163 |
+
buffer.seek(0)
|
164 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
165 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
166 |
+
|
167 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
168 |
+
buffer.truncate()
|
169 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
170 |
+
frame_for_contour.save(buffer, format='jpeg')
|
171 |
+
buffer.seek(0)
|
172 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
173 |
+
|
174 |
+
encoded_frames[cat] = cat_frames
|
175 |
+
contoured_frames[cat] = contour_frames
|
176 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
177 |
+
|
178 |
+
return encoded_frames, vid_cat_cnts, contoured_frames
|
179 |
+
|
180 |
+
|
181 |
+
def getCaption(idx, color_mask=True):
|
182 |
+
vid_meta = metas[idx]
|
183 |
+
vid_data = train_dataset[idx]
|
184 |
+
vid_id = vid_meta['video']
|
185 |
+
print(f"vid id: {vid_id}\n")
|
186 |
+
|
187 |
+
frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
|
188 |
+
cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
|
189 |
+
all_captions = dict()
|
190 |
+
|
191 |
+
base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
|
192 |
+
marked = "mask with boundary" if color_mask else "boundary"
|
193 |
+
|
194 |
+
for cat_name in list(cat_names) :
|
195 |
+
|
196 |
+
is_movable = False
|
197 |
+
if cat_name in ytvos_category_valid_list :
|
198 |
+
is_movable = True
|
199 |
+
|
200 |
+
if not is_movable:
|
201 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
|
202 |
+
|
203 |
+
|
204 |
+
image_captions = {}
|
205 |
+
captioner = OpenAI()
|
206 |
+
cat_base64_frames = base64_frames[cat_name]
|
207 |
+
cont_base64_frames = contoured_frames[cat_name]
|
208 |
+
|
209 |
+
for i in range(len(cat_base64_frames)):
|
210 |
+
frame_name = frame_indx[i]
|
211 |
+
cont_base64_image = cont_base64_frames[i]
|
212 |
+
base64_image = cat_base64_frames[i]
|
213 |
+
should_filter = False
|
214 |
+
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
|
215 |
+
|
216 |
+
if frame_cat_cnts >= 2:
|
217 |
+
should_filter = True
|
218 |
+
else:
|
219 |
+
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
|
220 |
+
|
221 |
+
if is_movable and should_filter:
|
222 |
+
#1단계: 필터링
|
223 |
+
print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
224 |
+
caption_filter_text = f"""
|
225 |
+
You are a visual assistant analyzing a single frame from a video.
|
226 |
+
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
|
227 |
+
|
228 |
+
Are {cat_name}s in the image performing all different and recognizable actions or postures?
|
229 |
+
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
|
230 |
+
facial expressions, and any notable interactions with objects or other {cat_name}s or people.
|
231 |
+
|
232 |
+
Only focus on obvious, prominent actions that can be reliably identified from this single frame.
|
233 |
+
|
234 |
+
- Respond with "YES" if:
|
235 |
+
1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
|
236 |
+
2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
|
237 |
+
3) Each action is unambiguously recognizable and distinct.
|
238 |
+
|
239 |
+
- Respond with "NONE" if:
|
240 |
+
1) The actions or pose are not clearly differentiable or too similar.
|
241 |
+
2) They show no noticeable action beyond standing or minor movements.
|
242 |
+
|
243 |
+
Answer strictly with either "YES" or "NONE".
|
244 |
+
"""
|
245 |
+
|
246 |
+
|
247 |
+
response1 = captioner.chat.completions.create(
|
248 |
+
model="chatgpt-4o-latest",
|
249 |
+
messages=[
|
250 |
+
{
|
251 |
+
"role": "user",
|
252 |
+
"content": [
|
253 |
+
{
|
254 |
+
"type": "text",
|
255 |
+
"text": caption_filter_text,
|
256 |
+
},
|
257 |
+
{
|
258 |
+
"type": "image_url",
|
259 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
260 |
+
}
|
261 |
+
],
|
262 |
+
}
|
263 |
+
],
|
264 |
+
)
|
265 |
+
response_content = response1.choices[0].message.content
|
266 |
+
should_caption = True if "yes" in response_content.lower() else False
|
267 |
+
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
|
268 |
+
|
269 |
+
else:
|
270 |
+
should_caption = False
|
271 |
+
|
272 |
+
#2단계: dense caption 만들기
|
273 |
+
dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
|
274 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
275 |
+
I want to use your expressions to create a action-centric referring expression dataset.
|
276 |
+
Therefore, your expressions for these {cat_name}s should describe unique action of each object.
|
277 |
+
|
278 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
279 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
280 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
281 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
282 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
283 |
+
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
284 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
285 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
286 |
+
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
287 |
+
10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
|
288 |
+
11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
|
289 |
+
12. Do not mention object IDs.
|
290 |
+
13. Use '{cat_name}' as the noun for the referring expressions.
|
291 |
+
|
292 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
293 |
+
Output referring expressions for each object id.
|
294 |
+
"""
|
295 |
+
|
296 |
+
dense_caption_prompt = f"""
|
297 |
+
You are a visual assistant analyzing a single frame of a video.
|
298 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
299 |
+
I want to use your expressions to create a action-centric referring expression dataset.
|
300 |
+
Please describe each {cat_name} using **clearly observable** and **specific** actions.
|
301 |
+
|
302 |
+
## Guidelines:
|
303 |
+
1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
|
304 |
+
2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
|
305 |
+
3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
|
306 |
+
4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
|
307 |
+
Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
|
308 |
+
5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
309 |
+
6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
|
310 |
+
7. Base your description on the following action definitions:
|
311 |
+
- Facial with object manipulation
|
312 |
+
- General body movement, body position or pattern
|
313 |
+
- Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
|
314 |
+
- Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
|
315 |
+
|
316 |
+
## Output Format:
|
317 |
+
- For each labeled {cat_name}, output one line in the format:
|
318 |
+
ID. action-oriented description
|
319 |
+
|
320 |
+
Example:
|
321 |
+
1. a bear grasping the edge of a wood with its front paws
|
322 |
+
2. the bear pushing another bear, leaning forward
|
323 |
+
|
324 |
+
**Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
|
325 |
+
**Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
|
326 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
327 |
+
For each labeled {cat_name}, output referring expressions for each object id.
|
328 |
+
"""
|
329 |
+
if should_caption:
|
330 |
+
response2 = captioner.chat.completions.create(
|
331 |
+
model="chatgpt-4o-latest",
|
332 |
+
messages=[
|
333 |
+
{
|
334 |
+
"role": "user",
|
335 |
+
"content": [
|
336 |
+
{
|
337 |
+
"type": "text",
|
338 |
+
"text": dense_caption_prompt,
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"type": "image_url",
|
342 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
343 |
+
},
|
344 |
+
],
|
345 |
+
}
|
346 |
+
],
|
347 |
+
)
|
348 |
+
|
349 |
+
caption = response2.choices[0].message.content
|
350 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
351 |
+
else:
|
352 |
+
caption = None
|
353 |
+
|
354 |
+
image_captions[frame_name] = caption
|
355 |
+
all_captions[cat_name] = image_captions
|
356 |
+
|
357 |
+
# final : also prepare valid object ids
|
358 |
+
valid_obj_ids = dict()
|
359 |
+
|
360 |
+
for cat in cat_names:
|
361 |
+
if cat in ytvos_category_valid_list:
|
362 |
+
obj_id_cat = vid_meta['obj_id_cat']
|
363 |
+
valid_cat_ids = []
|
364 |
+
for obj_id in list(obj_id_cat.keys()):
|
365 |
+
if obj_id_cat[obj_id] == cat:
|
366 |
+
valid_cat_ids.append(obj_id)
|
367 |
+
valid_obj_ids[cat] = valid_cat_ids
|
368 |
+
|
369 |
+
return vid_id, all_captions, valid_obj_ids
|
370 |
+
|
371 |
+
|
372 |
+
if __name__ == '__main__':
|
373 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
374 |
+
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
|
375 |
+
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
|
376 |
+
|
377 |
+
args = parser.parse_args()
|
378 |
+
|
379 |
+
print(args.save_caption_path, flush=True)
|
380 |
+
print(args.save_valid_obj_ids_path, flush=True)
|
381 |
+
|
382 |
+
#==================데이터 불러오기===================
|
383 |
+
# 전체 데이터셋
|
384 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
385 |
+
|
386 |
+
# 전체 데이터셋 메타데이터
|
387 |
+
metas = train_dataset.metas
|
388 |
+
|
389 |
+
# 색상 후보 8개 (RGB 형식)
|
390 |
+
colors = [
|
391 |
+
(255, 0, 0), # Red
|
392 |
+
(0, 255, 0), # Green
|
393 |
+
(0, 0, 255), # Blue
|
394 |
+
(255, 255, 0), # Yellow
|
395 |
+
(255, 0, 255), # Magenta
|
396 |
+
(0, 255, 255), # Cyan
|
397 |
+
(128, 0, 128), # Purple
|
398 |
+
(255, 165, 0) # Orange
|
399 |
+
]
|
400 |
+
|
401 |
+
ytvos_category_valid_list = [
|
402 |
+
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
403 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
404 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
405 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
406 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
407 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
408 |
+
]
|
409 |
+
|
410 |
+
#==================gpt 돌리기===================
|
411 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
412 |
+
|
413 |
+
result_captions = {}
|
414 |
+
result_valid_obj_ids = {}
|
415 |
+
|
416 |
+
for i in range(370):
|
417 |
+
vid_id, all_captions, valid_obj_ids = getCaption(i, True)
|
418 |
+
|
419 |
+
if vid_id not in result_captions:
|
420 |
+
result_captions[vid_id] = all_captions
|
421 |
+
if vid_id not in result_valid_obj_ids:
|
422 |
+
result_valid_obj_ids[vid_id] = valid_obj_ids
|
423 |
+
|
424 |
+
print("Finished!", flush=True)
|
425 |
+
|
426 |
+
with open(args.save_caption_path, "w") as file:
|
427 |
+
json.dump(result_captions, file, indent=4)
|
428 |
+
|
429 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
430 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
.history/mbench/gpt_ref-ytvos_numbered_cy_20250130190713.py
ADDED
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from os import path as osp
|
6 |
+
from io import BytesIO
|
7 |
+
|
8 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
9 |
+
import argparse
|
10 |
+
import opts
|
11 |
+
|
12 |
+
import sys
|
13 |
+
from pathlib import Path
|
14 |
+
import os
|
15 |
+
from os import path as osp
|
16 |
+
import skimage
|
17 |
+
from io import BytesIO
|
18 |
+
|
19 |
+
import numpy as np
|
20 |
+
import pandas as pd
|
21 |
+
import regex as re
|
22 |
+
import json
|
23 |
+
|
24 |
+
import cv2
|
25 |
+
from PIL import Image, ImageDraw
|
26 |
+
import torch
|
27 |
+
from torchvision.transforms import functional as F
|
28 |
+
|
29 |
+
from skimage import measure # (pip install scikit-image)
|
30 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
31 |
+
|
32 |
+
import matplotlib.pyplot as plt
|
33 |
+
import matplotlib.patches as patches
|
34 |
+
from matplotlib.collections import PatchCollection
|
35 |
+
from matplotlib.patches import Rectangle
|
36 |
+
import textwrap
|
37 |
+
|
38 |
+
|
39 |
+
import ipywidgets as widgets
|
40 |
+
from IPython.display import display, clear_output
|
41 |
+
|
42 |
+
from openai import OpenAI
|
43 |
+
import base64
|
44 |
+
import json
|
45 |
+
|
46 |
+
def number_objects_and_encode(idx, color_mask=False):
|
47 |
+
encoded_frames = {}
|
48 |
+
contoured_frames = {} # New dictionary for original images
|
49 |
+
vid_cat_cnts = {}
|
50 |
+
|
51 |
+
vid_meta = metas[idx]
|
52 |
+
vid_data = train_dataset[idx]
|
53 |
+
vid_id = vid_meta['video']
|
54 |
+
frame_indx = vid_meta['sample_indx']
|
55 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
56 |
+
imgs = vid_data[0]
|
57 |
+
|
58 |
+
for cat in cat_names:
|
59 |
+
cat_frames = []
|
60 |
+
contour_frames = []
|
61 |
+
frame_cat_cnts = {}
|
62 |
+
|
63 |
+
for i in range(imgs.size(0)):
|
64 |
+
frame_name = frame_indx[i]
|
65 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
66 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
67 |
+
|
68 |
+
frame_data = vid_data[2][frame_name]
|
69 |
+
obj_ids = list(frame_data.keys())
|
70 |
+
|
71 |
+
cat_cnt = 0
|
72 |
+
|
73 |
+
for j in range(len(obj_ids)):
|
74 |
+
obj_id = obj_ids[j]
|
75 |
+
obj_data = frame_data[obj_id]
|
76 |
+
obj_bbox = obj_data['bbox']
|
77 |
+
obj_valid = obj_data['valid']
|
78 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
79 |
+
obj_cat = obj_data['category_name']
|
80 |
+
|
81 |
+
if obj_cat == cat and obj_valid:
|
82 |
+
cat_cnt += 1
|
83 |
+
|
84 |
+
if color_mask == False:
|
85 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
86 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
87 |
+
for i, contour in enumerate(contours):
|
88 |
+
# 윤곽선 중심 계산
|
89 |
+
moments = cv2.moments(contour)
|
90 |
+
if moments["m00"] != 0: # 중심 계산 가능 여부 확인
|
91 |
+
cx = int(moments["m10"] / moments["m00"])
|
92 |
+
cy = int(moments["m01"] / moments["m00"])
|
93 |
+
else:
|
94 |
+
cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
|
95 |
+
|
96 |
+
# 텍스트 배경 (검은색 배경 만들기)
|
97 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
98 |
+
text = obj_id
|
99 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
100 |
+
text_w, text_h = text_size
|
101 |
+
|
102 |
+
# 텍스트 배경 그리기 (검은색 배경)
|
103 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
104 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
105 |
+
|
106 |
+
# 텍스트 그리기 (흰색 텍스트)
|
107 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
108 |
+
font, 1, (255, 255, 255), 2)
|
109 |
+
|
110 |
+
else:
|
111 |
+
alpha = 0.08
|
112 |
+
|
113 |
+
colored_obj_mask = np.zeros_like(frame)
|
114 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
115 |
+
frame[obj_mask == 1] = (
|
116 |
+
(1 - alpha) * frame[obj_mask == 1]
|
117 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
118 |
+
)
|
119 |
+
|
120 |
+
|
121 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
122 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
123 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
124 |
+
|
125 |
+
|
126 |
+
|
127 |
+
if len(contours) > 0:
|
128 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
129 |
+
M = cv2.moments(largest_contour)
|
130 |
+
if M["m00"] != 0:
|
131 |
+
center_x = int(M["m10"] / M["m00"])
|
132 |
+
center_y = int(M["m01"] / M["m00"])
|
133 |
+
else:
|
134 |
+
center_x, center_y = 0, 0
|
135 |
+
|
136 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
137 |
+
text = obj_id
|
138 |
+
|
139 |
+
font_scale = 0.9
|
140 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
141 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
142 |
+
text_y = center_y
|
143 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
144 |
+
|
145 |
+
# 텍스트 배경 사각형 좌표 계산
|
146 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
147 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
148 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
149 |
+
|
150 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
151 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
152 |
+
|
153 |
+
# plt.figure(figsize=(12, 8))
|
154 |
+
# plt.imshow(frame)
|
155 |
+
# plt.title(f"frame {frame_name}")
|
156 |
+
# plt.tight_layout()
|
157 |
+
# plt.axis('off')
|
158 |
+
# plt.show()
|
159 |
+
|
160 |
+
buffer = BytesIO()
|
161 |
+
frame = Image.fromarray(frame)
|
162 |
+
frame.save(buffer, format='jpeg')
|
163 |
+
buffer.seek(0)
|
164 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
165 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
166 |
+
|
167 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
168 |
+
buffer.truncate()
|
169 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
170 |
+
frame_for_contour.save(buffer, format='jpeg')
|
171 |
+
buffer.seek(0)
|
172 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
173 |
+
|
174 |
+
encoded_frames[cat] = cat_frames
|
175 |
+
contoured_frames[cat] = contour_frames
|
176 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
177 |
+
|
178 |
+
return encoded_frames, vid_cat_cnts, contoured_frames
|
179 |
+
|
180 |
+
|
181 |
+
def getCaption(idx, color_mask=True):
|
182 |
+
vid_meta = metas[idx]
|
183 |
+
vid_data = train_dataset[idx]
|
184 |
+
vid_id = vid_meta['video']
|
185 |
+
print(f"vid id: {vid_id}\n")
|
186 |
+
|
187 |
+
frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
|
188 |
+
cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
|
189 |
+
all_captions = dict()
|
190 |
+
|
191 |
+
base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
|
192 |
+
marked = "mask with boundary" if color_mask else "boundary"
|
193 |
+
|
194 |
+
for cat_name in list(cat_names) :
|
195 |
+
|
196 |
+
is_movable = False
|
197 |
+
if cat_name in ytvos_category_valid_list :
|
198 |
+
is_movable = True
|
199 |
+
|
200 |
+
if not is_movable:
|
201 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
|
202 |
+
|
203 |
+
|
204 |
+
image_captions = {}
|
205 |
+
captioner = OpenAI()
|
206 |
+
cat_base64_frames = base64_frames[cat_name]
|
207 |
+
cont_base64_frames = contoured_frames[cat_name]
|
208 |
+
|
209 |
+
for i in range(len(cat_base64_frames)):
|
210 |
+
frame_name = frame_indx[i]
|
211 |
+
cont_base64_image = cont_base64_frames[i]
|
212 |
+
base64_image = cat_base64_frames[i]
|
213 |
+
should_filter = False
|
214 |
+
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
|
215 |
+
|
216 |
+
if frame_cat_cnts >= 2:
|
217 |
+
should_filter = True
|
218 |
+
else:
|
219 |
+
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
|
220 |
+
|
221 |
+
if is_movable and should_filter:
|
222 |
+
#1단계: 필터링
|
223 |
+
print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
224 |
+
caption_filter_text = f"""
|
225 |
+
You are a visual assistant analyzing a single frame from a video.
|
226 |
+
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
|
227 |
+
|
228 |
+
Are {cat_name}s in the image performing all different and recognizable actions or postures?
|
229 |
+
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
|
230 |
+
facial expressions, and any notable interactions with objects or other {cat_name}s or people.
|
231 |
+
|
232 |
+
Only focus on obvious, prominent actions that can be reliably identified from this single frame.
|
233 |
+
|
234 |
+
- Respond with "YES" if:
|
235 |
+
1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
|
236 |
+
2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
|
237 |
+
3) Each action is unambiguously recognizable and distinct.
|
238 |
+
|
239 |
+
- Respond with "NONE" if:
|
240 |
+
1) The actions or pose are not clearly differentiable or too similar.
|
241 |
+
2) They show no noticeable action beyond standing or minor movements.
|
242 |
+
|
243 |
+
Answer strictly with either "YES" or "NONE".
|
244 |
+
"""
|
245 |
+
|
246 |
+
|
247 |
+
response1 = captioner.chat.completions.create(
|
248 |
+
model="chatgpt-4o-latest",
|
249 |
+
messages=[
|
250 |
+
{
|
251 |
+
"role": "user",
|
252 |
+
"content": [
|
253 |
+
{
|
254 |
+
"type": "text",
|
255 |
+
"text": caption_filter_text,
|
256 |
+
},
|
257 |
+
{
|
258 |
+
"type": "image_url",
|
259 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
260 |
+
}
|
261 |
+
],
|
262 |
+
}
|
263 |
+
],
|
264 |
+
)
|
265 |
+
response_content = response1.choices[0].message.content
|
266 |
+
should_caption = True if "yes" in response_content.lower() else False
|
267 |
+
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
|
268 |
+
|
269 |
+
else:
|
270 |
+
should_caption = False
|
271 |
+
|
272 |
+
#2단계: dense caption 만들기
|
273 |
+
dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
|
274 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
275 |
+
I want to use your expressions to create a action-centric referring expression dataset.
|
276 |
+
Therefore, your expressions for these {cat_name}s should describe unique action of each object.
|
277 |
+
|
278 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
279 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
280 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
281 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
282 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
283 |
+
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
284 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
285 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
286 |
+
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
287 |
+
10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
|
288 |
+
11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
|
289 |
+
12. Do not mention object IDs.
|
290 |
+
13. Use '{cat_name}' as the noun for the referring expressions.
|
291 |
+
|
292 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
293 |
+
Output referring expressions for each object id.
|
294 |
+
"""
|
295 |
+
|
296 |
+
dense_caption_prompt = f"""
|
297 |
+
You are a visual assistant analyzing a single frame of a video.
|
298 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
299 |
+
I want to use your expressions to create a action-centric referring expression dataset.
|
300 |
+
Please describe each {cat_name} using **clearly observable** and **specific** actions.
|
301 |
+
|
302 |
+
## Guidelines:
|
303 |
+
1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
|
304 |
+
2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
|
305 |
+
3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
|
306 |
+
4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
|
307 |
+
Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
|
308 |
+
5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
309 |
+
6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
|
310 |
+
7. Base your description on the following action definitions:
|
311 |
+
- Facial with object manipulation
|
312 |
+
- General body movement, body position or pattern
|
313 |
+
- Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
|
314 |
+
- Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
|
315 |
+
|
316 |
+
## Output Format:
|
317 |
+
- For each labeled {cat_name}, output one line in the format:
|
318 |
+
ID. action-oriented description
|
319 |
+
|
320 |
+
Example:
|
321 |
+
1. a bear grasping the edge of a wood with its front paws
|
322 |
+
2. the bear pushing another bear, leaning forward
|
323 |
+
|
324 |
+
**Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
|
325 |
+
**Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
|
326 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
327 |
+
For each labeled {cat_name}, output referring expressions for each object id.
|
328 |
+
"""
|
329 |
+
if should_caption:
|
330 |
+
response2 = captioner.chat.completions.create(
|
331 |
+
model="chatgpt-4o-latest",
|
332 |
+
messages=[
|
333 |
+
{
|
334 |
+
"role": "user",
|
335 |
+
"content": [
|
336 |
+
{
|
337 |
+
"type": "text",
|
338 |
+
"text": dense_caption_prompt,
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"type": "image_url",
|
342 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
343 |
+
},
|
344 |
+
],
|
345 |
+
}
|
346 |
+
],
|
347 |
+
)
|
348 |
+
|
349 |
+
caption = response2.choices[0].message.content
|
350 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
351 |
+
else:
|
352 |
+
caption = None
|
353 |
+
|
354 |
+
image_captions[frame_name] = caption
|
355 |
+
all_captions[cat_name] = image_captions
|
356 |
+
|
357 |
+
# final : also prepare valid object ids
|
358 |
+
valid_obj_ids = dict()
|
359 |
+
|
360 |
+
for cat in cat_names:
|
361 |
+
if cat in ytvos_category_valid_list:
|
362 |
+
obj_id_cat = vid_meta['obj_id_cat']
|
363 |
+
valid_cat_ids = []
|
364 |
+
for obj_id in list(obj_id_cat.keys()):
|
365 |
+
if obj_id_cat[obj_id] == cat:
|
366 |
+
valid_cat_ids.append(obj_id)
|
367 |
+
valid_obj_ids[cat] = valid_cat_ids
|
368 |
+
|
369 |
+
return vid_id, all_captions, valid_obj_ids
|
370 |
+
|
371 |
+
|
372 |
+
if __name__ == '__main__':
|
373 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
374 |
+
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
|
375 |
+
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
|
376 |
+
|
377 |
+
args = parser.parse_args()
|
378 |
+
|
379 |
+
print(args.save_caption_path, flush=True)
|
380 |
+
print(args.save_valid_obj_ids_path, flush=True)
|
381 |
+
|
382 |
+
#==================데이터 불러오기===================
|
383 |
+
# 전체 데이터셋
|
384 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
385 |
+
|
386 |
+
# 전체 데이터셋 메타데이터
|
387 |
+
metas = train_dataset.metas
|
388 |
+
|
389 |
+
# 색상 후보 8개 (RGB 형식)
|
390 |
+
colors = [
|
391 |
+
(255, 0, 0), # Red
|
392 |
+
(0, 255, 0), # Green
|
393 |
+
(0, 0, 255), # Blue
|
394 |
+
(255, 255, 0), # Yellow
|
395 |
+
(255, 0, 255), # Magenta
|
396 |
+
(0, 255, 255), # Cyan
|
397 |
+
(128, 0, 128), # Purple
|
398 |
+
(255, 165, 0) # Orange
|
399 |
+
]
|
400 |
+
|
401 |
+
ytvos_category_valid_list = [
|
402 |
+
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
403 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
404 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
405 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
406 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
407 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
408 |
+
]
|
409 |
+
|
410 |
+
#==================gpt 돌리기===================
|
411 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
412 |
+
|
413 |
+
result_captions = {}
|
414 |
+
result_valid_obj_ids = {}
|
415 |
+
|
416 |
+
for i in range(370):
|
417 |
+
vid_id, all_captions, valid_obj_ids = getCaption(i, True)
|
418 |
+
|
419 |
+
if vid_id not in result_captions:
|
420 |
+
result_captions[vid_id] = all_captions
|
421 |
+
if vid_id not in result_valid_obj_ids:
|
422 |
+
result_valid_obj_ids[vid_id] = valid_obj_ids
|
423 |
+
|
424 |
+
print("Finished!", flush=True)
|
425 |
+
|
426 |
+
with open(args.save_caption_path, "w") as file:
|
427 |
+
json.dump(result_captions, file, indent=4)
|
428 |
+
|
429 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
430 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
.history/mbench/gpt_ref-ytvos_numbered_cy_20250131124156.py
ADDED
@@ -0,0 +1,427 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from os import path as osp
|
6 |
+
from io import BytesIO
|
7 |
+
|
8 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
9 |
+
import argparse
|
10 |
+
import opts
|
11 |
+
|
12 |
+
import sys
|
13 |
+
from pathlib import Path
|
14 |
+
import os
|
15 |
+
from os import path as osp
|
16 |
+
import skimage
|
17 |
+
from io import BytesIO
|
18 |
+
|
19 |
+
import numpy as np
|
20 |
+
import pandas as pd
|
21 |
+
import regex as re
|
22 |
+
import json
|
23 |
+
|
24 |
+
import cv2
|
25 |
+
from PIL import Image, ImageDraw
|
26 |
+
import torch
|
27 |
+
from torchvision.transforms import functional as F
|
28 |
+
|
29 |
+
from skimage import measure # (pip install scikit-image)
|
30 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
31 |
+
|
32 |
+
import matplotlib.pyplot as plt
|
33 |
+
import matplotlib.patches as patches
|
34 |
+
from matplotlib.collections import PatchCollection
|
35 |
+
from matplotlib.patches import Rectangle
|
36 |
+
import textwrap
|
37 |
+
|
38 |
+
|
39 |
+
import ipywidgets as widgets
|
40 |
+
from IPython.display import display, clear_output
|
41 |
+
|
42 |
+
from openai import OpenAI
|
43 |
+
import base64
|
44 |
+
import json
|
45 |
+
|
46 |
+
def number_objects_and_encode(idx, color_mask=False):
|
47 |
+
encoded_frames = {}
|
48 |
+
contoured_frames = {} # New dictionary for original images
|
49 |
+
vid_cat_cnts = {}
|
50 |
+
|
51 |
+
vid_meta = metas[idx]
|
52 |
+
vid_data = train_dataset[idx]
|
53 |
+
vid_id = vid_meta['video']
|
54 |
+
frame_indx = vid_meta['sample_indx']
|
55 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
56 |
+
imgs = vid_data[0]
|
57 |
+
|
58 |
+
for cat in cat_names:
|
59 |
+
cat_frames = []
|
60 |
+
contour_frames = []
|
61 |
+
frame_cat_cnts = {}
|
62 |
+
|
63 |
+
for i in range(imgs.size(0)):
|
64 |
+
frame_name = frame_indx[i]
|
65 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
66 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
67 |
+
|
68 |
+
frame_data = vid_data[2][frame_name]
|
69 |
+
obj_ids = list(frame_data.keys())
|
70 |
+
|
71 |
+
cat_cnt = 0
|
72 |
+
|
73 |
+
for j in range(len(obj_ids)):
|
74 |
+
obj_id = obj_ids[j]
|
75 |
+
obj_data = frame_data[obj_id]
|
76 |
+
obj_bbox = obj_data['bbox']
|
77 |
+
obj_valid = obj_data['valid']
|
78 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
79 |
+
obj_cat = obj_data['category_name']
|
80 |
+
|
81 |
+
if obj_cat == cat and obj_valid:
|
82 |
+
cat_cnt += 1
|
83 |
+
|
84 |
+
if color_mask == False:
|
85 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
86 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
87 |
+
for i, contour in enumerate(contours):
|
88 |
+
# 윤곽선 중심 계산
|
89 |
+
moments = cv2.moments(contour)
|
90 |
+
if moments["m00"] != 0: # 중심 계산 가능 여부 확인
|
91 |
+
cx = int(moments["m10"] / moments["m00"])
|
92 |
+
cy = int(moments["m01"] / moments["m00"])
|
93 |
+
else:
|
94 |
+
cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
|
95 |
+
|
96 |
+
# 텍스트 배경 (검은색 배경 만들기)
|
97 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
98 |
+
text = obj_id
|
99 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
100 |
+
text_w, text_h = text_size
|
101 |
+
|
102 |
+
# 텍스트 배경 그리기 (검은색 배경)
|
103 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
104 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
105 |
+
|
106 |
+
# 텍스트 그리기 (흰색 텍스트)
|
107 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
108 |
+
font, 1, (255, 255, 255), 2)
|
109 |
+
|
110 |
+
else:
|
111 |
+
alpha = 0.08
|
112 |
+
|
113 |
+
colored_obj_mask = np.zeros_like(frame)
|
114 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
115 |
+
frame[obj_mask == 1] = (
|
116 |
+
(1 - alpha) * frame[obj_mask == 1]
|
117 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
118 |
+
)
|
119 |
+
|
120 |
+
|
121 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
122 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
123 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
124 |
+
|
125 |
+
|
126 |
+
|
127 |
+
if len(contours) > 0:
|
128 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
129 |
+
M = cv2.moments(largest_contour)
|
130 |
+
if M["m00"] != 0:
|
131 |
+
center_x = int(M["m10"] / M["m00"])
|
132 |
+
center_y = int(M["m01"] / M["m00"])
|
133 |
+
else:
|
134 |
+
center_x, center_y = 0, 0
|
135 |
+
|
136 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
137 |
+
text = obj_id
|
138 |
+
|
139 |
+
font_scale = 0.9
|
140 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
141 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
142 |
+
text_y = center_y
|
143 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
144 |
+
|
145 |
+
# 텍스트 배경 사각형 좌표 계산
|
146 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
147 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
148 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
149 |
+
|
150 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
151 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
152 |
+
|
153 |
+
# plt.figure(figsize=(12, 8))
|
154 |
+
# plt.imshow(frame)
|
155 |
+
# plt.title(f"frame {frame_name}")
|
156 |
+
# plt.tight_layout()
|
157 |
+
# plt.axis('off')
|
158 |
+
# plt.show()
|
159 |
+
|
160 |
+
buffer = BytesIO()
|
161 |
+
frame = Image.fromarray(frame)
|
162 |
+
frame.save(buffer, format='jpeg')
|
163 |
+
buffer.seek(0)
|
164 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
165 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
166 |
+
|
167 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
168 |
+
buffer.truncate()
|
169 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
170 |
+
frame_for_contour.save(buffer, format='jpeg')
|
171 |
+
buffer.seek(0)
|
172 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
173 |
+
|
174 |
+
encoded_frames[cat] = cat_frames
|
175 |
+
contoured_frames[cat] = contour_frames
|
176 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
177 |
+
|
178 |
+
return encoded_frames, vid_cat_cnts, contoured_frames
|
179 |
+
|
180 |
+
|
181 |
+
def getCaption(idx, color_mask=True):
|
182 |
+
vid_meta = metas[idx]
|
183 |
+
vid_data = train_dataset[idx]
|
184 |
+
vid_id = vid_meta['video']
|
185 |
+
print(f"vid id: {vid_id}\n")
|
186 |
+
|
187 |
+
frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
|
188 |
+
cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
|
189 |
+
all_captions = dict()
|
190 |
+
|
191 |
+
base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
|
192 |
+
marked = "mask with boundary" if color_mask else "boundary"
|
193 |
+
|
194 |
+
for cat_name in list(cat_names) :
|
195 |
+
|
196 |
+
is_movable = False
|
197 |
+
if cat_name in ytvos_category_valid_list :
|
198 |
+
is_movable = True
|
199 |
+
|
200 |
+
if not is_movable:
|
201 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
|
202 |
+
|
203 |
+
|
204 |
+
image_captions = {}
|
205 |
+
captioner = OpenAI()
|
206 |
+
cat_base64_frames = base64_frames[cat_name]
|
207 |
+
cont_base64_frames = contoured_frames[cat_name]
|
208 |
+
|
209 |
+
for i in range(len(cat_base64_frames)):
|
210 |
+
frame_name = frame_indx[i]
|
211 |
+
cont_base64_image = cont_base64_frames[i]
|
212 |
+
base64_image = cat_base64_frames[i]
|
213 |
+
should_filter = False
|
214 |
+
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
|
215 |
+
|
216 |
+
if frame_cat_cnts >= 2:
|
217 |
+
should_filter = True
|
218 |
+
else:
|
219 |
+
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
|
220 |
+
|
221 |
+
if is_movable and should_filter:
|
222 |
+
#1단계: 필터링
|
223 |
+
print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
224 |
+
caption_filter_text = f"""
|
225 |
+
You are a visual assistant analyzing a single frame from a video.
|
226 |
+
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
|
227 |
+
|
228 |
+
Are {cat_name}s in the image performing all different and recognizable actions or postures?
|
229 |
+
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
|
230 |
+
facial expressions, and any notable interactions with objects or other {cat_name}s or people.
|
231 |
+
|
232 |
+
Only focus on obvious, prominent actions that can be reliably identified from this single frame.
|
233 |
+
|
234 |
+
- Respond with "YES" if:
|
235 |
+
1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
|
236 |
+
2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
|
237 |
+
3) Each action is unambiguously recognizable and distinct.
|
238 |
+
|
239 |
+
- Respond with "NONE" if:
|
240 |
+
1) The actions or pose are not clearly differentiable or too similar.
|
241 |
+
2) They show no noticeable action beyond standing or minor movements.
|
242 |
+
|
243 |
+
Answer strictly with either "YES" or "NONE".
|
244 |
+
"""
|
245 |
+
|
246 |
+
|
247 |
+
response1 = captioner.chat.completions.create(
|
248 |
+
model="chatgpt-4o-latest",
|
249 |
+
messages=[
|
250 |
+
{
|
251 |
+
"role": "user",
|
252 |
+
"content": [
|
253 |
+
{
|
254 |
+
"type": "text",
|
255 |
+
"text": caption_filter_text,
|
256 |
+
},
|
257 |
+
{
|
258 |
+
"type": "image_url",
|
259 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
260 |
+
}
|
261 |
+
],
|
262 |
+
}
|
263 |
+
],
|
264 |
+
)
|
265 |
+
response_content = response1.choices[0].message.content
|
266 |
+
should_caption = True if "yes" in response_content.lower() else False
|
267 |
+
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
|
268 |
+
|
269 |
+
else:
|
270 |
+
should_caption = False
|
271 |
+
|
272 |
+
#2단계: dense caption 만들기
|
273 |
+
dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
|
274 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
275 |
+
I want to use your expressions to create a action-centric referring expression dataset.
|
276 |
+
Therefore, your expressions for these {cat_name}s should describe unique action of each object.
|
277 |
+
|
278 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
279 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
280 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
281 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
282 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
283 |
+
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
284 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
285 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
286 |
+
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
287 |
+
10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
|
288 |
+
11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
|
289 |
+
12. Do not mention object IDs.
|
290 |
+
13. Use '{cat_name}' as the noun for the referring expressions.
|
291 |
+
|
292 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
293 |
+
Output referring expressions for each object id.
|
294 |
+
"""
|
295 |
+
|
296 |
+
dense_caption_prompt = f"""
|
297 |
+
You are a visual assistant analyzing a single frame of a video.
|
298 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
299 |
+
I want to use your expressions to create a action-centric referring expression dataset.
|
300 |
+
Please describe each {cat_name} using **clearly observable** and **specific** actions.
|
301 |
+
|
302 |
+
## Guidelines:
|
303 |
+
1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
|
304 |
+
2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
|
305 |
+
3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
|
306 |
+
4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
|
307 |
+
Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
|
308 |
+
5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
309 |
+
6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
|
310 |
+
7. Base your description on the following action definitions:
|
311 |
+
- Facial with object manipulation
|
312 |
+
- General body movement, body position or pattern
|
313 |
+
- Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
|
314 |
+
- Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
|
315 |
+
|
316 |
+
## Output Format:
|
317 |
+
- For each labeled {cat_name}, output one line in the format:
|
318 |
+
ID. action-oriented description
|
319 |
+
|
320 |
+
Example:
|
321 |
+
1. a bear grasping the edge of a wood with its front paws
|
322 |
+
2. the bear pushing another bear, leaning forward
|
323 |
+
|
324 |
+
**Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
|
325 |
+
**Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
|
326 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
327 |
+
For each labeled {cat_name}, output referring expressions for each object id.
|
328 |
+
"""
|
329 |
+
if should_caption:
|
330 |
+
response2 = captioner.chat.completions.create(
|
331 |
+
model="chatgpt-4o-latest",
|
332 |
+
messages=[
|
333 |
+
{
|
334 |
+
"role": "user",
|
335 |
+
"content": [
|
336 |
+
{
|
337 |
+
"type": "text",
|
338 |
+
"text": dense_caption_prompt,
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"type": "image_url",
|
342 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
343 |
+
},
|
344 |
+
],
|
345 |
+
}
|
346 |
+
],
|
347 |
+
)
|
348 |
+
|
349 |
+
caption = response2.choices[0].message.content
|
350 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
351 |
+
else:
|
352 |
+
caption = None
|
353 |
+
|
354 |
+
image_captions[frame_name] = caption
|
355 |
+
all_captions[cat_name] = image_captions
|
356 |
+
|
357 |
+
# final : also prepare valid object ids
|
358 |
+
valid_obj_ids = dict()
|
359 |
+
|
360 |
+
for cat in cat_names:
|
361 |
+
if cat in ytvos_category_valid_list:
|
362 |
+
obj_id_cat = vid_meta['obj_id_cat']
|
363 |
+
valid_cat_ids = []
|
364 |
+
for obj_id in list(obj_id_cat.keys()):
|
365 |
+
if obj_id_cat[obj_id] == cat:
|
366 |
+
valid_cat_ids.append(obj_id)
|
367 |
+
valid_obj_ids[cat] = valid_cat_ids
|
368 |
+
|
369 |
+
return vid_id, all_captions, valid_obj_ids
|
370 |
+
|
371 |
+
|
372 |
+
if __name__ == '__main__':
|
373 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
374 |
+
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
|
375 |
+
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
|
376 |
+
|
377 |
+
args = parser.parse_args()
|
378 |
+
|
379 |
+
#==================데이터 불러오기===================
|
380 |
+
# 전체 데이터셋
|
381 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
382 |
+
|
383 |
+
# 전체 데이터셋 메타데이터
|
384 |
+
metas = train_dataset.metas
|
385 |
+
|
386 |
+
# 색상 후보 8개 (RGB 형식)
|
387 |
+
colors = [
|
388 |
+
(255, 0, 0), # Red
|
389 |
+
(0, 255, 0), # Green
|
390 |
+
(0, 0, 255), # Blue
|
391 |
+
(255, 255, 0), # Yellow
|
392 |
+
(255, 0, 255), # Magenta
|
393 |
+
(0, 255, 255), # Cyan
|
394 |
+
(128, 0, 128), # Purple
|
395 |
+
(255, 165, 0) # Orange
|
396 |
+
]
|
397 |
+
|
398 |
+
ytvos_category_valid_list = [
|
399 |
+
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
400 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
401 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
402 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
403 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
404 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
405 |
+
]
|
406 |
+
|
407 |
+
#==================gpt 돌리기===================
|
408 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
409 |
+
|
410 |
+
result_captions = {}
|
411 |
+
result_valid_obj_ids = {}
|
412 |
+
|
413 |
+
for i in range(370):
|
414 |
+
vid_id, all_captions, valid_obj_ids = getCaption(i, True)
|
415 |
+
|
416 |
+
if vid_id not in result_captions:
|
417 |
+
result_captions[vid_id] = all_captions
|
418 |
+
if vid_id not in result_valid_obj_ids:
|
419 |
+
result_valid_obj_ids[vid_id] = valid_obj_ids
|
420 |
+
|
421 |
+
print("Finished!", flush=True)
|
422 |
+
|
423 |
+
with open(args.save_caption_path, "w") as file:
|
424 |
+
json.dump(result_captions, file, indent=4)
|
425 |
+
|
426 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
427 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
.history/mbench/gpt_ref-ytvos_numbered_cy_20250201140343.py
ADDED
@@ -0,0 +1,460 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from os import path as osp
|
6 |
+
from io import BytesIO
|
7 |
+
|
8 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
9 |
+
import argparse
|
10 |
+
import opts
|
11 |
+
|
12 |
+
import sys
|
13 |
+
from pathlib import Path
|
14 |
+
import os
|
15 |
+
from os import path as osp
|
16 |
+
import skimage
|
17 |
+
from io import BytesIO
|
18 |
+
|
19 |
+
import numpy as np
|
20 |
+
import pandas as pd
|
21 |
+
import regex as re
|
22 |
+
import json
|
23 |
+
|
24 |
+
import cv2
|
25 |
+
from PIL import Image, ImageDraw
|
26 |
+
import torch
|
27 |
+
from torchvision.transforms import functional as F
|
28 |
+
|
29 |
+
from skimage import measure # (pip install scikit-image)
|
30 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
31 |
+
|
32 |
+
import matplotlib.pyplot as plt
|
33 |
+
import matplotlib.patches as patches
|
34 |
+
from matplotlib.collections import PatchCollection
|
35 |
+
from matplotlib.patches import Rectangle
|
36 |
+
import textwrap
|
37 |
+
|
38 |
+
|
39 |
+
import ipywidgets as widgets
|
40 |
+
from IPython.display import display, clear_output
|
41 |
+
|
42 |
+
from openai import OpenAI
|
43 |
+
import base64
|
44 |
+
import json
|
45 |
+
|
46 |
+
def number_objects_and_encode(idx, color_mask=False):
|
47 |
+
encoded_frames = {}
|
48 |
+
contoured_frames = {} # New dictionary for original images
|
49 |
+
vid_cat_cnts = {}
|
50 |
+
|
51 |
+
vid_meta = metas[idx]
|
52 |
+
vid_data = train_dataset[idx]
|
53 |
+
vid_id = vid_meta['video']
|
54 |
+
frame_indx = vid_meta['sample_indx']
|
55 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
56 |
+
imgs = vid_data[0]
|
57 |
+
|
58 |
+
for cat in cat_names:
|
59 |
+
cat_frames = []
|
60 |
+
contour_frames = []
|
61 |
+
frame_cat_cnts = {}
|
62 |
+
|
63 |
+
for i in range(imgs.size(0)):
|
64 |
+
frame_name = frame_indx[i]
|
65 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
66 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
67 |
+
|
68 |
+
frame_data = vid_data[2][frame_name]
|
69 |
+
obj_ids = list(frame_data.keys())
|
70 |
+
|
71 |
+
cat_cnt = 0
|
72 |
+
|
73 |
+
for j in range(len(obj_ids)):
|
74 |
+
obj_id = obj_ids[j]
|
75 |
+
obj_data = frame_data[obj_id]
|
76 |
+
obj_bbox = obj_data['bbox']
|
77 |
+
obj_valid = obj_data['valid']
|
78 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
79 |
+
obj_cat = obj_data['category_name']
|
80 |
+
|
81 |
+
if obj_cat == cat and obj_valid:
|
82 |
+
cat_cnt += 1
|
83 |
+
|
84 |
+
if color_mask == False:
|
85 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
86 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
87 |
+
for i, contour in enumerate(contours):
|
88 |
+
# 윤곽선 중심 계산
|
89 |
+
moments = cv2.moments(contour)
|
90 |
+
if moments["m00"] != 0: # 중심 계산 가능 여부 확인
|
91 |
+
cx = int(moments["m10"] / moments["m00"])
|
92 |
+
cy = int(moments["m01"] / moments["m00"])
|
93 |
+
else:
|
94 |
+
cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
|
95 |
+
|
96 |
+
# 텍스트 배경 (검은색 배경 만들기)
|
97 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
98 |
+
text = obj_id
|
99 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
100 |
+
text_w, text_h = text_size
|
101 |
+
|
102 |
+
# 텍스트 배경 그리기 (검은색 배경)
|
103 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
104 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
105 |
+
|
106 |
+
# 텍스트 그리기 (흰색 텍스트)
|
107 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
108 |
+
font, 1, (255, 255, 255), 2)
|
109 |
+
|
110 |
+
else:
|
111 |
+
alpha = 0.08
|
112 |
+
|
113 |
+
colored_obj_mask = np.zeros_like(frame)
|
114 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
115 |
+
frame[obj_mask == 1] = (
|
116 |
+
(1 - alpha) * frame[obj_mask == 1]
|
117 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
118 |
+
)
|
119 |
+
|
120 |
+
|
121 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
122 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
123 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
124 |
+
|
125 |
+
|
126 |
+
|
127 |
+
if len(contours) > 0:
|
128 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
129 |
+
M = cv2.moments(largest_contour)
|
130 |
+
if M["m00"] != 0:
|
131 |
+
center_x = int(M["m10"] / M["m00"])
|
132 |
+
center_y = int(M["m01"] / M["m00"])
|
133 |
+
else:
|
134 |
+
center_x, center_y = 0, 0
|
135 |
+
|
136 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
137 |
+
text = obj_id
|
138 |
+
|
139 |
+
font_scale = 0.9
|
140 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
141 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
142 |
+
text_y = center_y
|
143 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
144 |
+
|
145 |
+
# 텍스트 배경 사각형 좌표 계산
|
146 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
147 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
148 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
149 |
+
|
150 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
151 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
152 |
+
|
153 |
+
# plt.figure(figsize=(12, 8))
|
154 |
+
# plt.imshow(frame)
|
155 |
+
# plt.title(f"frame {frame_name}")
|
156 |
+
# plt.tight_layout()
|
157 |
+
# plt.axis('off')
|
158 |
+
# plt.show()
|
159 |
+
|
160 |
+
buffer = BytesIO()
|
161 |
+
frame = Image.fromarray(frame)
|
162 |
+
frame.save(buffer, format='jpeg')
|
163 |
+
buffer.seek(0)
|
164 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
165 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
166 |
+
|
167 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
168 |
+
buffer.truncate()
|
169 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
170 |
+
frame_for_contour.save(buffer, format='jpeg')
|
171 |
+
buffer.seek(0)
|
172 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
173 |
+
|
174 |
+
encoded_frames[cat] = cat_frames
|
175 |
+
contoured_frames[cat] = contour_frames
|
176 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
177 |
+
|
178 |
+
return encoded_frames, vid_cat_cnts, contoured_frames
|
179 |
+
|
180 |
+
|
181 |
+
def getCaption(idx, model='gpt-4o-mini', color_mask=True):
|
182 |
+
vid_meta = metas[idx]
|
183 |
+
vid_data = train_dataset[idx]
|
184 |
+
vid_id = vid_meta['video']
|
185 |
+
print(f"vid id: {vid_id}\n")
|
186 |
+
|
187 |
+
frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
|
188 |
+
cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
|
189 |
+
all_captions = dict()
|
190 |
+
|
191 |
+
base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
|
192 |
+
marked = "mask with boundary" if color_mask else "boundary"
|
193 |
+
|
194 |
+
for cat_name in list(cat_names) :
|
195 |
+
|
196 |
+
is_movable = False
|
197 |
+
if cat_name in ytvos_category_valid_list :
|
198 |
+
is_movable = True
|
199 |
+
|
200 |
+
if not is_movable:
|
201 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
|
202 |
+
|
203 |
+
|
204 |
+
image_captions = {}
|
205 |
+
captioner = OpenAI()
|
206 |
+
cat_base64_frames = base64_frames[cat_name]
|
207 |
+
cont_base64_frames = contoured_frames[cat_name]
|
208 |
+
|
209 |
+
for i in range(len(cat_base64_frames)):
|
210 |
+
frame_name = frame_indx[i]
|
211 |
+
cont_base64_image = cont_base64_frames[i]
|
212 |
+
base64_image = cat_base64_frames[i]
|
213 |
+
should_filter = False
|
214 |
+
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
|
215 |
+
|
216 |
+
if frame_cat_cnts >= 2:
|
217 |
+
should_filter = True
|
218 |
+
else:
|
219 |
+
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
|
220 |
+
|
221 |
+
if is_movable and should_filter:
|
222 |
+
#1단계: 필터링
|
223 |
+
print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
224 |
+
caption_filter_text = f"""
|
225 |
+
You are a visual assistant analyzing a single frame from a video.
|
226 |
+
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
|
227 |
+
|
228 |
+
Are {cat_name}s in the image performing all different and recognizable actions or postures?
|
229 |
+
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
|
230 |
+
facial expressions, and any notable interactions with objects or other {cat_name}s or people.
|
231 |
+
|
232 |
+
Only focus on obvious, prominent actions that can be reliably identified from this single frame.
|
233 |
+
|
234 |
+
- Respond with "YES" if:
|
235 |
+
1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
|
236 |
+
(e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
|
237 |
+
2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
|
238 |
+
3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
|
239 |
+
|
240 |
+
- Respond with "NONE" if:
|
241 |
+
1) The actions or pose are not clearly differentiable or too similar.
|
242 |
+
2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
|
243 |
+
3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
|
244 |
+
|
245 |
+
Answer strictly with either "YES" or "NONE".
|
246 |
+
"""
|
247 |
+
|
248 |
+
response1 = captioner.chat.completions.create(
|
249 |
+
# model="chatgpt-4o-latest",
|
250 |
+
model=model,
|
251 |
+
messages=[
|
252 |
+
{
|
253 |
+
"role": "user",
|
254 |
+
"content": [
|
255 |
+
{
|
256 |
+
"type": "text",
|
257 |
+
"text": caption_filter_text,
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"type": "image_url",
|
261 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
262 |
+
}
|
263 |
+
],
|
264 |
+
}
|
265 |
+
],
|
266 |
+
)
|
267 |
+
response_content = response1.choices[0].message.content
|
268 |
+
should_caption = True if "yes" in response_content.lower() else False
|
269 |
+
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
|
270 |
+
|
271 |
+
else:
|
272 |
+
should_caption = False
|
273 |
+
|
274 |
+
#2단계: dense caption 만들기
|
275 |
+
dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
|
276 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
277 |
+
I want to use your expressions to create a action-centric referring expression dataset.
|
278 |
+
Therefore, your expressions for these {cat_name}s should describe unique action of each object.
|
279 |
+
|
280 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
281 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
282 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
283 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
284 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
285 |
+
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
286 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
287 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
288 |
+
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
289 |
+
10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
|
290 |
+
11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
|
291 |
+
12. Do not mention object IDs.
|
292 |
+
13. Use '{cat_name}' as the noun for the referring expressions.
|
293 |
+
|
294 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
295 |
+
Output referring expressions for each object id.
|
296 |
+
"""
|
297 |
+
|
298 |
+
dense_caption_prompt = f"""
|
299 |
+
You are a visual assistant analyzing a single frame of a video.
|
300 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
301 |
+
|
302 |
+
I want to use your expressions to create an **action-centric referring expression** dataset.
|
303 |
+
Please describe each {cat_name} using **clearly observable** and **specific** actions.
|
304 |
+
|
305 |
+
---
|
306 |
+
## Guidelines:
|
307 |
+
1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
|
308 |
+
2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
|
309 |
+
3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
|
310 |
+
4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
311 |
+
5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
312 |
+
6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
|
313 |
+
7. Base your description on these action definitions:
|
314 |
+
- Avoid using term 'minimal' or 'slightly'.
|
315 |
+
- General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
|
316 |
+
- details such as motion and intention, facial with object manipulation
|
317 |
+
- movements with objects or other entities when they are prominent and observable. expression should be specific.
|
318 |
+
(e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
|
319 |
+
---
|
320 |
+
|
321 |
+
## Output Format:
|
322 |
+
- For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
|
323 |
+
object id. using {cat_name} as subject noun, action-oriented description
|
324 |
+
(e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
|
325 |
+
- **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
|
326 |
+
|
327 |
+
### Example
|
328 |
+
If the frame has 2 labeled bears, your output should look like:
|
329 |
+
1. the bear reaching his right arm while leaning forward to capture the prey
|
330 |
+
2. a bear standing upright facing right, touching the bike aside
|
331 |
+
|
332 |
+
---
|
333 |
+
**Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
|
334 |
+
**Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
|
335 |
+
**Do not include markdown** in the output.
|
336 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
337 |
+
For each labeled {cat_name}, output referring expressions for each object id.
|
338 |
+
"""
|
339 |
+
MAX_RETRIES = 2
|
340 |
+
retry_count = 0
|
341 |
+
|
342 |
+
if should_caption:
|
343 |
+
while retry_count < MAX_RETRIES:
|
344 |
+
|
345 |
+
response2 = captioner.chat.completions.create(
|
346 |
+
model=model,
|
347 |
+
messages=[
|
348 |
+
{
|
349 |
+
"role": "user",
|
350 |
+
"content": [
|
351 |
+
{
|
352 |
+
"type": "text",
|
353 |
+
"text": dense_caption_prompt,
|
354 |
+
},
|
355 |
+
{
|
356 |
+
"type": "image_url",
|
357 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
358 |
+
},
|
359 |
+
],
|
360 |
+
}
|
361 |
+
],
|
362 |
+
)
|
363 |
+
|
364 |
+
# caption = response2.choices[0].message.content
|
365 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
366 |
+
|
367 |
+
caption = response2.choices[0].message.content.strip()
|
368 |
+
caption_lower = caption.lower().lstrip()
|
369 |
+
|
370 |
+
if caption_lower.startswith("1.") and not any(
|
371 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
372 |
+
):
|
373 |
+
break
|
374 |
+
|
375 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
376 |
+
retry_count += 1
|
377 |
+
time.sleep(2)
|
378 |
+
|
379 |
+
if retry_count == MAX_RETRIES:
|
380 |
+
caption = None
|
381 |
+
print("Max retries reached. Caption generation failed.")
|
382 |
+
|
383 |
+
else:
|
384 |
+
caption = None
|
385 |
+
|
386 |
+
image_captions[frame_name] = caption
|
387 |
+
all_captions[cat_name] = image_captions
|
388 |
+
|
389 |
+
# final : also prepare valid object ids
|
390 |
+
valid_obj_ids = dict()
|
391 |
+
|
392 |
+
for cat in cat_names:
|
393 |
+
if cat in ytvos_category_valid_list:
|
394 |
+
obj_id_cat = vid_meta['obj_id_cat']
|
395 |
+
valid_cat_ids = []
|
396 |
+
for obj_id in list(obj_id_cat.keys()):
|
397 |
+
if obj_id_cat[obj_id] == cat:
|
398 |
+
valid_cat_ids.append(obj_id)
|
399 |
+
valid_obj_ids[cat] = valid_cat_ids
|
400 |
+
|
401 |
+
return all_captions, valid_obj_ids
|
402 |
+
|
403 |
+
|
404 |
+
|
405 |
+
if __name__ == '__main__':
|
406 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
407 |
+
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
|
408 |
+
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
|
409 |
+
|
410 |
+
args = parser.parse_args()
|
411 |
+
|
412 |
+
#==================데이터 불러오기===================
|
413 |
+
# 전체 데이터셋
|
414 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
415 |
+
|
416 |
+
# 전체 데이터셋 메타데이터
|
417 |
+
metas = train_dataset.metas
|
418 |
+
|
419 |
+
# 색상 후보 8개 (RGB 형식)
|
420 |
+
colors = [
|
421 |
+
(255, 0, 0), # Red
|
422 |
+
(0, 255, 0), # Green
|
423 |
+
(0, 0, 255), # Blue
|
424 |
+
(255, 255, 0), # Yellow
|
425 |
+
(255, 0, 255), # Magenta
|
426 |
+
(0, 255, 255), # Cyan
|
427 |
+
(128, 0, 128), # Purple
|
428 |
+
(255, 165, 0) # Orange
|
429 |
+
]
|
430 |
+
|
431 |
+
ytvos_category_valid_list = [
|
432 |
+
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
433 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
434 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
435 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
436 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
437 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
438 |
+
]
|
439 |
+
|
440 |
+
#==================gpt 돌리기===================
|
441 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
442 |
+
|
443 |
+
result_captions = {}
|
444 |
+
result_valid_obj_ids = {}
|
445 |
+
|
446 |
+
for i in range(370):
|
447 |
+
vid_id, all_captions, valid_obj_ids = getCaption(i, True)
|
448 |
+
|
449 |
+
if vid_id not in result_captions:
|
450 |
+
result_captions[vid_id] = all_captions
|
451 |
+
if vid_id not in result_valid_obj_ids:
|
452 |
+
result_valid_obj_ids[vid_id] = valid_obj_ids
|
453 |
+
|
454 |
+
print("Finished!", flush=True)
|
455 |
+
|
456 |
+
with open(args.save_caption_path, "w") as file:
|
457 |
+
json.dump(result_captions, file, indent=4)
|
458 |
+
|
459 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
460 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
.history/mbench/gpt_ref-ytvos_numbered_cy_20250201140413.py
ADDED
@@ -0,0 +1,460 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from os import path as osp
|
6 |
+
from io import BytesIO
|
7 |
+
|
8 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
9 |
+
import argparse
|
10 |
+
import opts
|
11 |
+
|
12 |
+
import sys
|
13 |
+
from pathlib import Path
|
14 |
+
import os
|
15 |
+
from os import path as osp
|
16 |
+
import skimage
|
17 |
+
from io import BytesIO
|
18 |
+
|
19 |
+
import numpy as np
|
20 |
+
import pandas as pd
|
21 |
+
import regex as re
|
22 |
+
import json
|
23 |
+
|
24 |
+
import cv2
|
25 |
+
from PIL import Image, ImageDraw
|
26 |
+
import torch
|
27 |
+
from torchvision.transforms import functional as F
|
28 |
+
|
29 |
+
from skimage import measure # (pip install scikit-image)
|
30 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
31 |
+
|
32 |
+
import matplotlib.pyplot as plt
|
33 |
+
import matplotlib.patches as patches
|
34 |
+
from matplotlib.collections import PatchCollection
|
35 |
+
from matplotlib.patches import Rectangle
|
36 |
+
import textwrap
|
37 |
+
|
38 |
+
|
39 |
+
import ipywidgets as widgets
|
40 |
+
from IPython.display import display, clear_output
|
41 |
+
|
42 |
+
from openai import OpenAI
|
43 |
+
import base64
|
44 |
+
import json
|
45 |
+
|
46 |
+
def number_objects_and_encode(idx, color_mask=False):
|
47 |
+
encoded_frames = {}
|
48 |
+
contoured_frames = {} # New dictionary for original images
|
49 |
+
vid_cat_cnts = {}
|
50 |
+
|
51 |
+
vid_meta = metas[idx]
|
52 |
+
vid_data = train_dataset[idx]
|
53 |
+
vid_id = vid_meta['video']
|
54 |
+
frame_indx = vid_meta['sample_indx']
|
55 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
56 |
+
imgs = vid_data[0]
|
57 |
+
|
58 |
+
for cat in cat_names:
|
59 |
+
cat_frames = []
|
60 |
+
contour_frames = []
|
61 |
+
frame_cat_cnts = {}
|
62 |
+
|
63 |
+
for i in range(imgs.size(0)):
|
64 |
+
frame_name = frame_indx[i]
|
65 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
66 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
67 |
+
|
68 |
+
frame_data = vid_data[2][frame_name]
|
69 |
+
obj_ids = list(frame_data.keys())
|
70 |
+
|
71 |
+
cat_cnt = 0
|
72 |
+
|
73 |
+
for j in range(len(obj_ids)):
|
74 |
+
obj_id = obj_ids[j]
|
75 |
+
obj_data = frame_data[obj_id]
|
76 |
+
obj_bbox = obj_data['bbox']
|
77 |
+
obj_valid = obj_data['valid']
|
78 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
79 |
+
obj_cat = obj_data['category_name']
|
80 |
+
|
81 |
+
if obj_cat == cat and obj_valid:
|
82 |
+
cat_cnt += 1
|
83 |
+
|
84 |
+
if color_mask == False:
|
85 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
86 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
87 |
+
for i, contour in enumerate(contours):
|
88 |
+
# 윤곽선 중심 계산
|
89 |
+
moments = cv2.moments(contour)
|
90 |
+
if moments["m00"] != 0: # 중심 계산 가능 여부 확인
|
91 |
+
cx = int(moments["m10"] / moments["m00"])
|
92 |
+
cy = int(moments["m01"] / moments["m00"])
|
93 |
+
else:
|
94 |
+
cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
|
95 |
+
|
96 |
+
# 텍스트 배경 (검은색 배경 만들기)
|
97 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
98 |
+
text = obj_id
|
99 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
100 |
+
text_w, text_h = text_size
|
101 |
+
|
102 |
+
# 텍스트 배경 그리기 (검은색 배경)
|
103 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
104 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
105 |
+
|
106 |
+
# 텍스트 그리기 (흰색 텍스트)
|
107 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
108 |
+
font, 1, (255, 255, 255), 2)
|
109 |
+
|
110 |
+
else:
|
111 |
+
alpha = 0.08
|
112 |
+
|
113 |
+
colored_obj_mask = np.zeros_like(frame)
|
114 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
115 |
+
frame[obj_mask == 1] = (
|
116 |
+
(1 - alpha) * frame[obj_mask == 1]
|
117 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
118 |
+
)
|
119 |
+
|
120 |
+
|
121 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
122 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
123 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
124 |
+
|
125 |
+
|
126 |
+
|
127 |
+
if len(contours) > 0:
|
128 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
129 |
+
M = cv2.moments(largest_contour)
|
130 |
+
if M["m00"] != 0:
|
131 |
+
center_x = int(M["m10"] / M["m00"])
|
132 |
+
center_y = int(M["m01"] / M["m00"])
|
133 |
+
else:
|
134 |
+
center_x, center_y = 0, 0
|
135 |
+
|
136 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
137 |
+
text = obj_id
|
138 |
+
|
139 |
+
font_scale = 0.9
|
140 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
141 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
142 |
+
text_y = center_y
|
143 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
144 |
+
|
145 |
+
# 텍스트 배경 사각형 좌표 계산
|
146 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
147 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
148 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
149 |
+
|
150 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
151 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
152 |
+
|
153 |
+
# plt.figure(figsize=(12, 8))
|
154 |
+
# plt.imshow(frame)
|
155 |
+
# plt.title(f"frame {frame_name}")
|
156 |
+
# plt.tight_layout()
|
157 |
+
# plt.axis('off')
|
158 |
+
# plt.show()
|
159 |
+
|
160 |
+
buffer = BytesIO()
|
161 |
+
frame = Image.fromarray(frame)
|
162 |
+
frame.save(buffer, format='jpeg')
|
163 |
+
buffer.seek(0)
|
164 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
165 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
166 |
+
|
167 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
168 |
+
buffer.truncate()
|
169 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
170 |
+
frame_for_contour.save(buffer, format='jpeg')
|
171 |
+
buffer.seek(0)
|
172 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
173 |
+
|
174 |
+
encoded_frames[cat] = cat_frames
|
175 |
+
contoured_frames[cat] = contour_frames
|
176 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
177 |
+
|
178 |
+
return encoded_frames, vid_cat_cnts, contoured_frames
|
179 |
+
|
180 |
+
|
181 |
+
def getCaption(idx, model='gpt-4o', color_mask=True):
|
182 |
+
vid_meta = metas[idx]
|
183 |
+
vid_data = train_dataset[idx]
|
184 |
+
vid_id = vid_meta['video']
|
185 |
+
print(f"vid id: {vid_id}\n")
|
186 |
+
|
187 |
+
frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
|
188 |
+
cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
|
189 |
+
all_captions = dict()
|
190 |
+
|
191 |
+
base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
|
192 |
+
marked = "mask with boundary" if color_mask else "boundary"
|
193 |
+
|
194 |
+
for cat_name in list(cat_names) :
|
195 |
+
|
196 |
+
is_movable = False
|
197 |
+
if cat_name in ytvos_category_valid_list :
|
198 |
+
is_movable = True
|
199 |
+
|
200 |
+
if not is_movable:
|
201 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
|
202 |
+
|
203 |
+
|
204 |
+
image_captions = {}
|
205 |
+
captioner = OpenAI()
|
206 |
+
cat_base64_frames = base64_frames[cat_name]
|
207 |
+
cont_base64_frames = contoured_frames[cat_name]
|
208 |
+
|
209 |
+
for i in range(len(cat_base64_frames)):
|
210 |
+
frame_name = frame_indx[i]
|
211 |
+
cont_base64_image = cont_base64_frames[i]
|
212 |
+
base64_image = cat_base64_frames[i]
|
213 |
+
should_filter = False
|
214 |
+
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
|
215 |
+
|
216 |
+
if frame_cat_cnts >= 2:
|
217 |
+
should_filter = True
|
218 |
+
else:
|
219 |
+
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
|
220 |
+
|
221 |
+
if is_movable and should_filter:
|
222 |
+
#1단계: 필터링
|
223 |
+
print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
224 |
+
caption_filter_text = f"""
|
225 |
+
You are a visual assistant analyzing a single frame from a video.
|
226 |
+
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
|
227 |
+
|
228 |
+
Are {cat_name}s in the image performing all different and recognizable actions or postures?
|
229 |
+
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
|
230 |
+
facial expressions, and any notable interactions with objects or other {cat_name}s or people.
|
231 |
+
|
232 |
+
Only focus on obvious, prominent actions that can be reliably identified from this single frame.
|
233 |
+
|
234 |
+
- Respond with "YES" if:
|
235 |
+
1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
|
236 |
+
(e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
|
237 |
+
2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
|
238 |
+
3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
|
239 |
+
|
240 |
+
- Respond with "NONE" if:
|
241 |
+
1) The actions or pose are not clearly differentiable or too similar.
|
242 |
+
2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
|
243 |
+
3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
|
244 |
+
|
245 |
+
Answer strictly with either "YES" or "NONE".
|
246 |
+
"""
|
247 |
+
|
248 |
+
response1 = captioner.chat.completions.create(
|
249 |
+
# model="chatgpt-4o-latest",
|
250 |
+
model=model,
|
251 |
+
messages=[
|
252 |
+
{
|
253 |
+
"role": "user",
|
254 |
+
"content": [
|
255 |
+
{
|
256 |
+
"type": "text",
|
257 |
+
"text": caption_filter_text,
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"type": "image_url",
|
261 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
262 |
+
}
|
263 |
+
],
|
264 |
+
}
|
265 |
+
],
|
266 |
+
)
|
267 |
+
response_content = response1.choices[0].message.content
|
268 |
+
should_caption = True if "yes" in response_content.lower() else False
|
269 |
+
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
|
270 |
+
|
271 |
+
else:
|
272 |
+
should_caption = False
|
273 |
+
|
274 |
+
#2단계: dense caption 만들기
|
275 |
+
dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
|
276 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
277 |
+
I want to use your expressions to create a action-centric referring expression dataset.
|
278 |
+
Therefore, your expressions for these {cat_name}s should describe unique action of each object.
|
279 |
+
|
280 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
281 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
282 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
283 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
284 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
285 |
+
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
286 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
287 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
288 |
+
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
289 |
+
10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
|
290 |
+
11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
|
291 |
+
12. Do not mention object IDs.
|
292 |
+
13. Use '{cat_name}' as the noun for the referring expressions.
|
293 |
+
|
294 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
295 |
+
Output referring expressions for each object id.
|
296 |
+
"""
|
297 |
+
|
298 |
+
dense_caption_prompt = f"""
|
299 |
+
You are a visual assistant analyzing a single frame of a video.
|
300 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
301 |
+
|
302 |
+
I want to use your expressions to create an **action-centric referring expression** dataset.
|
303 |
+
Please describe each {cat_name} using **clearly observable** and **specific** actions.
|
304 |
+
|
305 |
+
---
|
306 |
+
## Guidelines:
|
307 |
+
1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
|
308 |
+
2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
|
309 |
+
3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
|
310 |
+
4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
311 |
+
5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
312 |
+
6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
|
313 |
+
7. Base your description on these action definitions:
|
314 |
+
- Avoid using term 'minimal' or 'slightly'.
|
315 |
+
- General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
|
316 |
+
- details such as motion and intention, facial with object manipulation
|
317 |
+
- movements with objects or other entities when they are prominent and observable. expression should be specific.
|
318 |
+
(e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
|
319 |
+
---
|
320 |
+
|
321 |
+
## Output Format:
|
322 |
+
- For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
|
323 |
+
object id. using {cat_name} as subject noun, action-oriented description
|
324 |
+
(e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
|
325 |
+
- **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
|
326 |
+
|
327 |
+
### Example
|
328 |
+
If the frame has 2 labeled bears, your output should look like:
|
329 |
+
1. the bear reaching his right arm while leaning forward to capture the prey
|
330 |
+
2. a bear standing upright facing right, touching the bike aside
|
331 |
+
|
332 |
+
---
|
333 |
+
**Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
|
334 |
+
**Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
|
335 |
+
**Do not include markdown** in the output.
|
336 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
337 |
+
For each labeled {cat_name}, output referring expressions for each object id.
|
338 |
+
"""
|
339 |
+
MAX_RETRIES = 2
|
340 |
+
retry_count = 0
|
341 |
+
|
342 |
+
if should_caption:
|
343 |
+
while retry_count < MAX_RETRIES:
|
344 |
+
|
345 |
+
response2 = captioner.chat.completions.create(
|
346 |
+
model=model,
|
347 |
+
messages=[
|
348 |
+
{
|
349 |
+
"role": "user",
|
350 |
+
"content": [
|
351 |
+
{
|
352 |
+
"type": "text",
|
353 |
+
"text": dense_caption_prompt,
|
354 |
+
},
|
355 |
+
{
|
356 |
+
"type": "image_url",
|
357 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
358 |
+
},
|
359 |
+
],
|
360 |
+
}
|
361 |
+
],
|
362 |
+
)
|
363 |
+
|
364 |
+
# caption = response2.choices[0].message.content
|
365 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
366 |
+
|
367 |
+
caption = response2.choices[0].message.content.strip()
|
368 |
+
caption_lower = caption.lower().lstrip()
|
369 |
+
|
370 |
+
if caption_lower.startswith("1.") and not any(
|
371 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
372 |
+
):
|
373 |
+
break
|
374 |
+
|
375 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
376 |
+
retry_count += 1
|
377 |
+
time.sleep(2)
|
378 |
+
|
379 |
+
if retry_count == MAX_RETRIES:
|
380 |
+
caption = None
|
381 |
+
print("Max retries reached. Caption generation failed.")
|
382 |
+
|
383 |
+
else:
|
384 |
+
caption = None
|
385 |
+
|
386 |
+
image_captions[frame_name] = caption
|
387 |
+
all_captions[cat_name] = image_captions
|
388 |
+
|
389 |
+
# final : also prepare valid object ids
|
390 |
+
valid_obj_ids = dict()
|
391 |
+
|
392 |
+
for cat in cat_names:
|
393 |
+
if cat in ytvos_category_valid_list:
|
394 |
+
obj_id_cat = vid_meta['obj_id_cat']
|
395 |
+
valid_cat_ids = []
|
396 |
+
for obj_id in list(obj_id_cat.keys()):
|
397 |
+
if obj_id_cat[obj_id] == cat:
|
398 |
+
valid_cat_ids.append(obj_id)
|
399 |
+
valid_obj_ids[cat] = valid_cat_ids
|
400 |
+
|
401 |
+
return all_captions, valid_obj_ids
|
402 |
+
|
403 |
+
|
404 |
+
|
405 |
+
if __name__ == '__main__':
|
406 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
407 |
+
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
|
408 |
+
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
|
409 |
+
|
410 |
+
args = parser.parse_args()
|
411 |
+
|
412 |
+
#==================데이터 불러오기===================
|
413 |
+
# 전체 데이터셋
|
414 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
415 |
+
|
416 |
+
# 전체 데이터셋 메타데이터
|
417 |
+
metas = train_dataset.metas
|
418 |
+
|
419 |
+
# 색상 후보 8개 (RGB 형식)
|
420 |
+
colors = [
|
421 |
+
(255, 0, 0), # Red
|
422 |
+
(0, 255, 0), # Green
|
423 |
+
(0, 0, 255), # Blue
|
424 |
+
(255, 255, 0), # Yellow
|
425 |
+
(255, 0, 255), # Magenta
|
426 |
+
(0, 255, 255), # Cyan
|
427 |
+
(128, 0, 128), # Purple
|
428 |
+
(255, 165, 0) # Orange
|
429 |
+
]
|
430 |
+
|
431 |
+
ytvos_category_valid_list = [
|
432 |
+
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
433 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
434 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
435 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
436 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
437 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
438 |
+
]
|
439 |
+
|
440 |
+
#==================gpt 돌리기===================
|
441 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
442 |
+
|
443 |
+
result_captions = {}
|
444 |
+
result_valid_obj_ids = {}
|
445 |
+
|
446 |
+
for i in range(370):
|
447 |
+
vid_id, all_captions, valid_obj_ids = getCaption(i, True)
|
448 |
+
|
449 |
+
if vid_id not in result_captions:
|
450 |
+
result_captions[vid_id] = all_captions
|
451 |
+
if vid_id not in result_valid_obj_ids:
|
452 |
+
result_valid_obj_ids[vid_id] = valid_obj_ids
|
453 |
+
|
454 |
+
print("Finished!", flush=True)
|
455 |
+
|
456 |
+
with open(args.save_caption_path, "w") as file:
|
457 |
+
json.dump(result_captions, file, indent=4)
|
458 |
+
|
459 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
460 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
.history/mbench/gpt_ref-ytvos_numbered_cy_20250201141847.py
ADDED
@@ -0,0 +1,460 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
import time
|
5 |
+
|
6 |
+
from os import path as osp
|
7 |
+
from io import BytesIO
|
8 |
+
|
9 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
10 |
+
import argparse
|
11 |
+
import opts
|
12 |
+
|
13 |
+
import sys
|
14 |
+
from pathlib import Path
|
15 |
+
import os
|
16 |
+
from os import path as osp
|
17 |
+
import skimage
|
18 |
+
from io import BytesIO
|
19 |
+
|
20 |
+
import numpy as np
|
21 |
+
import pandas as pd
|
22 |
+
import regex as re
|
23 |
+
import json
|
24 |
+
|
25 |
+
import cv2
|
26 |
+
from PIL import Image, ImageDraw
|
27 |
+
import torch
|
28 |
+
from torchvision.transforms import functional as F
|
29 |
+
|
30 |
+
from skimage import measure # (pip install scikit-image)
|
31 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
32 |
+
|
33 |
+
import matplotlib.pyplot as plt
|
34 |
+
import matplotlib.patches as patches
|
35 |
+
from matplotlib.collections import PatchCollection
|
36 |
+
from matplotlib.patches import Rectangle
|
37 |
+
import textwrap
|
38 |
+
|
39 |
+
|
40 |
+
import ipywidgets as widgets
|
41 |
+
from IPython.display import display, clear_output
|
42 |
+
|
43 |
+
from openai import OpenAI
|
44 |
+
import base64
|
45 |
+
import json
|
46 |
+
|
47 |
+
def number_objects_and_encode(idx, color_mask=False):
|
48 |
+
encoded_frames = {}
|
49 |
+
contoured_frames = {} # New dictionary for original images
|
50 |
+
vid_cat_cnts = {}
|
51 |
+
|
52 |
+
vid_meta = metas[idx]
|
53 |
+
vid_data = train_dataset[idx]
|
54 |
+
vid_id = vid_meta['video']
|
55 |
+
frame_indx = vid_meta['sample_indx']
|
56 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
57 |
+
imgs = vid_data[0]
|
58 |
+
|
59 |
+
for cat in cat_names:
|
60 |
+
cat_frames = []
|
61 |
+
contour_frames = []
|
62 |
+
frame_cat_cnts = {}
|
63 |
+
|
64 |
+
for i in range(imgs.size(0)):
|
65 |
+
frame_name = frame_indx[i]
|
66 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
67 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
68 |
+
|
69 |
+
frame_data = vid_data[2][frame_name]
|
70 |
+
obj_ids = list(frame_data.keys())
|
71 |
+
|
72 |
+
cat_cnt = 0
|
73 |
+
|
74 |
+
for j in range(len(obj_ids)):
|
75 |
+
obj_id = obj_ids[j]
|
76 |
+
obj_data = frame_data[obj_id]
|
77 |
+
obj_bbox = obj_data['bbox']
|
78 |
+
obj_valid = obj_data['valid']
|
79 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
80 |
+
obj_cat = obj_data['category_name']
|
81 |
+
|
82 |
+
if obj_cat == cat and obj_valid:
|
83 |
+
cat_cnt += 1
|
84 |
+
|
85 |
+
if color_mask == False:
|
86 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
87 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
88 |
+
for i, contour in enumerate(contours):
|
89 |
+
# 윤곽선 중심 계산
|
90 |
+
moments = cv2.moments(contour)
|
91 |
+
if moments["m00"] != 0: # 중심 계산 가능 여부 확인
|
92 |
+
cx = int(moments["m10"] / moments["m00"])
|
93 |
+
cy = int(moments["m01"] / moments["m00"])
|
94 |
+
else:
|
95 |
+
cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
|
96 |
+
|
97 |
+
# 텍스트 배경 (검은색 배경 만들기)
|
98 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
99 |
+
text = obj_id
|
100 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
101 |
+
text_w, text_h = text_size
|
102 |
+
|
103 |
+
# 텍스트 배경 그리기 (검은색 배경)
|
104 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
105 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
106 |
+
|
107 |
+
# 텍스트 그리기 (흰색 텍스트)
|
108 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
109 |
+
font, 1, (255, 255, 255), 2)
|
110 |
+
|
111 |
+
else:
|
112 |
+
alpha = 0.08
|
113 |
+
|
114 |
+
colored_obj_mask = np.zeros_like(frame)
|
115 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
116 |
+
frame[obj_mask == 1] = (
|
117 |
+
(1 - alpha) * frame[obj_mask == 1]
|
118 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
119 |
+
)
|
120 |
+
|
121 |
+
|
122 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
123 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
124 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
125 |
+
|
126 |
+
|
127 |
+
|
128 |
+
if len(contours) > 0:
|
129 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
130 |
+
M = cv2.moments(largest_contour)
|
131 |
+
if M["m00"] != 0:
|
132 |
+
center_x = int(M["m10"] / M["m00"])
|
133 |
+
center_y = int(M["m01"] / M["m00"])
|
134 |
+
else:
|
135 |
+
center_x, center_y = 0, 0
|
136 |
+
|
137 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
138 |
+
text = obj_id
|
139 |
+
|
140 |
+
font_scale = 0.9
|
141 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
142 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
143 |
+
text_y = center_y
|
144 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
145 |
+
|
146 |
+
# 텍스트 배경 사각형 좌표 계산
|
147 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
148 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
149 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
150 |
+
|
151 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
152 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
153 |
+
|
154 |
+
# plt.figure(figsize=(12, 8))
|
155 |
+
# plt.imshow(frame)
|
156 |
+
# plt.title(f"frame {frame_name}")
|
157 |
+
# plt.tight_layout()
|
158 |
+
# plt.axis('off')
|
159 |
+
# plt.show()
|
160 |
+
|
161 |
+
buffer = BytesIO()
|
162 |
+
frame = Image.fromarray(frame)
|
163 |
+
frame.save(buffer, format='jpeg')
|
164 |
+
buffer.seek(0)
|
165 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
166 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
167 |
+
|
168 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
169 |
+
buffer.truncate()
|
170 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
171 |
+
frame_for_contour.save(buffer, format='jpeg')
|
172 |
+
buffer.seek(0)
|
173 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
174 |
+
|
175 |
+
encoded_frames[cat] = cat_frames
|
176 |
+
contoured_frames[cat] = contour_frames
|
177 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
178 |
+
|
179 |
+
return encoded_frames, vid_cat_cnts, contoured_frames
|
180 |
+
|
181 |
+
|
182 |
+
def getCaption(idx, model='gpt-4o', color_mask=True):
|
183 |
+
vid_meta = metas[idx]
|
184 |
+
vid_data = train_dataset[idx]
|
185 |
+
vid_id = vid_meta['video']
|
186 |
+
print(f"vid id: {vid_id}\n")
|
187 |
+
|
188 |
+
frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
|
189 |
+
cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
|
190 |
+
all_captions = dict()
|
191 |
+
|
192 |
+
base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
|
193 |
+
#marked = "mask with boundary" if color_mask else "boundary"
|
194 |
+
|
195 |
+
for cat_name in list(cat_names) :
|
196 |
+
|
197 |
+
is_movable = False
|
198 |
+
if cat_name in ytvos_category_valid_list :
|
199 |
+
is_movable = True
|
200 |
+
|
201 |
+
if not is_movable:
|
202 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
|
203 |
+
|
204 |
+
|
205 |
+
image_captions = {}
|
206 |
+
captioner = OpenAI()
|
207 |
+
cat_base64_frames = base64_frames[cat_name]
|
208 |
+
cont_base64_frames = contoured_frames[cat_name]
|
209 |
+
|
210 |
+
for i in range(len(cat_base64_frames)):
|
211 |
+
frame_name = frame_indx[i]
|
212 |
+
cont_base64_image = cont_base64_frames[i]
|
213 |
+
base64_image = cat_base64_frames[i]
|
214 |
+
should_filter = False
|
215 |
+
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
|
216 |
+
|
217 |
+
if frame_cat_cnts >= 2:
|
218 |
+
should_filter = True
|
219 |
+
else:
|
220 |
+
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
|
221 |
+
|
222 |
+
if is_movable and should_filter:
|
223 |
+
#1단계: 필터링
|
224 |
+
print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
225 |
+
caption_filter_text = f"""
|
226 |
+
You are a visual assistant analyzing a single frame from a video.
|
227 |
+
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
|
228 |
+
|
229 |
+
Are {cat_name}s in the image performing all different and recognizable actions or postures?
|
230 |
+
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
|
231 |
+
facial expressions, and any notable interactions with objects or other {cat_name}s or people.
|
232 |
+
|
233 |
+
Only focus on obvious, prominent actions that can be reliably identified from this single frame.
|
234 |
+
|
235 |
+
- Respond with "YES" if:
|
236 |
+
1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
|
237 |
+
(e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
|
238 |
+
2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
|
239 |
+
3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
|
240 |
+
|
241 |
+
- Respond with "NONE" if:
|
242 |
+
1) The actions or pose are not clearly differentiable or too similar.
|
243 |
+
2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
|
244 |
+
3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
|
245 |
+
|
246 |
+
Answer strictly with either "YES" or "NONE".
|
247 |
+
"""
|
248 |
+
|
249 |
+
response1 = captioner.chat.completions.create(
|
250 |
+
model=model,
|
251 |
+
messages=[
|
252 |
+
{
|
253 |
+
"role": "user",
|
254 |
+
"content": [
|
255 |
+
{
|
256 |
+
"type": "text",
|
257 |
+
"text": caption_filter_text,
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"type": "image_url",
|
261 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
262 |
+
}
|
263 |
+
],
|
264 |
+
}
|
265 |
+
],
|
266 |
+
)
|
267 |
+
response_content = response1.choices[0].message.content
|
268 |
+
should_caption = True if "yes" in response_content.lower() else False
|
269 |
+
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
|
270 |
+
|
271 |
+
else:
|
272 |
+
should_caption = False
|
273 |
+
|
274 |
+
#2단계: dense caption 만들기
|
275 |
+
dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
|
276 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
277 |
+
I want to use your expressions to create a action-centric referring expression dataset.
|
278 |
+
Therefore, your expressions for these {cat_name}s should describe unique action of each object.
|
279 |
+
|
280 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
281 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
282 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
283 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
284 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
285 |
+
6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
286 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
287 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
288 |
+
9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
289 |
+
10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
|
290 |
+
11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
|
291 |
+
12. Do not mention object IDs.
|
292 |
+
13. Use '{cat_name}' as the noun for the referring expressions.
|
293 |
+
|
294 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
295 |
+
Output referring expressions for each object id.
|
296 |
+
"""
|
297 |
+
|
298 |
+
dense_caption_prompt = f"""
|
299 |
+
You are a visual assistant analyzing a single frame of a video.
|
300 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
|
301 |
+
|
302 |
+
I want to use your expressions to create an **action-centric referring expression** dataset.
|
303 |
+
Please describe each {cat_name} using **clearly observable** and **specific** actions.
|
304 |
+
|
305 |
+
---
|
306 |
+
## Guidelines:
|
307 |
+
1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
|
308 |
+
2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
|
309 |
+
3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
|
310 |
+
4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
311 |
+
5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
312 |
+
6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
|
313 |
+
7. Base your description on these action definitions:
|
314 |
+
- Avoid using term 'minimal' or 'slightly'.
|
315 |
+
- General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
|
316 |
+
- details such as motion and intention, facial with object manipulation
|
317 |
+
- movements with objects or other entities when they are prominent and observable. expression should be specific.
|
318 |
+
(e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
|
319 |
+
---
|
320 |
+
|
321 |
+
## Output Format:
|
322 |
+
- For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
|
323 |
+
object id. using {cat_name} as subject noun, action-oriented description
|
324 |
+
(e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
|
325 |
+
- **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
|
326 |
+
|
327 |
+
### Example
|
328 |
+
If the frame has 2 labeled bears, your output should look like:
|
329 |
+
1. the bear reaching his right arm while leaning forward to capture the prey
|
330 |
+
2. a bear standing upright facing right, touching the bike aside
|
331 |
+
|
332 |
+
---
|
333 |
+
**Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
|
334 |
+
**Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
|
335 |
+
**Do not include markdown** in the output.
|
336 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
337 |
+
For each labeled {cat_name}, output referring expressions for each object id.
|
338 |
+
"""
|
339 |
+
MAX_RETRIES = 2
|
340 |
+
retry_count = 0
|
341 |
+
|
342 |
+
if should_caption:
|
343 |
+
while retry_count < MAX_RETRIES:
|
344 |
+
|
345 |
+
response2 = captioner.chat.completions.create(
|
346 |
+
model=model,
|
347 |
+
messages=[
|
348 |
+
{
|
349 |
+
"role": "user",
|
350 |
+
"content": [
|
351 |
+
{
|
352 |
+
"type": "text",
|
353 |
+
"text": dense_caption_prompt,
|
354 |
+
},
|
355 |
+
{
|
356 |
+
"type": "image_url",
|
357 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
358 |
+
},
|
359 |
+
],
|
360 |
+
}
|
361 |
+
],
|
362 |
+
)
|
363 |
+
|
364 |
+
# caption = response2.choices[0].message.content
|
365 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
366 |
+
|
367 |
+
caption = response2.choices[0].message.content.strip()
|
368 |
+
caption_lower = caption.lower().lstrip()
|
369 |
+
|
370 |
+
if caption_lower.startswith("1.") and not any(
|
371 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
372 |
+
):
|
373 |
+
break
|
374 |
+
|
375 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
376 |
+
retry_count += 1
|
377 |
+
time.sleep(2)
|
378 |
+
|
379 |
+
if retry_count == MAX_RETRIES:
|
380 |
+
caption = None
|
381 |
+
print("Max retries reached. Caption generation failed.")
|
382 |
+
|
383 |
+
else:
|
384 |
+
caption = None
|
385 |
+
|
386 |
+
image_captions[frame_name] = caption
|
387 |
+
all_captions[cat_name] = image_captions
|
388 |
+
|
389 |
+
# final : also prepare valid object ids
|
390 |
+
valid_obj_ids = dict()
|
391 |
+
|
392 |
+
for cat in cat_names:
|
393 |
+
if cat in ytvos_category_valid_list:
|
394 |
+
obj_id_cat = vid_meta['obj_id_cat']
|
395 |
+
valid_cat_ids = []
|
396 |
+
for obj_id in list(obj_id_cat.keys()):
|
397 |
+
if obj_id_cat[obj_id] == cat:
|
398 |
+
valid_cat_ids.append(obj_id)
|
399 |
+
valid_obj_ids[cat] = valid_cat_ids
|
400 |
+
|
401 |
+
return all_captions, valid_obj_ids
|
402 |
+
|
403 |
+
|
404 |
+
|
405 |
+
if __name__ == '__main__':
|
406 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
407 |
+
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
|
408 |
+
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
|
409 |
+
|
410 |
+
args = parser.parse_args()
|
411 |
+
|
412 |
+
#==================데이터 불러오기===================
|
413 |
+
# 전체 데이터셋
|
414 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
415 |
+
|
416 |
+
# 전체 데이터셋 메타데이터
|
417 |
+
metas = train_dataset.metas
|
418 |
+
|
419 |
+
# 색상 후보 8개 (RGB 형식)
|
420 |
+
colors = [
|
421 |
+
(255, 0, 0), # Red
|
422 |
+
(0, 255, 0), # Green
|
423 |
+
(0, 0, 255), # Blue
|
424 |
+
(255, 255, 0), # Yellow
|
425 |
+
(255, 0, 255), # Magenta
|
426 |
+
(0, 255, 255), # Cyan
|
427 |
+
(128, 0, 128), # Purple
|
428 |
+
(255, 165, 0) # Orange
|
429 |
+
]
|
430 |
+
|
431 |
+
ytvos_category_valid_list = [
|
432 |
+
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
433 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
434 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
435 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
436 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
437 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
438 |
+
]
|
439 |
+
|
440 |
+
#==================gpt 돌리기===================
|
441 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
442 |
+
|
443 |
+
result_captions = {}
|
444 |
+
result_valid_obj_ids = {}
|
445 |
+
|
446 |
+
for i in range(370):
|
447 |
+
vid_id, all_captions, valid_obj_ids = getCaption(i)
|
448 |
+
|
449 |
+
if vid_id not in result_captions:
|
450 |
+
result_captions[vid_id] = all_captions
|
451 |
+
if vid_id not in result_valid_obj_ids:
|
452 |
+
result_valid_obj_ids[vid_id] = valid_obj_ids
|
453 |
+
|
454 |
+
print("Finished!", flush=True)
|
455 |
+
|
456 |
+
with open(args.save_caption_path, "w") as file:
|
457 |
+
json.dump(result_captions, file, indent=4)
|
458 |
+
|
459 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
460 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250206153011.py
ADDED
@@ -0,0 +1,644 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
import time
|
5 |
+
|
6 |
+
from os import path as osp
|
7 |
+
from io import BytesIO
|
8 |
+
import random
|
9 |
+
|
10 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
11 |
+
import argparse
|
12 |
+
import opts
|
13 |
+
|
14 |
+
import sys
|
15 |
+
from pathlib import Path
|
16 |
+
import os
|
17 |
+
from os import path as osp
|
18 |
+
import skimage
|
19 |
+
from io import BytesIO
|
20 |
+
|
21 |
+
import numpy as np
|
22 |
+
import pandas as pd
|
23 |
+
import regex as re
|
24 |
+
import json
|
25 |
+
|
26 |
+
import cv2
|
27 |
+
from PIL import Image, ImageDraw
|
28 |
+
import torch
|
29 |
+
from torchvision.transforms import functional as F
|
30 |
+
|
31 |
+
from skimage import measure # (pip install scikit-image)
|
32 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
33 |
+
|
34 |
+
import matplotlib.pyplot as plt
|
35 |
+
import matplotlib.patches as patches
|
36 |
+
from matplotlib.collections import PatchCollection
|
37 |
+
from matplotlib.patches import Rectangle
|
38 |
+
import textwrap
|
39 |
+
|
40 |
+
|
41 |
+
import ipywidgets as widgets
|
42 |
+
from IPython.display import display, clear_output
|
43 |
+
|
44 |
+
from openai import OpenAI
|
45 |
+
import base64
|
46 |
+
import json
|
47 |
+
|
48 |
+
def number_objects_and_encode_old(idx, color_mask=False):
|
49 |
+
encoded_frames = {}
|
50 |
+
contoured_frames = {} # New dictionary for original images
|
51 |
+
vid_cat_cnts = {}
|
52 |
+
|
53 |
+
vid_meta = metas[idx]
|
54 |
+
vid_data = train_dataset[idx]
|
55 |
+
vid_id = vid_meta['video']
|
56 |
+
frame_indx = vid_meta['sample_indx']
|
57 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
58 |
+
imgs = vid_data[0]
|
59 |
+
|
60 |
+
for cat in cat_names:
|
61 |
+
cat_frames = []
|
62 |
+
contour_frames = []
|
63 |
+
frame_cat_cnts = {}
|
64 |
+
|
65 |
+
for i in range(imgs.size(0)):
|
66 |
+
frame_name = frame_indx[i]
|
67 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
68 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
69 |
+
|
70 |
+
frame_data = vid_data[2][frame_name]
|
71 |
+
obj_ids = list(frame_data.keys())
|
72 |
+
|
73 |
+
cat_cnt = 0
|
74 |
+
|
75 |
+
for j in range(len(obj_ids)):
|
76 |
+
obj_id = obj_ids[j]
|
77 |
+
obj_data = frame_data[obj_id]
|
78 |
+
obj_bbox = obj_data['bbox']
|
79 |
+
obj_valid = obj_data['valid']
|
80 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
81 |
+
obj_cat = obj_data['category_name']
|
82 |
+
|
83 |
+
if obj_cat == cat and obj_valid:
|
84 |
+
cat_cnt += 1
|
85 |
+
|
86 |
+
if color_mask == False:
|
87 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
88 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
89 |
+
for i, contour in enumerate(contours):
|
90 |
+
moments = cv2.moments(contour)
|
91 |
+
if moments["m00"] != 0:
|
92 |
+
cx = int(moments["m10"] / moments["m00"])
|
93 |
+
cy = int(moments["m01"] / moments["m00"])
|
94 |
+
else:
|
95 |
+
cx, cy = contour[0][0]
|
96 |
+
|
97 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
98 |
+
text = obj_id
|
99 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
100 |
+
text_w, text_h = text_size
|
101 |
+
|
102 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
103 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
104 |
+
|
105 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
106 |
+
font, 1, (255, 255, 255), 2)
|
107 |
+
|
108 |
+
else:
|
109 |
+
alpha = 0.08
|
110 |
+
|
111 |
+
colored_obj_mask = np.zeros_like(frame)
|
112 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
113 |
+
frame[obj_mask == 1] = (
|
114 |
+
(1 - alpha) * frame[obj_mask == 1]
|
115 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
116 |
+
)
|
117 |
+
|
118 |
+
|
119 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
120 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
121 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
122 |
+
|
123 |
+
if len(contours) > 0:
|
124 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
125 |
+
M = cv2.moments(largest_contour)
|
126 |
+
if M["m00"] != 0:
|
127 |
+
center_x = int(M["m10"] / M["m00"])
|
128 |
+
center_y = int(M["m01"] / M["m00"])
|
129 |
+
else:
|
130 |
+
center_x, center_y = 0, 0
|
131 |
+
|
132 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
133 |
+
text = obj_id
|
134 |
+
|
135 |
+
font_scale = 0.9
|
136 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
137 |
+
text_x = center_x - text_size[0] // 1
|
138 |
+
text_y = center_y
|
139 |
+
|
140 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5)
|
141 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
142 |
+
|
143 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
144 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
145 |
+
|
146 |
+
# plt.figure(figsize=(12, 8))
|
147 |
+
# plt.imshow(frame)
|
148 |
+
# plt.title(f"frame {frame_name}")
|
149 |
+
# plt.tight_layout()
|
150 |
+
# plt.axis('off')
|
151 |
+
# plt.show()
|
152 |
+
|
153 |
+
buffer = BytesIO()
|
154 |
+
frame = Image.fromarray(frame)
|
155 |
+
frame.save(buffer, format='jpeg')
|
156 |
+
buffer.seek(0)
|
157 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
158 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
159 |
+
|
160 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
161 |
+
buffer.truncate()
|
162 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
163 |
+
frame_for_contour.save(buffer, format='jpeg')
|
164 |
+
buffer.seek(0)
|
165 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
166 |
+
|
167 |
+
encoded_frames[cat] = cat_frames
|
168 |
+
contoured_frames[cat] = contour_frames
|
169 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
170 |
+
|
171 |
+
return encoded_frames, contoured_frames, vid_cat_cnts
|
172 |
+
|
173 |
+
|
174 |
+
def number_objects_and_encode(idx, color_mask=False):
|
175 |
+
encoded_frames = {}
|
176 |
+
contoured_frames = {} # New dictionary for original images
|
177 |
+
vid_cat_cnts = {}
|
178 |
+
|
179 |
+
vid_meta = metas[idx]
|
180 |
+
vid_data = train_dataset[idx]
|
181 |
+
vid_id = vid_meta['video']
|
182 |
+
frame_indx = vid_meta['sample_indx']
|
183 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
184 |
+
imgs = vid_data[0]
|
185 |
+
|
186 |
+
for cat in cat_names:
|
187 |
+
cat_frames = []
|
188 |
+
contour_frames = []
|
189 |
+
frame_cat_cnts = {}
|
190 |
+
|
191 |
+
for i in range(imgs.size(0)):
|
192 |
+
frame_name = frame_indx[i]
|
193 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
194 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
195 |
+
|
196 |
+
frame_data = vid_data[2][frame_name]
|
197 |
+
obj_ids = list(frame_data.keys())
|
198 |
+
|
199 |
+
cat_cnt = 0
|
200 |
+
|
201 |
+
for j in range(len(obj_ids)):
|
202 |
+
obj_id = obj_ids[j]
|
203 |
+
obj_data = frame_data[obj_id]
|
204 |
+
obj_bbox = obj_data['bbox']
|
205 |
+
obj_valid = obj_data['valid']
|
206 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
207 |
+
obj_cat = obj_data['category_name']
|
208 |
+
|
209 |
+
if obj_cat == cat and obj_valid:
|
210 |
+
cat_cnt += 1
|
211 |
+
|
212 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
213 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
214 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
215 |
+
|
216 |
+
if len(contours) > 0:
|
217 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
218 |
+
M = cv2.moments(largest_contour)
|
219 |
+
if M["m00"] != 0:
|
220 |
+
center_x = int(M["m10"] / M["m00"])
|
221 |
+
center_y = int(M["m01"] / M["m00"])
|
222 |
+
else:
|
223 |
+
center_x, center_y = 0, 0
|
224 |
+
|
225 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
226 |
+
text = obj_id
|
227 |
+
font_scale = 1.2
|
228 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
229 |
+
text_x = center_x - text_size[0] // 1
|
230 |
+
text_y = center_y
|
231 |
+
|
232 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5)
|
233 |
+
rect_end = (text_x + text_size[0] + 5, text_y + 3)
|
234 |
+
|
235 |
+
contour_thickness = 1
|
236 |
+
rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
|
237 |
+
rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
|
238 |
+
|
239 |
+
cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
|
240 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
241 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
242 |
+
|
243 |
+
|
244 |
+
if color_mask:
|
245 |
+
alpha = 0.08
|
246 |
+
colored_obj_mask = np.zeros_like(frame)
|
247 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
248 |
+
frame[obj_mask == 1] = (
|
249 |
+
(1 - alpha) * frame[obj_mask == 1]
|
250 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
251 |
+
)
|
252 |
+
|
253 |
+
# plt.figure(figsize=(12, 8))
|
254 |
+
# plt.imshow(frame)
|
255 |
+
# plt.title(f"frame {frame_name}")
|
256 |
+
# plt.tight_layout()
|
257 |
+
# plt.axis('off')
|
258 |
+
# plt.show()
|
259 |
+
|
260 |
+
buffer = BytesIO()
|
261 |
+
frame = Image.fromarray(frame)
|
262 |
+
frame.save(buffer, format='jpeg')
|
263 |
+
buffer.seek(0)
|
264 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
265 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
266 |
+
|
267 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
268 |
+
buffer.truncate()
|
269 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
270 |
+
frame_for_contour.save(buffer, format='jpeg')
|
271 |
+
buffer.seek(0)
|
272 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
273 |
+
|
274 |
+
encoded_frames[cat] = cat_frames
|
275 |
+
contoured_frames[cat] = contour_frames
|
276 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
277 |
+
|
278 |
+
return encoded_frames, contoured_frames, vid_cat_cnts
|
279 |
+
|
280 |
+
|
281 |
+
|
282 |
+
def getCaption(idx, model='gpt-4o'):
|
283 |
+
vid_meta = metas[idx]
|
284 |
+
vid_data = train_dataset[idx]
|
285 |
+
vid_id = vid_meta['video']
|
286 |
+
print(f"vid id: {vid_id}\n")
|
287 |
+
|
288 |
+
frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
|
289 |
+
cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
|
290 |
+
all_captions = dict()
|
291 |
+
|
292 |
+
# color_mask = random.choice([True, False])
|
293 |
+
color_mask = random.choices([False, True], weights=[60, 40])[0]
|
294 |
+
|
295 |
+
base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
|
296 |
+
#marked = "mask with boundary" if color_mask else "boundary"
|
297 |
+
|
298 |
+
for cat_name in list(cat_names) :
|
299 |
+
|
300 |
+
is_movable = False
|
301 |
+
if cat_name in ytvos_category_valid_list :
|
302 |
+
is_movable = True
|
303 |
+
|
304 |
+
if not is_movable:
|
305 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
|
306 |
+
|
307 |
+
|
308 |
+
image_captions = {}
|
309 |
+
captioner = OpenAI()
|
310 |
+
cat_base64_frames = base64_frames[cat_name]
|
311 |
+
# cont_base64_frames = contoured_frames[cat_name]
|
312 |
+
|
313 |
+
for i in range(len(cat_base64_frames)):
|
314 |
+
frame_name = frame_indx[i]
|
315 |
+
# cont_base64_image = cont_base64_frames[i]
|
316 |
+
base64_image = cat_base64_frames[i]
|
317 |
+
should_filter = False
|
318 |
+
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
|
319 |
+
|
320 |
+
if frame_cat_cnts >= 2:
|
321 |
+
should_filter = True
|
322 |
+
else:
|
323 |
+
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
|
324 |
+
|
325 |
+
|
326 |
+
if is_movable and should_filter:
|
327 |
+
#1단계: 필터링
|
328 |
+
print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
329 |
+
caption_filter_text = f"""
|
330 |
+
You are a visual assistant analyzing a single frame from a video.
|
331 |
+
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
|
332 |
+
|
333 |
+
Are {cat_name}s in the image performing all different and recognizable actions or postures?
|
334 |
+
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
|
335 |
+
facial expressions, and any notable interactions with objects or other {cat_name}s or people.
|
336 |
+
|
337 |
+
Only focus on obvious, prominent actions that can be reliably identified from this single frame.
|
338 |
+
|
339 |
+
- Respond with "YES" if:
|
340 |
+
1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
|
341 |
+
(e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
|
342 |
+
2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
|
343 |
+
3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
|
344 |
+
|
345 |
+
- Respond with "NONE" if:
|
346 |
+
1) The actions or pose are not clearly differentiable or too similar.
|
347 |
+
2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
|
348 |
+
3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
|
349 |
+
|
350 |
+
Answer strictly with either "YES" or "NONE".
|
351 |
+
"""
|
352 |
+
|
353 |
+
response1 = captioner.chat.completions.create(
|
354 |
+
model=model,
|
355 |
+
messages=[
|
356 |
+
{
|
357 |
+
"role": "user",
|
358 |
+
"content": [
|
359 |
+
{
|
360 |
+
"type": "text",
|
361 |
+
"text": caption_filter_text,
|
362 |
+
},
|
363 |
+
{
|
364 |
+
"type": "image_url",
|
365 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
366 |
+
}
|
367 |
+
],
|
368 |
+
}
|
369 |
+
],
|
370 |
+
)
|
371 |
+
response_content = response1.choices[0].message.content
|
372 |
+
should_caption = True if "yes" in response_content.lower() else False
|
373 |
+
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
|
374 |
+
|
375 |
+
else:
|
376 |
+
should_caption = False
|
377 |
+
|
378 |
+
#2단계: dense caption 만들기
|
379 |
+
dense_caption_prompt_1 = f"""
|
380 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
|
381 |
+
|
382 |
+
Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
|
383 |
+
|
384 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
385 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
386 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
387 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
388 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
389 |
+
6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
390 |
+
- expressions like 'seems to be', 'appears to be' are BANNED!
|
391 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
392 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
393 |
+
9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
|
394 |
+
10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
|
395 |
+
11. Do not mention object IDs.
|
396 |
+
12. Use '{cat_name}' as the noun for the referring expressions.
|
397 |
+
|
398 |
+
Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
399 |
+
|
400 |
+
- Your answer should contain details, and follow the following format:
|
401 |
+
object id. action-oriented description
|
402 |
+
(e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
|
403 |
+
2. a person bending over and touching his boots to tie the shoelace.)
|
404 |
+
- for action-oriented description, use {cat_name} as subject noun
|
405 |
+
|
406 |
+
**Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
|
407 |
+
Please pay attention to the categories of these objects and don’t change them.
|
408 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
409 |
+
Output referring expressions for each object id. Please start your answer:"""
|
410 |
+
|
411 |
+
|
412 |
+
dense_caption_prompt_2 = f"""
|
413 |
+
You are an advanced visual language model analyzing a video frame.
|
414 |
+
In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
|
415 |
+
|
416 |
+
Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
|
417 |
+
Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
|
418 |
+
|
419 |
+
---
|
420 |
+
## Key Guidelines:
|
421 |
+
1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
|
422 |
+
- Example: "grabbing a branch and pulling it down" (**(O) Specific**)
|
423 |
+
- Avoid: "moving slightly to the side" (**(X) Too vague**)
|
424 |
+
|
425 |
+
2. **Do not describe appearance, color, or position**—focus purely on the action.
|
426 |
+
- (X) "A large brown bear standing on the left"
|
427 |
+
- (O) "The bear is lifting its front paws and swiping forward."
|
428 |
+
|
429 |
+
3. **Use dynamic, action-specific verbs** rather than passive descriptions.
|
430 |
+
- (O) "The giraffe is tilting its head and sniffing the ground."
|
431 |
+
- (X) "The giraffe is near a tree and looking around."
|
432 |
+
|
433 |
+
4. **Avoid assumptions, emotions, or speculative phrasing.**
|
434 |
+
- (X) "The person seems excited" / "The person might be preparing to jump."
|
435 |
+
- (O) "The person is pushing its front legs against the rock and leaping forward."
|
436 |
+
|
437 |
+
5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
438 |
+
- expressions like 'seems to be', 'appears to be' are BANNED!
|
439 |
+
6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
440 |
+
|
441 |
+
7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
|
442 |
+
- **Each object should have a unique, descriptive action.**
|
443 |
+
- (X) "Two dogs are running."
|
444 |
+
- (O) "1. One dog is chasing another, its legs stretched mid-air.
|
445 |
+
2. The other dog is looking back while speeding up."
|
446 |
+
|
447 |
+
---
|
448 |
+
## Output Format:
|
449 |
+
- Each labeled **{cat_name}** should have exactly **one line of description**.
|
450 |
+
- Format: `ID. {cat_name} + action-based description`
|
451 |
+
- (O) Example:
|
452 |
+
```
|
453 |
+
1. The person is leaning forward while opening a bag with both hands.
|
454 |
+
2. The person is holding onto a rope and pulling themselves up.
|
455 |
+
```
|
456 |
+
- **Ensure that each object is described individually.**
|
457 |
+
- **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
|
458 |
+
|
459 |
+
---
|
460 |
+
## Additional Instructions:
|
461 |
+
- **Do NOT** use expressions like "it appears that..." or "it seems like...".
|
462 |
+
- **Do NOT** mention object IDs in the description (only use the provided format).
|
463 |
+
- **DO NOT** include markdown formatting (no bullet points, no asterisks).
|
464 |
+
- **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
|
465 |
+
|
466 |
+
Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
|
467 |
+
"""
|
468 |
+
|
469 |
+
|
470 |
+
dense_caption_prompt = f"""
|
471 |
+
You are a visual assistant analyzing a single frame of a video.
|
472 |
+
In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
|
473 |
+
|
474 |
+
I am building an **action-centric referring expression** dataset.
|
475 |
+
Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
|
476 |
+
|
477 |
+
---
|
478 |
+
## Guidelines:
|
479 |
+
1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
|
480 |
+
2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
|
481 |
+
3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
|
482 |
+
4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
483 |
+
5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
|
484 |
+
6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
|
485 |
+
7. Base your descriptions on these principles:
|
486 |
+
- **Avoid words like 'minimal' or 'slightly'.**
|
487 |
+
- Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
|
488 |
+
- Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
|
489 |
+
- **Specify actions with other objects or entities** only when they are clear and observable.
|
490 |
+
- (O) "pushing another person"
|
491 |
+
- (X) "interacting with another object"
|
492 |
+
|
493 |
+
---
|
494 |
+
## Output Format:
|
495 |
+
- Each labeled **{cat_name}** must have **exactly one line**.
|
496 |
+
- Format: `ID. {cat_name} + action-based description`
|
497 |
+
- (O) Example:
|
498 |
+
```
|
499 |
+
1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
|
500 |
+
2. The person is pulling a baby carriage while smiling.
|
501 |
+
```
|
502 |
+
- **Ensure each object is described individually.**
|
503 |
+
- **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
|
504 |
+
|
505 |
+
---
|
506 |
+
## Example:
|
507 |
+
If the frame has two labeled **bears**, your output should be:
|
508 |
+
```
|
509 |
+
1. The bear is reaching out its right paw while leaning forward to catch prey.
|
510 |
+
2. A bear is standing upright, facing right, and touching the bike beside it.
|
511 |
+
```
|
512 |
+
|
513 |
+
---
|
514 |
+
## Additional Instructions:
|
515 |
+
- **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
|
516 |
+
- **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
|
517 |
+
- **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
|
518 |
+
- **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
|
519 |
+
|
520 |
+
Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
|
521 |
+
|
522 |
+
|
523 |
+
MAX_RETRIES = 3
|
524 |
+
retry_count = 0
|
525 |
+
|
526 |
+
if should_caption:
|
527 |
+
while retry_count < MAX_RETRIES:
|
528 |
+
selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
|
529 |
+
|
530 |
+
response2 = captioner.chat.completions.create(
|
531 |
+
model=model,
|
532 |
+
messages=[
|
533 |
+
{
|
534 |
+
"role": "user",
|
535 |
+
"content": [
|
536 |
+
{
|
537 |
+
"type": "text",
|
538 |
+
"text": selected_prompt,
|
539 |
+
},
|
540 |
+
{
|
541 |
+
"type": "image_url",
|
542 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
543 |
+
},
|
544 |
+
],
|
545 |
+
}
|
546 |
+
],
|
547 |
+
)
|
548 |
+
|
549 |
+
# caption = response2.choices[0].message.content
|
550 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
551 |
+
|
552 |
+
caption = response2.choices[0].message.content.strip()
|
553 |
+
caption_lower = caption.lower().lstrip()
|
554 |
+
|
555 |
+
if caption_lower.startswith("1.") and not any(
|
556 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
557 |
+
):
|
558 |
+
break
|
559 |
+
|
560 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
561 |
+
retry_count += 1
|
562 |
+
time.sleep(2)
|
563 |
+
|
564 |
+
if retry_count == MAX_RETRIES:
|
565 |
+
caption = None
|
566 |
+
print("Max retries reached. Caption generation failed.")
|
567 |
+
|
568 |
+
else:
|
569 |
+
caption = None
|
570 |
+
|
571 |
+
image_captions[frame_name] = caption
|
572 |
+
all_captions[cat_name] = image_captions
|
573 |
+
|
574 |
+
# final : also prepare valid object ids
|
575 |
+
valid_obj_ids = dict()
|
576 |
+
|
577 |
+
for cat in cat_names:
|
578 |
+
if cat in ytvos_category_valid_list:
|
579 |
+
obj_id_cat = vid_meta['obj_id_cat']
|
580 |
+
valid_cat_ids = []
|
581 |
+
for obj_id in list(obj_id_cat.keys()):
|
582 |
+
if obj_id_cat[obj_id] == cat:
|
583 |
+
valid_cat_ids.append(obj_id)
|
584 |
+
valid_obj_ids[cat] = valid_cat_ids
|
585 |
+
|
586 |
+
return vid_id, all_captions, valid_obj_ids
|
587 |
+
|
588 |
+
|
589 |
+
if __name__ == '__main__':
|
590 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
591 |
+
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
|
592 |
+
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
|
593 |
+
|
594 |
+
args = parser.parse_args()
|
595 |
+
|
596 |
+
#==================데이터 불러오기===================
|
597 |
+
# 전체 데이터셋
|
598 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
599 |
+
|
600 |
+
# 전체 데이터셋 메타데이터
|
601 |
+
metas = train_dataset.metas
|
602 |
+
|
603 |
+
# 색상 후보 8개 (RGB 형식)
|
604 |
+
colors = [
|
605 |
+
(255, 0, 0), # Red
|
606 |
+
(0, 255, 0), # Green
|
607 |
+
(0, 0, 255), # Blue
|
608 |
+
(255, 255, 0), # Yellow
|
609 |
+
(255, 0, 255), # Magenta
|
610 |
+
(0, 255, 255), # Cyan
|
611 |
+
(128, 0, 128), # Purple
|
612 |
+
(255, 165, 0) # Orange
|
613 |
+
]
|
614 |
+
|
615 |
+
ytvos_category_valid_list = [
|
616 |
+
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
617 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
618 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
619 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
620 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
621 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
622 |
+
]
|
623 |
+
|
624 |
+
#==================gpt 돌리기===================
|
625 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
626 |
+
|
627 |
+
result_captions = {}
|
628 |
+
result_valid_obj_ids = {}
|
629 |
+
|
630 |
+
for i in range(370):
|
631 |
+
vid_id, all_captions, valid_obj_ids = getCaption(i)
|
632 |
+
|
633 |
+
if vid_id not in result_captions:
|
634 |
+
result_captions[vid_id] = all_captions
|
635 |
+
if vid_id not in result_valid_obj_ids:
|
636 |
+
result_valid_obj_ids[vid_id] = valid_obj_ids
|
637 |
+
|
638 |
+
print("Finished!", flush=True)
|
639 |
+
|
640 |
+
with open(args.save_caption_path, "w") as file:
|
641 |
+
json.dump(result_captions, file, indent=4)
|
642 |
+
|
643 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
644 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207171300.py
ADDED
@@ -0,0 +1,644 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
import time
|
5 |
+
|
6 |
+
from os import path as osp
|
7 |
+
from io import BytesIO
|
8 |
+
import random
|
9 |
+
|
10 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
11 |
+
import argparse
|
12 |
+
import opts
|
13 |
+
|
14 |
+
import sys
|
15 |
+
from pathlib import Path
|
16 |
+
import os
|
17 |
+
from os import path as osp
|
18 |
+
import skimage
|
19 |
+
from io import BytesIO
|
20 |
+
|
21 |
+
import numpy as np
|
22 |
+
import pandas as pd
|
23 |
+
import regex as re
|
24 |
+
import json
|
25 |
+
|
26 |
+
import cv2
|
27 |
+
from PIL import Image, ImageDraw
|
28 |
+
import torch
|
29 |
+
from torchvision.transforms import functional as F
|
30 |
+
|
31 |
+
from skimage import measure # (pip install scikit-image)
|
32 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
33 |
+
|
34 |
+
import matplotlib.pyplot as plt
|
35 |
+
import matplotlib.patches as patches
|
36 |
+
from matplotlib.collections import PatchCollection
|
37 |
+
from matplotlib.patches import Rectangle
|
38 |
+
import textwrap
|
39 |
+
|
40 |
+
|
41 |
+
import ipywidgets as widgets
|
42 |
+
from IPython.display import display, clear_output
|
43 |
+
|
44 |
+
from openai import OpenAI
|
45 |
+
import base64
|
46 |
+
import json
|
47 |
+
|
48 |
+
def number_objects_and_encode_old(idx, color_mask=False):
|
49 |
+
encoded_frames = {}
|
50 |
+
contoured_frames = {} # New dictionary for original images
|
51 |
+
vid_cat_cnts = {}
|
52 |
+
|
53 |
+
vid_meta = metas[idx]
|
54 |
+
vid_data = train_dataset[idx]
|
55 |
+
vid_id = vid_meta['video']
|
56 |
+
frame_indx = vid_meta['sample_indx']
|
57 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
58 |
+
imgs = vid_data[0]
|
59 |
+
|
60 |
+
for cat in cat_names:
|
61 |
+
cat_frames = []
|
62 |
+
contour_frames = []
|
63 |
+
frame_cat_cnts = {}
|
64 |
+
|
65 |
+
for i in range(imgs.size(0)):
|
66 |
+
frame_name = frame_indx[i]
|
67 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
68 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
69 |
+
|
70 |
+
frame_data = vid_data[2][frame_name]
|
71 |
+
obj_ids = list(frame_data.keys())
|
72 |
+
|
73 |
+
cat_cnt = 0
|
74 |
+
|
75 |
+
for j in range(len(obj_ids)):
|
76 |
+
obj_id = obj_ids[j]
|
77 |
+
obj_data = frame_data[obj_id]
|
78 |
+
obj_bbox = obj_data['bbox']
|
79 |
+
obj_valid = obj_data['valid']
|
80 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
81 |
+
obj_cat = obj_data['category_name']
|
82 |
+
|
83 |
+
if obj_cat == cat and obj_valid:
|
84 |
+
cat_cnt += 1
|
85 |
+
|
86 |
+
if color_mask == False:
|
87 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
88 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
89 |
+
for i, contour in enumerate(contours):
|
90 |
+
moments = cv2.moments(contour)
|
91 |
+
if moments["m00"] != 0:
|
92 |
+
cx = int(moments["m10"] / moments["m00"])
|
93 |
+
cy = int(moments["m01"] / moments["m00"])
|
94 |
+
else:
|
95 |
+
cx, cy = contour[0][0]
|
96 |
+
|
97 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
98 |
+
text = obj_id
|
99 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
100 |
+
text_w, text_h = text_size
|
101 |
+
|
102 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
103 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
104 |
+
|
105 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
106 |
+
font, 1, (255, 255, 255), 2)
|
107 |
+
|
108 |
+
else:
|
109 |
+
alpha = 0.08
|
110 |
+
|
111 |
+
colored_obj_mask = np.zeros_like(frame)
|
112 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
113 |
+
frame[obj_mask == 1] = (
|
114 |
+
(1 - alpha) * frame[obj_mask == 1]
|
115 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
116 |
+
)
|
117 |
+
|
118 |
+
|
119 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
120 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
121 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
122 |
+
|
123 |
+
if len(contours) > 0:
|
124 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
125 |
+
M = cv2.moments(largest_contour)
|
126 |
+
if M["m00"] != 0:
|
127 |
+
center_x = int(M["m10"] / M["m00"])
|
128 |
+
center_y = int(M["m01"] / M["m00"])
|
129 |
+
else:
|
130 |
+
center_x, center_y = 0, 0
|
131 |
+
|
132 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
133 |
+
text = obj_id
|
134 |
+
|
135 |
+
font_scale = 0.9
|
136 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
137 |
+
text_x = center_x - text_size[0] // 1
|
138 |
+
text_y = center_y
|
139 |
+
|
140 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5)
|
141 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
142 |
+
|
143 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
144 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
145 |
+
|
146 |
+
# plt.figure(figsize=(12, 8))
|
147 |
+
# plt.imshow(frame)
|
148 |
+
# plt.title(f"frame {frame_name}")
|
149 |
+
# plt.tight_layout()
|
150 |
+
# plt.axis('off')
|
151 |
+
# plt.show()
|
152 |
+
|
153 |
+
buffer = BytesIO()
|
154 |
+
frame = Image.fromarray(frame)
|
155 |
+
frame.save(buffer, format='jpeg')
|
156 |
+
buffer.seek(0)
|
157 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
158 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
159 |
+
|
160 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
161 |
+
buffer.truncate()
|
162 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
163 |
+
frame_for_contour.save(buffer, format='jpeg')
|
164 |
+
buffer.seek(0)
|
165 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
166 |
+
|
167 |
+
encoded_frames[cat] = cat_frames
|
168 |
+
contoured_frames[cat] = contour_frames
|
169 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
170 |
+
|
171 |
+
return encoded_frames, contoured_frames, vid_cat_cnts
|
172 |
+
|
173 |
+
|
174 |
+
def number_objects_and_encode(idx, color_mask=False):
|
175 |
+
encoded_frames = {}
|
176 |
+
contoured_frames = {} # New dictionary for original images
|
177 |
+
vid_cat_cnts = {}
|
178 |
+
|
179 |
+
vid_meta = metas[idx]
|
180 |
+
vid_data = train_dataset[idx]
|
181 |
+
vid_id = vid_meta['video']
|
182 |
+
frame_indx = vid_meta['sample_indx']
|
183 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
184 |
+
imgs = vid_data[0]
|
185 |
+
|
186 |
+
for cat in cat_names:
|
187 |
+
cat_frames = []
|
188 |
+
contour_frames = []
|
189 |
+
frame_cat_cnts = {}
|
190 |
+
|
191 |
+
for i in range(imgs.size(0)):
|
192 |
+
frame_name = frame_indx[i]
|
193 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
194 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
195 |
+
|
196 |
+
frame_data = vid_data[2][frame_name]
|
197 |
+
obj_ids = list(frame_data.keys())
|
198 |
+
|
199 |
+
cat_cnt = 0
|
200 |
+
|
201 |
+
for j in range(len(obj_ids)):
|
202 |
+
obj_id = obj_ids[j]
|
203 |
+
obj_data = frame_data[obj_id]
|
204 |
+
obj_bbox = obj_data['bbox']
|
205 |
+
obj_valid = obj_data['valid']
|
206 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
207 |
+
obj_cat = obj_data['category_name']
|
208 |
+
|
209 |
+
if obj_cat == cat and obj_valid:
|
210 |
+
cat_cnt += 1
|
211 |
+
|
212 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
213 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
214 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
215 |
+
|
216 |
+
if len(contours) > 0:
|
217 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
218 |
+
M = cv2.moments(largest_contour)
|
219 |
+
if M["m00"] != 0:
|
220 |
+
center_x = int(M["m10"] / M["m00"])
|
221 |
+
center_y = int(M["m01"] / M["m00"])
|
222 |
+
else:
|
223 |
+
center_x, center_y = 0, 0
|
224 |
+
|
225 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
226 |
+
text = obj_id
|
227 |
+
font_scale = 1.2
|
228 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
229 |
+
text_x = center_x - text_size[0] // 1
|
230 |
+
text_y = center_y
|
231 |
+
|
232 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5)
|
233 |
+
rect_end = (text_x + text_size[0] + 5, text_y + 3)
|
234 |
+
|
235 |
+
contour_thickness = 1
|
236 |
+
rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
|
237 |
+
rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
|
238 |
+
|
239 |
+
cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
|
240 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
241 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
242 |
+
|
243 |
+
|
244 |
+
if color_mask:
|
245 |
+
alpha = 0.08
|
246 |
+
colored_obj_mask = np.zeros_like(frame)
|
247 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
248 |
+
frame[obj_mask == 1] = (
|
249 |
+
(1 - alpha) * frame[obj_mask == 1]
|
250 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
251 |
+
)
|
252 |
+
|
253 |
+
# plt.figure(figsize=(12, 8))
|
254 |
+
# plt.imshow(frame)
|
255 |
+
# plt.title(f"frame {frame_name}")
|
256 |
+
# plt.tight_layout()
|
257 |
+
# plt.axis('off')
|
258 |
+
# plt.show()
|
259 |
+
|
260 |
+
buffer = BytesIO()
|
261 |
+
frame = Image.fromarray(frame)
|
262 |
+
frame.save(buffer, format='jpeg')
|
263 |
+
buffer.seek(0)
|
264 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
265 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
266 |
+
|
267 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
268 |
+
buffer.truncate()
|
269 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
270 |
+
frame_for_contour.save(buffer, format='jpeg')
|
271 |
+
buffer.seek(0)
|
272 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
273 |
+
|
274 |
+
encoded_frames[cat] = cat_frames
|
275 |
+
contoured_frames[cat] = contour_frames
|
276 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
277 |
+
|
278 |
+
return encoded_frames, contoured_frames, vid_cat_cnts
|
279 |
+
|
280 |
+
|
281 |
+
|
282 |
+
def getCaption(idx, model='gpt-4o'):
|
283 |
+
vid_meta = metas[idx]
|
284 |
+
vid_data = train_dataset[idx]
|
285 |
+
vid_id = vid_meta['video']
|
286 |
+
print(f"vid id: {vid_id}\n")
|
287 |
+
|
288 |
+
frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
|
289 |
+
cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
|
290 |
+
all_captions = dict()
|
291 |
+
|
292 |
+
# color_mask = random.choice([True, False])
|
293 |
+
color_mask = random.choices([False, True], weights=[60, 40])[0]
|
294 |
+
|
295 |
+
base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
|
296 |
+
#marked = "mask with boundary" if color_mask else "boundary"
|
297 |
+
|
298 |
+
for cat_name in list(cat_names) :
|
299 |
+
|
300 |
+
is_movable = False
|
301 |
+
if cat_name in ytvos_category_valid_list :
|
302 |
+
is_movable = True
|
303 |
+
|
304 |
+
if not is_movable:
|
305 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
|
306 |
+
|
307 |
+
|
308 |
+
image_captions = {}
|
309 |
+
captioner = OpenAI()
|
310 |
+
cat_base64_frames = base64_frames[cat_name]
|
311 |
+
# cont_base64_frames = contoured_frames[cat_name]
|
312 |
+
|
313 |
+
for i in range(len(cat_base64_frames)):
|
314 |
+
frame_name = frame_indx[i]
|
315 |
+
# cont_base64_image = cont_base64_frames[i]
|
316 |
+
base64_image = cat_base64_frames[i]
|
317 |
+
should_filter = False
|
318 |
+
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
|
319 |
+
|
320 |
+
if frame_cat_cnts >= 2:
|
321 |
+
should_filter = True
|
322 |
+
else:
|
323 |
+
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
|
324 |
+
|
325 |
+
|
326 |
+
if is_movable and should_filter:
|
327 |
+
#1단계: 필터링
|
328 |
+
print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
329 |
+
caption_filter_text = f"""
|
330 |
+
You are a visual assistant analyzing a single frame from a video.
|
331 |
+
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
|
332 |
+
|
333 |
+
Are {cat_name}s in the image performing all different and recognizable actions or postures?
|
334 |
+
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
|
335 |
+
facial expressions, and any notable interactions with objects or other {cat_name}s or people.
|
336 |
+
|
337 |
+
Only focus on obvious, prominent actions that can be reliably identified from this single frame.
|
338 |
+
|
339 |
+
- Respond with "YES" if:
|
340 |
+
1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
|
341 |
+
(e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
|
342 |
+
2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
|
343 |
+
3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
|
344 |
+
|
345 |
+
- Respond with "NONE" if:
|
346 |
+
1) The actions or pose are not clearly differentiable or too similar.
|
347 |
+
2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
|
348 |
+
3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
|
349 |
+
|
350 |
+
Answer strictly with either "YES" or "NONE".
|
351 |
+
"""
|
352 |
+
|
353 |
+
response1 = captioner.chat.completions.create(
|
354 |
+
model=model,
|
355 |
+
messages=[
|
356 |
+
{
|
357 |
+
"role": "user",
|
358 |
+
"content": [
|
359 |
+
{
|
360 |
+
"type": "text",
|
361 |
+
"text": caption_filter_text,
|
362 |
+
},
|
363 |
+
{
|
364 |
+
"type": "image_url",
|
365 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
366 |
+
}
|
367 |
+
],
|
368 |
+
}
|
369 |
+
],
|
370 |
+
)
|
371 |
+
response_content = response1.choices[0].message.content
|
372 |
+
should_caption = True if "yes" in response_content.lower() else False
|
373 |
+
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
|
374 |
+
|
375 |
+
else:
|
376 |
+
should_caption = False
|
377 |
+
|
378 |
+
#2단계: dense caption 만들기
|
379 |
+
dense_caption_prompt_1 = f"""
|
380 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
|
381 |
+
|
382 |
+
Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
|
383 |
+
|
384 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
385 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
386 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
387 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
388 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
389 |
+
6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
390 |
+
- expressions like 'seems to be', 'appears to be' are BANNED!
|
391 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
392 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
393 |
+
9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
|
394 |
+
10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
|
395 |
+
11. Do not mention object IDs.
|
396 |
+
12. Use '{cat_name}' as the noun for the referring expressions.
|
397 |
+
|
398 |
+
Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
399 |
+
|
400 |
+
- Your answer should contain details, and follow the following format:
|
401 |
+
object id. action-oriented description
|
402 |
+
(e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
|
403 |
+
2. a person bending over and touching his boots to tie the shoelace.)
|
404 |
+
- for action-oriented description, use {cat_name} as subject noun
|
405 |
+
|
406 |
+
**Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
|
407 |
+
Please pay attention to the categories of these objects and don’t change them.
|
408 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
409 |
+
Output referring expressions for each object id. Please start your answer:"""
|
410 |
+
|
411 |
+
|
412 |
+
dense_caption_prompt_2 = f"""
|
413 |
+
You are an advanced visual language model analyzing a video frame.
|
414 |
+
In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
|
415 |
+
|
416 |
+
Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
|
417 |
+
Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
|
418 |
+
|
419 |
+
---
|
420 |
+
## Key Guidelines:
|
421 |
+
1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
|
422 |
+
- Example: "grabbing a branch and pulling it down" (**(O) Specific**)
|
423 |
+
- Avoid: "moving slightly to the side" (**(X) Too vague**)
|
424 |
+
|
425 |
+
2. **Do not describe appearance, color, or position**—focus purely on the action.
|
426 |
+
- (X) "A large brown bear standing on the left"
|
427 |
+
- (O) "The bear is lifting its front paws and swiping forward."
|
428 |
+
|
429 |
+
3. **Use dynamic, action-specific verbs** rather than passive descriptions.
|
430 |
+
- (O) "The giraffe is tilting its head and sniffing the ground."
|
431 |
+
- (X) "The giraffe is near a tree and looking around."
|
432 |
+
|
433 |
+
4. **Avoid assumptions, emotions, or speculative phrasing.**
|
434 |
+
- (X) "The person seems excited" / "The person might be preparing to jump."
|
435 |
+
- (O) "The person is pushing its front legs against the rock and leaping forward."
|
436 |
+
|
437 |
+
5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
438 |
+
- expressions like 'seems to be', 'appears to be' are BANNED!
|
439 |
+
6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
440 |
+
|
441 |
+
7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
|
442 |
+
- **Each object should have a unique, descriptive action.**
|
443 |
+
- (X) "Two dogs are running."
|
444 |
+
- (O) "1. One dog is chasing another, its legs stretched mid-air.
|
445 |
+
2. The other dog is looking back while speeding up."
|
446 |
+
|
447 |
+
---
|
448 |
+
## Output Format:
|
449 |
+
- Each labeled **{cat_name}** should have exactly **one line of description**.
|
450 |
+
- Format: `ID. {cat_name} + action-based description`
|
451 |
+
- (O) Example:
|
452 |
+
```
|
453 |
+
1. The person is leaning forward while opening a bag with both hands.
|
454 |
+
2. The person is holding onto a rope and pulling themselves up.
|
455 |
+
```
|
456 |
+
- **Ensure that each object is described individually.**
|
457 |
+
- **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
|
458 |
+
|
459 |
+
---
|
460 |
+
## Additional Instructions:
|
461 |
+
- **Do NOT** use expressions like "it appears that..." or "it seems like...".
|
462 |
+
- **Do NOT** mention object IDs in the description (only use the provided format).
|
463 |
+
- **DO NOT** include markdown formatting (no bullet points, no asterisks).
|
464 |
+
- **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
|
465 |
+
|
466 |
+
Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
|
467 |
+
"""
|
468 |
+
|
469 |
+
|
470 |
+
dense_caption_prompt = f"""
|
471 |
+
You are a visual assistant analyzing a single frame of a video.
|
472 |
+
In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
|
473 |
+
|
474 |
+
I am building an **action-centric referring expression** dataset.
|
475 |
+
Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
|
476 |
+
|
477 |
+
---
|
478 |
+
## Guidelines:
|
479 |
+
1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
|
480 |
+
2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
|
481 |
+
3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
|
482 |
+
4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
483 |
+
5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
|
484 |
+
6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
|
485 |
+
7. Base your descriptions on these principles:
|
486 |
+
- **Avoid words like 'minimal' or 'slightly'.**
|
487 |
+
- Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
|
488 |
+
- Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
|
489 |
+
- **Specify actions with other objects or entities** only when they are clear and observable.
|
490 |
+
- (O) "pushing another person"
|
491 |
+
- (X) "interacting with another object"
|
492 |
+
|
493 |
+
---
|
494 |
+
## Output Format:
|
495 |
+
- Each labeled **{cat_name}** must have **exactly one line**.
|
496 |
+
- Format: `ID. {cat_name} + action-based description`
|
497 |
+
- (O) Example:
|
498 |
+
```
|
499 |
+
1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
|
500 |
+
2. The person is pulling a baby carriage while smiling.
|
501 |
+
```
|
502 |
+
- **Ensure each object is described individually.**
|
503 |
+
- **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
|
504 |
+
|
505 |
+
---
|
506 |
+
## Example:
|
507 |
+
If the frame has two labeled **bears**, your output should be:
|
508 |
+
```
|
509 |
+
1. The bear is reaching out its right paw while leaning forward to catch prey.
|
510 |
+
2. A bear is standing upright, facing right, and touching the bike beside it.
|
511 |
+
```
|
512 |
+
|
513 |
+
---
|
514 |
+
## Additional Instructions:
|
515 |
+
- **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
|
516 |
+
- **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
|
517 |
+
- **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
|
518 |
+
- **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
|
519 |
+
|
520 |
+
Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
|
521 |
+
|
522 |
+
|
523 |
+
MAX_RETRIES = 3
|
524 |
+
retry_count = 0
|
525 |
+
|
526 |
+
if should_caption:
|
527 |
+
while retry_count < MAX_RETRIES:
|
528 |
+
selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
|
529 |
+
|
530 |
+
response2 = captioner.chat.completions.create(
|
531 |
+
model=model,
|
532 |
+
messages=[
|
533 |
+
{
|
534 |
+
"role": "user",
|
535 |
+
"content": [
|
536 |
+
{
|
537 |
+
"type": "text",
|
538 |
+
"text": selected_prompt,
|
539 |
+
},
|
540 |
+
{
|
541 |
+
"type": "image_url",
|
542 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
543 |
+
},
|
544 |
+
],
|
545 |
+
}
|
546 |
+
],
|
547 |
+
)
|
548 |
+
|
549 |
+
# caption = response2.choices[0].message.content
|
550 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
551 |
+
|
552 |
+
caption = response2.choices[0].message.content.strip()
|
553 |
+
caption_lower = caption.lower().lstrip()
|
554 |
+
|
555 |
+
if caption_lower.startswith("1.") and not any(
|
556 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
557 |
+
):
|
558 |
+
break
|
559 |
+
|
560 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
561 |
+
retry_count += 1
|
562 |
+
time.sleep(2)
|
563 |
+
|
564 |
+
if retry_count == MAX_RETRIES:
|
565 |
+
caption = None
|
566 |
+
print("Max retries reached. Caption generation failed.")
|
567 |
+
|
568 |
+
else:
|
569 |
+
caption = None
|
570 |
+
|
571 |
+
image_captions[frame_name] = caption
|
572 |
+
all_captions[cat_name] = image_captions
|
573 |
+
|
574 |
+
# final : also prepare valid object ids
|
575 |
+
valid_obj_ids = dict()
|
576 |
+
|
577 |
+
for cat in cat_names:
|
578 |
+
if cat in ytvos_category_valid_list:
|
579 |
+
obj_id_cat = vid_meta['obj_id_cat']
|
580 |
+
valid_cat_ids = []
|
581 |
+
for obj_id in list(obj_id_cat.keys()):
|
582 |
+
if obj_id_cat[obj_id] == cat:
|
583 |
+
valid_cat_ids.append(obj_id)
|
584 |
+
valid_obj_ids[cat] = valid_cat_ids
|
585 |
+
|
586 |
+
return vid_id, all_captions, valid_obj_ids
|
587 |
+
|
588 |
+
|
589 |
+
if __name__ == '__main__':
|
590 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
591 |
+
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
|
592 |
+
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
|
593 |
+
|
594 |
+
args = parser.parse_args()
|
595 |
+
|
596 |
+
#==================데이터 불러오기===================
|
597 |
+
# 전체 데이터셋
|
598 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
599 |
+
|
600 |
+
# 전체 데이터셋 메타데이터
|
601 |
+
metas = train_dataset.metas
|
602 |
+
|
603 |
+
# 색상 후보 8개 (RGB 형식)
|
604 |
+
colors = [
|
605 |
+
(255, 0, 0), # Red
|
606 |
+
(0, 255, 0), # Green
|
607 |
+
(0, 0, 255), # Blue
|
608 |
+
(255, 255, 0), # Yellow
|
609 |
+
(255, 0, 255), # Magenta
|
610 |
+
(0, 255, 255), # Cyan
|
611 |
+
(128, 0, 128), # Purple
|
612 |
+
(255, 165, 0) # Orange
|
613 |
+
]
|
614 |
+
|
615 |
+
ytvos_category_valid_list = [
|
616 |
+
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
617 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
618 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
619 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
620 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
621 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
622 |
+
]
|
623 |
+
|
624 |
+
#==================gpt 돌리기===================
|
625 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
626 |
+
|
627 |
+
result_captions = {}
|
628 |
+
result_valid_obj_ids = {}
|
629 |
+
|
630 |
+
for i in range(len(metas)):
|
631 |
+
vid_id, all_captions, valid_obj_ids = getCaption(i)
|
632 |
+
|
633 |
+
if vid_id not in result_captions:
|
634 |
+
result_captions[vid_id] = all_captions
|
635 |
+
if vid_id not in result_valid_obj_ids:
|
636 |
+
result_valid_obj_ids[vid_id] = valid_obj_ids
|
637 |
+
|
638 |
+
print("Finished!", flush=True)
|
639 |
+
|
640 |
+
with open(args.save_caption_path, "w") as file:
|
641 |
+
json.dump(result_captions, file, indent=4)
|
642 |
+
|
643 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
644 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207171416.py
ADDED
@@ -0,0 +1,644 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
import time
|
5 |
+
|
6 |
+
from os import path as osp
|
7 |
+
from io import BytesIO
|
8 |
+
import random
|
9 |
+
|
10 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
11 |
+
import argparse
|
12 |
+
import opts
|
13 |
+
|
14 |
+
import sys
|
15 |
+
from pathlib import Path
|
16 |
+
import os
|
17 |
+
from os import path as osp
|
18 |
+
import skimage
|
19 |
+
from io import BytesIO
|
20 |
+
|
21 |
+
import numpy as np
|
22 |
+
import pandas as pd
|
23 |
+
import regex as re
|
24 |
+
import json
|
25 |
+
|
26 |
+
import cv2
|
27 |
+
from PIL import Image, ImageDraw
|
28 |
+
import torch
|
29 |
+
from torchvision.transforms import functional as F
|
30 |
+
|
31 |
+
from skimage import measure # (pip install scikit-image)
|
32 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
33 |
+
|
34 |
+
import matplotlib.pyplot as plt
|
35 |
+
import matplotlib.patches as patches
|
36 |
+
from matplotlib.collections import PatchCollection
|
37 |
+
from matplotlib.patches import Rectangle
|
38 |
+
import textwrap
|
39 |
+
|
40 |
+
|
41 |
+
import ipywidgets as widgets
|
42 |
+
from IPython.display import display, clear_output
|
43 |
+
|
44 |
+
from openai import OpenAI
|
45 |
+
import base64
|
46 |
+
import json
|
47 |
+
|
48 |
+
def number_objects_and_encode_old(idx, color_mask=False):
|
49 |
+
encoded_frames = {}
|
50 |
+
contoured_frames = {} # New dictionary for original images
|
51 |
+
vid_cat_cnts = {}
|
52 |
+
|
53 |
+
vid_meta = metas[idx]
|
54 |
+
vid_data = train_dataset[idx]
|
55 |
+
vid_id = vid_meta['video']
|
56 |
+
frame_indx = vid_meta['sample_indx']
|
57 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
58 |
+
imgs = vid_data[0]
|
59 |
+
|
60 |
+
for cat in cat_names:
|
61 |
+
cat_frames = []
|
62 |
+
contour_frames = []
|
63 |
+
frame_cat_cnts = {}
|
64 |
+
|
65 |
+
for i in range(imgs.size(0)):
|
66 |
+
frame_name = frame_indx[i]
|
67 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
68 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
69 |
+
|
70 |
+
frame_data = vid_data[2][frame_name]
|
71 |
+
obj_ids = list(frame_data.keys())
|
72 |
+
|
73 |
+
cat_cnt = 0
|
74 |
+
|
75 |
+
for j in range(len(obj_ids)):
|
76 |
+
obj_id = obj_ids[j]
|
77 |
+
obj_data = frame_data[obj_id]
|
78 |
+
obj_bbox = obj_data['bbox']
|
79 |
+
obj_valid = obj_data['valid']
|
80 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
81 |
+
obj_cat = obj_data['category_name']
|
82 |
+
|
83 |
+
if obj_cat == cat and obj_valid:
|
84 |
+
cat_cnt += 1
|
85 |
+
|
86 |
+
if color_mask == False:
|
87 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
88 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
89 |
+
for i, contour in enumerate(contours):
|
90 |
+
moments = cv2.moments(contour)
|
91 |
+
if moments["m00"] != 0:
|
92 |
+
cx = int(moments["m10"] / moments["m00"])
|
93 |
+
cy = int(moments["m01"] / moments["m00"])
|
94 |
+
else:
|
95 |
+
cx, cy = contour[0][0]
|
96 |
+
|
97 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
98 |
+
text = obj_id
|
99 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
100 |
+
text_w, text_h = text_size
|
101 |
+
|
102 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
103 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
104 |
+
|
105 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
106 |
+
font, 1, (255, 255, 255), 2)
|
107 |
+
|
108 |
+
else:
|
109 |
+
alpha = 0.08
|
110 |
+
|
111 |
+
colored_obj_mask = np.zeros_like(frame)
|
112 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
113 |
+
frame[obj_mask == 1] = (
|
114 |
+
(1 - alpha) * frame[obj_mask == 1]
|
115 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
116 |
+
)
|
117 |
+
|
118 |
+
|
119 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
120 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
121 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
122 |
+
|
123 |
+
if len(contours) > 0:
|
124 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
125 |
+
M = cv2.moments(largest_contour)
|
126 |
+
if M["m00"] != 0:
|
127 |
+
center_x = int(M["m10"] / M["m00"])
|
128 |
+
center_y = int(M["m01"] / M["m00"])
|
129 |
+
else:
|
130 |
+
center_x, center_y = 0, 0
|
131 |
+
|
132 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
133 |
+
text = obj_id
|
134 |
+
|
135 |
+
font_scale = 0.9
|
136 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
137 |
+
text_x = center_x - text_size[0] // 1
|
138 |
+
text_y = center_y
|
139 |
+
|
140 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5)
|
141 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
142 |
+
|
143 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
144 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
145 |
+
|
146 |
+
# plt.figure(figsize=(12, 8))
|
147 |
+
# plt.imshow(frame)
|
148 |
+
# plt.title(f"frame {frame_name}")
|
149 |
+
# plt.tight_layout()
|
150 |
+
# plt.axis('off')
|
151 |
+
# plt.show()
|
152 |
+
|
153 |
+
buffer = BytesIO()
|
154 |
+
frame = Image.fromarray(frame)
|
155 |
+
frame.save(buffer, format='jpeg')
|
156 |
+
buffer.seek(0)
|
157 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
158 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
159 |
+
|
160 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
161 |
+
buffer.truncate()
|
162 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
163 |
+
frame_for_contour.save(buffer, format='jpeg')
|
164 |
+
buffer.seek(0)
|
165 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
166 |
+
|
167 |
+
encoded_frames[cat] = cat_frames
|
168 |
+
contoured_frames[cat] = contour_frames
|
169 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
170 |
+
|
171 |
+
return encoded_frames, contoured_frames, vid_cat_cnts
|
172 |
+
|
173 |
+
|
174 |
+
def number_objects_and_encode(idx, color_mask=False):
|
175 |
+
encoded_frames = {}
|
176 |
+
contoured_frames = {} # New dictionary for original images
|
177 |
+
vid_cat_cnts = {}
|
178 |
+
|
179 |
+
vid_meta = metas[idx]
|
180 |
+
vid_data = train_dataset[idx]
|
181 |
+
vid_id = vid_meta['video']
|
182 |
+
frame_indx = vid_meta['sample_indx']
|
183 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
184 |
+
imgs = vid_data[0]
|
185 |
+
|
186 |
+
for cat in cat_names:
|
187 |
+
cat_frames = []
|
188 |
+
contour_frames = []
|
189 |
+
frame_cat_cnts = {}
|
190 |
+
|
191 |
+
for i in range(imgs.size(0)):
|
192 |
+
frame_name = frame_indx[i]
|
193 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
194 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
195 |
+
|
196 |
+
frame_data = vid_data[2][frame_name]
|
197 |
+
obj_ids = list(frame_data.keys())
|
198 |
+
|
199 |
+
cat_cnt = 0
|
200 |
+
|
201 |
+
for j in range(len(obj_ids)):
|
202 |
+
obj_id = obj_ids[j]
|
203 |
+
obj_data = frame_data[obj_id]
|
204 |
+
obj_bbox = obj_data['bbox']
|
205 |
+
obj_valid = obj_data['valid']
|
206 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
207 |
+
obj_cat = obj_data['category_name']
|
208 |
+
|
209 |
+
if obj_cat == cat and obj_valid:
|
210 |
+
cat_cnt += 1
|
211 |
+
|
212 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
213 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
214 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
215 |
+
|
216 |
+
if len(contours) > 0:
|
217 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
218 |
+
M = cv2.moments(largest_contour)
|
219 |
+
if M["m00"] != 0:
|
220 |
+
center_x = int(M["m10"] / M["m00"])
|
221 |
+
center_y = int(M["m01"] / M["m00"])
|
222 |
+
else:
|
223 |
+
center_x, center_y = 0, 0
|
224 |
+
|
225 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
226 |
+
text = obj_id
|
227 |
+
font_scale = 1.2
|
228 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
229 |
+
text_x = center_x - text_size[0] // 1
|
230 |
+
text_y = center_y
|
231 |
+
|
232 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5)
|
233 |
+
rect_end = (text_x + text_size[0] + 5, text_y + 3)
|
234 |
+
|
235 |
+
contour_thickness = 1
|
236 |
+
rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
|
237 |
+
rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
|
238 |
+
|
239 |
+
cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
|
240 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
241 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
242 |
+
|
243 |
+
|
244 |
+
if color_mask:
|
245 |
+
alpha = 0.08
|
246 |
+
colored_obj_mask = np.zeros_like(frame)
|
247 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
248 |
+
frame[obj_mask == 1] = (
|
249 |
+
(1 - alpha) * frame[obj_mask == 1]
|
250 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
251 |
+
)
|
252 |
+
|
253 |
+
# plt.figure(figsize=(12, 8))
|
254 |
+
# plt.imshow(frame)
|
255 |
+
# plt.title(f"frame {frame_name}")
|
256 |
+
# plt.tight_layout()
|
257 |
+
# plt.axis('off')
|
258 |
+
# plt.show()
|
259 |
+
|
260 |
+
buffer = BytesIO()
|
261 |
+
frame = Image.fromarray(frame)
|
262 |
+
frame.save(buffer, format='jpeg')
|
263 |
+
buffer.seek(0)
|
264 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
265 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
266 |
+
|
267 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
268 |
+
buffer.truncate()
|
269 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
270 |
+
frame_for_contour.save(buffer, format='jpeg')
|
271 |
+
buffer.seek(0)
|
272 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
273 |
+
|
274 |
+
encoded_frames[cat] = cat_frames
|
275 |
+
contoured_frames[cat] = contour_frames
|
276 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
277 |
+
|
278 |
+
return encoded_frames, contoured_frames, vid_cat_cnts
|
279 |
+
|
280 |
+
|
281 |
+
|
282 |
+
def getCaption(idx, model='gpt-4o'):
|
283 |
+
vid_meta = metas[idx]
|
284 |
+
vid_data = train_dataset[idx]
|
285 |
+
vid_id = vid_meta['video']
|
286 |
+
print(f"vid id: {vid_id}\n")
|
287 |
+
|
288 |
+
frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
|
289 |
+
cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
|
290 |
+
all_captions = dict()
|
291 |
+
|
292 |
+
# color_mask = random.choice([True, False])
|
293 |
+
color_mask = random.choices([False, True], weights=[60, 40])[0]
|
294 |
+
|
295 |
+
base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
|
296 |
+
#marked = "mask with boundary" if color_mask else "boundary"
|
297 |
+
|
298 |
+
for cat_name in list(cat_names) :
|
299 |
+
|
300 |
+
is_movable = False
|
301 |
+
if cat_name in ytvos_category_valid_list :
|
302 |
+
is_movable = True
|
303 |
+
|
304 |
+
if not is_movable:
|
305 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
|
306 |
+
|
307 |
+
|
308 |
+
image_captions = {}
|
309 |
+
captioner = OpenAI()
|
310 |
+
cat_base64_frames = base64_frames[cat_name]
|
311 |
+
# cont_base64_frames = contoured_frames[cat_name]
|
312 |
+
|
313 |
+
for i in range(len(cat_base64_frames)):
|
314 |
+
frame_name = frame_indx[i]
|
315 |
+
# cont_base64_image = cont_base64_frames[i]
|
316 |
+
base64_image = cat_base64_frames[i]
|
317 |
+
should_filter = False
|
318 |
+
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
|
319 |
+
|
320 |
+
if frame_cat_cnts >= 2:
|
321 |
+
should_filter = True
|
322 |
+
else:
|
323 |
+
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
|
324 |
+
|
325 |
+
|
326 |
+
if is_movable and should_filter:
|
327 |
+
#1단계: 필터링
|
328 |
+
print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
329 |
+
caption_filter_text = f"""
|
330 |
+
You are a visual assistant analyzing a single frame from a video.
|
331 |
+
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
|
332 |
+
|
333 |
+
Are {cat_name}s in the image performing all different and recognizable actions or postures?
|
334 |
+
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
|
335 |
+
facial expressions, and any notable interactions with objects or other {cat_name}s or people.
|
336 |
+
|
337 |
+
Only focus on obvious, prominent actions that can be reliably identified from this single frame.
|
338 |
+
|
339 |
+
- Respond with "YES" if:
|
340 |
+
1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
|
341 |
+
(e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
|
342 |
+
2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
|
343 |
+
3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
|
344 |
+
|
345 |
+
- Respond with "NONE" if:
|
346 |
+
1) The actions or pose are not clearly differentiable or too similar.
|
347 |
+
2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
|
348 |
+
3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
|
349 |
+
|
350 |
+
Answer strictly with either "YES" or "NONE".
|
351 |
+
"""
|
352 |
+
|
353 |
+
response1 = captioner.chat.completions.create(
|
354 |
+
model=model,
|
355 |
+
messages=[
|
356 |
+
{
|
357 |
+
"role": "user",
|
358 |
+
"content": [
|
359 |
+
{
|
360 |
+
"type": "text",
|
361 |
+
"text": caption_filter_text,
|
362 |
+
},
|
363 |
+
{
|
364 |
+
"type": "image_url",
|
365 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
366 |
+
}
|
367 |
+
],
|
368 |
+
}
|
369 |
+
],
|
370 |
+
)
|
371 |
+
response_content = response1.choices[0].message.content
|
372 |
+
should_caption = True if "yes" in response_content.lower() else False
|
373 |
+
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
|
374 |
+
|
375 |
+
else:
|
376 |
+
should_caption = False
|
377 |
+
|
378 |
+
#2단계: dense caption 만들기
|
379 |
+
dense_caption_prompt_1 = f"""
|
380 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
|
381 |
+
|
382 |
+
Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
|
383 |
+
|
384 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
385 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
386 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
387 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
388 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
389 |
+
6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
390 |
+
- expressions like 'seems to be', 'appears to be' are BANNED!
|
391 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
392 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
393 |
+
9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
|
394 |
+
10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
|
395 |
+
11. Do not mention object IDs.
|
396 |
+
12. Use '{cat_name}' as the noun for the referring expressions.
|
397 |
+
|
398 |
+
Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
399 |
+
|
400 |
+
- Your answer should contain details, and follow the following format:
|
401 |
+
object id. action-oriented description
|
402 |
+
(e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
|
403 |
+
2. a person bending over and touching his boots to tie the shoelace.)
|
404 |
+
- for action-oriented description, use {cat_name} as subject noun
|
405 |
+
|
406 |
+
**Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
|
407 |
+
Please pay attention to the categories of these objects and don’t change them.
|
408 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
409 |
+
Output referring expressions for each object id. Please start your answer:"""
|
410 |
+
|
411 |
+
|
412 |
+
dense_caption_prompt_2 = f"""
|
413 |
+
You are an advanced visual language model analyzing a video frame.
|
414 |
+
In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
|
415 |
+
|
416 |
+
Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
|
417 |
+
Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
|
418 |
+
|
419 |
+
---
|
420 |
+
## Key Guidelines:
|
421 |
+
1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
|
422 |
+
- Example: "grabbing a branch and pulling it down" (**(O) Specific**)
|
423 |
+
- Avoid: "moving slightly to the side" (**(X) Too vague**)
|
424 |
+
|
425 |
+
2. **Do not describe appearance, color, or position**—focus purely on the action.
|
426 |
+
- (X) "A large brown bear standing on the left"
|
427 |
+
- (O) "The bear is lifting its front paws and swiping forward."
|
428 |
+
|
429 |
+
3. **Use dynamic, action-specific verbs** rather than passive descriptions.
|
430 |
+
- (O) "The giraffe is tilting its head and sniffing the ground."
|
431 |
+
- (X) "The giraffe is near a tree and looking around."
|
432 |
+
|
433 |
+
4. **Avoid assumptions, emotions, or speculative phrasing.**
|
434 |
+
- (X) "The person seems excited" / "The person might be preparing to jump."
|
435 |
+
- (O) "The person is pushing its front legs against the rock and leaping forward."
|
436 |
+
|
437 |
+
5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
438 |
+
- expressions like 'seems to be', 'appears to be' are BANNED!
|
439 |
+
6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
440 |
+
|
441 |
+
7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
|
442 |
+
- **Each object should have a unique, descriptive action.**
|
443 |
+
- (X) "Two dogs are running."
|
444 |
+
- (O) "1. One dog is chasing another, its legs stretched mid-air.
|
445 |
+
2. The other dog is looking back while speeding up."
|
446 |
+
|
447 |
+
---
|
448 |
+
## Output Format:
|
449 |
+
- Each labeled **{cat_name}** should have exactly **one line of description**.
|
450 |
+
- Format: `ID. {cat_name} + action-based description`
|
451 |
+
- (O) Example:
|
452 |
+
```
|
453 |
+
1. The person is leaning forward while opening a bag with both hands.
|
454 |
+
2. The person is holding onto a rope and pulling themselves up.
|
455 |
+
```
|
456 |
+
- **Ensure that each object is described individually.**
|
457 |
+
- **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
|
458 |
+
|
459 |
+
---
|
460 |
+
## Additional Instructions:
|
461 |
+
- **Do NOT** use expressions like "it appears that..." or "it seems like...".
|
462 |
+
- **Do NOT** mention object IDs in the description (only use the provided format).
|
463 |
+
- **DO NOT** include markdown formatting (no bullet points, no asterisks).
|
464 |
+
- **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
|
465 |
+
|
466 |
+
Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
|
467 |
+
"""
|
468 |
+
|
469 |
+
|
470 |
+
dense_caption_prompt = f"""
|
471 |
+
You are a visual assistant analyzing a single frame of a video.
|
472 |
+
In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
|
473 |
+
|
474 |
+
I am building an **action-centric referring expression** dataset.
|
475 |
+
Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
|
476 |
+
|
477 |
+
---
|
478 |
+
## Guidelines:
|
479 |
+
1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
|
480 |
+
2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
|
481 |
+
3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
|
482 |
+
4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
483 |
+
5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
|
484 |
+
6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
|
485 |
+
7. Base your descriptions on these principles:
|
486 |
+
- **Avoid words like 'minimal' or 'slightly'.**
|
487 |
+
- Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
|
488 |
+
- Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
|
489 |
+
- **Specify actions with other objects or entities** only when they are clear and observable.
|
490 |
+
- (O) "pushing another person"
|
491 |
+
- (X) "interacting with another object"
|
492 |
+
|
493 |
+
---
|
494 |
+
## Output Format:
|
495 |
+
- Each labeled **{cat_name}** must have **exactly one line**.
|
496 |
+
- Format: `ID. {cat_name} + action-based description`
|
497 |
+
- (O) Example:
|
498 |
+
```
|
499 |
+
1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
|
500 |
+
2. The person is pulling a baby carriage while smiling.
|
501 |
+
```
|
502 |
+
- **Ensure each object is described individually.**
|
503 |
+
- **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
|
504 |
+
|
505 |
+
---
|
506 |
+
## Example:
|
507 |
+
If the frame has two labeled **bears**, your output should be:
|
508 |
+
```
|
509 |
+
1. The bear is reaching out its right paw while leaning forward to catch prey.
|
510 |
+
2. A bear is standing upright, facing right, and touching the bike beside it.
|
511 |
+
```
|
512 |
+
|
513 |
+
---
|
514 |
+
## Additional Instructions:
|
515 |
+
- **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
|
516 |
+
- **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
|
517 |
+
- **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
|
518 |
+
- **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
|
519 |
+
|
520 |
+
Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
|
521 |
+
|
522 |
+
|
523 |
+
MAX_RETRIES = 3
|
524 |
+
retry_count = 0
|
525 |
+
|
526 |
+
if should_caption:
|
527 |
+
while retry_count < MAX_RETRIES:
|
528 |
+
selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
|
529 |
+
|
530 |
+
response2 = captioner.chat.completions.create(
|
531 |
+
model=model,
|
532 |
+
messages=[
|
533 |
+
{
|
534 |
+
"role": "user",
|
535 |
+
"content": [
|
536 |
+
{
|
537 |
+
"type": "text",
|
538 |
+
"text": selected_prompt,
|
539 |
+
},
|
540 |
+
{
|
541 |
+
"type": "image_url",
|
542 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
543 |
+
},
|
544 |
+
],
|
545 |
+
}
|
546 |
+
],
|
547 |
+
)
|
548 |
+
|
549 |
+
# caption = response2.choices[0].message.content
|
550 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
551 |
+
|
552 |
+
caption = response2.choices[0].message.content.strip()
|
553 |
+
caption_lower = caption.lower().lstrip()
|
554 |
+
|
555 |
+
if caption_lower.startswith("1.") and not any(
|
556 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
557 |
+
):
|
558 |
+
break
|
559 |
+
|
560 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
561 |
+
retry_count += 1
|
562 |
+
time.sleep(2)
|
563 |
+
|
564 |
+
if retry_count == MAX_RETRIES:
|
565 |
+
caption = None
|
566 |
+
print("Max retries reached. Caption generation failed.")
|
567 |
+
|
568 |
+
else:
|
569 |
+
caption = None
|
570 |
+
|
571 |
+
image_captions[frame_name] = caption
|
572 |
+
all_captions[cat_name] = image_captions
|
573 |
+
|
574 |
+
# final : also prepare valid object ids
|
575 |
+
valid_obj_ids = dict()
|
576 |
+
|
577 |
+
for cat in cat_names:
|
578 |
+
if cat in ytvos_category_valid_list:
|
579 |
+
obj_id_cat = vid_meta['obj_id_cat']
|
580 |
+
valid_cat_ids = []
|
581 |
+
for obj_id in list(obj_id_cat.keys()):
|
582 |
+
if obj_id_cat[obj_id] == cat:
|
583 |
+
valid_cat_ids.append(obj_id)
|
584 |
+
valid_obj_ids[cat] = valid_cat_ids
|
585 |
+
|
586 |
+
return vid_id, all_captions, valid_obj_ids
|
587 |
+
|
588 |
+
|
589 |
+
if __name__ == '__main__':
|
590 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
591 |
+
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
|
592 |
+
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
|
593 |
+
|
594 |
+
args = parser.parse_args()
|
595 |
+
|
596 |
+
#==================데이터 불러오기===================
|
597 |
+
# 전체 데이터셋
|
598 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
599 |
+
|
600 |
+
# 전체 데이터셋 메타데이터
|
601 |
+
metas = train_dataset.metas
|
602 |
+
|
603 |
+
# 색상 후보 8개 (RGB 형식)
|
604 |
+
colors = [
|
605 |
+
(255, 0, 0), # Red
|
606 |
+
(0, 255, 0), # Green
|
607 |
+
(0, 0, 255), # Blue
|
608 |
+
(255, 255, 0), # Yellow
|
609 |
+
(255, 0, 255), # Magenta
|
610 |
+
(0, 255, 255), # Cyan
|
611 |
+
(128, 0, 128), # Purple
|
612 |
+
(255, 165, 0) # Orange
|
613 |
+
]
|
614 |
+
|
615 |
+
ytvos_category_valid_list = [
|
616 |
+
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
617 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
618 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
619 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
620 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
621 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
622 |
+
]
|
623 |
+
|
624 |
+
#==================gpt 돌리기===================
|
625 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
626 |
+
|
627 |
+
result_captions = {}
|
628 |
+
result_valid_obj_ids = {}
|
629 |
+
|
630 |
+
for i in range(len(metas)):
|
631 |
+
vid_id, all_captions, valid_obj_ids = getCaption(i)
|
632 |
+
|
633 |
+
if vid_id not in result_captions:
|
634 |
+
result_captions[vid_id] = all_captions
|
635 |
+
if vid_id not in result_valid_obj_ids:
|
636 |
+
result_valid_obj_ids[vid_id] = valid_obj_ids
|
637 |
+
|
638 |
+
print("Finished!", flush=True)
|
639 |
+
|
640 |
+
with open(args.save_caption_path, "w") as file:
|
641 |
+
json.dump(result_captions, file, indent=4)
|
642 |
+
|
643 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
644 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207173350.py
ADDED
@@ -0,0 +1,677 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
import time
|
5 |
+
|
6 |
+
from os import path as osp
|
7 |
+
from io import BytesIO
|
8 |
+
import random
|
9 |
+
|
10 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
11 |
+
import argparse
|
12 |
+
import opts
|
13 |
+
|
14 |
+
import sys
|
15 |
+
from pathlib import Path
|
16 |
+
import os
|
17 |
+
from os import path as osp
|
18 |
+
import skimage
|
19 |
+
from io import BytesIO
|
20 |
+
|
21 |
+
import numpy as np
|
22 |
+
import pandas as pd
|
23 |
+
import regex as re
|
24 |
+
import json
|
25 |
+
|
26 |
+
import cv2
|
27 |
+
from PIL import Image, ImageDraw
|
28 |
+
import torch
|
29 |
+
from torchvision.transforms import functional as F
|
30 |
+
|
31 |
+
from skimage import measure # (pip install scikit-image)
|
32 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
33 |
+
|
34 |
+
import matplotlib.pyplot as plt
|
35 |
+
import matplotlib.patches as patches
|
36 |
+
from matplotlib.collections import PatchCollection
|
37 |
+
from matplotlib.patches import Rectangle
|
38 |
+
import textwrap
|
39 |
+
|
40 |
+
|
41 |
+
import ipywidgets as widgets
|
42 |
+
from IPython.display import display, clear_output
|
43 |
+
|
44 |
+
from openai import OpenAI
|
45 |
+
import base64
|
46 |
+
import json
|
47 |
+
import requests
|
48 |
+
from openai.error import APIConnectionError, OpenAIError
|
49 |
+
|
50 |
+
def number_objects_and_encode_old(idx, color_mask=False):
|
51 |
+
encoded_frames = {}
|
52 |
+
contoured_frames = {} # New dictionary for original images
|
53 |
+
vid_cat_cnts = {}
|
54 |
+
|
55 |
+
vid_meta = metas[idx]
|
56 |
+
vid_data = train_dataset[idx]
|
57 |
+
vid_id = vid_meta['video']
|
58 |
+
frame_indx = vid_meta['sample_indx']
|
59 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
60 |
+
imgs = vid_data[0]
|
61 |
+
|
62 |
+
for cat in cat_names:
|
63 |
+
cat_frames = []
|
64 |
+
contour_frames = []
|
65 |
+
frame_cat_cnts = {}
|
66 |
+
|
67 |
+
for i in range(imgs.size(0)):
|
68 |
+
frame_name = frame_indx[i]
|
69 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
70 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
71 |
+
|
72 |
+
frame_data = vid_data[2][frame_name]
|
73 |
+
obj_ids = list(frame_data.keys())
|
74 |
+
|
75 |
+
cat_cnt = 0
|
76 |
+
|
77 |
+
for j in range(len(obj_ids)):
|
78 |
+
obj_id = obj_ids[j]
|
79 |
+
obj_data = frame_data[obj_id]
|
80 |
+
obj_bbox = obj_data['bbox']
|
81 |
+
obj_valid = obj_data['valid']
|
82 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
83 |
+
obj_cat = obj_data['category_name']
|
84 |
+
|
85 |
+
if obj_cat == cat and obj_valid:
|
86 |
+
cat_cnt += 1
|
87 |
+
|
88 |
+
if color_mask == False:
|
89 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
90 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
91 |
+
for i, contour in enumerate(contours):
|
92 |
+
moments = cv2.moments(contour)
|
93 |
+
if moments["m00"] != 0:
|
94 |
+
cx = int(moments["m10"] / moments["m00"])
|
95 |
+
cy = int(moments["m01"] / moments["m00"])
|
96 |
+
else:
|
97 |
+
cx, cy = contour[0][0]
|
98 |
+
|
99 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
100 |
+
text = obj_id
|
101 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
102 |
+
text_w, text_h = text_size
|
103 |
+
|
104 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
105 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
106 |
+
|
107 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
108 |
+
font, 1, (255, 255, 255), 2)
|
109 |
+
|
110 |
+
else:
|
111 |
+
alpha = 0.08
|
112 |
+
|
113 |
+
colored_obj_mask = np.zeros_like(frame)
|
114 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
115 |
+
frame[obj_mask == 1] = (
|
116 |
+
(1 - alpha) * frame[obj_mask == 1]
|
117 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
118 |
+
)
|
119 |
+
|
120 |
+
|
121 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
122 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
123 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
124 |
+
|
125 |
+
if len(contours) > 0:
|
126 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
127 |
+
M = cv2.moments(largest_contour)
|
128 |
+
if M["m00"] != 0:
|
129 |
+
center_x = int(M["m10"] / M["m00"])
|
130 |
+
center_y = int(M["m01"] / M["m00"])
|
131 |
+
else:
|
132 |
+
center_x, center_y = 0, 0
|
133 |
+
|
134 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
135 |
+
text = obj_id
|
136 |
+
|
137 |
+
font_scale = 0.9
|
138 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
139 |
+
text_x = center_x - text_size[0] // 1
|
140 |
+
text_y = center_y
|
141 |
+
|
142 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5)
|
143 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
144 |
+
|
145 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
146 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
147 |
+
|
148 |
+
# plt.figure(figsize=(12, 8))
|
149 |
+
# plt.imshow(frame)
|
150 |
+
# plt.title(f"frame {frame_name}")
|
151 |
+
# plt.tight_layout()
|
152 |
+
# plt.axis('off')
|
153 |
+
# plt.show()
|
154 |
+
|
155 |
+
buffer = BytesIO()
|
156 |
+
frame = Image.fromarray(frame)
|
157 |
+
frame.save(buffer, format='jpeg')
|
158 |
+
buffer.seek(0)
|
159 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
160 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
161 |
+
|
162 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
163 |
+
buffer.truncate()
|
164 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
165 |
+
frame_for_contour.save(buffer, format='jpeg')
|
166 |
+
buffer.seek(0)
|
167 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
168 |
+
|
169 |
+
encoded_frames[cat] = cat_frames
|
170 |
+
contoured_frames[cat] = contour_frames
|
171 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
172 |
+
|
173 |
+
return encoded_frames, contoured_frames, vid_cat_cnts
|
174 |
+
|
175 |
+
|
176 |
+
def number_objects_and_encode(idx, color_mask=False):
|
177 |
+
encoded_frames = {}
|
178 |
+
contoured_frames = {} # New dictionary for original images
|
179 |
+
vid_cat_cnts = {}
|
180 |
+
|
181 |
+
vid_meta = metas[idx]
|
182 |
+
vid_data = train_dataset[idx]
|
183 |
+
vid_id = vid_meta['video']
|
184 |
+
frame_indx = vid_meta['sample_indx']
|
185 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
186 |
+
imgs = vid_data[0]
|
187 |
+
|
188 |
+
for cat in cat_names:
|
189 |
+
cat_frames = []
|
190 |
+
contour_frames = []
|
191 |
+
frame_cat_cnts = {}
|
192 |
+
|
193 |
+
for i in range(imgs.size(0)):
|
194 |
+
frame_name = frame_indx[i]
|
195 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
196 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
197 |
+
|
198 |
+
frame_data = vid_data[2][frame_name]
|
199 |
+
obj_ids = list(frame_data.keys())
|
200 |
+
|
201 |
+
cat_cnt = 0
|
202 |
+
|
203 |
+
for j in range(len(obj_ids)):
|
204 |
+
obj_id = obj_ids[j]
|
205 |
+
obj_data = frame_data[obj_id]
|
206 |
+
obj_bbox = obj_data['bbox']
|
207 |
+
obj_valid = obj_data['valid']
|
208 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
209 |
+
obj_cat = obj_data['category_name']
|
210 |
+
|
211 |
+
if obj_cat == cat and obj_valid:
|
212 |
+
cat_cnt += 1
|
213 |
+
|
214 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
215 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
216 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
217 |
+
|
218 |
+
if len(contours) > 0:
|
219 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
220 |
+
M = cv2.moments(largest_contour)
|
221 |
+
if M["m00"] != 0:
|
222 |
+
center_x = int(M["m10"] / M["m00"])
|
223 |
+
center_y = int(M["m01"] / M["m00"])
|
224 |
+
else:
|
225 |
+
center_x, center_y = 0, 0
|
226 |
+
|
227 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
228 |
+
text = obj_id
|
229 |
+
font_scale = 1.2
|
230 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
231 |
+
text_x = center_x - text_size[0] // 1
|
232 |
+
text_y = center_y
|
233 |
+
|
234 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5)
|
235 |
+
rect_end = (text_x + text_size[0] + 5, text_y + 3)
|
236 |
+
|
237 |
+
contour_thickness = 1
|
238 |
+
rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
|
239 |
+
rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
|
240 |
+
|
241 |
+
cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
|
242 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
243 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
244 |
+
|
245 |
+
|
246 |
+
if color_mask:
|
247 |
+
alpha = 0.08
|
248 |
+
colored_obj_mask = np.zeros_like(frame)
|
249 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
250 |
+
frame[obj_mask == 1] = (
|
251 |
+
(1 - alpha) * frame[obj_mask == 1]
|
252 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
253 |
+
)
|
254 |
+
|
255 |
+
# plt.figure(figsize=(12, 8))
|
256 |
+
# plt.imshow(frame)
|
257 |
+
# plt.title(f"frame {frame_name}")
|
258 |
+
# plt.tight_layout()
|
259 |
+
# plt.axis('off')
|
260 |
+
# plt.show()
|
261 |
+
|
262 |
+
buffer = BytesIO()
|
263 |
+
frame = Image.fromarray(frame)
|
264 |
+
frame.save(buffer, format='jpeg')
|
265 |
+
buffer.seek(0)
|
266 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
267 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
268 |
+
|
269 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
270 |
+
buffer.truncate()
|
271 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
272 |
+
frame_for_contour.save(buffer, format='jpeg')
|
273 |
+
buffer.seek(0)
|
274 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
275 |
+
|
276 |
+
encoded_frames[cat] = cat_frames
|
277 |
+
contoured_frames[cat] = contour_frames
|
278 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
279 |
+
|
280 |
+
return encoded_frames, contoured_frames, vid_cat_cnts
|
281 |
+
|
282 |
+
|
283 |
+
|
284 |
+
def getCaption(idx, model='gpt-4o'):
|
285 |
+
vid_meta = metas[idx]
|
286 |
+
vid_data = train_dataset[idx]
|
287 |
+
vid_id = vid_meta['video']
|
288 |
+
print(f"vid id: {vid_id}\n")
|
289 |
+
|
290 |
+
frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
|
291 |
+
cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
|
292 |
+
all_captions = dict()
|
293 |
+
|
294 |
+
# color_mask = random.choice([True, False])
|
295 |
+
color_mask = random.choices([False, True], weights=[60, 40])[0]
|
296 |
+
|
297 |
+
base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
|
298 |
+
#marked = "mask with boundary" if color_mask else "boundary"
|
299 |
+
|
300 |
+
for cat_name in list(cat_names) :
|
301 |
+
|
302 |
+
is_movable = False
|
303 |
+
if cat_name in ytvos_category_valid_list :
|
304 |
+
is_movable = True
|
305 |
+
|
306 |
+
if not is_movable:
|
307 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
|
308 |
+
|
309 |
+
|
310 |
+
image_captions = {}
|
311 |
+
captioner = OpenAI()
|
312 |
+
cat_base64_frames = base64_frames[cat_name]
|
313 |
+
# cont_base64_frames = contoured_frames[cat_name]
|
314 |
+
|
315 |
+
for i in range(len(cat_base64_frames)):
|
316 |
+
frame_name = frame_indx[i]
|
317 |
+
# cont_base64_image = cont_base64_frames[i]
|
318 |
+
base64_image = cat_base64_frames[i]
|
319 |
+
should_filter = False
|
320 |
+
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
|
321 |
+
|
322 |
+
if frame_cat_cnts >= 2:
|
323 |
+
should_filter = True
|
324 |
+
else:
|
325 |
+
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
|
326 |
+
|
327 |
+
|
328 |
+
if is_movable and should_filter:
|
329 |
+
#1단계: 필터링
|
330 |
+
print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
331 |
+
caption_filter_text = f"""
|
332 |
+
You are a visual assistant analyzing a single frame from a video.
|
333 |
+
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
|
334 |
+
|
335 |
+
Are {cat_name}s in the image performing all different and recognizable actions or postures?
|
336 |
+
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
|
337 |
+
facial expressions, and any notable interactions with objects or other {cat_name}s or people.
|
338 |
+
|
339 |
+
Only focus on obvious, prominent actions that can be reliably identified from this single frame.
|
340 |
+
|
341 |
+
- Respond with "YES" if:
|
342 |
+
1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
|
343 |
+
(e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
|
344 |
+
2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
|
345 |
+
3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
|
346 |
+
|
347 |
+
- Respond with "NONE" if:
|
348 |
+
1) The actions or pose are not clearly differentiable or too similar.
|
349 |
+
2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
|
350 |
+
3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
|
351 |
+
|
352 |
+
Answer strictly with either "YES" or "NONE".
|
353 |
+
"""
|
354 |
+
|
355 |
+
response1 = captioner.chat.completions.create(
|
356 |
+
model=model,
|
357 |
+
messages=[
|
358 |
+
{
|
359 |
+
"role": "user",
|
360 |
+
"content": [
|
361 |
+
{
|
362 |
+
"type": "text",
|
363 |
+
"text": caption_filter_text,
|
364 |
+
},
|
365 |
+
{
|
366 |
+
"type": "image_url",
|
367 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
368 |
+
}
|
369 |
+
],
|
370 |
+
}
|
371 |
+
],
|
372 |
+
)
|
373 |
+
response_content = response1.choices[0].message.content
|
374 |
+
should_caption = True if "yes" in response_content.lower() else False
|
375 |
+
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
|
376 |
+
|
377 |
+
else:
|
378 |
+
should_caption = False
|
379 |
+
|
380 |
+
#2단계: dense caption 만들기
|
381 |
+
dense_caption_prompt_1 = f"""
|
382 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
|
383 |
+
|
384 |
+
Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
|
385 |
+
|
386 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
387 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
388 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
389 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
390 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
391 |
+
6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
392 |
+
- expressions like 'seems to be', 'appears to be' are BANNED!
|
393 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
394 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
395 |
+
9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
|
396 |
+
10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
|
397 |
+
11. Do not mention object IDs.
|
398 |
+
12. Use '{cat_name}' as the noun for the referring expressions.
|
399 |
+
|
400 |
+
Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
401 |
+
|
402 |
+
- Your answer should contain details, and follow the following format:
|
403 |
+
object id. action-oriented description
|
404 |
+
(e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
|
405 |
+
2. a person bending over and touching his boots to tie the shoelace.)
|
406 |
+
- for action-oriented description, use {cat_name} as subject noun
|
407 |
+
|
408 |
+
**Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
|
409 |
+
Please pay attention to the categories of these objects and don’t change them.
|
410 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
411 |
+
Output referring expressions for each object id. Please start your answer:"""
|
412 |
+
|
413 |
+
|
414 |
+
dense_caption_prompt_2 = f"""
|
415 |
+
You are an advanced visual language model analyzing a video frame.
|
416 |
+
In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
|
417 |
+
|
418 |
+
Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
|
419 |
+
Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
|
420 |
+
|
421 |
+
---
|
422 |
+
## Key Guidelines:
|
423 |
+
1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
|
424 |
+
- Example: "grabbing a branch and pulling it down" (**(O) Specific**)
|
425 |
+
- Avoid: "moving slightly to the side" (**(X) Too vague**)
|
426 |
+
|
427 |
+
2. **Do not describe appearance, color, or position**—focus purely on the action.
|
428 |
+
- (X) "A large brown bear standing on the left"
|
429 |
+
- (O) "The bear is lifting its front paws and swiping forward."
|
430 |
+
|
431 |
+
3. **Use dynamic, action-specific verbs** rather than passive descriptions.
|
432 |
+
- (O) "The giraffe is tilting its head and sniffing the ground."
|
433 |
+
- (X) "The giraffe is near a tree and looking around."
|
434 |
+
|
435 |
+
4. **Avoid assumptions, emotions, or speculative phrasing.**
|
436 |
+
- (X) "The person seems excited" / "The person might be preparing to jump."
|
437 |
+
- (O) "The person is pushing its front legs against the rock and leaping forward."
|
438 |
+
|
439 |
+
5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
440 |
+
- expressions like 'seems to be', 'appears to be' are BANNED!
|
441 |
+
6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
442 |
+
|
443 |
+
7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
|
444 |
+
- **Each object should have a unique, descriptive action.**
|
445 |
+
- (X) "Two dogs are running."
|
446 |
+
- (O) "1. One dog is chasing another, its legs stretched mid-air.
|
447 |
+
2. The other dog is looking back while speeding up."
|
448 |
+
|
449 |
+
---
|
450 |
+
## Output Format:
|
451 |
+
- Each labeled **{cat_name}** should have exactly **one line of description**.
|
452 |
+
- Format: `ID. {cat_name} + action-based description`
|
453 |
+
- (O) Example:
|
454 |
+
```
|
455 |
+
1. The person is leaning forward while opening a bag with both hands.
|
456 |
+
2. The person is holding onto a rope and pulling themselves up.
|
457 |
+
```
|
458 |
+
- **Ensure that each object is described individually.**
|
459 |
+
- **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
|
460 |
+
|
461 |
+
---
|
462 |
+
## Additional Instructions:
|
463 |
+
- **Do NOT** use expressions like "it appears that..." or "it seems like...".
|
464 |
+
- **Do NOT** mention object IDs in the description (only use the provided format).
|
465 |
+
- **DO NOT** include markdown formatting (no bullet points, no asterisks).
|
466 |
+
- **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
|
467 |
+
|
468 |
+
Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
|
469 |
+
"""
|
470 |
+
|
471 |
+
|
472 |
+
dense_caption_prompt = f"""
|
473 |
+
You are a visual assistant analyzing a single frame of a video.
|
474 |
+
In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
|
475 |
+
|
476 |
+
I am building an **action-centric referring expression** dataset.
|
477 |
+
Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
|
478 |
+
|
479 |
+
---
|
480 |
+
## Guidelines:
|
481 |
+
1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
|
482 |
+
2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
|
483 |
+
3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
|
484 |
+
4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
485 |
+
5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
|
486 |
+
6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
|
487 |
+
7. Base your descriptions on these principles:
|
488 |
+
- **Avoid words like 'minimal' or 'slightly'.**
|
489 |
+
- Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
|
490 |
+
- Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
|
491 |
+
- **Specify actions with other objects or entities** only when they are clear and observable.
|
492 |
+
- (O) "pushing another person"
|
493 |
+
- (X) "interacting with another object"
|
494 |
+
|
495 |
+
---
|
496 |
+
## Output Format:
|
497 |
+
- Each labeled **{cat_name}** must have **exactly one line**.
|
498 |
+
- Format: `ID. {cat_name} + action-based description`
|
499 |
+
- (O) Example:
|
500 |
+
```
|
501 |
+
1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
|
502 |
+
2. The person is pulling a baby carriage while smiling.
|
503 |
+
```
|
504 |
+
- **Ensure each object is described individually.**
|
505 |
+
- **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
|
506 |
+
|
507 |
+
---
|
508 |
+
## Example:
|
509 |
+
If the frame has two labeled **bears**, your output should be:
|
510 |
+
```
|
511 |
+
1. The bear is reaching out its right paw while leaning forward to catch prey.
|
512 |
+
2. A bear is standing upright, facing right, and touching the bike beside it.
|
513 |
+
```
|
514 |
+
|
515 |
+
---
|
516 |
+
## Additional Instructions:
|
517 |
+
- **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
|
518 |
+
- **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
|
519 |
+
- **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
|
520 |
+
- **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
|
521 |
+
|
522 |
+
Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
|
523 |
+
|
524 |
+
|
525 |
+
MAX_RETRIES = 3
|
526 |
+
retry_count = 0
|
527 |
+
|
528 |
+
if should_caption:
|
529 |
+
while retry_count < MAX_RETRIES:
|
530 |
+
selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
|
531 |
+
|
532 |
+
response2 = captioner.chat.completions.create(
|
533 |
+
model=model,
|
534 |
+
messages=[
|
535 |
+
{
|
536 |
+
"role": "user",
|
537 |
+
"content": [
|
538 |
+
{
|
539 |
+
"type": "text",
|
540 |
+
"text": selected_prompt,
|
541 |
+
},
|
542 |
+
{
|
543 |
+
"type": "image_url",
|
544 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
545 |
+
},
|
546 |
+
],
|
547 |
+
}
|
548 |
+
],
|
549 |
+
)
|
550 |
+
|
551 |
+
# caption = response2.choices[0].message.content
|
552 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
553 |
+
|
554 |
+
caption = response2.choices[0].message.content.strip()
|
555 |
+
caption_lower = caption.lower().lstrip()
|
556 |
+
|
557 |
+
if caption_lower.startswith("1.") and not any(
|
558 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
559 |
+
):
|
560 |
+
break
|
561 |
+
|
562 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
563 |
+
retry_count += 1
|
564 |
+
time.sleep(2)
|
565 |
+
|
566 |
+
if retry_count == MAX_RETRIES:
|
567 |
+
caption = None
|
568 |
+
print("Max retries reached. Caption generation failed.")
|
569 |
+
|
570 |
+
else:
|
571 |
+
caption = None
|
572 |
+
|
573 |
+
image_captions[frame_name] = caption
|
574 |
+
all_captions[cat_name] = image_captions
|
575 |
+
|
576 |
+
# final : also prepare valid object ids
|
577 |
+
valid_obj_ids = dict()
|
578 |
+
|
579 |
+
for cat in cat_names:
|
580 |
+
if cat in ytvos_category_valid_list:
|
581 |
+
obj_id_cat = vid_meta['obj_id_cat']
|
582 |
+
valid_cat_ids = []
|
583 |
+
for obj_id in list(obj_id_cat.keys()):
|
584 |
+
if obj_id_cat[obj_id] == cat:
|
585 |
+
valid_cat_ids.append(obj_id)
|
586 |
+
valid_obj_ids[cat] = valid_cat_ids
|
587 |
+
|
588 |
+
return vid_id, all_captions, valid_obj_ids
|
589 |
+
|
590 |
+
|
591 |
+
if __name__ == '__main__':
|
592 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
593 |
+
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
|
594 |
+
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
|
595 |
+
|
596 |
+
args = parser.parse_args()
|
597 |
+
|
598 |
+
#==================데이터 불러오기===================
|
599 |
+
# 전체 데이터셋
|
600 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
601 |
+
|
602 |
+
# 전체 데이터셋 메타데이터
|
603 |
+
metas = train_dataset.metas
|
604 |
+
|
605 |
+
# 색상 후보 8개 (RGB 형식)
|
606 |
+
colors = [
|
607 |
+
(255, 0, 0), # Red
|
608 |
+
(0, 255, 0), # Green
|
609 |
+
(0, 0, 255), # Blue
|
610 |
+
(255, 255, 0), # Yellow
|
611 |
+
(255, 0, 255), # Magenta
|
612 |
+
(0, 255, 255), # Cyan
|
613 |
+
(128, 0, 128), # Purple
|
614 |
+
(255, 165, 0) # Orange
|
615 |
+
]
|
616 |
+
|
617 |
+
ytvos_category_valid_list = [
|
618 |
+
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
619 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
620 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
621 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
622 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
623 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
624 |
+
]
|
625 |
+
|
626 |
+
#==================gpt 돌리기===================
|
627 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
628 |
+
|
629 |
+
result_captions = {}
|
630 |
+
result_valid_obj_ids = {}
|
631 |
+
|
632 |
+
for i in range(len(metas)):
|
633 |
+
try:
|
634 |
+
vid_id, all_captions, valid_obj_ids = getCaption(i)
|
635 |
+
|
636 |
+
if vid_id not in result_captions:
|
637 |
+
result_captions[vid_id] = all_captions
|
638 |
+
if vid_id not in result_valid_obj_ids:
|
639 |
+
result_valid_obj_ids[vid_id] = valid_obj_ids
|
640 |
+
|
641 |
+
except (requests.exceptions.ConnectionError, APIConnectionError) as e:
|
642 |
+
print(f"created caption until {i-1}", flush=True)
|
643 |
+
print("인터넷 연결 문제로 요청을 처리할 수 없습니다:", e, flush=True)
|
644 |
+
|
645 |
+
with open(args.save_caption_path, "w") as file:
|
646 |
+
json.dump(result_captions, file, indent=4)
|
647 |
+
|
648 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
649 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
650 |
+
|
651 |
+
except OpenAIError as e:
|
652 |
+
print(f"created caption until {i-1}", flush=True)
|
653 |
+
print("OpenAI API 관련 오류가 발생했습니다:", e, flush=True)
|
654 |
+
|
655 |
+
with open(args.save_caption_path, "w") as file:
|
656 |
+
json.dump(result_captions, file, indent=4)
|
657 |
+
|
658 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
659 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
660 |
+
|
661 |
+
except Exception as e:
|
662 |
+
print(f"created caption until {i-1}", flush=True)
|
663 |
+
print("알 수 없는 오류 발생:", e, flush=True)
|
664 |
+
|
665 |
+
with open(args.save_caption_path, "w") as file:
|
666 |
+
json.dump(result_captions, file, indent=4)
|
667 |
+
|
668 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
669 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
670 |
+
|
671 |
+
print("Finished!", flush=True)
|
672 |
+
|
673 |
+
with open(args.save_caption_path, "w") as file:
|
674 |
+
json.dump(result_captions, file, indent=4)
|
675 |
+
|
676 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
677 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207184812.py
ADDED
@@ -0,0 +1,676 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
import time
|
5 |
+
|
6 |
+
from os import path as osp
|
7 |
+
from io import BytesIO
|
8 |
+
import random
|
9 |
+
|
10 |
+
from mbench.ytvos_ref import build as build_ytvos_ref
|
11 |
+
import argparse
|
12 |
+
import opts
|
13 |
+
|
14 |
+
import sys
|
15 |
+
from pathlib import Path
|
16 |
+
import os
|
17 |
+
from os import path as osp
|
18 |
+
import skimage
|
19 |
+
from io import BytesIO
|
20 |
+
|
21 |
+
import numpy as np
|
22 |
+
import pandas as pd
|
23 |
+
import regex as re
|
24 |
+
import json
|
25 |
+
|
26 |
+
import cv2
|
27 |
+
from PIL import Image, ImageDraw
|
28 |
+
import torch
|
29 |
+
from torchvision.transforms import functional as F
|
30 |
+
|
31 |
+
from skimage import measure # (pip install scikit-image)
|
32 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
33 |
+
|
34 |
+
import matplotlib.pyplot as plt
|
35 |
+
import matplotlib.patches as patches
|
36 |
+
from matplotlib.collections import PatchCollection
|
37 |
+
from matplotlib.patches import Rectangle
|
38 |
+
import textwrap
|
39 |
+
|
40 |
+
|
41 |
+
import ipywidgets as widgets
|
42 |
+
from IPython.display import display, clear_output
|
43 |
+
|
44 |
+
from openai import OpenAI, APIConnectionError, OpenAIError
|
45 |
+
import base64
|
46 |
+
import json
|
47 |
+
import requests
|
48 |
+
|
49 |
+
def number_objects_and_encode_old(idx, color_mask=False):
|
50 |
+
encoded_frames = {}
|
51 |
+
contoured_frames = {} # New dictionary for original images
|
52 |
+
vid_cat_cnts = {}
|
53 |
+
|
54 |
+
vid_meta = metas[idx]
|
55 |
+
vid_data = train_dataset[idx]
|
56 |
+
vid_id = vid_meta['video']
|
57 |
+
frame_indx = vid_meta['sample_indx']
|
58 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
59 |
+
imgs = vid_data[0]
|
60 |
+
|
61 |
+
for cat in cat_names:
|
62 |
+
cat_frames = []
|
63 |
+
contour_frames = []
|
64 |
+
frame_cat_cnts = {}
|
65 |
+
|
66 |
+
for i in range(imgs.size(0)):
|
67 |
+
frame_name = frame_indx[i]
|
68 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
69 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
70 |
+
|
71 |
+
frame_data = vid_data[2][frame_name]
|
72 |
+
obj_ids = list(frame_data.keys())
|
73 |
+
|
74 |
+
cat_cnt = 0
|
75 |
+
|
76 |
+
for j in range(len(obj_ids)):
|
77 |
+
obj_id = obj_ids[j]
|
78 |
+
obj_data = frame_data[obj_id]
|
79 |
+
obj_bbox = obj_data['bbox']
|
80 |
+
obj_valid = obj_data['valid']
|
81 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
82 |
+
obj_cat = obj_data['category_name']
|
83 |
+
|
84 |
+
if obj_cat == cat and obj_valid:
|
85 |
+
cat_cnt += 1
|
86 |
+
|
87 |
+
if color_mask == False:
|
88 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
89 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
90 |
+
for i, contour in enumerate(contours):
|
91 |
+
moments = cv2.moments(contour)
|
92 |
+
if moments["m00"] != 0:
|
93 |
+
cx = int(moments["m10"] / moments["m00"])
|
94 |
+
cy = int(moments["m01"] / moments["m00"])
|
95 |
+
else:
|
96 |
+
cx, cy = contour[0][0]
|
97 |
+
|
98 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
99 |
+
text = obj_id
|
100 |
+
text_size = cv2.getTextSize(text, font, 1, 2)[0]
|
101 |
+
text_w, text_h = text_size
|
102 |
+
|
103 |
+
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
|
104 |
+
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
|
105 |
+
|
106 |
+
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
|
107 |
+
font, 1, (255, 255, 255), 2)
|
108 |
+
|
109 |
+
else:
|
110 |
+
alpha = 0.08
|
111 |
+
|
112 |
+
colored_obj_mask = np.zeros_like(frame)
|
113 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
114 |
+
frame[obj_mask == 1] = (
|
115 |
+
(1 - alpha) * frame[obj_mask == 1]
|
116 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
117 |
+
)
|
118 |
+
|
119 |
+
|
120 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
121 |
+
cv2.drawContours(frame, contours, -1, colors[j], 2)
|
122 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
123 |
+
|
124 |
+
if len(contours) > 0:
|
125 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
126 |
+
M = cv2.moments(largest_contour)
|
127 |
+
if M["m00"] != 0:
|
128 |
+
center_x = int(M["m10"] / M["m00"])
|
129 |
+
center_y = int(M["m01"] / M["m00"])
|
130 |
+
else:
|
131 |
+
center_x, center_y = 0, 0
|
132 |
+
|
133 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
134 |
+
text = obj_id
|
135 |
+
|
136 |
+
font_scale = 0.9
|
137 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
138 |
+
text_x = center_x - text_size[0] // 1
|
139 |
+
text_y = center_y
|
140 |
+
|
141 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5)
|
142 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
143 |
+
|
144 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
145 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
146 |
+
|
147 |
+
# plt.figure(figsize=(12, 8))
|
148 |
+
# plt.imshow(frame)
|
149 |
+
# plt.title(f"frame {frame_name}")
|
150 |
+
# plt.tight_layout()
|
151 |
+
# plt.axis('off')
|
152 |
+
# plt.show()
|
153 |
+
|
154 |
+
buffer = BytesIO()
|
155 |
+
frame = Image.fromarray(frame)
|
156 |
+
frame.save(buffer, format='jpeg')
|
157 |
+
buffer.seek(0)
|
158 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
159 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
160 |
+
|
161 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
162 |
+
buffer.truncate()
|
163 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
164 |
+
frame_for_contour.save(buffer, format='jpeg')
|
165 |
+
buffer.seek(0)
|
166 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
167 |
+
|
168 |
+
encoded_frames[cat] = cat_frames
|
169 |
+
contoured_frames[cat] = contour_frames
|
170 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
171 |
+
|
172 |
+
return encoded_frames, contoured_frames, vid_cat_cnts
|
173 |
+
|
174 |
+
|
175 |
+
def number_objects_and_encode(idx, color_mask=False):
|
176 |
+
encoded_frames = {}
|
177 |
+
contoured_frames = {} # New dictionary for original images
|
178 |
+
vid_cat_cnts = {}
|
179 |
+
|
180 |
+
vid_meta = metas[idx]
|
181 |
+
vid_data = train_dataset[idx]
|
182 |
+
vid_id = vid_meta['video']
|
183 |
+
frame_indx = vid_meta['sample_indx']
|
184 |
+
cat_names = set(vid_meta['obj_id_cat'].values())
|
185 |
+
imgs = vid_data[0]
|
186 |
+
|
187 |
+
for cat in cat_names:
|
188 |
+
cat_frames = []
|
189 |
+
contour_frames = []
|
190 |
+
frame_cat_cnts = {}
|
191 |
+
|
192 |
+
for i in range(imgs.size(0)):
|
193 |
+
frame_name = frame_indx[i]
|
194 |
+
frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
195 |
+
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
|
196 |
+
|
197 |
+
frame_data = vid_data[2][frame_name]
|
198 |
+
obj_ids = list(frame_data.keys())
|
199 |
+
|
200 |
+
cat_cnt = 0
|
201 |
+
|
202 |
+
for j in range(len(obj_ids)):
|
203 |
+
obj_id = obj_ids[j]
|
204 |
+
obj_data = frame_data[obj_id]
|
205 |
+
obj_bbox = obj_data['bbox']
|
206 |
+
obj_valid = obj_data['valid']
|
207 |
+
obj_mask = obj_data['mask'].numpy().astype(np.uint8)
|
208 |
+
obj_cat = obj_data['category_name']
|
209 |
+
|
210 |
+
if obj_cat == cat and obj_valid:
|
211 |
+
cat_cnt += 1
|
212 |
+
|
213 |
+
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
214 |
+
cv2.drawContours(frame, contours, -1, colors[j], 3)
|
215 |
+
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
|
216 |
+
|
217 |
+
if len(contours) > 0:
|
218 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
219 |
+
M = cv2.moments(largest_contour)
|
220 |
+
if M["m00"] != 0:
|
221 |
+
center_x = int(M["m10"] / M["m00"])
|
222 |
+
center_y = int(M["m01"] / M["m00"])
|
223 |
+
else:
|
224 |
+
center_x, center_y = 0, 0
|
225 |
+
|
226 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
227 |
+
text = obj_id
|
228 |
+
font_scale = 1.2
|
229 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
230 |
+
text_x = center_x - text_size[0] // 1
|
231 |
+
text_y = center_y
|
232 |
+
|
233 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5)
|
234 |
+
rect_end = (text_x + text_size[0] + 5, text_y + 3)
|
235 |
+
|
236 |
+
contour_thickness = 1
|
237 |
+
rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
|
238 |
+
rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
|
239 |
+
|
240 |
+
cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
|
241 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
242 |
+
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
|
243 |
+
|
244 |
+
|
245 |
+
if color_mask:
|
246 |
+
alpha = 0.08
|
247 |
+
colored_obj_mask = np.zeros_like(frame)
|
248 |
+
colored_obj_mask[obj_mask == 1] = colors[j]
|
249 |
+
frame[obj_mask == 1] = (
|
250 |
+
(1 - alpha) * frame[obj_mask == 1]
|
251 |
+
+ alpha * colored_obj_mask[obj_mask == 1]
|
252 |
+
)
|
253 |
+
|
254 |
+
# plt.figure(figsize=(12, 8))
|
255 |
+
# plt.imshow(frame)
|
256 |
+
# plt.title(f"frame {frame_name}")
|
257 |
+
# plt.tight_layout()
|
258 |
+
# plt.axis('off')
|
259 |
+
# plt.show()
|
260 |
+
|
261 |
+
buffer = BytesIO()
|
262 |
+
frame = Image.fromarray(frame)
|
263 |
+
frame.save(buffer, format='jpeg')
|
264 |
+
buffer.seek(0)
|
265 |
+
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
266 |
+
frame_cat_cnts[frame_name] = cat_cnt
|
267 |
+
|
268 |
+
buffer.seek(0) # Reuse buffer instead of creating a new one
|
269 |
+
buffer.truncate()
|
270 |
+
frame_for_contour = Image.fromarray(frame_for_contour)
|
271 |
+
frame_for_contour.save(buffer, format='jpeg')
|
272 |
+
buffer.seek(0)
|
273 |
+
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
|
274 |
+
|
275 |
+
encoded_frames[cat] = cat_frames
|
276 |
+
contoured_frames[cat] = contour_frames
|
277 |
+
vid_cat_cnts[cat] = frame_cat_cnts
|
278 |
+
|
279 |
+
return encoded_frames, contoured_frames, vid_cat_cnts
|
280 |
+
|
281 |
+
|
282 |
+
|
283 |
+
def getCaption(idx, model='gpt-4o'):
|
284 |
+
vid_meta = metas[idx]
|
285 |
+
vid_data = train_dataset[idx]
|
286 |
+
vid_id = vid_meta['video']
|
287 |
+
print(f"vid id: {vid_id}\n")
|
288 |
+
|
289 |
+
frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
|
290 |
+
cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
|
291 |
+
all_captions = dict()
|
292 |
+
|
293 |
+
# color_mask = random.choice([True, False])
|
294 |
+
color_mask = random.choices([False, True], weights=[60, 40])[0]
|
295 |
+
|
296 |
+
base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
|
297 |
+
#marked = "mask with boundary" if color_mask else "boundary"
|
298 |
+
|
299 |
+
for cat_name in list(cat_names) :
|
300 |
+
|
301 |
+
is_movable = False
|
302 |
+
if cat_name in ytvos_category_valid_list :
|
303 |
+
is_movable = True
|
304 |
+
|
305 |
+
if not is_movable:
|
306 |
+
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
|
307 |
+
|
308 |
+
|
309 |
+
image_captions = {}
|
310 |
+
captioner = OpenAI()
|
311 |
+
cat_base64_frames = base64_frames[cat_name]
|
312 |
+
# cont_base64_frames = contoured_frames[cat_name]
|
313 |
+
|
314 |
+
for i in range(len(cat_base64_frames)):
|
315 |
+
frame_name = frame_indx[i]
|
316 |
+
# cont_base64_image = cont_base64_frames[i]
|
317 |
+
base64_image = cat_base64_frames[i]
|
318 |
+
should_filter = False
|
319 |
+
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
|
320 |
+
|
321 |
+
if frame_cat_cnts >= 2:
|
322 |
+
should_filter = True
|
323 |
+
else:
|
324 |
+
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
|
325 |
+
|
326 |
+
|
327 |
+
if is_movable and should_filter:
|
328 |
+
#1단계: 필터링
|
329 |
+
print(f"-----------category name: {cat_name}, frame name: {frame_name}")
|
330 |
+
caption_filter_text = f"""
|
331 |
+
You are a visual assistant analyzing a single frame from a video.
|
332 |
+
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
|
333 |
+
|
334 |
+
Are {cat_name}s in the image performing all different and recognizable actions or postures?
|
335 |
+
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
|
336 |
+
facial expressions, and any notable interactions with objects or other {cat_name}s or people.
|
337 |
+
|
338 |
+
Only focus on obvious, prominent actions that can be reliably identified from this single frame.
|
339 |
+
|
340 |
+
- Respond with "YES" if:
|
341 |
+
1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
|
342 |
+
(e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
|
343 |
+
2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
|
344 |
+
3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
|
345 |
+
|
346 |
+
- Respond with "NONE" if:
|
347 |
+
1) The actions or pose are not clearly differentiable or too similar.
|
348 |
+
2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
|
349 |
+
3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
|
350 |
+
|
351 |
+
Answer strictly with either "YES" or "NONE".
|
352 |
+
"""
|
353 |
+
|
354 |
+
response1 = captioner.chat.completions.create(
|
355 |
+
model=model,
|
356 |
+
messages=[
|
357 |
+
{
|
358 |
+
"role": "user",
|
359 |
+
"content": [
|
360 |
+
{
|
361 |
+
"type": "text",
|
362 |
+
"text": caption_filter_text,
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"type": "image_url",
|
366 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
367 |
+
}
|
368 |
+
],
|
369 |
+
}
|
370 |
+
],
|
371 |
+
)
|
372 |
+
response_content = response1.choices[0].message.content
|
373 |
+
should_caption = True if "yes" in response_content.lower() else False
|
374 |
+
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
|
375 |
+
|
376 |
+
else:
|
377 |
+
should_caption = False
|
378 |
+
|
379 |
+
#2단계: dense caption 만들기
|
380 |
+
dense_caption_prompt_1 = f"""
|
381 |
+
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
|
382 |
+
|
383 |
+
Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
|
384 |
+
|
385 |
+
1. Focus only on clear, unique, and prominent actions that distinguish each object.
|
386 |
+
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
|
387 |
+
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
|
388 |
+
4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
|
389 |
+
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
|
390 |
+
6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
391 |
+
- expressions like 'seems to be', 'appears to be' are BANNED!
|
392 |
+
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
393 |
+
8. Include interactions with objects or other entities when they are prominent and observable.
|
394 |
+
9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
|
395 |
+
10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
|
396 |
+
11. Do not mention object IDs.
|
397 |
+
12. Use '{cat_name}' as the noun for the referring expressions.
|
398 |
+
|
399 |
+
Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
|
400 |
+
|
401 |
+
- Your answer should contain details, and follow the following format:
|
402 |
+
object id. action-oriented description
|
403 |
+
(e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
|
404 |
+
2. a person bending over and touching his boots to tie the shoelace.)
|
405 |
+
- for action-oriented description, use {cat_name} as subject noun
|
406 |
+
|
407 |
+
**Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
|
408 |
+
Please pay attention to the categories of these objects and don’t change them.
|
409 |
+
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
410 |
+
Output referring expressions for each object id. Please start your answer:"""
|
411 |
+
|
412 |
+
|
413 |
+
dense_caption_prompt_2 = f"""
|
414 |
+
You are an advanced visual language model analyzing a video frame.
|
415 |
+
In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
|
416 |
+
|
417 |
+
Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
|
418 |
+
Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
|
419 |
+
|
420 |
+
---
|
421 |
+
## Key Guidelines:
|
422 |
+
1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
|
423 |
+
- Example: "grabbing a branch and pulling it down" (**(O) Specific**)
|
424 |
+
- Avoid: "moving slightly to the side" (**(X) Too vague**)
|
425 |
+
|
426 |
+
2. **Do not describe appearance, color, or position**—focus purely on the action.
|
427 |
+
- (X) "A large brown bear standing on the left"
|
428 |
+
- (O) "The bear is lifting its front paws and swiping forward."
|
429 |
+
|
430 |
+
3. **Use dynamic, action-specific verbs** rather than passive descriptions.
|
431 |
+
- (O) "The giraffe is tilting its head and sniffing the ground."
|
432 |
+
- (X) "The giraffe is near a tree and looking around."
|
433 |
+
|
434 |
+
4. **Avoid assumptions, emotions, or speculative phrasing.**
|
435 |
+
- (X) "The person seems excited" / "The person might be preparing to jump."
|
436 |
+
- (O) "The person is pushing its front legs against the rock and leaping forward."
|
437 |
+
|
438 |
+
5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
|
439 |
+
- expressions like 'seems to be', 'appears to be' are BANNED!
|
440 |
+
6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
|
441 |
+
|
442 |
+
7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
|
443 |
+
- **Each object should have a unique, descriptive action.**
|
444 |
+
- (X) "Two dogs are running."
|
445 |
+
- (O) "1. One dog is chasing another, its legs stretched mid-air.
|
446 |
+
2. The other dog is looking back while speeding up."
|
447 |
+
|
448 |
+
---
|
449 |
+
## Output Format:
|
450 |
+
- Each labeled **{cat_name}** should have exactly **one line of description**.
|
451 |
+
- Format: `ID. {cat_name} + action-based description`
|
452 |
+
- (O) Example:
|
453 |
+
```
|
454 |
+
1. The person is leaning forward while opening a bag with both hands.
|
455 |
+
2. The person is holding onto a rope and pulling themselves up.
|
456 |
+
```
|
457 |
+
- **Ensure that each object is described individually.**
|
458 |
+
- **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
|
459 |
+
|
460 |
+
---
|
461 |
+
## Additional Instructions:
|
462 |
+
- **Do NOT** use expressions like "it appears that..." or "it seems like...".
|
463 |
+
- **Do NOT** mention object IDs in the description (only use the provided format).
|
464 |
+
- **DO NOT** include markdown formatting (no bullet points, no asterisks).
|
465 |
+
- **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
|
466 |
+
|
467 |
+
Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
|
468 |
+
"""
|
469 |
+
|
470 |
+
|
471 |
+
dense_caption_prompt = f"""
|
472 |
+
You are a visual assistant analyzing a single frame of a video.
|
473 |
+
In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
|
474 |
+
|
475 |
+
I am building an **action-centric referring expression** dataset.
|
476 |
+
Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
|
477 |
+
|
478 |
+
---
|
479 |
+
## Guidelines:
|
480 |
+
1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
|
481 |
+
2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
|
482 |
+
3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
|
483 |
+
4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
484 |
+
5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
|
485 |
+
6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
|
486 |
+
7. Base your descriptions on these principles:
|
487 |
+
- **Avoid words like 'minimal' or 'slightly'.**
|
488 |
+
- Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
|
489 |
+
- Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
|
490 |
+
- **Specify actions with other objects or entities** only when they are clear and observable.
|
491 |
+
- (O) "pushing another person"
|
492 |
+
- (X) "interacting with another object"
|
493 |
+
|
494 |
+
---
|
495 |
+
## Output Format:
|
496 |
+
- Each labeled **{cat_name}** must have **exactly one line**.
|
497 |
+
- Format: `ID. {cat_name} + action-based description`
|
498 |
+
- (O) Example:
|
499 |
+
```
|
500 |
+
1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
|
501 |
+
2. The person is pulling a baby carriage while smiling.
|
502 |
+
```
|
503 |
+
- **Ensure each object is described individually.**
|
504 |
+
- **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
|
505 |
+
|
506 |
+
---
|
507 |
+
## Example:
|
508 |
+
If the frame has two labeled **bears**, your output should be:
|
509 |
+
```
|
510 |
+
1. The bear is reaching out its right paw while leaning forward to catch prey.
|
511 |
+
2. A bear is standing upright, facing right, and touching the bike beside it.
|
512 |
+
```
|
513 |
+
|
514 |
+
---
|
515 |
+
## Additional Instructions:
|
516 |
+
- **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
|
517 |
+
- **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
|
518 |
+
- **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
|
519 |
+
- **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
|
520 |
+
|
521 |
+
Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
|
522 |
+
|
523 |
+
|
524 |
+
MAX_RETRIES = 3
|
525 |
+
retry_count = 0
|
526 |
+
|
527 |
+
if should_caption:
|
528 |
+
while retry_count < MAX_RETRIES:
|
529 |
+
selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
|
530 |
+
|
531 |
+
response2 = captioner.chat.completions.create(
|
532 |
+
model=model,
|
533 |
+
messages=[
|
534 |
+
{
|
535 |
+
"role": "user",
|
536 |
+
"content": [
|
537 |
+
{
|
538 |
+
"type": "text",
|
539 |
+
"text": selected_prompt,
|
540 |
+
},
|
541 |
+
{
|
542 |
+
"type": "image_url",
|
543 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
544 |
+
},
|
545 |
+
],
|
546 |
+
}
|
547 |
+
],
|
548 |
+
)
|
549 |
+
|
550 |
+
# caption = response2.choices[0].message.content
|
551 |
+
#print(f"{image_path} - {frame_name}: {caption}")
|
552 |
+
|
553 |
+
caption = response2.choices[0].message.content.strip()
|
554 |
+
caption_lower = caption.lower().lstrip()
|
555 |
+
|
556 |
+
if caption_lower.startswith("1.") and not any(
|
557 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
558 |
+
):
|
559 |
+
break
|
560 |
+
|
561 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
562 |
+
retry_count += 1
|
563 |
+
time.sleep(2)
|
564 |
+
|
565 |
+
if retry_count == MAX_RETRIES:
|
566 |
+
caption = None
|
567 |
+
print("Max retries reached. Caption generation failed.")
|
568 |
+
|
569 |
+
else:
|
570 |
+
caption = None
|
571 |
+
|
572 |
+
image_captions[frame_name] = caption
|
573 |
+
all_captions[cat_name] = image_captions
|
574 |
+
|
575 |
+
# final : also prepare valid object ids
|
576 |
+
valid_obj_ids = dict()
|
577 |
+
|
578 |
+
for cat in cat_names:
|
579 |
+
if cat in ytvos_category_valid_list:
|
580 |
+
obj_id_cat = vid_meta['obj_id_cat']
|
581 |
+
valid_cat_ids = []
|
582 |
+
for obj_id in list(obj_id_cat.keys()):
|
583 |
+
if obj_id_cat[obj_id] == cat:
|
584 |
+
valid_cat_ids.append(obj_id)
|
585 |
+
valid_obj_ids[cat] = valid_cat_ids
|
586 |
+
|
587 |
+
return vid_id, all_captions, valid_obj_ids
|
588 |
+
|
589 |
+
|
590 |
+
if __name__ == '__main__':
|
591 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
592 |
+
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
|
593 |
+
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
|
594 |
+
|
595 |
+
args = parser.parse_args()
|
596 |
+
|
597 |
+
#==================데이터 불러오기===================
|
598 |
+
# 전체 데이터셋
|
599 |
+
train_dataset = build_ytvos_ref(image_set = 'train', args = args)
|
600 |
+
|
601 |
+
# 전체 데이터셋 메타데이터
|
602 |
+
metas = train_dataset.metas
|
603 |
+
|
604 |
+
# 색상 후보 8개 (RGB 형식)
|
605 |
+
colors = [
|
606 |
+
(255, 0, 0), # Red
|
607 |
+
(0, 255, 0), # Green
|
608 |
+
(0, 0, 255), # Blue
|
609 |
+
(255, 255, 0), # Yellow
|
610 |
+
(255, 0, 255), # Magenta
|
611 |
+
(0, 255, 255), # Cyan
|
612 |
+
(128, 0, 128), # Purple
|
613 |
+
(255, 165, 0) # Orange
|
614 |
+
]
|
615 |
+
|
616 |
+
ytvos_category_valid_list = [
|
617 |
+
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
|
618 |
+
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
|
619 |
+
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
|
620 |
+
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
|
621 |
+
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
|
622 |
+
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
|
623 |
+
]
|
624 |
+
|
625 |
+
#==================gpt 돌리기===================
|
626 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
627 |
+
|
628 |
+
result_captions = {}
|
629 |
+
result_valid_obj_ids = {}
|
630 |
+
|
631 |
+
for i in range(len(metas)):
|
632 |
+
try:
|
633 |
+
vid_id, all_captions, valid_obj_ids = getCaption(i)
|
634 |
+
|
635 |
+
if vid_id not in result_captions:
|
636 |
+
result_captions[vid_id] = all_captions
|
637 |
+
if vid_id not in result_valid_obj_ids:
|
638 |
+
result_valid_obj_ids[vid_id] = valid_obj_ids
|
639 |
+
|
640 |
+
except (requests.exceptions.ConnectionError, APIConnectionError) as e:
|
641 |
+
print(f"created caption until {i-1}", flush=True)
|
642 |
+
print("인터넷 연결 문제로 요청을 처리할 수 없습니다:", e, flush=True)
|
643 |
+
|
644 |
+
with open(args.save_caption_path, "w") as file:
|
645 |
+
json.dump(result_captions, file, indent=4)
|
646 |
+
|
647 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
648 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
649 |
+
|
650 |
+
except OpenAIError as e:
|
651 |
+
print(f"created caption until {i-1}", flush=True)
|
652 |
+
print("OpenAI API 관련 오류가 발생했습니다:", e, flush=True)
|
653 |
+
|
654 |
+
with open(args.save_caption_path, "w") as file:
|
655 |
+
json.dump(result_captions, file, indent=4)
|
656 |
+
|
657 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
658 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
659 |
+
|
660 |
+
except Exception as e:
|
661 |
+
print(f"created caption until {i-1}", flush=True)
|
662 |
+
print("알 수 없는 오류 발생:", e, flush=True)
|
663 |
+
|
664 |
+
with open(args.save_caption_path, "w") as file:
|
665 |
+
json.dump(result_captions, file, indent=4)
|
666 |
+
|
667 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
668 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
669 |
+
|
670 |
+
print("Finished!", flush=True)
|
671 |
+
|
672 |
+
with open(args.save_caption_path, "w") as file:
|
673 |
+
json.dump(result_captions, file, indent=4)
|
674 |
+
|
675 |
+
with open(args.save_valid_obj_ids_path, "w") as file:
|
676 |
+
json.dump(result_valid_obj_ids, file, indent=4)
|
.history/mbench/make_ref-ytvos_json_20250113183250.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from os import path as osp
|
3 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
|
9 |
+
|
10 |
+
from pathlib import Path
|
11 |
+
import io
|
12 |
+
|
13 |
+
import numpy as np
|
14 |
+
import pandas as pd
|
15 |
+
import regex as re
|
16 |
+
import json
|
17 |
+
|
18 |
+
import cv2
|
19 |
+
from PIL import Image, ImageDraw
|
20 |
+
import torch
|
21 |
+
from torchvision.transforms import functional as F
|
22 |
+
|
23 |
+
from skimage import measure # (pip install scikit-image)
|
24 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
25 |
+
|
26 |
+
import matplotlib.pyplot as plt
|
27 |
+
import matplotlib.patches as patches
|
28 |
+
from matplotlib.collections import PatchCollection
|
29 |
+
from matplotlib.patches import Rectangle
|
30 |
+
|
31 |
+
|
32 |
+
import ipywidgets as widgets
|
33 |
+
from IPython.display import display, clear_output
|
34 |
+
|
35 |
+
#==================json 만들기===================
|
36 |
+
def createJson(train_dataset, metas):
|
37 |
+
entire_json = {}
|
38 |
+
|
39 |
+
#초기화
|
40 |
+
data_idx = 0
|
41 |
+
|
42 |
+
while data_idx < 10:
|
43 |
+
|
44 |
+
#하나의 비디오에 대해
|
45 |
+
video_data = {}
|
46 |
+
video_id = metas[data_idx]['video']
|
47 |
+
video_data['bins'] = metas[data_idx]['bins']
|
48 |
+
annotation_data = []
|
49 |
+
frame_names = []
|
50 |
+
|
51 |
+
while metas[data_idx]['video'] == video_id:
|
52 |
+
|
53 |
+
obj_id = metas[data_idx]['obj_id']
|
54 |
+
sample_id = metas[data_idx]['sample_id']
|
55 |
+
sample_frames_id = metas[data_idx]['sample_frames_id']
|
56 |
+
sample_frame_idx = sample_frames_id.index(sample_id)
|
57 |
+
|
58 |
+
frames = metas[data_idx]['frames']
|
59 |
+
|
60 |
+
frame_name = frames[sample_id]
|
61 |
+
cat_name = metas[data_idx]['category']
|
62 |
+
|
63 |
+
bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
|
64 |
+
|
65 |
+
obj_data = {obj_id: {
|
66 |
+
"category_name" : cat_name,
|
67 |
+
"bbox": bbox
|
68 |
+
}}
|
69 |
+
|
70 |
+
|
71 |
+
annotation_data.append(obj_data)
|
72 |
+
|
73 |
+
frame_names.append(frame_name)
|
74 |
+
|
75 |
+
data_idx += 1
|
76 |
+
|
77 |
+
video_data['annotations'] = annotation_data
|
78 |
+
video_data['frame_names'] = frame_names
|
79 |
+
video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
|
80 |
+
|
81 |
+
entire_json[video_id] = video_data
|
82 |
+
|
83 |
+
return entire_json
|
84 |
+
|
85 |
+
|
86 |
+
if __name__ == '__main__':
|
87 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
88 |
+
args = parser.parse_args()
|
89 |
+
|
90 |
+
#==================데이터 불러오기===================
|
91 |
+
# 전체 데이터셋
|
92 |
+
train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
|
93 |
+
|
94 |
+
# 전체 데이터셋 메타데이터
|
95 |
+
metas = train_dataset.metas
|
96 |
+
|
97 |
+
#==================json 만들기===================
|
98 |
+
entire_json_dict = createJson(train_dataset, metas)
|
99 |
+
print(type(entire_json_dict))
|
100 |
+
entire_json = json.dumps(entire_json_dict, indent=4)
|
101 |
+
|
102 |
+
with open('mbench/sampled_frame.json', mode='w') as file:
|
103 |
+
file.write(entire_json)
|
.history/mbench/make_ref-ytvos_json_20250113183335.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from os import path as osp
|
3 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
|
9 |
+
|
10 |
+
from pathlib import Path
|
11 |
+
import io
|
12 |
+
|
13 |
+
import numpy as np
|
14 |
+
import pandas as pd
|
15 |
+
import regex as re
|
16 |
+
import json
|
17 |
+
|
18 |
+
import cv2
|
19 |
+
from PIL import Image, ImageDraw
|
20 |
+
import torch
|
21 |
+
from torchvision.transforms import functional as F
|
22 |
+
|
23 |
+
from skimage import measure # (pip install scikit-image)
|
24 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
25 |
+
|
26 |
+
import matplotlib.pyplot as plt
|
27 |
+
import matplotlib.patches as patches
|
28 |
+
from matplotlib.collections import PatchCollection
|
29 |
+
from matplotlib.patches import Rectangle
|
30 |
+
|
31 |
+
|
32 |
+
import ipywidgets as widgets
|
33 |
+
from IPython.display import display, clear_output
|
34 |
+
|
35 |
+
#==================json 만들기===================
|
36 |
+
def createJson(train_dataset, metas):
|
37 |
+
entire_json = {}
|
38 |
+
|
39 |
+
#초기화
|
40 |
+
data_idx = 0
|
41 |
+
|
42 |
+
while data_idx < 10:
|
43 |
+
|
44 |
+
#하나의 비디오에 대해
|
45 |
+
video_data = {}
|
46 |
+
video_id = metas[data_idx]['video']
|
47 |
+
video_data['bins'] = metas[data_idx]['bins']
|
48 |
+
annotation_data = []
|
49 |
+
frame_names = []
|
50 |
+
|
51 |
+
while metas[data_idx]['video'] == video_id:
|
52 |
+
|
53 |
+
obj_id = metas[data_idx]['obj_id']
|
54 |
+
sample_id = metas[data_idx]['sample_id']
|
55 |
+
sample_frames_id = metas[data_idx]['sample_frames_id']
|
56 |
+
sample_frame_idx = sample_frames_id.index(sample_id)
|
57 |
+
|
58 |
+
frames = metas[data_idx]['frames']
|
59 |
+
|
60 |
+
frame_name = frames[sample_id]
|
61 |
+
cat_name = metas[data_idx]['category']
|
62 |
+
|
63 |
+
bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :].numpy()
|
64 |
+
|
65 |
+
obj_data = {obj_id: {
|
66 |
+
"category_name" : cat_name,
|
67 |
+
"bbox": bbox
|
68 |
+
}}
|
69 |
+
|
70 |
+
|
71 |
+
annotation_data.append(obj_data)
|
72 |
+
|
73 |
+
frame_names.append(frame_name)
|
74 |
+
|
75 |
+
data_idx += 1
|
76 |
+
|
77 |
+
video_data['annotations'] = annotation_data
|
78 |
+
video_data['frame_names'] = frame_names
|
79 |
+
video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
|
80 |
+
|
81 |
+
entire_json[video_id] = video_data
|
82 |
+
|
83 |
+
return entire_json
|
84 |
+
|
85 |
+
|
86 |
+
if __name__ == '__main__':
|
87 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
88 |
+
args = parser.parse_args()
|
89 |
+
|
90 |
+
#==================데이터 불러오기===================
|
91 |
+
# 전체 데이터셋
|
92 |
+
train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
|
93 |
+
|
94 |
+
# 전체 데이터셋 메타데이터
|
95 |
+
metas = train_dataset.metas
|
96 |
+
|
97 |
+
#==================json 만들기===================
|
98 |
+
entire_json_dict = createJson(train_dataset, metas)
|
99 |
+
print(type(entire_json_dict))
|
100 |
+
entire_json = json.dumps(entire_json_dict, indent=4)
|
101 |
+
|
102 |
+
with open('mbench/sampled_frame.json', mode='w') as file:
|
103 |
+
file.write(entire_json)
|
.history/mbench/make_ref-ytvos_json_20250113183413.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from os import path as osp
|
3 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
|
9 |
+
|
10 |
+
from pathlib import Path
|
11 |
+
import io
|
12 |
+
|
13 |
+
import numpy as np
|
14 |
+
import pandas as pd
|
15 |
+
import regex as re
|
16 |
+
import json
|
17 |
+
|
18 |
+
import cv2
|
19 |
+
from PIL import Image, ImageDraw
|
20 |
+
import torch
|
21 |
+
from torchvision.transforms import functional as F
|
22 |
+
|
23 |
+
from skimage import measure # (pip install scikit-image)
|
24 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
25 |
+
|
26 |
+
import matplotlib.pyplot as plt
|
27 |
+
import matplotlib.patches as patches
|
28 |
+
from matplotlib.collections import PatchCollection
|
29 |
+
from matplotlib.patches import Rectangle
|
30 |
+
|
31 |
+
|
32 |
+
import ipywidgets as widgets
|
33 |
+
from IPython.display import display, clear_output
|
34 |
+
|
35 |
+
#==================json 만들기===================
|
36 |
+
def createJson(train_dataset, metas):
|
37 |
+
entire_json = {}
|
38 |
+
|
39 |
+
#초기화
|
40 |
+
data_idx = 0
|
41 |
+
|
42 |
+
while data_idx < 10:
|
43 |
+
|
44 |
+
#하나의 비디오에 대해
|
45 |
+
video_data = {}
|
46 |
+
video_id = metas[data_idx]['video']
|
47 |
+
video_data['bins'] = metas[data_idx]['bins']
|
48 |
+
annotation_data = []
|
49 |
+
frame_names = []
|
50 |
+
|
51 |
+
while metas[data_idx]['video'] == video_id:
|
52 |
+
|
53 |
+
obj_id = metas[data_idx]['obj_id']
|
54 |
+
sample_id = metas[data_idx]['sample_id']
|
55 |
+
sample_frames_id = metas[data_idx]['sample_frames_id']
|
56 |
+
sample_frame_idx = sample_frames_id.index(sample_id)
|
57 |
+
|
58 |
+
frames = metas[data_idx]['frames']
|
59 |
+
|
60 |
+
frame_name = frames[sample_id]
|
61 |
+
cat_name = metas[data_idx]['category']
|
62 |
+
|
63 |
+
bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :].tolist()
|
64 |
+
|
65 |
+
obj_data = {obj_id: {
|
66 |
+
"category_name" : cat_name,
|
67 |
+
"bbox": bbox
|
68 |
+
}}
|
69 |
+
|
70 |
+
|
71 |
+
annotation_data.append(obj_data)
|
72 |
+
|
73 |
+
frame_names.append(frame_name)
|
74 |
+
|
75 |
+
data_idx += 1
|
76 |
+
|
77 |
+
video_data['annotations'] = annotation_data
|
78 |
+
video_data['frame_names'] = frame_names
|
79 |
+
video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
|
80 |
+
|
81 |
+
entire_json[video_id] = video_data
|
82 |
+
|
83 |
+
return entire_json
|
84 |
+
|
85 |
+
|
86 |
+
if __name__ == '__main__':
|
87 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
88 |
+
args = parser.parse_args()
|
89 |
+
|
90 |
+
#==================데이터 불러오기===================
|
91 |
+
# 전체 데이터셋
|
92 |
+
train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
|
93 |
+
|
94 |
+
# 전체 데이터셋 메타데이터
|
95 |
+
metas = train_dataset.metas
|
96 |
+
|
97 |
+
#==================json 만들기===================
|
98 |
+
entire_json_dict = createJson(train_dataset, metas)
|
99 |
+
print(type(entire_json_dict))
|
100 |
+
entire_json = json.dumps(entire_json_dict, indent=4)
|
101 |
+
|
102 |
+
with open('mbench/sampled_frame.json', mode='w') as file:
|
103 |
+
file.write(entire_json)
|
.history/mbench/make_ref-ytvos_json_20250113195227.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from os import path as osp
|
3 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
|
9 |
+
|
10 |
+
from pathlib import Path
|
11 |
+
import io
|
12 |
+
|
13 |
+
import numpy as np
|
14 |
+
import pandas as pd
|
15 |
+
import regex as re
|
16 |
+
import json
|
17 |
+
|
18 |
+
import cv2
|
19 |
+
from PIL import Image, ImageDraw
|
20 |
+
import torch
|
21 |
+
from torchvision.transforms import functional as F
|
22 |
+
|
23 |
+
from skimage import measure # (pip install scikit-image)
|
24 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
25 |
+
|
26 |
+
import matplotlib.pyplot as plt
|
27 |
+
import matplotlib.patches as patches
|
28 |
+
from matplotlib.collections import PatchCollection
|
29 |
+
from matplotlib.patches import Rectangle
|
30 |
+
|
31 |
+
|
32 |
+
import ipywidgets as widgets
|
33 |
+
from IPython.display import display, clear_output
|
34 |
+
|
35 |
+
#==================json 만들기===================
|
36 |
+
def createJson(train_dataset, metas):
|
37 |
+
entire_json = {}
|
38 |
+
|
39 |
+
#초기화
|
40 |
+
data_idx = 0
|
41 |
+
print(len(train_dataset), len(metas))
|
42 |
+
while data_idx < len(train_dataset):
|
43 |
+
|
44 |
+
#하나의 비디오에 대해
|
45 |
+
video_data = {}
|
46 |
+
video_id = metas[data_idx]['video']
|
47 |
+
video_data['bins'] = metas[data_idx]['bins']
|
48 |
+
annotation_data = []
|
49 |
+
frame_names = []
|
50 |
+
|
51 |
+
while metas[data_idx]['video'] == video_id:
|
52 |
+
|
53 |
+
obj_id = metas[data_idx]['obj_id']
|
54 |
+
sample_id = metas[data_idx]['sample_id']
|
55 |
+
sample_frames_id = metas[data_idx]['sample_frames_id']
|
56 |
+
sample_frame_idx = sample_frames_id.index(sample_id)
|
57 |
+
|
58 |
+
frames = metas[data_idx]['frames']
|
59 |
+
|
60 |
+
frame_name = frames[sample_id]
|
61 |
+
cat_name = metas[data_idx]['category']
|
62 |
+
|
63 |
+
bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :].tolist()
|
64 |
+
|
65 |
+
obj_data = {obj_id: {
|
66 |
+
"category_name" : cat_name,
|
67 |
+
"bbox": bbox
|
68 |
+
}}
|
69 |
+
|
70 |
+
|
71 |
+
annotation_data.append(obj_data)
|
72 |
+
|
73 |
+
frame_names.append(frame_name)
|
74 |
+
|
75 |
+
data_idx += 1
|
76 |
+
|
77 |
+
video_data['annotations'] = annotation_data
|
78 |
+
video_data['frame_names'] = frame_names
|
79 |
+
video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
|
80 |
+
|
81 |
+
entire_json[video_id] = video_data
|
82 |
+
|
83 |
+
return entire_json
|
84 |
+
|
85 |
+
|
86 |
+
if __name__ == '__main__':
|
87 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
88 |
+
args = parser.parse_args()
|
89 |
+
|
90 |
+
#==================데이터 불러오기===================
|
91 |
+
# 전체 데이터셋
|
92 |
+
train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
|
93 |
+
|
94 |
+
# 전체 데이터셋 메타데이터
|
95 |
+
metas = train_dataset.metas
|
96 |
+
|
97 |
+
#==================json 만들기===================
|
98 |
+
entire_json_dict = createJson(train_dataset, metas)
|
99 |
+
print(type(entire_json_dict))
|
100 |
+
entire_json = json.dumps(entire_json_dict, indent=4)
|
101 |
+
|
102 |
+
with open('mbench/sampled_frame.json', mode='w') as file:
|
103 |
+
file.write(entire_json)
|
.history/mbench/make_ref-ytvos_json_20250116140938.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from os import path as osp
|
3 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
|
9 |
+
|
10 |
+
from pathlib import Path
|
11 |
+
import io
|
12 |
+
|
13 |
+
import numpy as np
|
14 |
+
import pandas as pd
|
15 |
+
import regex as re
|
16 |
+
import json
|
17 |
+
|
18 |
+
import cv2
|
19 |
+
from PIL import Image, ImageDraw
|
20 |
+
import torch
|
21 |
+
from torchvision.transforms import functional as F
|
22 |
+
|
23 |
+
from skimage import measure # (pip install scikit-image)
|
24 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
25 |
+
|
26 |
+
import matplotlib.pyplot as plt
|
27 |
+
import matplotlib.patches as patches
|
28 |
+
from matplotlib.collections import PatchCollection
|
29 |
+
from matplotlib.patches import Rectangle
|
30 |
+
|
31 |
+
|
32 |
+
import ipywidgets as widgets
|
33 |
+
from IPython.display import display, clear_output
|
34 |
+
|
35 |
+
#==================json 만들기===================
|
36 |
+
def createJson(train_dataset, metas):
|
37 |
+
entire_json = {}
|
38 |
+
|
39 |
+
#초기화
|
40 |
+
vid_idx = 0
|
41 |
+
|
42 |
+
while vid_idx < len(train_dataset):
|
43 |
+
|
44 |
+
#하나의 비디오에 대해
|
45 |
+
video_data = {}
|
46 |
+
video_train_frames, video_train_info = train_dataset[vid_idx]
|
47 |
+
video_meta = metas[vid_idx]
|
48 |
+
|
49 |
+
video_id = video_meta['video']
|
50 |
+
video_data['bins'] = video_meta['bins']
|
51 |
+
bin_nums = len(video_meta['bins'])
|
52 |
+
obj_nums = len(list(video_meta['obj_id_cat'].keys()))
|
53 |
+
|
54 |
+
annotation_data = []
|
55 |
+
frame_names = []
|
56 |
+
|
57 |
+
for i in range(bin_nums):
|
58 |
+
bin_data = {}
|
59 |
+
for j in range(obj_nums):
|
60 |
+
obj_id = str(j+1)
|
61 |
+
obj_data = {
|
62 |
+
"category_name":video_meta['obj_id_cat'][obj_id],
|
63 |
+
"bbox":video_train_info['boxes'][i*obj_nums+j, :]
|
64 |
+
}
|
65 |
+
bin_data[obj_id] = obj_data
|
66 |
+
annotation_data.append(bin_data)
|
67 |
+
|
68 |
+
video_data['annotations'] = annotation_data
|
69 |
+
|
70 |
+
|
71 |
+
sample_indx = metas[vid_idx]['sample_indx']
|
72 |
+
frames = metas[vid_idx]['frames']
|
73 |
+
for i in sample_indx:
|
74 |
+
frame_name = frames[i]
|
75 |
+
frame_names.append(frame_name)
|
76 |
+
|
77 |
+
video_data['frame_names'] = frame_names
|
78 |
+
video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
|
79 |
+
entire_json[video_id] = video_data
|
80 |
+
|
81 |
+
vid_idx += 1
|
82 |
+
|
83 |
+
return entire_json
|
84 |
+
|
85 |
+
|
86 |
+
if __name__ == '__main__':
|
87 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
88 |
+
args = parser.parse_args()
|
89 |
+
|
90 |
+
#==================데이터 불러오기===================
|
91 |
+
# 전체 데이터셋
|
92 |
+
train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
|
93 |
+
|
94 |
+
# 전체 데이터셋 메타데이터
|
95 |
+
metas = train_dataset.metas
|
96 |
+
|
97 |
+
#==================json 만들기===================
|
98 |
+
entire_json_dict = createJson(train_dataset, metas)
|
99 |
+
print(type(entire_json_dict))
|
100 |
+
entire_json = json.dumps(entire_json_dict, indent=4)
|
101 |
+
|
102 |
+
with open('mbench/sampled_frame.json', mode='w') as file:
|
103 |
+
file.write(entire_json)
|
.history/mbench/make_ref-ytvos_json_20250116141629.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
from os import path as osp
|
4 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
5 |
+
|
6 |
+
from datasets import build_dataset
|
7 |
+
import argparse
|
8 |
+
import opts
|
9 |
+
|
10 |
+
|
11 |
+
from pathlib import Path
|
12 |
+
import io
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import pandas as pd
|
16 |
+
import regex as re
|
17 |
+
import json
|
18 |
+
|
19 |
+
import cv2
|
20 |
+
from PIL import Image, ImageDraw
|
21 |
+
import torch
|
22 |
+
from torchvision.transforms import functional as F
|
23 |
+
|
24 |
+
from skimage import measure # (pip install scikit-image)
|
25 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
26 |
+
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
import matplotlib.patches as patches
|
29 |
+
from matplotlib.collections import PatchCollection
|
30 |
+
from matplotlib.patches import Rectangle
|
31 |
+
|
32 |
+
|
33 |
+
import ipywidgets as widgets
|
34 |
+
from IPython.display import display, clear_output
|
35 |
+
|
36 |
+
#==================json 만들기===================
|
37 |
+
def createJson(train_dataset, metas):
|
38 |
+
entire_json = {}
|
39 |
+
|
40 |
+
#초기화
|
41 |
+
vid_idx = 0
|
42 |
+
|
43 |
+
while vid_idx < len(train_dataset):
|
44 |
+
|
45 |
+
#하나의 비디오에 대해
|
46 |
+
video_data = {}
|
47 |
+
video_train_frames, video_train_info = train_dataset[vid_idx]
|
48 |
+
video_meta = metas[vid_idx]
|
49 |
+
|
50 |
+
video_id = video_meta['video']
|
51 |
+
video_data['bins'] = video_meta['bins']
|
52 |
+
bin_nums = len(video_meta['bins'])
|
53 |
+
obj_nums = len(list(video_meta['obj_id_cat'].keys()))
|
54 |
+
|
55 |
+
annotation_data = []
|
56 |
+
frame_names = []
|
57 |
+
|
58 |
+
for i in range(bin_nums):
|
59 |
+
bin_data = {}
|
60 |
+
for j in range(obj_nums):
|
61 |
+
obj_id = str(j+1)
|
62 |
+
obj_data = {
|
63 |
+
"category_name":video_meta['obj_id_cat'][obj_id],
|
64 |
+
"bbox":video_train_info['boxes'][i*obj_nums+j, :]
|
65 |
+
}
|
66 |
+
bin_data[obj_id] = obj_data
|
67 |
+
annotation_data.append(bin_data)
|
68 |
+
|
69 |
+
video_data['annotations'] = annotation_data
|
70 |
+
|
71 |
+
|
72 |
+
sample_indx = metas[vid_idx]['sample_indx']
|
73 |
+
frames = metas[vid_idx]['frames']
|
74 |
+
for i in sample_indx:
|
75 |
+
frame_name = frames[i]
|
76 |
+
frame_names.append(frame_name)
|
77 |
+
|
78 |
+
video_data['frame_names'] = frame_names
|
79 |
+
video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
|
80 |
+
entire_json[video_id] = video_data
|
81 |
+
|
82 |
+
vid_idx += 1
|
83 |
+
|
84 |
+
return entire_json
|
85 |
+
|
86 |
+
|
87 |
+
if __name__ == '__main__':
|
88 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
89 |
+
args = parser.parse_args()
|
90 |
+
|
91 |
+
#==================데이터 불러오기===================
|
92 |
+
# 전체 데이터셋
|
93 |
+
train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
|
94 |
+
|
95 |
+
# 전체 데이터셋 메타데이터
|
96 |
+
metas = train_dataset.metas
|
97 |
+
|
98 |
+
#==================json 만들기===================
|
99 |
+
entire_json_dict = createJson(train_dataset, metas)
|
100 |
+
print(type(entire_json_dict))
|
101 |
+
entire_json = json.dumps(entire_json_dict, indent=4)
|
102 |
+
|
103 |
+
with open('mbench/sampled_frame.json', mode='w') as file:
|
104 |
+
file.write(entire_json)
|
.history/mbench/make_ref-ytvos_json_20250117072647.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
from os import path as osp
|
4 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
5 |
+
|
6 |
+
from datasets import build_dataset
|
7 |
+
import argparse
|
8 |
+
import opts
|
9 |
+
|
10 |
+
|
11 |
+
from pathlib import Path
|
12 |
+
import io
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import pandas as pd
|
16 |
+
import regex as re
|
17 |
+
import json
|
18 |
+
|
19 |
+
import cv2
|
20 |
+
from PIL import Image, ImageDraw
|
21 |
+
import torch
|
22 |
+
from torchvision.transforms import functional as F
|
23 |
+
|
24 |
+
from skimage import measure # (pip install scikit-image)
|
25 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
26 |
+
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
import matplotlib.patches as patches
|
29 |
+
from matplotlib.collections import PatchCollection
|
30 |
+
from matplotlib.patches import Rectangle
|
31 |
+
|
32 |
+
|
33 |
+
import ipywidgets as widgets
|
34 |
+
from IPython.display import display, clear_output
|
35 |
+
|
36 |
+
#==================json 만들기===================
|
37 |
+
def createJson(train_dataset, metas):
|
38 |
+
entire_json = {}
|
39 |
+
|
40 |
+
#초기화
|
41 |
+
vid_idx = 0
|
42 |
+
|
43 |
+
while vid_idx < len(train_dataset):
|
44 |
+
|
45 |
+
#하나의 비디오에 대해
|
46 |
+
video_data = {}
|
47 |
+
video_train_frames, video_train_info = train_dataset[vid_idx]
|
48 |
+
video_meta = metas[vid_idx]
|
49 |
+
|
50 |
+
video_id = video_meta['video']
|
51 |
+
video_data['bins'] = video_meta['bins']
|
52 |
+
bin_nums = len(video_meta['bins'])
|
53 |
+
obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
|
54 |
+
|
55 |
+
annotation_data = []
|
56 |
+
frame_names = []
|
57 |
+
|
58 |
+
for i in range(bin_nums):
|
59 |
+
bin_data = {}
|
60 |
+
for j in range(obj_nums):
|
61 |
+
obj_id = str(j+1)
|
62 |
+
try:
|
63 |
+
obj_data = {
|
64 |
+
"category_name":video_meta['obj_id_cat'][obj_id],
|
65 |
+
"bbox":video_train_info['boxes'][i*obj_nums+j, :]
|
66 |
+
}
|
67 |
+
except:
|
68 |
+
obj_data = {}
|
69 |
+
bin_data[obj_id] = obj_data
|
70 |
+
annotation_data.append(bin_data)
|
71 |
+
|
72 |
+
video_data['annotations'] = annotation_data
|
73 |
+
|
74 |
+
|
75 |
+
sample_indx = metas[vid_idx]['sample_indx']
|
76 |
+
frames = metas[vid_idx]['frames']
|
77 |
+
for i in sample_indx:
|
78 |
+
frame_name = frames[i]
|
79 |
+
frame_names.append(frame_name)
|
80 |
+
|
81 |
+
video_data['frame_names'] = frame_names
|
82 |
+
video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
|
83 |
+
entire_json[video_id] = video_data
|
84 |
+
|
85 |
+
vid_idx += 1
|
86 |
+
|
87 |
+
return entire_json
|
88 |
+
|
89 |
+
|
90 |
+
if __name__ == '__main__':
|
91 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
92 |
+
args = parser.parse_args()
|
93 |
+
|
94 |
+
#==================데이터 불러오기===================
|
95 |
+
# 전체 데이터셋
|
96 |
+
train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
|
97 |
+
|
98 |
+
# 전체 데이터셋 메타데이터
|
99 |
+
metas = train_dataset.metas
|
100 |
+
|
101 |
+
#==================json 만들기===================
|
102 |
+
entire_json_dict = createJson(train_dataset, metas)
|
103 |
+
print(type(entire_json_dict))
|
104 |
+
entire_json = json.dumps(entire_json_dict, indent=4)
|
105 |
+
|
106 |
+
with open('mbench/sampled_frame2.json', mode='w') as file:
|
107 |
+
file.write(entire_json)
|
.history/mbench/make_ref-ytvos_json_20250117074149.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
from os import path as osp
|
4 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
5 |
+
|
6 |
+
from datasets import build_dataset
|
7 |
+
import argparse
|
8 |
+
import opts
|
9 |
+
|
10 |
+
|
11 |
+
from pathlib import Path
|
12 |
+
import io
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import pandas as pd
|
16 |
+
import regex as re
|
17 |
+
import json
|
18 |
+
|
19 |
+
import cv2
|
20 |
+
from PIL import Image, ImageDraw
|
21 |
+
import torch
|
22 |
+
from torchvision.transforms import functional as F
|
23 |
+
|
24 |
+
from skimage import measure # (pip install scikit-image)
|
25 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
26 |
+
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
import matplotlib.patches as patches
|
29 |
+
from matplotlib.collections import PatchCollection
|
30 |
+
from matplotlib.patches import Rectangle
|
31 |
+
|
32 |
+
|
33 |
+
import ipywidgets as widgets
|
34 |
+
from IPython.display import display, clear_output
|
35 |
+
|
36 |
+
#==================json 만들기===================
|
37 |
+
def createJson(train_dataset, metas):
|
38 |
+
entire_json = {}
|
39 |
+
|
40 |
+
#초기화
|
41 |
+
vid_idx = 0
|
42 |
+
|
43 |
+
while vid_idx < len(train_dataset):
|
44 |
+
|
45 |
+
#하나의 비디오에 대해
|
46 |
+
video_data = {}
|
47 |
+
video_train_frames, video_train_info = train_dataset[vid_idx]
|
48 |
+
video_meta = metas[vid_idx]
|
49 |
+
|
50 |
+
video_id = video_meta['video']
|
51 |
+
video_data['bins'] = video_meta['bins']
|
52 |
+
bin_nums = len(video_meta['bins'])
|
53 |
+
obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
|
54 |
+
|
55 |
+
annotation_data = []
|
56 |
+
frame_names = []
|
57 |
+
|
58 |
+
for i in range(bin_nums):
|
59 |
+
bin_data = {}
|
60 |
+
for j in range(obj_nums):
|
61 |
+
obj_id = str(j+1)
|
62 |
+
try:
|
63 |
+
obj_data = {
|
64 |
+
"category_name":video_meta['obj_id_cat'][obj_id],
|
65 |
+
"bbox":video_train_info['boxes'][i*obj_nums+j, :].tolist()
|
66 |
+
}
|
67 |
+
except:
|
68 |
+
obj_data = {}
|
69 |
+
bin_data[obj_id] = obj_data
|
70 |
+
annotation_data.append(bin_data)
|
71 |
+
|
72 |
+
video_data['annotations'] = annotation_data
|
73 |
+
|
74 |
+
|
75 |
+
sample_indx = metas[vid_idx]['sample_indx']
|
76 |
+
frames = metas[vid_idx]['frames']
|
77 |
+
for i in sample_indx:
|
78 |
+
frame_name = frames[i]
|
79 |
+
frame_names.append(frame_name)
|
80 |
+
|
81 |
+
video_data['frame_names'] = frame_names
|
82 |
+
video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
|
83 |
+
entire_json[video_id] = video_data
|
84 |
+
|
85 |
+
vid_idx += 1
|
86 |
+
|
87 |
+
return entire_json
|
88 |
+
|
89 |
+
|
90 |
+
if __name__ == '__main__':
|
91 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
92 |
+
args = parser.parse_args()
|
93 |
+
|
94 |
+
#==================데이터 불러오기===================
|
95 |
+
# 전체 데이터셋
|
96 |
+
train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
|
97 |
+
|
98 |
+
# 전체 데이터셋 메타데이터
|
99 |
+
metas = train_dataset.metas
|
100 |
+
|
101 |
+
#==================json 만들기===================
|
102 |
+
entire_json_dict = createJson(train_dataset, metas)
|
103 |
+
print(type(entire_json_dict))
|
104 |
+
entire_json = json.dumps(entire_json_dict, indent=4)
|
105 |
+
|
106 |
+
with open('mbench/sampled_frame2.json', mode='w') as file:
|
107 |
+
file.write(entire_json)
|
.history/mbench/make_ref-ytvos_json_20250118024354.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
from os import path as osp
|
4 |
+
sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
|
5 |
+
|
6 |
+
from datasets import build_dataset
|
7 |
+
import argparse
|
8 |
+
import opts
|
9 |
+
|
10 |
+
|
11 |
+
from pathlib import Path
|
12 |
+
import io
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import pandas as pd
|
16 |
+
import regex as re
|
17 |
+
import json
|
18 |
+
|
19 |
+
import cv2
|
20 |
+
from PIL import Image, ImageDraw
|
21 |
+
import torch
|
22 |
+
from torchvision.transforms import functional as F
|
23 |
+
|
24 |
+
from skimage import measure # (pip install scikit-image)
|
25 |
+
from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
|
26 |
+
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
import matplotlib.patches as patches
|
29 |
+
from matplotlib.collections import PatchCollection
|
30 |
+
from matplotlib.patches import Rectangle
|
31 |
+
|
32 |
+
|
33 |
+
import ipywidgets as widgets
|
34 |
+
from IPython.display import display, clear_output
|
35 |
+
|
36 |
+
#==================json 만들기===================
|
37 |
+
def createJson(train_dataset, metas):
|
38 |
+
entire_json = {}
|
39 |
+
|
40 |
+
#초기화
|
41 |
+
vid_idx = 0
|
42 |
+
|
43 |
+
while vid_idx < len(train_dataset):
|
44 |
+
|
45 |
+
#하나의 비디오에 대해
|
46 |
+
video_data = {}
|
47 |
+
video_train_frames, video_train_info = train_dataset[vid_idx]
|
48 |
+
video_meta = metas[vid_idx]
|
49 |
+
|
50 |
+
video_id = video_meta['video']
|
51 |
+
video_data['bins'] = video_meta['bins']
|
52 |
+
bin_nums = len(video_meta['bins'])
|
53 |
+
obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
|
54 |
+
|
55 |
+
annotation_data = []
|
56 |
+
frame_names = []
|
57 |
+
|
58 |
+
for i in range(bin_nums):
|
59 |
+
bin_data = {}
|
60 |
+
for j in range(obj_nums):
|
61 |
+
obj_id = str(j+1)
|
62 |
+
try:
|
63 |
+
obj_data = {
|
64 |
+
"category_name":video_meta['obj_id_cat'][obj_id],
|
65 |
+
"bbox":video_train_info['boxes'][i*obj_nums+j, :].tolist(),
|
66 |
+
"valid":video_train_info['valid'][i*obj_nums+j].item()
|
67 |
+
}
|
68 |
+
except:
|
69 |
+
obj_data = {}
|
70 |
+
bin_data[obj_id] = obj_data
|
71 |
+
annotation_data.append(bin_data)
|
72 |
+
|
73 |
+
video_data['annotations'] = annotation_data
|
74 |
+
|
75 |
+
|
76 |
+
sample_indx = metas[vid_idx]['sample_indx']
|
77 |
+
frames = metas[vid_idx]['frames']
|
78 |
+
for i in sample_indx:
|
79 |
+
frame_name = frames[i]
|
80 |
+
frame_names.append(frame_name)
|
81 |
+
|
82 |
+
video_data['frame_names'] = frame_names
|
83 |
+
video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
|
84 |
+
entire_json[video_id] = video_data
|
85 |
+
|
86 |
+
vid_idx += 1
|
87 |
+
|
88 |
+
return entire_json
|
89 |
+
|
90 |
+
|
91 |
+
if __name__ == '__main__':
|
92 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
93 |
+
args = parser.parse_args()
|
94 |
+
|
95 |
+
#==================데이터 불러오기===================
|
96 |
+
# 전체 데이터셋
|
97 |
+
train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
|
98 |
+
|
99 |
+
# 전체 데이터셋 메타데이터
|
100 |
+
metas = train_dataset.metas
|
101 |
+
|
102 |
+
#==================json 만들기===================
|
103 |
+
entire_json_dict = createJson(train_dataset, metas)
|
104 |
+
print(type(entire_json_dict))
|
105 |
+
entire_json = json.dumps(entire_json_dict, indent=4)
|
106 |
+
|
107 |
+
with open('mbench/sampled_frame3.json', mode='w') as file:
|
108 |
+
file.write(entire_json)
|
.history/mbench/ytvos_ref_20250121140600.py
ADDED
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Ref-YoutubeVOS data loader
|
3 |
+
"""
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from torch.utils.data import Dataset
|
8 |
+
import transforms_video as T
|
9 |
+
|
10 |
+
import os
|
11 |
+
from PIL import Image
|
12 |
+
import json
|
13 |
+
import numpy as np
|
14 |
+
import random
|
15 |
+
|
16 |
+
# from datasets.categories import ytvos_category_dict as category_dict
|
17 |
+
|
18 |
+
|
19 |
+
category_dict = {
|
20 |
+
'airplane': 0, 'ape': 1, 'bear': 2, 'bike': 3, 'bird': 4, 'boat': 5, 'bucket': 6, 'bus': 7, 'camel': 8, 'cat': 9,
|
21 |
+
'cow': 10, 'crocodile': 11, 'deer': 12, 'dog': 13, 'dolphin': 14, 'duck': 15, 'eagle': 16, 'earless_seal': 17,
|
22 |
+
'elephant': 18, 'fish': 19, 'fox': 20, 'frisbee': 21, 'frog': 22, 'giant_panda': 23, 'giraffe': 24, 'hand': 25,
|
23 |
+
'hat': 26, 'hedgehog': 27, 'horse': 28, 'knife': 29, 'leopard': 30, 'lion': 31, 'lizard': 32, 'monkey': 33,
|
24 |
+
'motorbike': 34, 'mouse': 35, 'others': 36, 'owl': 37, 'paddle': 38, 'parachute': 39, 'parrot': 40, 'penguin': 41,
|
25 |
+
'person': 42, 'plant': 43, 'rabbit': 44, 'raccoon': 45, 'sedan': 46, 'shark': 47, 'sheep': 48, 'sign': 49,
|
26 |
+
'skateboard': 50, 'snail': 51, 'snake': 52, 'snowboard': 53, 'squirrel': 54, 'surfboard': 55, 'tennis_racket': 56,
|
27 |
+
'tiger': 57, 'toilet': 58, 'train': 59, 'truck': 60, 'turtle': 61, 'umbrella': 62, 'whale': 63, 'zebra': 64
|
28 |
+
}
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
class YTVOSDataset(Dataset):
|
33 |
+
"""
|
34 |
+
A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
|
35 |
+
"URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
|
36 |
+
(see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
|
37 |
+
The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
|
38 |
+
dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
|
39 |
+
through the Youtube-VOS referring video object segmentation competition page at:
|
40 |
+
https://competitions.codalab.org/competitions/29139
|
41 |
+
Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
|
42 |
+
two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
|
43 |
+
currently only be done on the competition 'validation' subset using the competition's server, as
|
44 |
+
annotations were publicly released only for the 'train' subset of the competition.
|
45 |
+
|
46 |
+
"""
|
47 |
+
def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
|
48 |
+
num_frames: int, max_skip: int):
|
49 |
+
self.img_folder = img_folder
|
50 |
+
self.ann_file = ann_file
|
51 |
+
self._transforms = transforms
|
52 |
+
self.return_masks = return_masks # not used
|
53 |
+
self.num_frames = num_frames
|
54 |
+
self.max_skip = max_skip
|
55 |
+
# create video meta data
|
56 |
+
self.prepare_metas()
|
57 |
+
|
58 |
+
print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
|
59 |
+
print('\n')
|
60 |
+
|
61 |
+
def prepare_metas(self):
|
62 |
+
# read object information
|
63 |
+
with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
|
64 |
+
subset_metas_by_video = json.load(f)['videos']
|
65 |
+
|
66 |
+
# read expression data
|
67 |
+
with open(str(self.ann_file), 'r') as f:
|
68 |
+
subset_expressions_by_video = json.load(f)['videos']
|
69 |
+
self.videos = list(subset_expressions_by_video.keys())
|
70 |
+
|
71 |
+
self.metas = []
|
72 |
+
skip_vid_count = 0
|
73 |
+
|
74 |
+
for vid in self.videos:
|
75 |
+
vid_meta = subset_metas_by_video[vid]
|
76 |
+
vid_data = subset_expressions_by_video[vid]
|
77 |
+
vid_frames = sorted(vid_data['frames'])
|
78 |
+
vid_len = len(vid_frames)
|
79 |
+
|
80 |
+
if vid_len < 11:
|
81 |
+
#print(f"Too short video: {vid} with frame length {vid_len}")
|
82 |
+
skip_vid_count += 1
|
83 |
+
continue
|
84 |
+
|
85 |
+
|
86 |
+
# Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
|
87 |
+
start_idx , end_idx = 2, vid_len-2
|
88 |
+
bin_size = (end_idx - start_idx) // 4
|
89 |
+
|
90 |
+
bins = []
|
91 |
+
for i in range(4):
|
92 |
+
bin_start = start_idx + i * bin_size
|
93 |
+
bin_end = bin_start + bin_size if i < 3 else end_idx
|
94 |
+
|
95 |
+
bins.append((bin_start, bin_end))
|
96 |
+
|
97 |
+
# Random sample one frame from each bin
|
98 |
+
sample_indx = []
|
99 |
+
for start_idx, end_idx in bins:
|
100 |
+
sample_indx.append(random.randint(start_idx, end_idx - 1))
|
101 |
+
sample_indx.sort() # Ensure indices are in order
|
102 |
+
|
103 |
+
|
104 |
+
meta = {
|
105 |
+
'video':vid,
|
106 |
+
'sample_indx':sample_indx,
|
107 |
+
'bins':bins,
|
108 |
+
'frames':vid_frames
|
109 |
+
}
|
110 |
+
obj_id_cat = {}
|
111 |
+
for exp_id, exp_dict in vid_data['expressions'].items():
|
112 |
+
obj_id = exp_dict['obj_id']
|
113 |
+
if obj_id not in obj_id_cat:
|
114 |
+
obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
|
115 |
+
meta['obj_id_cat'] = obj_id_cat
|
116 |
+
self.metas.append(meta)
|
117 |
+
|
118 |
+
print(f"skipped {skip_vid_count} short videos")
|
119 |
+
|
120 |
+
|
121 |
+
@staticmethod
|
122 |
+
def bounding_box(img):
|
123 |
+
rows = np.any(img, axis=1)
|
124 |
+
cols = np.any(img, axis=0)
|
125 |
+
rmin, rmax = np.where(rows)[0][[0, -1]]
|
126 |
+
cmin, cmax = np.where(cols)[0][[0, -1]]
|
127 |
+
return rmin, rmax, cmin, cmax # y1, y2, x1, x2
|
128 |
+
|
129 |
+
def __len__(self):
|
130 |
+
return len(self.metas)
|
131 |
+
|
132 |
+
def __getitem__(self, idx):
|
133 |
+
meta = self.metas[idx] # dict
|
134 |
+
|
135 |
+
video, sample_indx, bins, frames, obj_id_cat = \
|
136 |
+
meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
|
137 |
+
|
138 |
+
# read frames and masks
|
139 |
+
annos = {}
|
140 |
+
imgs, labels, boxes, masks, valid = [], [], [], [], []
|
141 |
+
for frame_indx in sample_indx:
|
142 |
+
frame_name = frames[frame_indx]
|
143 |
+
img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
|
144 |
+
mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
|
145 |
+
img = Image.open(img_path).convert('RGB')
|
146 |
+
imgs.append(img)
|
147 |
+
|
148 |
+
mask = Image.open(mask_path).convert('P')
|
149 |
+
mask = np.array(mask)
|
150 |
+
|
151 |
+
frame_annotations = {}
|
152 |
+
|
153 |
+
# create the target
|
154 |
+
for obj_id in list(obj_id_cat.keys()):
|
155 |
+
obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary
|
156 |
+
if (obj_mask > 0).any():
|
157 |
+
y1, y2, x1, x2 = self.bounding_box(obj_mask)
|
158 |
+
box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
|
159 |
+
valid.append(1)
|
160 |
+
val = 1
|
161 |
+
else: # some frame didn't contain the instance
|
162 |
+
box = torch.tensor([0, 0, 0, 0]).to(torch.float)
|
163 |
+
valid.append(0)
|
164 |
+
val = 0
|
165 |
+
obj_mask = torch.from_numpy(obj_mask)
|
166 |
+
|
167 |
+
# append
|
168 |
+
masks.append(obj_mask)
|
169 |
+
boxes.append(box)
|
170 |
+
|
171 |
+
frame_annotations[obj_id] = {
|
172 |
+
'category_name': obj_id_cat[obj_id],
|
173 |
+
'bbox': box,
|
174 |
+
'valid' : val,
|
175 |
+
'mask': obj_mask
|
176 |
+
}
|
177 |
+
|
178 |
+
annos[frame_indx] = frame_annotations
|
179 |
+
|
180 |
+
|
181 |
+
# transform
|
182 |
+
w, h = img.size
|
183 |
+
boxes = torch.stack(boxes, dim=0)
|
184 |
+
boxes[:, 0::2].clamp_(min=0, max=w)
|
185 |
+
boxes[:, 1::2].clamp_(min=0, max=h)
|
186 |
+
masks = torch.stack(masks, dim=0)
|
187 |
+
target = {
|
188 |
+
'frames_idx': sample_indx, # [T,]
|
189 |
+
'boxes': boxes, # [T, 4], xyxy
|
190 |
+
'masks': masks, # [T, H, W]
|
191 |
+
'valid': torch.tensor(valid), # [T,]
|
192 |
+
'obj_ids' : list(obj_id_cat.keys()),
|
193 |
+
'orig_size': torch.as_tensor([int(h), int(w)]),
|
194 |
+
'size': torch.as_tensor([int(h), int(w)])
|
195 |
+
}
|
196 |
+
|
197 |
+
# "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
|
198 |
+
# if self._transforms:
|
199 |
+
# imgs, target = self._transforms(imgs, target)
|
200 |
+
# imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
|
201 |
+
# else:
|
202 |
+
imgs = np.array(imgs)
|
203 |
+
imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
|
204 |
+
|
205 |
+
|
206 |
+
# # FIXME: handle "valid", since some box may be removed due to random crop
|
207 |
+
# if torch.any(target['valid'] == 1): # at leatst one instance
|
208 |
+
# instance_check = True
|
209 |
+
# else:
|
210 |
+
# idx = random.randint(0, self.__len__() - 1)
|
211 |
+
|
212 |
+
return imgs, target, annos
|
213 |
+
|
214 |
+
|
215 |
+
def make_coco_transforms(image_set, max_size=640):
|
216 |
+
normalize = T.Compose([
|
217 |
+
T.ToTensor(),
|
218 |
+
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
219 |
+
])
|
220 |
+
|
221 |
+
scales = [288, 320, 352, 392, 416, 448, 480, 512]
|
222 |
+
|
223 |
+
if image_set == 'train':
|
224 |
+
return T.Compose([
|
225 |
+
T.RandomHorizontalFlip(),
|
226 |
+
T.PhotometricDistort(),
|
227 |
+
T.RandomSelect(
|
228 |
+
T.Compose([
|
229 |
+
T.RandomResize(scales, max_size=max_size),
|
230 |
+
T.Check(),
|
231 |
+
]),
|
232 |
+
T.Compose([
|
233 |
+
T.RandomResize([400, 500, 600]),
|
234 |
+
T.RandomSizeCrop(384, 600),
|
235 |
+
T.RandomResize(scales, max_size=max_size),
|
236 |
+
T.Check(),
|
237 |
+
])
|
238 |
+
),
|
239 |
+
normalize,
|
240 |
+
])
|
241 |
+
|
242 |
+
# we do not use the 'val' set since the annotations are inaccessible
|
243 |
+
if image_set == 'val':
|
244 |
+
return T.Compose([
|
245 |
+
T.RandomResize([360], max_size=640),
|
246 |
+
normalize,
|
247 |
+
])
|
248 |
+
|
249 |
+
raise ValueError(f'unknown {image_set}')
|
250 |
+
|
251 |
+
|
252 |
+
def build(image_set, args):
|
253 |
+
root = Path(args.ytvos_path)
|
254 |
+
assert root.exists(), f'provided YTVOS path {root} does not exist'
|
255 |
+
PATHS = {
|
256 |
+
"train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
|
257 |
+
"val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
|
258 |
+
}
|
259 |
+
img_folder, ann_file = PATHS[image_set]
|
260 |
+
# dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
|
261 |
+
# num_frames=args.num_frames, max_skip=args.max_skip)
|
262 |
+
dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
|
263 |
+
num_frames=args.num_frames, max_skip=args.max_skip)
|
264 |
+
return dataset
|
265 |
+
|
.history/mbench_a2d/gpt_a2d_numbered_20250205111521.py
ADDED
File without changes
|
.history/mbench_a2d/gpt_a2d_numbered_20250205151640.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
import time
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import cv2
|
13 |
+
from io import BytesIO
|
14 |
+
import base64
|
15 |
+
from PIL import Image
|
16 |
+
import json
|
17 |
+
|
18 |
+
from openai import OpenAI
|
19 |
+
|
20 |
+
def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
|
21 |
+
#마스크 색칠할지
|
22 |
+
if color_mask == True:
|
23 |
+
alpha = 0.1
|
24 |
+
|
25 |
+
colored_mask = np.zeros_like(frame)
|
26 |
+
colored_mask[mask == 1] = [255, 0, 0]
|
27 |
+
frame[mask == 1] = (
|
28 |
+
(1 - alpha) * frame[mask == 1] +
|
29 |
+
alpha * colored_mask[mask == 1]
|
30 |
+
)
|
31 |
+
|
32 |
+
#마스크 아웃라인 그리기
|
33 |
+
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
34 |
+
cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
|
35 |
+
|
36 |
+
#instance_id 적을지
|
37 |
+
if label_number == True:
|
38 |
+
if len(contours) > 0:
|
39 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
40 |
+
M = cv2.moments(largest_contour)
|
41 |
+
if M["m00"] != 0:
|
42 |
+
center_x = int(M["m10"] / M["m00"])
|
43 |
+
center_y = int(M["m01"] / M["m00"])
|
44 |
+
else:
|
45 |
+
center_x, center_y = 0, 0
|
46 |
+
|
47 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
48 |
+
text = str(instance_id)
|
49 |
+
font_scale = 0.6
|
50 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
51 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
52 |
+
text_y = center_y
|
53 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
54 |
+
|
55 |
+
# 텍스트 배경 사각형 좌표 계산
|
56 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
57 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
58 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
59 |
+
|
60 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
61 |
+
cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
|
62 |
+
|
63 |
+
# plt.figure(figsize=(6, 10))
|
64 |
+
# plt.imshow(frame)
|
65 |
+
# plt.title(text_query)
|
66 |
+
# plt.tight_layout()
|
67 |
+
# plt.axis('off')
|
68 |
+
# plt.show()
|
69 |
+
|
70 |
+
buffer = BytesIO()
|
71 |
+
frame = Image.fromarray(frame)
|
72 |
+
frame.save(buffer, format='jpeg')
|
73 |
+
buffer.seek(0)
|
74 |
+
encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
|
75 |
+
|
76 |
+
return encoded_frame
|
77 |
+
|
78 |
+
def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
|
79 |
+
|
80 |
+
base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
|
81 |
+
|
82 |
+
captioner = OpenAI()
|
83 |
+
|
84 |
+
#필터링하지 않고 바로 ref exp 만들기
|
85 |
+
dense_caption_prompt = f"""
|
86 |
+
You are a visual assistant analyzing a single frame of a video.
|
87 |
+
In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
|
88 |
+
I also give you a text query describing the marked object.
|
89 |
+
I want to use your expression to create an **action-centric referring expression** dataset.
|
90 |
+
Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
|
91 |
+
---
|
92 |
+
## Guidelines:
|
93 |
+
1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
|
94 |
+
2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
|
95 |
+
3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
|
96 |
+
4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
97 |
+
5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
98 |
+
6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
|
99 |
+
7. Base your description on these action definitions:
|
100 |
+
- Avoid using term 'minimal' or 'slightly'.
|
101 |
+
- General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
|
102 |
+
- details such as motion and intention, facial with object manipulation
|
103 |
+
- movements with object or other entities when they are prominent and observable. expression should be specific.
|
104 |
+
(e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
|
105 |
+
--
|
106 |
+
## Output Format:
|
107 |
+
- For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
|
108 |
+
object id. action-oriented description
|
109 |
+
(e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
|
110 |
+
### Example
|
111 |
+
If the frame has 1 labeled bear, your output should look like:
|
112 |
+
1. the bear reaching his right arm while leaning forward to capture the prey
|
113 |
+
---
|
114 |
+
**Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
|
115 |
+
**Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
|
116 |
+
**Do not include markdown** in the output.
|
117 |
+
Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
118 |
+
For each labeled object, output referring expressions for each object id.
|
119 |
+
"""
|
120 |
+
prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
|
121 |
+
|
122 |
+
MAX_RETRIES = 2
|
123 |
+
retry_count = 0
|
124 |
+
|
125 |
+
while retry_count < MAX_RETRIES:
|
126 |
+
response = captioner.chat.completions.create(
|
127 |
+
model=model,
|
128 |
+
messages=[
|
129 |
+
{
|
130 |
+
"role": "user",
|
131 |
+
"content": [
|
132 |
+
{
|
133 |
+
"type": "text",
|
134 |
+
"text": prompt_with_text_query,
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"type": "image_url",
|
138 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
139 |
+
},
|
140 |
+
],
|
141 |
+
}
|
142 |
+
],
|
143 |
+
)
|
144 |
+
|
145 |
+
|
146 |
+
caption = response.choices[0].message.content.strip()
|
147 |
+
caption_lower = caption.lower().lstrip()
|
148 |
+
if caption_lower.startswith("1.") and not any(
|
149 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
150 |
+
):
|
151 |
+
break
|
152 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
153 |
+
retry_count += 1
|
154 |
+
time.sleep(2)
|
155 |
+
|
156 |
+
if retry_count == MAX_RETRIES:
|
157 |
+
caption = None
|
158 |
+
print("Max retries reached. Caption generation failed.")
|
159 |
+
|
160 |
+
else:
|
161 |
+
caption = None
|
162 |
+
|
163 |
+
return caption
|
164 |
+
|
165 |
+
if __name__ == "__main__":
|
166 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
167 |
+
parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
|
168 |
+
args = parser.parse_args()
|
169 |
+
|
170 |
+
train_dataset = build_dataset('a2d', image_set = 'train', args = args)
|
171 |
+
text_annotations = train_dataset.text_annotations
|
172 |
+
|
173 |
+
all_captions = {}
|
174 |
+
|
175 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
176 |
+
|
177 |
+
for idx in range(100):
|
178 |
+
imgs, target = train_dataset[idx]
|
179 |
+
frames_idx = target['frames_idx'].tolist()
|
180 |
+
text_query, vid_id, frame_id, instance_id = text_annotations[idx]
|
181 |
+
|
182 |
+
frame_id = frame_id - 1
|
183 |
+
frame_order = frames_idx.index(frame_id)
|
184 |
+
|
185 |
+
frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
|
186 |
+
mask = target['masks'].numpy().astype(np.uint8).squeeze()
|
187 |
+
|
188 |
+
caption = getCaption(frame, mask, instance_id, text_query)
|
189 |
+
if vid_id not in all_captions:
|
190 |
+
all_captions[vid_id] = {frame_id : caption}
|
191 |
+
else:
|
192 |
+
all_captions[vid_id][frame_id] = caption
|
193 |
+
|
194 |
+
|
195 |
+
with open(args.save_caption_path, 'w') as file:
|
196 |
+
json.dump(all_captions, file, indent=4)
|
197 |
+
|
.history/mbench_a2d/gpt_a2d_numbered_20250205151759.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
import time
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import cv2
|
13 |
+
from io import BytesIO
|
14 |
+
import base64
|
15 |
+
from PIL import Image
|
16 |
+
import json
|
17 |
+
|
18 |
+
from openai import OpenAI
|
19 |
+
|
20 |
+
def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
|
21 |
+
#마스크 색칠할지
|
22 |
+
if color_mask == True:
|
23 |
+
alpha = 0.1
|
24 |
+
|
25 |
+
colored_mask = np.zeros_like(frame)
|
26 |
+
colored_mask[mask == 1] = [255, 0, 0]
|
27 |
+
frame[mask == 1] = (
|
28 |
+
(1 - alpha) * frame[mask == 1] +
|
29 |
+
alpha * colored_mask[mask == 1]
|
30 |
+
)
|
31 |
+
|
32 |
+
#마스크 아웃라인 그리기
|
33 |
+
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
34 |
+
cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
|
35 |
+
|
36 |
+
#instance_id 적을지
|
37 |
+
if label_number == True:
|
38 |
+
if len(contours) > 0:
|
39 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
40 |
+
M = cv2.moments(largest_contour)
|
41 |
+
if M["m00"] != 0:
|
42 |
+
center_x = int(M["m10"] / M["m00"])
|
43 |
+
center_y = int(M["m01"] / M["m00"])
|
44 |
+
else:
|
45 |
+
center_x, center_y = 0, 0
|
46 |
+
|
47 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
48 |
+
text = str(instance_id)
|
49 |
+
font_scale = 0.6
|
50 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
51 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
52 |
+
text_y = center_y
|
53 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
54 |
+
|
55 |
+
# 텍스트 배경 사각형 좌표 계산
|
56 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
57 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
58 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
59 |
+
|
60 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
61 |
+
cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
|
62 |
+
|
63 |
+
# plt.figure(figsize=(6, 10))
|
64 |
+
# plt.imshow(frame)
|
65 |
+
# plt.title(text_query)
|
66 |
+
# plt.tight_layout()
|
67 |
+
# plt.axis('off')
|
68 |
+
# plt.show()
|
69 |
+
|
70 |
+
buffer = BytesIO()
|
71 |
+
frame = Image.fromarray(frame)
|
72 |
+
frame.save(buffer, format='jpeg')
|
73 |
+
buffer.seek(0)
|
74 |
+
encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
|
75 |
+
|
76 |
+
return encoded_frame
|
77 |
+
|
78 |
+
def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
|
79 |
+
|
80 |
+
base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
|
81 |
+
|
82 |
+
captioner = OpenAI()
|
83 |
+
|
84 |
+
#필터링하지 않고 바로 ref exp 만들기
|
85 |
+
dense_caption_prompt = f"""
|
86 |
+
You are a visual assistant analyzing a single frame of a video.
|
87 |
+
In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
|
88 |
+
I also give you a text query describing the marked object.
|
89 |
+
I want to use your expression to create an **action-centric referring expression** dataset.
|
90 |
+
Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
|
91 |
+
---
|
92 |
+
## Guidelines:
|
93 |
+
1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
|
94 |
+
2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
|
95 |
+
3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
|
96 |
+
4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
97 |
+
5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
98 |
+
6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
|
99 |
+
7. Base your description on these action definitions:
|
100 |
+
- Avoid using term 'minimal' or 'slightly'.
|
101 |
+
- General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
|
102 |
+
- details such as motion and intention, facial with object manipulation
|
103 |
+
- movements with object or other entities when they are prominent and observable. expression should be specific.
|
104 |
+
(e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
|
105 |
+
--
|
106 |
+
## Output Format:
|
107 |
+
- For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
|
108 |
+
object id. action-oriented description
|
109 |
+
(e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
|
110 |
+
### Example
|
111 |
+
If the frame has 1 labeled bear, your output should look like:
|
112 |
+
1. the bear reaching his right arm while leaning forward to capture the prey
|
113 |
+
---
|
114 |
+
**Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
|
115 |
+
**Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
|
116 |
+
**Do not include markdown** in the output.
|
117 |
+
Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
118 |
+
For each labeled object, output referring expressions for each object id.
|
119 |
+
"""
|
120 |
+
prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
|
121 |
+
|
122 |
+
MAX_RETRIES = 2
|
123 |
+
retry_count = 0
|
124 |
+
|
125 |
+
while retry_count < MAX_RETRIES:
|
126 |
+
response = captioner.chat.completions.create(
|
127 |
+
model=model,
|
128 |
+
messages=[
|
129 |
+
{
|
130 |
+
"role": "user",
|
131 |
+
"content": [
|
132 |
+
{
|
133 |
+
"type": "text",
|
134 |
+
"text": prompt_with_text_query,
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"type": "image_url",
|
138 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
139 |
+
},
|
140 |
+
],
|
141 |
+
}
|
142 |
+
],
|
143 |
+
)
|
144 |
+
|
145 |
+
|
146 |
+
caption = response.choices[0].message.content.strip()
|
147 |
+
caption_lower = caption.lower().lstrip()
|
148 |
+
if caption_lower.startswith("1.") and not any(
|
149 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
150 |
+
):
|
151 |
+
break
|
152 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
153 |
+
retry_count += 1
|
154 |
+
time.sleep(2)
|
155 |
+
|
156 |
+
if retry_count == MAX_RETRIES:
|
157 |
+
caption = None
|
158 |
+
print("Max retries reached. Caption generation failed.")
|
159 |
+
|
160 |
+
else:
|
161 |
+
caption = None
|
162 |
+
|
163 |
+
return caption
|
164 |
+
|
165 |
+
if __name__ == "__main__":
|
166 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
167 |
+
parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
|
168 |
+
args = parser.parse_args()
|
169 |
+
|
170 |
+
train_dataset = build_dataset('a2d', image_set = 'train', args = args)
|
171 |
+
text_annotations = train_dataset.text_annotations
|
172 |
+
|
173 |
+
all_captions = {}
|
174 |
+
|
175 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
176 |
+
|
177 |
+
for idx in range(100):
|
178 |
+
imgs, target = train_dataset[idx]
|
179 |
+
frames_idx = target['frames_idx'].tolist()
|
180 |
+
text_query, vid_id, frame_id, instance_id = text_annotations[idx]
|
181 |
+
print(f"vid id: {vid_id}", flush=True)
|
182 |
+
|
183 |
+
frame_id = frame_id - 1
|
184 |
+
frame_order = frames_idx.index(frame_id)
|
185 |
+
|
186 |
+
frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
|
187 |
+
mask = target['masks'].numpy().astype(np.uint8).squeeze()
|
188 |
+
|
189 |
+
caption = getCaption(frame, mask, instance_id, text_query)
|
190 |
+
if vid_id not in all_captions:
|
191 |
+
all_captions[vid_id] = {frame_id : caption}
|
192 |
+
else:
|
193 |
+
all_captions[vid_id][frame_id] = caption
|
194 |
+
|
195 |
+
print("Finished!", flush=True)
|
196 |
+
|
197 |
+
with open(args.save_caption_path, 'w') as file:
|
198 |
+
json.dump(all_captions, file, indent=4)
|
199 |
+
|
.history/mbench_a2d/gpt_a2d_numbered_20250205151827.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
import time
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import cv2
|
13 |
+
from io import BytesIO
|
14 |
+
import base64
|
15 |
+
from PIL import Image
|
16 |
+
import json
|
17 |
+
|
18 |
+
from openai import OpenAI
|
19 |
+
|
20 |
+
def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
|
21 |
+
#마스크 색칠할지
|
22 |
+
if color_mask == True:
|
23 |
+
alpha = 0.1
|
24 |
+
|
25 |
+
colored_mask = np.zeros_like(frame)
|
26 |
+
colored_mask[mask == 1] = [255, 0, 0]
|
27 |
+
frame[mask == 1] = (
|
28 |
+
(1 - alpha) * frame[mask == 1] +
|
29 |
+
alpha * colored_mask[mask == 1]
|
30 |
+
)
|
31 |
+
|
32 |
+
#마스크 아웃라인 그리기
|
33 |
+
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
34 |
+
cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
|
35 |
+
|
36 |
+
#instance_id 적을지
|
37 |
+
if label_number == True:
|
38 |
+
if len(contours) > 0:
|
39 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
40 |
+
M = cv2.moments(largest_contour)
|
41 |
+
if M["m00"] != 0:
|
42 |
+
center_x = int(M["m10"] / M["m00"])
|
43 |
+
center_y = int(M["m01"] / M["m00"])
|
44 |
+
else:
|
45 |
+
center_x, center_y = 0, 0
|
46 |
+
|
47 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
48 |
+
text = str(instance_id)
|
49 |
+
font_scale = 0.6
|
50 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
51 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
52 |
+
text_y = center_y
|
53 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
54 |
+
|
55 |
+
# 텍스트 배경 사각형 좌표 계산
|
56 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
57 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
58 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
59 |
+
|
60 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
61 |
+
cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
|
62 |
+
|
63 |
+
# plt.figure(figsize=(6, 10))
|
64 |
+
# plt.imshow(frame)
|
65 |
+
# plt.title(text_query)
|
66 |
+
# plt.tight_layout()
|
67 |
+
# plt.axis('off')
|
68 |
+
# plt.show()
|
69 |
+
|
70 |
+
buffer = BytesIO()
|
71 |
+
frame = Image.fromarray(frame)
|
72 |
+
frame.save(buffer, format='jpeg')
|
73 |
+
buffer.seek(0)
|
74 |
+
encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
|
75 |
+
|
76 |
+
return encoded_frame
|
77 |
+
|
78 |
+
def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
|
79 |
+
|
80 |
+
base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
|
81 |
+
|
82 |
+
captioner = OpenAI()
|
83 |
+
|
84 |
+
#필터링하지 않고 바로 ref exp 만들기
|
85 |
+
dense_caption_prompt = f"""
|
86 |
+
You are a visual assistant analyzing a single frame of a video.
|
87 |
+
In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
|
88 |
+
I also give you a text query describing the marked object.
|
89 |
+
I want to use your expression to create an **action-centric referring expression** dataset.
|
90 |
+
Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
|
91 |
+
---
|
92 |
+
## Guidelines:
|
93 |
+
1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
|
94 |
+
2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
|
95 |
+
3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
|
96 |
+
4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
97 |
+
5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
98 |
+
6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
|
99 |
+
7. Base your description on these action definitions:
|
100 |
+
- Avoid using term 'minimal' or 'slightly'.
|
101 |
+
- General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
|
102 |
+
- details such as motion and intention, facial with object manipulation
|
103 |
+
- movements with object or other entities when they are prominent and observable. expression should be specific.
|
104 |
+
(e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
|
105 |
+
--
|
106 |
+
## Output Format:
|
107 |
+
- For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
|
108 |
+
object id. action-oriented description
|
109 |
+
(e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
|
110 |
+
### Example
|
111 |
+
If the frame has 1 labeled bear, your output should look like:
|
112 |
+
1. the bear reaching his right arm while leaning forward to capture the prey
|
113 |
+
---
|
114 |
+
**Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
|
115 |
+
**Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
|
116 |
+
**Do not include markdown** in the output.
|
117 |
+
Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
118 |
+
For each labeled object, output referring expressions for each object id.
|
119 |
+
"""
|
120 |
+
prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
|
121 |
+
|
122 |
+
MAX_RETRIES = 2
|
123 |
+
retry_count = 0
|
124 |
+
|
125 |
+
while retry_count < MAX_RETRIES:
|
126 |
+
response = captioner.chat.completions.create(
|
127 |
+
model=model,
|
128 |
+
messages=[
|
129 |
+
{
|
130 |
+
"role": "user",
|
131 |
+
"content": [
|
132 |
+
{
|
133 |
+
"type": "text",
|
134 |
+
"text": prompt_with_text_query,
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"type": "image_url",
|
138 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
139 |
+
},
|
140 |
+
],
|
141 |
+
}
|
142 |
+
],
|
143 |
+
)
|
144 |
+
|
145 |
+
|
146 |
+
caption = response.choices[0].message.content.strip()
|
147 |
+
caption_lower = caption.lower().lstrip()
|
148 |
+
if caption_lower.startswith("1.") and not any(
|
149 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
150 |
+
):
|
151 |
+
break
|
152 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
153 |
+
retry_count += 1
|
154 |
+
time.sleep(2)
|
155 |
+
|
156 |
+
if retry_count == MAX_RETRIES:
|
157 |
+
caption = None
|
158 |
+
print("Max retries reached. Caption generation failed.")
|
159 |
+
|
160 |
+
else:
|
161 |
+
caption = None
|
162 |
+
|
163 |
+
return caption
|
164 |
+
|
165 |
+
if __name__ == "__main__":
|
166 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
167 |
+
parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
|
168 |
+
args = parser.parse_args()
|
169 |
+
|
170 |
+
train_dataset = build_dataset('a2d', image_set = 'train', args = args)
|
171 |
+
text_annotations = train_dataset.text_annotations
|
172 |
+
|
173 |
+
all_captions = {}
|
174 |
+
|
175 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
176 |
+
|
177 |
+
for idx in range(100):
|
178 |
+
imgs, target = train_dataset[idx]
|
179 |
+
frames_idx = target['frames_idx'].tolist()
|
180 |
+
text_query, vid_id, frame_id, instance_id = text_annotations[idx]
|
181 |
+
print(f"vid id: {vid_id}, frame id: {frame_id}", flush=True)
|
182 |
+
|
183 |
+
frame_id = frame_id - 1
|
184 |
+
frame_order = frames_idx.index(frame_id)
|
185 |
+
|
186 |
+
frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
|
187 |
+
mask = target['masks'].numpy().astype(np.uint8).squeeze()
|
188 |
+
|
189 |
+
caption = getCaption(frame, mask, instance_id, text_query)
|
190 |
+
if vid_id not in all_captions:
|
191 |
+
all_captions[vid_id] = {frame_id : caption}
|
192 |
+
else:
|
193 |
+
all_captions[vid_id][frame_id] = caption
|
194 |
+
|
195 |
+
print("Finished!", flush=True)
|
196 |
+
|
197 |
+
with open(args.save_caption_path, 'w') as file:
|
198 |
+
json.dump(all_captions, file, indent=4)
|
199 |
+
|
.history/mbench_a2d/gpt_a2d_numbered_20250205151833.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
import time
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import cv2
|
13 |
+
from io import BytesIO
|
14 |
+
import base64
|
15 |
+
from PIL import Image
|
16 |
+
import json
|
17 |
+
|
18 |
+
from openai import OpenAI
|
19 |
+
|
20 |
+
def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
|
21 |
+
#마스크 색칠할지
|
22 |
+
if color_mask == True:
|
23 |
+
alpha = 0.1
|
24 |
+
|
25 |
+
colored_mask = np.zeros_like(frame)
|
26 |
+
colored_mask[mask == 1] = [255, 0, 0]
|
27 |
+
frame[mask == 1] = (
|
28 |
+
(1 - alpha) * frame[mask == 1] +
|
29 |
+
alpha * colored_mask[mask == 1]
|
30 |
+
)
|
31 |
+
|
32 |
+
#마스크 아웃라인 그리기
|
33 |
+
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
34 |
+
cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
|
35 |
+
|
36 |
+
#instance_id 적을지
|
37 |
+
if label_number == True:
|
38 |
+
if len(contours) > 0:
|
39 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
40 |
+
M = cv2.moments(largest_contour)
|
41 |
+
if M["m00"] != 0:
|
42 |
+
center_x = int(M["m10"] / M["m00"])
|
43 |
+
center_y = int(M["m01"] / M["m00"])
|
44 |
+
else:
|
45 |
+
center_x, center_y = 0, 0
|
46 |
+
|
47 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
48 |
+
text = str(instance_id)
|
49 |
+
font_scale = 0.6
|
50 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
51 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
52 |
+
text_y = center_y
|
53 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
54 |
+
|
55 |
+
# 텍스트 배경 사각형 좌표 계산
|
56 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
57 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
58 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
59 |
+
|
60 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
61 |
+
cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
|
62 |
+
|
63 |
+
# plt.figure(figsize=(6, 10))
|
64 |
+
# plt.imshow(frame)
|
65 |
+
# plt.title(text_query)
|
66 |
+
# plt.tight_layout()
|
67 |
+
# plt.axis('off')
|
68 |
+
# plt.show()
|
69 |
+
|
70 |
+
buffer = BytesIO()
|
71 |
+
frame = Image.fromarray(frame)
|
72 |
+
frame.save(buffer, format='jpeg')
|
73 |
+
buffer.seek(0)
|
74 |
+
encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
|
75 |
+
|
76 |
+
return encoded_frame
|
77 |
+
|
78 |
+
def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
|
79 |
+
|
80 |
+
base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
|
81 |
+
|
82 |
+
captioner = OpenAI()
|
83 |
+
|
84 |
+
#필터링하지 않고 바로 ref exp 만들기
|
85 |
+
dense_caption_prompt = f"""
|
86 |
+
You are a visual assistant analyzing a single frame of a video.
|
87 |
+
In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
|
88 |
+
I also give you a text query describing the marked object.
|
89 |
+
I want to use your expression to create an **action-centric referring expression** dataset.
|
90 |
+
Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
|
91 |
+
---
|
92 |
+
## Guidelines:
|
93 |
+
1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
|
94 |
+
2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
|
95 |
+
3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
|
96 |
+
4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
97 |
+
5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
98 |
+
6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
|
99 |
+
7. Base your description on these action definitions:
|
100 |
+
- Avoid using term 'minimal' or 'slightly'.
|
101 |
+
- General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
|
102 |
+
- details such as motion and intention, facial with object manipulation
|
103 |
+
- movements with object or other entities when they are prominent and observable. expression should be specific.
|
104 |
+
(e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
|
105 |
+
--
|
106 |
+
## Output Format:
|
107 |
+
- For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
|
108 |
+
object id. action-oriented description
|
109 |
+
(e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
|
110 |
+
### Example
|
111 |
+
If the frame has 1 labeled bear, your output should look like:
|
112 |
+
1. the bear reaching his right arm while leaning forward to capture the prey
|
113 |
+
---
|
114 |
+
**Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
|
115 |
+
**Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
|
116 |
+
**Do not include markdown** in the output.
|
117 |
+
Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
118 |
+
For each labeled object, output referring expressions for each object id.
|
119 |
+
"""
|
120 |
+
prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
|
121 |
+
|
122 |
+
MAX_RETRIES = 2
|
123 |
+
retry_count = 0
|
124 |
+
|
125 |
+
while retry_count < MAX_RETRIES:
|
126 |
+
response = captioner.chat.completions.create(
|
127 |
+
model=model,
|
128 |
+
messages=[
|
129 |
+
{
|
130 |
+
"role": "user",
|
131 |
+
"content": [
|
132 |
+
{
|
133 |
+
"type": "text",
|
134 |
+
"text": prompt_with_text_query,
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"type": "image_url",
|
138 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
139 |
+
},
|
140 |
+
],
|
141 |
+
}
|
142 |
+
],
|
143 |
+
)
|
144 |
+
|
145 |
+
|
146 |
+
caption = response.choices[0].message.content.strip()
|
147 |
+
caption_lower = caption.lower().lstrip()
|
148 |
+
if caption_lower.startswith("1.") and not any(
|
149 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
150 |
+
):
|
151 |
+
break
|
152 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
153 |
+
retry_count += 1
|
154 |
+
time.sleep(2)
|
155 |
+
|
156 |
+
if retry_count == MAX_RETRIES:
|
157 |
+
caption = None
|
158 |
+
print("Max retries reached. Caption generation failed.")
|
159 |
+
|
160 |
+
else:
|
161 |
+
caption = None
|
162 |
+
|
163 |
+
return caption
|
164 |
+
|
165 |
+
if __name__ == "__main__":
|
166 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
167 |
+
parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
|
168 |
+
args = parser.parse_args()
|
169 |
+
|
170 |
+
train_dataset = build_dataset('a2d', image_set = 'train', args = args)
|
171 |
+
text_annotations = train_dataset.text_annotations
|
172 |
+
|
173 |
+
all_captions = {}
|
174 |
+
|
175 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
176 |
+
|
177 |
+
for idx in range(100):
|
178 |
+
imgs, target = train_dataset[idx]
|
179 |
+
frames_idx = target['frames_idx'].tolist()
|
180 |
+
text_query, vid_id, frame_id, instance_id = text_annotations[idx]
|
181 |
+
print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
|
182 |
+
|
183 |
+
frame_id = frame_id - 1
|
184 |
+
frame_order = frames_idx.index(frame_id)
|
185 |
+
|
186 |
+
frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
|
187 |
+
mask = target['masks'].numpy().astype(np.uint8).squeeze()
|
188 |
+
|
189 |
+
caption = getCaption(frame, mask, instance_id, text_query)
|
190 |
+
if vid_id not in all_captions:
|
191 |
+
all_captions[vid_id] = {frame_id : caption}
|
192 |
+
else:
|
193 |
+
all_captions[vid_id][frame_id] = caption
|
194 |
+
|
195 |
+
print("Finished!", flush=True)
|
196 |
+
|
197 |
+
with open(args.save_caption_path, 'w') as file:
|
198 |
+
json.dump(all_captions, file, indent=4)
|
199 |
+
|
.history/mbench_a2d/gpt_a2d_numbered_20250205152714.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
import time
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import cv2
|
13 |
+
from io import BytesIO
|
14 |
+
import base64
|
15 |
+
from PIL import Image
|
16 |
+
import json
|
17 |
+
|
18 |
+
from openai import OpenAI
|
19 |
+
|
20 |
+
def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
|
21 |
+
#마스크 색칠할지
|
22 |
+
if color_mask == True:
|
23 |
+
alpha = 0.1
|
24 |
+
|
25 |
+
colored_mask = np.zeros_like(frame)
|
26 |
+
colored_mask[mask == 1] = [255, 0, 0]
|
27 |
+
frame[mask == 1] = (
|
28 |
+
(1 - alpha) * frame[mask == 1] +
|
29 |
+
alpha * colored_mask[mask == 1]
|
30 |
+
)
|
31 |
+
|
32 |
+
#마스크 아웃라인 그리기
|
33 |
+
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
34 |
+
cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
|
35 |
+
|
36 |
+
#instance_id 적을지
|
37 |
+
if label_number == True:
|
38 |
+
if len(contours) > 0:
|
39 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
40 |
+
M = cv2.moments(largest_contour)
|
41 |
+
if M["m00"] != 0:
|
42 |
+
center_x = int(M["m10"] / M["m00"])
|
43 |
+
center_y = int(M["m01"] / M["m00"])
|
44 |
+
else:
|
45 |
+
center_x, center_y = 0, 0
|
46 |
+
|
47 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
48 |
+
text = str(instance_id)
|
49 |
+
font_scale = 0.6
|
50 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
51 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
52 |
+
text_y = center_y
|
53 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
54 |
+
|
55 |
+
# 텍스트 배경 사각형 좌표 계산
|
56 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
57 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
58 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
59 |
+
|
60 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
61 |
+
cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
|
62 |
+
|
63 |
+
# plt.figure(figsize=(6, 10))
|
64 |
+
# plt.imshow(frame)
|
65 |
+
# plt.title(text_query)
|
66 |
+
# plt.tight_layout()
|
67 |
+
# plt.axis('off')
|
68 |
+
# plt.show()
|
69 |
+
|
70 |
+
buffer = BytesIO()
|
71 |
+
frame = Image.fromarray(frame)
|
72 |
+
frame.save(buffer, format='jpeg')
|
73 |
+
buffer.seek(0)
|
74 |
+
encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
|
75 |
+
|
76 |
+
return encoded_frame
|
77 |
+
|
78 |
+
def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
|
79 |
+
|
80 |
+
base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
|
81 |
+
|
82 |
+
captioner = OpenAI()
|
83 |
+
|
84 |
+
#필터링하지 않고 바로 ref exp 만들기
|
85 |
+
dense_caption_prompt = f"""
|
86 |
+
You are a visual assistant analyzing a single frame of a video.
|
87 |
+
In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
|
88 |
+
I also give you a text query describing the marked object.
|
89 |
+
I want to use your expression to create an **action-centric referring expression** dataset.
|
90 |
+
Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
|
91 |
+
---
|
92 |
+
## Guidelines:
|
93 |
+
1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
|
94 |
+
2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
|
95 |
+
3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
|
96 |
+
4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
97 |
+
5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
98 |
+
6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
|
99 |
+
7. Base your description on these action definitions:
|
100 |
+
- Avoid using term 'minimal' or 'slightly'.
|
101 |
+
- General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
|
102 |
+
- details such as motion and intention, facial with object manipulation
|
103 |
+
- movements with object or other entities when they are prominent and observable. expression should be specific.
|
104 |
+
(e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
|
105 |
+
--
|
106 |
+
## Output Format:
|
107 |
+
- For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
|
108 |
+
object id. action-oriented description
|
109 |
+
(e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
|
110 |
+
### Example
|
111 |
+
If the frame has 1 labeled bear, your output should look like:
|
112 |
+
1. the bear reaching his right arm while leaning forward to capture the prey
|
113 |
+
---
|
114 |
+
**Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
|
115 |
+
**Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
|
116 |
+
**Do not include markdown** in the output.
|
117 |
+
Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
118 |
+
For each labeled object, output referring expressions for each object id.
|
119 |
+
"""
|
120 |
+
prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
|
121 |
+
|
122 |
+
MAX_RETRIES = 2
|
123 |
+
retry_count = 0
|
124 |
+
|
125 |
+
while retry_count < MAX_RETRIES:
|
126 |
+
response = captioner.chat.completions.create(
|
127 |
+
model=model,
|
128 |
+
messages=[
|
129 |
+
{
|
130 |
+
"role": "user",
|
131 |
+
"content": [
|
132 |
+
{
|
133 |
+
"type": "text",
|
134 |
+
"text": prompt_with_text_query,
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"type": "image_url",
|
138 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
139 |
+
},
|
140 |
+
],
|
141 |
+
}
|
142 |
+
],
|
143 |
+
)
|
144 |
+
|
145 |
+
|
146 |
+
caption = response.choices[0].message.content.strip()
|
147 |
+
caption_lower = caption.lower().lstrip()
|
148 |
+
if caption_lower.startswith("1.") and not any(
|
149 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
150 |
+
):
|
151 |
+
break
|
152 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
153 |
+
retry_count += 1
|
154 |
+
time.sleep(2)
|
155 |
+
|
156 |
+
if retry_count == MAX_RETRIES:
|
157 |
+
caption = None
|
158 |
+
print("Max retries reached. Caption generation failed.")
|
159 |
+
|
160 |
+
else:
|
161 |
+
caption = None
|
162 |
+
|
163 |
+
return caption
|
164 |
+
|
165 |
+
if __name__ == "__main__":
|
166 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
167 |
+
parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
|
168 |
+
args = parser.parse_args()
|
169 |
+
|
170 |
+
train_dataset = build_dataset('a2d', image_set = 'train', args = args)
|
171 |
+
text_annotations = train_dataset.text_annotations
|
172 |
+
|
173 |
+
all_captions = {}
|
174 |
+
|
175 |
+
#os.environ['OPENAI_API_KEY'] = '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'
|
176 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
177 |
+
|
178 |
+
for idx in range(100):
|
179 |
+
imgs, target = train_dataset[idx]
|
180 |
+
frames_idx = target['frames_idx'].tolist()
|
181 |
+
text_query, vid_id, frame_id, instance_id = text_annotations[idx]
|
182 |
+
print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
|
183 |
+
|
184 |
+
frame_id = frame_id - 1
|
185 |
+
frame_order = frames_idx.index(frame_id)
|
186 |
+
|
187 |
+
frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
|
188 |
+
mask = target['masks'].numpy().astype(np.uint8).squeeze()
|
189 |
+
|
190 |
+
caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini')
|
191 |
+
if vid_id not in all_captions:
|
192 |
+
all_captions[vid_id] = {frame_id : caption}
|
193 |
+
else:
|
194 |
+
all_captions[vid_id][frame_id] = caption
|
195 |
+
|
196 |
+
print("Finished!", flush=True)
|
197 |
+
|
198 |
+
with open(args.save_caption_path, 'w') as file:
|
199 |
+
json.dump(all_captions, file, indent=4)
|
200 |
+
|
.history/mbench_a2d/gpt_a2d_numbered_20250206114221.py
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
import time
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import cv2
|
13 |
+
from io import BytesIO
|
14 |
+
import base64
|
15 |
+
from PIL import Image
|
16 |
+
import json
|
17 |
+
|
18 |
+
from openai import OpenAI
|
19 |
+
|
20 |
+
def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
|
21 |
+
#마스크 색칠할지
|
22 |
+
if color_mask == True:
|
23 |
+
alpha = 0.1
|
24 |
+
|
25 |
+
colored_mask = np.zeros_like(frame)
|
26 |
+
colored_mask[mask == 1] = [255, 0, 0]
|
27 |
+
frame[mask == 1] = (
|
28 |
+
(1 - alpha) * frame[mask == 1] +
|
29 |
+
alpha * colored_mask[mask == 1]
|
30 |
+
)
|
31 |
+
|
32 |
+
#마스크 아웃라인 그리기
|
33 |
+
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
34 |
+
cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
|
35 |
+
|
36 |
+
#instance_id 적을지
|
37 |
+
if label_number == True:
|
38 |
+
if len(contours) > 0:
|
39 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
40 |
+
M = cv2.moments(largest_contour)
|
41 |
+
if M["m00"] != 0:
|
42 |
+
center_x = int(M["m10"] / M["m00"])
|
43 |
+
center_y = int(M["m01"] / M["m00"])
|
44 |
+
else:
|
45 |
+
center_x, center_y = 0, 0
|
46 |
+
|
47 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
48 |
+
text = str(instance_id)
|
49 |
+
font_scale = 0.6
|
50 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
51 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
52 |
+
text_y = center_y
|
53 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
54 |
+
|
55 |
+
# 텍스트 배경 사각형 좌표 계산
|
56 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
57 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
58 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
59 |
+
|
60 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
61 |
+
cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
|
62 |
+
|
63 |
+
# plt.figure(figsize=(6, 10))
|
64 |
+
# plt.imshow(frame)
|
65 |
+
# plt.title(text_query)
|
66 |
+
# plt.tight_layout()
|
67 |
+
# plt.axis('off')
|
68 |
+
# plt.show()
|
69 |
+
|
70 |
+
buffer = BytesIO()
|
71 |
+
frame = Image.fromarray(frame)
|
72 |
+
frame.save(buffer, format='jpeg')
|
73 |
+
buffer.seek(0)
|
74 |
+
encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
|
75 |
+
|
76 |
+
return encoded_frame
|
77 |
+
|
78 |
+
def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
|
79 |
+
|
80 |
+
base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
|
81 |
+
|
82 |
+
captioner = OpenAI()
|
83 |
+
|
84 |
+
#필터링하지 않고 바로 ref exp 만들기
|
85 |
+
dense_caption_prompt = f"""
|
86 |
+
You are a visual assistant analyzing a single frame of a video.
|
87 |
+
In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
|
88 |
+
I also give you a text query describing the marked object.
|
89 |
+
I want to use your expression to create an **action-centric referring expression** dataset.
|
90 |
+
Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
|
91 |
+
---
|
92 |
+
## Guidelines:
|
93 |
+
1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
|
94 |
+
2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
|
95 |
+
3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
|
96 |
+
4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
97 |
+
5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
98 |
+
6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
|
99 |
+
7. Base your description on these action definitions:
|
100 |
+
- Avoid using term 'minimal' or 'slightly'.
|
101 |
+
- General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
|
102 |
+
- details such as motion and intention, facial with object manipulation
|
103 |
+
- movements with object or other entities when they are prominent and observable. expression should be specific.
|
104 |
+
(e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
|
105 |
+
--
|
106 |
+
## Output Format:
|
107 |
+
- For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
|
108 |
+
object id. action-oriented description
|
109 |
+
(e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
|
110 |
+
### Example
|
111 |
+
If the frame has 1 labeled bear, your output should look like:
|
112 |
+
1. the bear reaching his right arm while leaning forward to capture the prey
|
113 |
+
---
|
114 |
+
**Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
|
115 |
+
**Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
|
116 |
+
**Do not include markdown** in the output.
|
117 |
+
Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
118 |
+
For each labeled object, output referring expressions for each object id.
|
119 |
+
"""
|
120 |
+
prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
|
121 |
+
|
122 |
+
MAX_RETRIES = 2
|
123 |
+
retry_count = 0
|
124 |
+
|
125 |
+
while retry_count < MAX_RETRIES:
|
126 |
+
response = captioner.chat.completions.create(
|
127 |
+
model=model,
|
128 |
+
messages=[
|
129 |
+
{
|
130 |
+
"role": "user",
|
131 |
+
"content": [
|
132 |
+
{
|
133 |
+
"type": "text",
|
134 |
+
"text": prompt_with_text_query,
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"type": "image_url",
|
138 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
139 |
+
},
|
140 |
+
],
|
141 |
+
}
|
142 |
+
],
|
143 |
+
)
|
144 |
+
|
145 |
+
|
146 |
+
caption = response.choices[0].message.content.strip()
|
147 |
+
caption_lower = caption.lower().lstrip()
|
148 |
+
if caption_lower.startswith("1.") and not any(
|
149 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
150 |
+
):
|
151 |
+
break
|
152 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
153 |
+
retry_count += 1
|
154 |
+
time.sleep(2)
|
155 |
+
|
156 |
+
if retry_count == MAX_RETRIES:
|
157 |
+
caption = None
|
158 |
+
print("Max retries reached. Caption generation failed.")
|
159 |
+
|
160 |
+
else:
|
161 |
+
caption = None
|
162 |
+
|
163 |
+
return caption
|
164 |
+
|
165 |
+
if __name__ == "__main__":
|
166 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
167 |
+
parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
|
168 |
+
args = parser.parse_args()
|
169 |
+
|
170 |
+
train_dataset = build_dataset('a2d', image_set = 'train', args = args)
|
171 |
+
text_annotations = train_dataset.text_annotations
|
172 |
+
|
173 |
+
all_captions = {}
|
174 |
+
|
175 |
+
#os.environ['OPENAI_API_KEY'] = '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'
|
176 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
177 |
+
|
178 |
+
first_text_query = ""
|
179 |
+
for idx in range(300):
|
180 |
+
imgs, target = train_dataset[idx]
|
181 |
+
frames_idx = target['frames_idx'].tolist()
|
182 |
+
text_query, vid_id, frame_id, instance_id = text_annotations[idx]
|
183 |
+
|
184 |
+
if text_query == first_text_query:
|
185 |
+
continue
|
186 |
+
|
187 |
+
print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
|
188 |
+
|
189 |
+
frame_id = frame_id - 1
|
190 |
+
frame_order = frames_idx.index(frame_id)
|
191 |
+
|
192 |
+
frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
|
193 |
+
mask = target['masks'].numpy().astype(np.uint8).squeeze()
|
194 |
+
|
195 |
+
caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini')
|
196 |
+
if vid_id not in all_captions:
|
197 |
+
all_captions[vid_id] = {frame_id : caption}
|
198 |
+
else:
|
199 |
+
all_captions[vid_id][frame_id] = caption
|
200 |
+
|
201 |
+
print("Finished!", flush=True)
|
202 |
+
|
203 |
+
with open(args.save_caption_path, 'w') as file:
|
204 |
+
json.dump(all_captions, file, indent=4)
|
205 |
+
|
.history/mbench_a2d/gpt_a2d_numbered_20250206114540.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
import time
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import cv2
|
13 |
+
from io import BytesIO
|
14 |
+
import base64
|
15 |
+
from PIL import Image
|
16 |
+
import json
|
17 |
+
|
18 |
+
from openai import OpenAI
|
19 |
+
|
20 |
+
def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
|
21 |
+
#마스크 색칠할지
|
22 |
+
if color_mask == True:
|
23 |
+
alpha = 0.1
|
24 |
+
|
25 |
+
colored_mask = np.zeros_like(frame)
|
26 |
+
colored_mask[mask == 1] = [255, 0, 0]
|
27 |
+
frame[mask == 1] = (
|
28 |
+
(1 - alpha) * frame[mask == 1] +
|
29 |
+
alpha * colored_mask[mask == 1]
|
30 |
+
)
|
31 |
+
|
32 |
+
#마스크 아웃라인 그리기
|
33 |
+
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
34 |
+
cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
|
35 |
+
|
36 |
+
#instance_id 적을지
|
37 |
+
if label_number == True:
|
38 |
+
if len(contours) > 0:
|
39 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
40 |
+
M = cv2.moments(largest_contour)
|
41 |
+
if M["m00"] != 0:
|
42 |
+
center_x = int(M["m10"] / M["m00"])
|
43 |
+
center_y = int(M["m01"] / M["m00"])
|
44 |
+
else:
|
45 |
+
center_x, center_y = 0, 0
|
46 |
+
|
47 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
48 |
+
text = str(instance_id)
|
49 |
+
font_scale = 0.6
|
50 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
51 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
52 |
+
text_y = center_y
|
53 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
54 |
+
|
55 |
+
# 텍스트 배경 사각형 좌표 계산
|
56 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
57 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
58 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
59 |
+
|
60 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
61 |
+
cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
|
62 |
+
|
63 |
+
# plt.figure(figsize=(6, 10))
|
64 |
+
# plt.imshow(frame)
|
65 |
+
# plt.title(text_query)
|
66 |
+
# plt.tight_layout()
|
67 |
+
# plt.axis('off')
|
68 |
+
# plt.show()
|
69 |
+
|
70 |
+
buffer = BytesIO()
|
71 |
+
frame = Image.fromarray(frame)
|
72 |
+
frame.save(buffer, format='jpeg')
|
73 |
+
buffer.seek(0)
|
74 |
+
encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
|
75 |
+
|
76 |
+
return encoded_frame
|
77 |
+
|
78 |
+
def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
|
79 |
+
|
80 |
+
base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
|
81 |
+
|
82 |
+
captioner = OpenAI()
|
83 |
+
|
84 |
+
#필터링하지 않고 바로 ref exp 만들기
|
85 |
+
dense_caption_prompt = f"""
|
86 |
+
You are a visual assistant analyzing a single frame of a video.
|
87 |
+
In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
|
88 |
+
I also give you a text query describing the marked object.
|
89 |
+
I want to use your expression to create an **action-centric referring expression** dataset.
|
90 |
+
Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
|
91 |
+
---
|
92 |
+
## Guidelines:
|
93 |
+
1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
|
94 |
+
2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
|
95 |
+
3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
|
96 |
+
4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
97 |
+
5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
98 |
+
6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
|
99 |
+
7. Base your description on these action definitions:
|
100 |
+
- Avoid using term 'minimal' or 'slightly'.
|
101 |
+
- General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
|
102 |
+
- details such as motion and intention, facial with object manipulation
|
103 |
+
- movements with object or other entities when they are prominent and observable. expression should be specific.
|
104 |
+
(e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
|
105 |
+
--
|
106 |
+
## Output Format:
|
107 |
+
- For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
|
108 |
+
object id. action-oriented description
|
109 |
+
(e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
|
110 |
+
### Example
|
111 |
+
If the frame has 1 labeled bear, your output should look like:
|
112 |
+
1. the bear reaching his right arm while leaning forward to capture the prey
|
113 |
+
---
|
114 |
+
**Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
|
115 |
+
**Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
|
116 |
+
**Do not include markdown** in the output.
|
117 |
+
Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
118 |
+
For each labeled object, output referring expressions for each object id.
|
119 |
+
"""
|
120 |
+
prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
|
121 |
+
|
122 |
+
MAX_RETRIES = 2
|
123 |
+
retry_count = 0
|
124 |
+
|
125 |
+
while retry_count < MAX_RETRIES:
|
126 |
+
response = captioner.chat.completions.create(
|
127 |
+
model=model,
|
128 |
+
messages=[
|
129 |
+
{
|
130 |
+
"role": "user",
|
131 |
+
"content": [
|
132 |
+
{
|
133 |
+
"type": "text",
|
134 |
+
"text": prompt_with_text_query,
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"type": "image_url",
|
138 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
139 |
+
},
|
140 |
+
],
|
141 |
+
}
|
142 |
+
],
|
143 |
+
)
|
144 |
+
|
145 |
+
|
146 |
+
caption = response.choices[0].message.content.strip()
|
147 |
+
caption_lower = caption.lower().lstrip()
|
148 |
+
if caption_lower.startswith("1.") and not any(
|
149 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
150 |
+
):
|
151 |
+
break
|
152 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
153 |
+
retry_count += 1
|
154 |
+
time.sleep(2)
|
155 |
+
|
156 |
+
if retry_count == MAX_RETRIES:
|
157 |
+
caption = None
|
158 |
+
print("Max retries reached. Caption generation failed.")
|
159 |
+
|
160 |
+
else:
|
161 |
+
caption = None
|
162 |
+
|
163 |
+
return caption
|
164 |
+
|
165 |
+
if __name__ == "__main__":
|
166 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
167 |
+
parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
|
168 |
+
args = parser.parse_args()
|
169 |
+
|
170 |
+
train_dataset = build_dataset('a2d', image_set = 'train', args = args)
|
171 |
+
text_annotations = train_dataset.text_annotations
|
172 |
+
|
173 |
+
all_captions = {}
|
174 |
+
|
175 |
+
#os.environ['OPENAI_API_KEY'] = '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'
|
176 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
177 |
+
|
178 |
+
first_text_query = ""
|
179 |
+
for idx in range(300):
|
180 |
+
imgs, target = train_dataset[idx]
|
181 |
+
frames_idx = target['frames_idx'].tolist()
|
182 |
+
text_query, vid_id, frame_id, instance_id = text_annotations[idx]
|
183 |
+
|
184 |
+
if text_query == first_text_query:
|
185 |
+
continue
|
186 |
+
|
187 |
+
print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
|
188 |
+
|
189 |
+
frame_id = frame_id - 1
|
190 |
+
frame_order = frames_idx.index(frame_id)
|
191 |
+
|
192 |
+
frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
|
193 |
+
mask = target['masks'].numpy().astype(np.uint8).squeeze()
|
194 |
+
|
195 |
+
caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini')
|
196 |
+
if vid_id not in all_captions:
|
197 |
+
all_captions[vid_id] = {frame_id : caption}
|
198 |
+
else:
|
199 |
+
all_captions[vid_id][frame_id] = caption
|
200 |
+
|
201 |
+
if idx % 50 == 0:
|
202 |
+
with open(args.save_caption_path, 'w') as file:
|
203 |
+
json.dump(all_captions, file, indent=4)
|
204 |
+
|
205 |
+
print("Finished!", flush=True)
|
206 |
+
|
207 |
+
with open(args.save_caption_path, 'w') as file:
|
208 |
+
json.dump(all_captions, file, indent=4)
|
209 |
+
|
.history/mbench_a2d/gpt_a2d_numbered_20250206145656.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
4 |
+
|
5 |
+
from datasets import build_dataset
|
6 |
+
import argparse
|
7 |
+
import opts
|
8 |
+
import time
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import cv2
|
13 |
+
from io import BytesIO
|
14 |
+
import base64
|
15 |
+
from PIL import Image
|
16 |
+
import json
|
17 |
+
|
18 |
+
from openai import OpenAI
|
19 |
+
|
20 |
+
def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
|
21 |
+
#마스크 색칠할지
|
22 |
+
if color_mask == True:
|
23 |
+
alpha = 0.1
|
24 |
+
|
25 |
+
colored_mask = np.zeros_like(frame)
|
26 |
+
colored_mask[mask == 1] = [255, 0, 0]
|
27 |
+
frame[mask == 1] = (
|
28 |
+
(1 - alpha) * frame[mask == 1] +
|
29 |
+
alpha * colored_mask[mask == 1]
|
30 |
+
)
|
31 |
+
|
32 |
+
#마스크 아웃라인 그리기
|
33 |
+
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
34 |
+
cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
|
35 |
+
|
36 |
+
#instance_id 적을지
|
37 |
+
if label_number == True:
|
38 |
+
if len(contours) > 0:
|
39 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
40 |
+
M = cv2.moments(largest_contour)
|
41 |
+
if M["m00"] != 0:
|
42 |
+
center_x = int(M["m10"] / M["m00"])
|
43 |
+
center_y = int(M["m01"] / M["m00"])
|
44 |
+
else:
|
45 |
+
center_x, center_y = 0, 0
|
46 |
+
|
47 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
48 |
+
text = str(instance_id)
|
49 |
+
font_scale = 0.6
|
50 |
+
text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
|
51 |
+
text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
|
52 |
+
text_y = center_y
|
53 |
+
# text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
|
54 |
+
|
55 |
+
# 텍스트 배경 사각형 좌표 계산
|
56 |
+
rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
|
57 |
+
# rect_end = (text_x + text_size[0] + 5, text_y + 5)
|
58 |
+
rect_end = (text_x + text_size[0] + 5, text_y)
|
59 |
+
|
60 |
+
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
|
61 |
+
cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
|
62 |
+
|
63 |
+
# plt.figure(figsize=(6, 10))
|
64 |
+
# plt.imshow(frame)
|
65 |
+
# plt.title(text_query)
|
66 |
+
# plt.tight_layout()
|
67 |
+
# plt.axis('off')
|
68 |
+
# plt.show()
|
69 |
+
|
70 |
+
buffer = BytesIO()
|
71 |
+
frame = Image.fromarray(frame)
|
72 |
+
frame.save(buffer, format='jpeg')
|
73 |
+
buffer.seek(0)
|
74 |
+
encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
|
75 |
+
|
76 |
+
return encoded_frame
|
77 |
+
|
78 |
+
def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
|
79 |
+
|
80 |
+
base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
|
81 |
+
|
82 |
+
captioner = OpenAI()
|
83 |
+
|
84 |
+
#필터링하지 않고 바로 ref exp 만들기
|
85 |
+
dense_caption_prompt = f"""
|
86 |
+
You are a visual assistant analyzing a single frame of a video.
|
87 |
+
In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
|
88 |
+
I also give you a text query describing the marked object.
|
89 |
+
I want to use your expression to create an **action-centric referring expression** dataset.
|
90 |
+
Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
|
91 |
+
---
|
92 |
+
## Guidelines:
|
93 |
+
1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
|
94 |
+
2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
|
95 |
+
3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
|
96 |
+
4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
|
97 |
+
5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
|
98 |
+
6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
|
99 |
+
7. Base your description on these action definitions:
|
100 |
+
- Avoid using term 'minimal' or 'slightly'.
|
101 |
+
- General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
|
102 |
+
- details such as motion and intention, facial with object manipulation
|
103 |
+
- movements with object or other entities when they are prominent and observable. expression should be specific.
|
104 |
+
(e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
|
105 |
+
--
|
106 |
+
## Output Format:
|
107 |
+
- For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
|
108 |
+
object id. action-oriented description
|
109 |
+
(e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
|
110 |
+
### Example
|
111 |
+
If the frame has 1 labeled bear, your output should look like:
|
112 |
+
1. the bear reaching his right arm while leaning forward to capture the prey
|
113 |
+
---
|
114 |
+
**Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
|
115 |
+
**Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
|
116 |
+
**Do not include markdown** in the output.
|
117 |
+
Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
|
118 |
+
For each labeled object, output referring expressions for each object id.
|
119 |
+
"""
|
120 |
+
prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
|
121 |
+
|
122 |
+
MAX_RETRIES = 2
|
123 |
+
retry_count = 0
|
124 |
+
|
125 |
+
while retry_count < MAX_RETRIES:
|
126 |
+
response = captioner.chat.completions.create(
|
127 |
+
model=model,
|
128 |
+
messages=[
|
129 |
+
{
|
130 |
+
"role": "user",
|
131 |
+
"content": [
|
132 |
+
{
|
133 |
+
"type": "text",
|
134 |
+
"text": prompt_with_text_query,
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"type": "image_url",
|
138 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
139 |
+
},
|
140 |
+
],
|
141 |
+
}
|
142 |
+
],
|
143 |
+
)
|
144 |
+
|
145 |
+
|
146 |
+
caption = response.choices[0].message.content.strip()
|
147 |
+
caption_lower = caption.lower().lstrip()
|
148 |
+
if caption_lower.startswith("1.") and not any(
|
149 |
+
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
|
150 |
+
):
|
151 |
+
break
|
152 |
+
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
|
153 |
+
retry_count += 1
|
154 |
+
time.sleep(2)
|
155 |
+
|
156 |
+
if retry_count == MAX_RETRIES:
|
157 |
+
caption = None
|
158 |
+
print("Max retries reached. Caption generation failed.")
|
159 |
+
|
160 |
+
else:
|
161 |
+
caption = None
|
162 |
+
|
163 |
+
return caption
|
164 |
+
|
165 |
+
if __name__ == "__main__":
|
166 |
+
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
|
167 |
+
parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
|
168 |
+
args = parser.parse_args()
|
169 |
+
|
170 |
+
train_dataset = build_dataset('a2d', image_set = 'train', args = args)
|
171 |
+
text_annotations = train_dataset.text_annotations
|
172 |
+
|
173 |
+
all_captions = {}
|
174 |
+
|
175 |
+
#os.environ['OPENAI_API_KEY'] = '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'
|
176 |
+
os.environ['OPENAI_API_KEY'] = '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'
|
177 |
+
|
178 |
+
first_text_query = ""
|
179 |
+
for idx in range(300):
|
180 |
+
imgs, target = train_dataset[idx]
|
181 |
+
frames_idx = target['frames_idx'].tolist()
|
182 |
+
text_query, vid_id, frame_id, instance_id = text_annotations[idx]
|
183 |
+
|
184 |
+
if text_query == first_text_query:
|
185 |
+
continue
|
186 |
+
|
187 |
+
print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
|
188 |
+
|
189 |
+
frame_id = frame_id - 1
|
190 |
+
frame_order = frames_idx.index(frame_id)
|
191 |
+
|
192 |
+
frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
|
193 |
+
mask = target['masks'].numpy().astype(np.uint8).squeeze()
|
194 |
+
|
195 |
+
caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini')
|
196 |
+
if vid_id not in all_captions:
|
197 |
+
all_captions[vid_id] = {idx : caption}
|
198 |
+
else:
|
199 |
+
all_captions[vid_id][idx] = caption
|
200 |
+
|
201 |
+
if idx % 50 == 0:
|
202 |
+
with open(args.save_caption_path, 'w') as file:
|
203 |
+
json.dump(all_captions, file, indent=4)
|
204 |
+
|
205 |
+
print("Finished!", flush=True)
|
206 |
+
|
207 |
+
with open(args.save_caption_path, 'w') as file:
|
208 |
+
json.dump(all_captions, file, indent=4)
|
209 |
+
|
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130185215.sh
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
|
4 |
+
#SBATCH --partition=a4000
|
5 |
+
#SBATCH --nodelist=node05
|
6 |
+
#SBATCH --gres=gpu:1
|
7 |
+
#SBATCH --time=14-00:00:00
|
8 |
+
#SBATCH --mem=5G
|
9 |
+
#SBATCH --cpus-per-task=4
|
10 |
+
#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
|
11 |
+
cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
|
12 |
+
|
13 |
+
ml purge
|
14 |
+
ml load cuda/12.1
|
15 |
+
eval "$(conda shell.bash hook)"
|
16 |
+
conda activate referformer
|
17 |
+
|
18 |
+
python3 mbench/gpt_ref-ytvos_numbered_cy.py
|
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207173418.sh
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered_final
|
4 |
+
#SBATCH --partition=a4000
|
5 |
+
#SBATCH --nodelist=node05
|
6 |
+
#SBATCH --gres=gpu:1
|
7 |
+
#SBATCH --time=14-00:00:00
|
8 |
+
#SBATCH --mem=5G
|
9 |
+
#SBATCH --cpus-per-task=4
|
10 |
+
#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered_final.out
|
11 |
+
cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
|
12 |
+
|
13 |
+
ml purge
|
14 |
+
ml load cuda/12.1
|
15 |
+
eval "$(conda shell.bash hook)"
|
16 |
+
conda activate referformer
|
17 |
+
|
18 |
+
python3 mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py \
|
19 |
+
--save_caption_path mbench/numbered_captions_gpt-4o_final.json \
|
20 |
+
--save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o_final.json
|
hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b.lock
ADDED
File without changes
|
hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/cf1c08b23cfa58fa714ab5a4a233b9b42ee9bb9b.lock
ADDED
File without changes
|