dianecy commited on 27 days ago

Commit

91e3dad

verified ·

1 Parent(s): 9b855a7

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.history/datasets/a2d_20241227174300.py +241 -0
.history/datasets/a2d_20250203155857.py +243 -0
.history/datasets/a2d_20250203160149.py +247 -0
.history/datasets/a2d_20250203174309.py +247 -0
.history/datasets/ytvos_ref_20250113163537.py +250 -0
.history/datasets/ytvos_ref_20250116071955.py +240 -0
.history/datasets/ytvos_ref_20250116072439.py +240 -0
.history/datasets/ytvos_ref_20250116073540.py +239 -0
.history/datasets/ytvos_ref_20250116073706.py +240 -0
.history/datasets/ytvos_ref_20250116073858.py +239 -0
.history/mbench/gpt_ref-ytvos-cy_20250121143328.py +0 -0
.history/mbench/gpt_ref-ytvos-cy_20250121155631.py +428 -0
.history/mbench/gpt_ref-ytvos_20250119071933.py +292 -0
.history/mbench/gpt_ref-ytvos_20250119072546.py +292 -0
.history/mbench/make_ref-ytvos_json_20250113181932.py +0 -0
.history/mbench/make_ref-ytvos_json_20250113182455.py +100 -0
.history/mbench/make_ref-ytvos_json_20250113182916.py +102 -0
.history/mbench/make_ref-ytvos_json_20250113182917.py +102 -0
.history/mbench/make_ref-ytvos_json_20250113183527.py +103 -0
.history/mbench/make_ref-ytvos_json_20250113195258.py +103 -0
.history/mbench/make_ref-ytvos_json_20250113195443.py +103 -0
.history/mbench/make_ref-ytvos_json_20250116140957.py +103 -0
.history/mbench/make_ref-ytvos_json_20250117032934.py +105 -0
.history/mbench/make_ref-ytvos_json_20250117074200.py +107 -0
.history/mbench/make_ref-ytvos_json_20250117074329.py +107 -0
.history/slurm_script/jupyter_20250106230703.sh +16 -0
.history/slurm_script/jupyter_20250113135212.sh +16 -0
.history/slurm_script/jupyter_20250117012746.sh +16 -0
.history/slurm_script/jupyter_20250117012750.sh +16 -0
.history/slurm_script/jupyter_20250117143527.sh +16 -0
.history/slurm_script/mbench_gpt_a2d_20250205122407.sh +0 -0
.history/slurm_script/mbench_gpt_a2d_20250205151525.sh +19 -0
.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121155759.sh +0 -0
.history/slurm_script/mbench_gpt_ref-ytvos_20250119070901.sh +0 -0
.history/slurm_script/mbench_gpt_ref-ytvos_20250119070932.sh +18 -0
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130185113.sh +0 -0
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130220432.sh +20 -0
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130220435.sh +20 -0
.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207171522.sh +20 -0
.history/slurm_script/mbench_ref-ytvos_json_20250113182619.sh +18 -0
.history/slurm_script/mbench_ref-ytvos_json_20250113182952.sh +18 -0
.history/slurm_script/mbench_ref-ytvos_json_20250116141255.sh +18 -0
.history/slurm_script/mbench_ref-ytvos_json_20250117072826.sh +18 -0
davis2017/results.py +31 -0
hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/7528dbb1b6ce860d242aff71294a5fef12a41572.lock +0 -0
hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/cc6c13cb9acd48b061e2d2664a50963c338b4998.lock +0 -0
hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/e7dbc990f8ede75b1ad2fd17028fbd89a950286a.lock +0 -0
hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/model.safetensors +0 -0
hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/model.safetensors.index.json +0 -0
hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/7528dbb1b6ce860d242aff71294a5fef12a41572 +7 -0

.history/datasets/a2d_20241227174300.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+A2D-Sentences data loader
+modified from https://github.com/mttr2021/MTTR/blob/main/datasets/a2d_sentences/a2d_sentences_dataset.py
+"""
+from pathlib import Path
+import torch
+from torchvision.io import read_video
+import torchvision.transforms.functional as F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+import h5py
+from pycocotools.mask import encode, area
+def get_image_id(video_id, frame_idx, ref_instance_a2d_id):
+    image_id = f'v_{video_id}_f_{frame_idx}_i_{ref_instance_a2d_id}'
+    return image_id
+class A2DSentencesDataset(Dataset):
+    """
+    A Torch dataset for A2D-Sentences.
+    For more information check out: https://kgavrilyuk.github.io/publication/actor_action/ or the original paper at:
+    https://arxiv.org/abs/1803.07485
+    """
+    def __init__(self, image_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int, subset):
+        super(A2DSentencesDataset, self).__init__()
+        dataset_path = str(image_folder)
+        self.mask_annotations_dir = os.path.join(dataset_path, 'text_annotations/a2d_annotation_with_instances')
+        self.videos_dir = os.path.join(dataset_path, 'Release/clips320H')
+        self.ann_file = ann_file
+        self.text_annotations = self.get_text_annotations()
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        self.subset = subset
+        print(f'\n {subset} sample num: ', len(self.text_annotations))
+        print('\n')
+    def get_text_annotations(self):
+        with open(str(self.ann_file), 'r') as f:
+            text_annotations_by_frame = [tuple(a) for a in json.load(f)]
+            return text_annotations_by_frame
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.text_annotations)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            text_query, video_id, frame_idx, instance_id = self.text_annotations[idx]
+            text_query = " ".join(text_query.lower().split())  # clean up the text query
+            # read the source window frames:
+            video_frames, _, _ = read_video(os.path.join(self.videos_dir, f'{video_id}.mp4'), pts_unit='sec')  # (T, H, W, C)
+            vid_len = len(video_frames)
+            # note that the original a2d dataset is 1 indexed, so we have to subtract 1 from frame_idx
+            frame_id = frame_idx - 1
+            if self.subset == 'train':
+                # get a window of window_size frames with frame frame_id in the middle.
+                num_frames = self.num_frames
+                # random sparse sample
+                sample_indx = [frame_id]
+                # local sample
+                sample_id_before = random.randint(1, 3)
+                sample_id_after = random.randint(1, 3)
+                local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
+                sample_indx.extend(local_indx)
+                # global sampling
+                if num_frames > 3:
+                    all_inds = list(range(vid_len))
+                    global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
+                    global_n = num_frames - len(sample_indx)
+                    if len(global_inds) > global_n:
+                        select_id = random.sample(range(len(global_inds)), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(global_inds[s_id])
+                    elif vid_len >=global_n:  # sample long range global frames
+                        select_id = random.sample(range(vid_len), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                    else:
+                        select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                sample_indx.sort()
+                # find the valid frame index in sampled frame list, there is only one valid frame
+                valid_indices = sample_indx.index(frame_id)
+            elif self.subset == 'val':
+                start_idx, end_idx = frame_id - self.num_frames // 2, frame_id + (self.num_frames + 1) // 2
+                sample_indx = []
+                for i in range(start_idx, end_idx):
+                    i = min(max(i, 0), len(video_frames)-1)  # pad out of range indices with edge frames
+                    sample_indx.append(i)
+                sample_indx.sort()
+                # find the valid frame index in sampled frame list, there is only one valid frame
+                valid_indices = sample_indx.index(frame_id)
+            # read frames
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for j in range(self.num_frames):
+                frame_indx = sample_indx[j]
+                img = F.to_pil_image(video_frames[frame_indx].permute(2, 0, 1))
+                imgs.append(img)
+            # read the instance mask
+            frame_annot_path = os.path.join(self.mask_annotations_dir, video_id, f'{frame_idx:05d}.h5')
+            f = h5py.File(frame_annot_path)
+            instances = list(f['instance'])
+            instance_idx = instances.index(instance_id)  # existence was already validated during init
+            instance_masks = np.array(f['reMask'])
+            if len(instances) == 1:
+                instance_masks = instance_masks[np.newaxis, ...]
+            instance_masks = torch.tensor(instance_masks).transpose(1, 2)
+            mask_rles = [encode(mask) for mask in instance_masks.numpy()]
+            mask_areas = area(mask_rles).astype(np.float)
+            f.close()
+            # select the referred mask
+            label = torch.tensor(0, dtype=torch.long)
+            mask = instance_masks[instance_idx].numpy()
+            if (mask > 0).any():
+                y1, y2, x1, x2 = self.bounding_box(mask)
+                box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                valid.append(1)
+            else: # some frame didn't contain the instance
+                box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                valid.append(0)
+            mask = torch.from_numpy(mask)
+            labels.append(label)
+            boxes.append(box)
+            masks.append(mask)
+            # transform
+            h, w = instance_masks.shape[-2:]
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            # there is only one valid frame
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'valid_indices': torch.tensor([valid_indices]),
+                'labels': labels,                        # [1,]
+                'boxes': boxes,                          # [1, 4], xyxy
+                'masks': masks,                          # [1, H, W]
+                'valid': torch.tensor(valid),            # [1,]
+                'caption': text_query,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)]),
+                'image_id': get_image_id(video_id,frame_idx, instance_id)
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            imgs, target = self._transforms(imgs, target)
+            imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.a2d_path)
+    assert root.exists(), f'provided A2D-Sentences path {root} does not exist'
+    PATHS = {
+        "train": (root, root / "a2d_sentences_single_frame_train_annotations.json"),
+        "val": (root, root / "a2d_sentences_single_frame_test_annotations.json"),
+    }
+    img_folder, ann_file = PATHS[image_set]
+    dataset = A2DSentencesDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size),
+                                return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
+    return dataset

.history/datasets/a2d_20250203155857.py ADDED Viewed

	@@ -0,0 +1,243 @@

+"""
+A2D-Sentences data loader
+modified from https://github.com/mttr2021/MTTR/blob/main/datasets/a2d_sentences/a2d_sentences_dataset.py
+"""
+from pathlib import Path
+import torch
+from torchvision.io import read_video
+import torchvision.transforms.functional as F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+import h5py
+from pycocotools.mask import encode, area
+def get_image_id(video_id, frame_idx, ref_instance_a2d_id):
+    image_id = f'v_{video_id}_f_{frame_idx}_i_{ref_instance_a2d_id}'
+    return image_id
+class A2DSentencesDataset(Dataset):
+    """
+    A Torch dataset for A2D-Sentences.
+    For more information check out: https://kgavrilyuk.github.io/publication/actor_action/ or the original paper at:
+    https://arxiv.org/abs/1803.07485
+    """
+    def __init__(self, image_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int, subset):
+        super(A2DSentencesDataset, self).__init__()
+        dataset_path = str(image_folder)
+        self.mask_annotations_dir = os.path.join(dataset_path, 'text_annotations/a2d_annotation_with_instances')
+        self.videos_dir = os.path.join(dataset_path, 'Release/clips320H')
+        self.ann_file = ann_file
+        self.text_annotations = self.get_text_annotations()
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        self.subset = subset
+        print(f'\n {subset} sample num: ', len(self.text_annotations))
+        print('\n')
+    def get_text_annotations(self):
+        with open(str(self.ann_file), 'r') as f:
+            text_annotations_by_frame = [tuple(a) for a in json.load(f)]
+            return text_annotations_by_frame
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.text_annotations)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            text_query, video_id, frame_idx, instance_id = self.text_annotations[idx]
+            text_query = " ".join(text_query.lower().split())  # clean up the text query
+            # read the source window frames:
+            video_frames, _, _ = read_video(os.path.join(self.videos_dir, f'{video_id}.mp4'), pts_unit='sec')  # (T, H, W, C)
+            vid_len = len(video_frames)
+            # note that the original a2d dataset is 1 indexed, so we have to subtract 1 from frame_idx
+            frame_id = frame_idx - 1
+            if self.subset == 'train':
+                # get a window of window_size frames with frame frame_id in the middle.
+                num_frames = self.num_frames
+                # random sparse sample
+                sample_indx = [frame_id]
+                # local sample
+                sample_id_before = random.randint(1, 3)
+                sample_id_after = random.randint(1, 3)
+                local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
+                sample_indx.extend(local_indx)
+                # global sampling
+                if num_frames > 3:
+                    all_inds = list(range(vid_len))
+                    global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
+                    global_n = num_frames - len(sample_indx)
+                    if len(global_inds) > global_n:
+                        select_id = random.sample(range(len(global_inds)), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(global_inds[s_id])
+                    elif vid_len >=global_n:  # sample long range global frames
+                        select_id = random.sample(range(vid_len), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                    else:
+                        select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                sample_indx.sort()
+                # find the valid frame index in sampled frame list, there is only one valid frame
+                valid_indices = sample_indx.index(frame_id)
+            elif self.subset == 'val':
+                start_idx, end_idx = frame_id - self.num_frames // 2, frame_id + (self.num_frames + 1) // 2
+                sample_indx = []
+                for i in range(start_idx, end_idx):
+                    i = min(max(i, 0), len(video_frames)-1)  # pad out of range indices with edge frames
+                    sample_indx.append(i)
+                sample_indx.sort()
+                # find the valid frame index in sampled frame list, there is only one valid frame
+                valid_indices = sample_indx.index(frame_id)
+            # read frames
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for j in range(self.num_frames):
+                frame_indx = sample_indx[j]
+                img = F.to_pil_image(video_frames[frame_indx].permute(2, 0, 1))
+                imgs.append(img)
+            # read the instance mask
+            frame_annot_path = os.path.join(self.mask_annotations_dir, video_id, f'{frame_idx:05d}.h5')
+            f = h5py.File(frame_annot_path)
+            instances = list(f['instance'])
+            instance_idx = instances.index(instance_id)  # existence was already validated during init
+            instance_masks = np.array(f['reMask'])
+            if len(instances) == 1:
+                instance_masks = instance_masks[np.newaxis, ...]
+            instance_masks = torch.tensor(instance_masks).transpose(1, 2)
+            mask_rles = [encode(mask) for mask in instance_masks.numpy()]
+            mask_areas = area(mask_rles).astype(np.float)
+            f.close()
+            # select the referred mask
+            label = torch.tensor(0, dtype=torch.long)
+            mask = instance_masks[instance_idx].numpy()
+            if (mask > 0).any():
+                y1, y2, x1, x2 = self.bounding_box(mask)
+                box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                valid.append(1)
+            else: # some frame didn't contain the instance
+                box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                valid.append(0)
+            mask = torch.from_numpy(mask)
+            labels.append(label)
+            boxes.append(box)
+            masks.append(mask)
+            # transform
+            h, w = instance_masks.shape[-2:]
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            # there is only one valid frame
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'valid_indices': torch.tensor([valid_indices]),
+                'labels': labels,                        # [1,]
+                'boxes': boxes,                          # [1, 4], xyxy
+                'masks': masks,                          # [1, H, W]
+                'valid': torch.tensor(valid),            # [1,]
+                'caption': text_query,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)]),
+                'image_id': get_image_id(video_id,frame_idx, instance_id)
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            imgs, target = self._transforms(imgs, target)
+            imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.a2d_path)
+    assert root.exists(), f'provided A2D-Sentences path {root} does not exist'
+    PATHS = {
+        "train": (root, root / "a2d_sentences_single_frame_train_annotations.json"),
+        "val": (root, root / "a2d_sentences_single_frame_test_annotations.json"),
+    }
+    img_folder, ann_file = PATHS[image_set]
+    #dataset = A2DSentencesDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size),
+    #                            return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
+    dataset = A2DSentencesDataset(img_folder, ann_file, transforms=None,
+                                return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
+    return dataset

.history/datasets/a2d_20250203160149.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+A2D-Sentences data loader
+modified from https://github.com/mttr2021/MTTR/blob/main/datasets/a2d_sentences/a2d_sentences_dataset.py
+"""
+from pathlib import Path
+import torch
+from torchvision.io import read_video
+import torchvision.transforms.functional as F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+import h5py
+from pycocotools.mask import encode, area
+def get_image_id(video_id, frame_idx, ref_instance_a2d_id):
+    image_id = f'v_{video_id}_f_{frame_idx}_i_{ref_instance_a2d_id}'
+    return image_id
+class A2DSentencesDataset(Dataset):
+    """
+    A Torch dataset for A2D-Sentences.
+    For more information check out: https://kgavrilyuk.github.io/publication/actor_action/ or the original paper at:
+    https://arxiv.org/abs/1803.07485
+    """
+    def __init__(self, image_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int, subset):
+        super(A2DSentencesDataset, self).__init__()
+        dataset_path = str(image_folder)
+        self.mask_annotations_dir = os.path.join(dataset_path, 'text_annotations/a2d_annotation_with_instances')
+        self.videos_dir = os.path.join(dataset_path, 'Release/clips320H')
+        self.ann_file = ann_file
+        self.text_annotations = self.get_text_annotations()
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        self.subset = subset
+        print(f'\n {subset} sample num: ', len(self.text_annotations))
+        print('\n')
+    def get_text_annotations(self):
+        with open(str(self.ann_file), 'r') as f:
+            text_annotations_by_frame = [tuple(a) for a in json.load(f)]
+            return text_annotations_by_frame
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.text_annotations)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            text_query, video_id, frame_idx, instance_id = self.text_annotations[idx]
+            text_query = " ".join(text_query.lower().split())  # clean up the text query
+            # read the source window frames:
+            video_frames, _, _ = read_video(os.path.join(self.videos_dir, f'{video_id}.mp4'), pts_unit='sec')  # (T, H, W, C)
+            vid_len = len(video_frames)
+            # note that the original a2d dataset is 1 indexed, so we have to subtract 1 from frame_idx
+            frame_id = frame_idx - 1
+            if self.subset == 'train':
+                # get a window of window_size frames with frame frame_id in the middle.
+                num_frames = self.num_frames
+                # random sparse sample
+                sample_indx = [frame_id]
+                # local sample
+                sample_id_before = random.randint(1, 3)
+                sample_id_after = random.randint(1, 3)
+                local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
+                sample_indx.extend(local_indx)
+                # global sampling
+                if num_frames > 3:
+                    all_inds = list(range(vid_len))
+                    global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
+                    global_n = num_frames - len(sample_indx)
+                    if len(global_inds) > global_n:
+                        select_id = random.sample(range(len(global_inds)), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(global_inds[s_id])
+                    elif vid_len >=global_n:  # sample long range global frames
+                        select_id = random.sample(range(vid_len), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                    else:
+                        select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                sample_indx.sort()
+                # find the valid frame index in sampled frame list, there is only one valid frame
+                valid_indices = sample_indx.index(frame_id)
+            elif self.subset == 'val':
+                start_idx, end_idx = frame_id - self.num_frames // 2, frame_id + (self.num_frames + 1) // 2
+                sample_indx = []
+                for i in range(start_idx, end_idx):
+                    i = min(max(i, 0), len(video_frames)-1)  # pad out of range indices with edge frames
+                    sample_indx.append(i)
+                sample_indx.sort()
+                # find the valid frame index in sampled frame list, there is only one valid frame
+                valid_indices = sample_indx.index(frame_id)
+            # read frames
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for j in range(self.num_frames):
+                frame_indx = sample_indx[j]
+                img = F.to_pil_image(video_frames[frame_indx].permute(2, 0, 1))
+                imgs.append(img)
+            # read the instance mask
+            frame_annot_path = os.path.join(self.mask_annotations_dir, video_id, f'{frame_idx:05d}.h5')
+            f = h5py.File(frame_annot_path)
+            instances = list(f['instance'])
+            instance_idx = instances.index(instance_id)  # existence was already validated during init
+            instance_masks = np.array(f['reMask'])
+            if len(instances) == 1:
+                instance_masks = instance_masks[np.newaxis, ...]
+            instance_masks = torch.tensor(instance_masks).transpose(1, 2)
+            mask_rles = [encode(mask) for mask in instance_masks.numpy()]
+            mask_areas = area(mask_rles).astype(np.float)
+            f.close()
+            # select the referred mask
+            label = torch.tensor(0, dtype=torch.long)
+            mask = instance_masks[instance_idx].numpy()
+            if (mask > 0).any():
+                y1, y2, x1, x2 = self.bounding_box(mask)
+                box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                valid.append(1)
+            else: # some frame didn't contain the instance
+                box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                valid.append(0)
+            mask = torch.from_numpy(mask)
+            labels.append(label)
+            boxes.append(box)
+            masks.append(mask)
+            # transform
+            h, w = instance_masks.shape[-2:]
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            # there is only one valid frame
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'valid_indices': torch.tensor([valid_indices]),
+                'labels': labels,                        # [1,]
+                'boxes': boxes,                          # [1, 4], xyxy
+                'masks': masks,                          # [1, H, W]
+                'valid': torch.tensor(valid),            # [1,]
+                'caption': text_query,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)]),
+                'image_id': get_image_id(video_id,frame_idx, instance_id)
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.a2d_path)
+    assert root.exists(), f'provided A2D-Sentences path {root} does not exist'
+    PATHS = {
+        "train": (root, root / "a2d_sentences_single_frame_train_annotations.json"),
+        "val": (root, root / "a2d_sentences_single_frame_test_annotations.json"),
+    }
+    img_folder, ann_file = PATHS[image_set]
+    #dataset = A2DSentencesDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size),
+    #                            return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
+    dataset = A2DSentencesDataset(img_folder, ann_file, transforms=None,
+                                return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
+    return dataset

.history/datasets/a2d_20250203174309.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+A2D-Sentences data loader
+modified from https://github.com/mttr2021/MTTR/blob/main/datasets/a2d_sentences/a2d_sentences_dataset.py
+"""
+from pathlib import Path
+import torch
+from torchvision.io import read_video
+import torchvision.transforms.functional as F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+import h5py
+from pycocotools.mask import encode, area
+def get_image_id(video_id, frame_idx, ref_instance_a2d_id):
+    image_id = f'v_{video_id}_f_{frame_idx}_i_{ref_instance_a2d_id}'
+    return image_id
+class A2DSentencesDataset(Dataset):
+    """
+    A Torch dataset for A2D-Sentences.
+    For more information check out: https://kgavrilyuk.github.io/publication/actor_action/ or the original paper at:
+    https://arxiv.org/abs/1803.07485
+    """
+    def __init__(self, image_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int, subset):
+        super(A2DSentencesDataset, self).__init__()
+        dataset_path = str(image_folder)
+        self.mask_annotations_dir = os.path.join(dataset_path, 'text_annotations/a2d_annotation_with_instances')
+        self.videos_dir = os.path.join(dataset_path, 'Release/clips320H')
+        self.ann_file = ann_file
+        self.text_annotations = self.get_text_annotations()
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        self.subset = subset
+        print(f'\n {subset} sample num: ', len(self.text_annotations))
+        print('\n')
+    def get_text_annotations(self):
+        with open(str(self.ann_file), 'r') as f:
+            text_annotations_by_frame = [tuple(a) for a in json.load(f)]
+            return text_annotations_by_frame
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.text_annotations)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            text_query, video_id, frame_idx, instance_id = self.text_annotations[idx]
+            text_query = " ".join(text_query.lower().split())  # clean up the text query
+            # read the source window frames:
+            video_frames, _, _ = read_video(os.path.join(self.videos_dir, f'{video_id}.mp4'), pts_unit='sec')  # (T, H, W, C)
+            vid_len = len(video_frames)
+            # note that the original a2d dataset is 1 indexed, so we have to subtract 1 from frame_idx
+            frame_id = frame_idx - 1
+            if self.subset == 'train':
+                # get a window of window_size frames with frame frame_id in the middle.
+                num_frames = self.num_frames
+                # random sparse sample
+                sample_indx = [frame_id]
+                # local sample
+                sample_id_before = random.randint(1, 3)
+                sample_id_after = random.randint(1, 3)
+                local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
+                sample_indx.extend(local_indx)
+                # global sampling
+                if num_frames > 3:
+                    all_inds = list(range(vid_len))
+                    global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
+                    global_n = num_frames - len(sample_indx)
+                    if len(global_inds) > global_n:
+                        select_id = random.sample(range(len(global_inds)), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(global_inds[s_id])
+                    elif vid_len >=global_n:  # sample long range global frames
+                        select_id = random.sample(range(vid_len), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                    else:
+                        select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                sample_indx.sort()
+                # find the valid frame index in sampled frame list, there is only one valid frame
+                valid_indices = sample_indx.index(frame_id)
+            elif self.subset == 'val':
+                start_idx, end_idx = frame_id - self.num_frames // 2, frame_id + (self.num_frames + 1) // 2
+                sample_indx = []
+                for i in range(start_idx, end_idx):
+                    i = min(max(i, 0), len(video_frames)-1)  # pad out of range indices with edge frames
+                    sample_indx.append(i)
+                sample_indx.sort()
+                # find the valid frame index in sampled frame list, there is only one valid frame
+                valid_indices = sample_indx.index(frame_id)
+            # read frames
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for j in range(self.num_frames):
+                frame_indx = sample_indx[j]
+                img = F.to_pil_image(video_frames[frame_indx].permute(2, 0, 1))
+                imgs.append(img)
+            # read the instance mask
+            frame_annot_path = os.path.join(self.mask_annotations_dir, video_id, f'{frame_idx:05d}.h5')
+            f = h5py.File(frame_annot_path)
+            instances = list(f['instance'])
+            instance_idx = instances.index(instance_id)  # existence was already validated during init
+            instance_masks = np.array(f['reMask'])
+            if len(instances) == 1:
+                instance_masks = instance_masks[np.newaxis, ...]
+            instance_masks = torch.tensor(instance_masks).transpose(1, 2)
+            mask_rles = [encode(mask) for mask in instance_masks.numpy()]
+            mask_areas = area(mask_rles).astype(float)
+            f.close()
+            # select the referred mask
+            label = torch.tensor(0, dtype=torch.long)
+            mask = instance_masks[instance_idx].numpy()
+            if (mask > 0).any():
+                y1, y2, x1, x2 = self.bounding_box(mask)
+                box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                valid.append(1)
+            else: # some frame didn't contain the instance
+                box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                valid.append(0)
+            mask = torch.from_numpy(mask)
+            labels.append(label)
+            boxes.append(box)
+            masks.append(mask)
+            # transform
+            h, w = instance_masks.shape[-2:]
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            # there is only one valid frame
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'valid_indices': torch.tensor([valid_indices]),
+                'labels': labels,                        # [1,]
+                'boxes': boxes,                          # [1, 4], xyxy
+                'masks': masks,                          # [1, H, W]
+                'valid': torch.tensor(valid),            # [1,]
+                'caption': text_query,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)]),
+                'image_id': get_image_id(video_id,frame_idx, instance_id)
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.a2d_path)
+    assert root.exists(), f'provided A2D-Sentences path {root} does not exist'
+    PATHS = {
+        "train": (root, root / "a2d_sentences_single_frame_train_annotations.json"),
+        "val": (root, root / "a2d_sentences_single_frame_test_annotations.json"),
+    }
+    img_folder, ann_file = PATHS[image_set]
+    #dataset = A2DSentencesDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size),
+    #                            return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
+    dataset = A2DSentencesDataset(img_folder, ann_file, transforms=None,
+                                return_masks=args.masks, num_frames=args.num_frames, max_skip=args.max_skip, subset=image_set)
+    return dataset

.history/datasets/ytvos_ref_20250113163537.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for frame_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['frame_id'], metas['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250116071955.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, sample_indx, bins, frames, obj_id_cat = \
+                meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                print(f"img size: {img.shape}")
+                mask = Image.open(mask_path).convert('P')
+                mask = np.array(mask)
+                # create the target
+                for obj_id in list(obj_id_cat.keys()):
+                    obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                    if (obj_mask > 0).any():
+                        y1, y2, x1, x2 = self.bounding_box(mask)
+                        box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                        valid.append(1)
+                    else: # some frame didn't contain the instance
+                        box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                        valid.append(0)
+                    obj_mask = torch.from_numpy(obj_mask)
+                    # append
+                    masks.append(obj_mask)
+                    boxes.append(box)
+            # transform
+            w, h = img.size
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': sample_indx, # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'obj_ids' : list(obj_id_cat.keys()),
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250116072439.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, sample_indx, bins, frames, obj_id_cat = \
+                meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                imgs.append(img)
+                mask = Image.open(mask_path).convert('P')
+                mask = np.array(mask)
+                # create the target
+                for obj_id in list(obj_id_cat.keys()):
+                    obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                    if (obj_mask > 0).any():
+                        y1, y2, x1, x2 = self.bounding_box(mask)
+                        box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                        valid.append(1)
+                    else: # some frame didn't contain the instance
+                        box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                        valid.append(0)
+                    obj_mask = torch.from_numpy(obj_mask)
+                    # append
+                    masks.append(obj_mask)
+                    boxes.append(box)
+            # transform
+            w, h = img.size
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': sample_indx, # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'obj_ids' : list(obj_id_cat.keys()),
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250116073540.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        meta = self.metas[idx]  # dict
+        video, sample_indx, bins, frames, obj_id_cat = \
+            meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+        # read frames and masks
+        imgs, labels, boxes, masks, valid = [], [], [], [], []
+        for frame_indx in sample_indx:
+            frame_name = frames[frame_indx]
+            img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+            mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+            img = Image.open(img_path).convert('RGB')
+            imgs.append(img)
+            mask = Image.open(mask_path).convert('P')
+            mask = np.array(mask)
+            # create the target
+            for obj_id in list(obj_id_cat.keys()):
+                obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (obj_mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                obj_mask = torch.from_numpy(obj_mask)
+                # append
+                masks.append(obj_mask)
+                boxes.append(box)
+        # transform
+        w, h = img.size
+        boxes = torch.stack(boxes, dim=0)
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        masks = torch.stack(masks, dim=0)
+        target = {
+            'frames_idx': sample_indx, # [T,]
+            'boxes': boxes,                          # [T, 4], xyxy
+            'masks': masks,                          # [T, H, W]
+            'valid': torch.tensor(valid),            # [T,]
+            'obj_ids' : list(obj_id_cat.keys()),
+            'orig_size': torch.as_tensor([int(h), int(w)]),
+            'size': torch.as_tensor([int(h), int(w)])
+        }
+        # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+        if self._transforms:
+            imgs, target = self._transforms(imgs, target)
+            imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+        else:
+            imgs = np.array(imgs)
+            imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+        #  # FIXME: handle "valid", since some box may be removed due to random crop
+        # if torch.any(target['valid'] == 1):  # at leatst one instance
+        #     instance_check = True
+        # else:
+        #     idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250116073706.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        meta = self.metas[idx]  # dict
+        video, sample_indx, bins, frames, obj_id_cat = \
+            meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+        # read frames and masks
+        imgs, labels, boxes, masks, valid = [], [], [], [], []
+        for frame_indx in sample_indx:
+            frame_name = frames[frame_indx]
+            img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+            mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+            img = Image.open(img_path).convert('RGB')
+            imgs.append(img)
+            mask = Image.open(mask_path).convert('P')
+            mask = np.array(mask)
+            print(np.unique(mask))
+            # create the target
+            for obj_id in list(obj_id_cat.keys()):
+                obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (obj_mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                obj_mask = torch.from_numpy(obj_mask)
+                # append
+                masks.append(obj_mask)
+                boxes.append(box)
+        # transform
+        w, h = img.size
+        boxes = torch.stack(boxes, dim=0)
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        masks = torch.stack(masks, dim=0)
+        target = {
+            'frames_idx': sample_indx, # [T,]
+            'boxes': boxes,                          # [T, 4], xyxy
+            'masks': masks,                          # [T, H, W]
+            'valid': torch.tensor(valid),            # [T,]
+            'obj_ids' : list(obj_id_cat.keys()),
+            'orig_size': torch.as_tensor([int(h), int(w)]),
+            'size': torch.as_tensor([int(h), int(w)])
+        }
+        # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+        if self._transforms:
+            imgs, target = self._transforms(imgs, target)
+            imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+        else:
+            imgs = np.array(imgs)
+            imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+        #  # FIXME: handle "valid", since some box may be removed due to random crop
+        # if torch.any(target['valid'] == 1):  # at leatst one instance
+        #     instance_check = True
+        # else:
+        #     idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250116073858.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        meta = self.metas[idx]  # dict
+        video, sample_indx, bins, frames, obj_id_cat = \
+            meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+        # read frames and masks
+        imgs, labels, boxes, masks, valid = [], [], [], [], []
+        for frame_indx in sample_indx:
+            frame_name = frames[frame_indx]
+            img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+            mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+            img = Image.open(img_path).convert('RGB')
+            imgs.append(img)
+            mask = Image.open(mask_path).convert('P')
+            mask = np.array(mask)
+            # create the target
+            for obj_id in list(obj_id_cat.keys()):
+                obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary
+                if (obj_mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                obj_mask = torch.from_numpy(obj_mask)
+                # append
+                masks.append(obj_mask)
+                boxes.append(box)
+        # transform
+        w, h = img.size
+        boxes = torch.stack(boxes, dim=0)
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        masks = torch.stack(masks, dim=0)
+        target = {
+            'frames_idx': sample_indx, # [T,]
+            'boxes': boxes,                          # [T, 4], xyxy
+            'masks': masks,                          # [T, H, W]
+            'valid': torch.tensor(valid),            # [T,]
+            'obj_ids' : list(obj_id_cat.keys()),
+            'orig_size': torch.as_tensor([int(h), int(w)]),
+            'size': torch.as_tensor([int(h), int(w)])
+        }
+        # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+        if self._transforms:
+            imgs, target = self._transforms(imgs, target)
+            imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+        else:
+            imgs = np.array(imgs)
+            imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+        #  # FIXME: handle "valid", since some box may be removed due to random crop
+        # if torch.any(target['valid'] == 1):  # at leatst one instance
+        #     instance_check = True
+        # else:
+        #     idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/mbench/gpt_ref-ytvos-cy_20250121143328.py ADDED Viewed

File without changes

.history/mbench/gpt_ref-ytvos-cy_20250121155631.py ADDED Viewed

	@@ -0,0 +1,428 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+# Captioner
+ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+]
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    all_captions = dict()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    # cat_names : person, snowboard
+    # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
+    # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
+    for cat_name in list(cat_names) :
+        image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+        image_captions = {}
+        captioner = OpenAI()
+        #0단계: action의 대상이 될 수 있는가?
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        # response_check = captioner.chat.completions.create(
+        #     model="gpt-4o",
+        #     messages=[
+        #         {
+        #             "role": "user",
+        #             "content": f"""
+        #                 Can a {cat_name} be a subject of distinct actions or movements?
+        #                 For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
+        #                 However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
+        #                 Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
+        #                 Answer only YES or NONE.
+        #             """
+        #         }
+        #     ],
+        # )
+        # response_check_content = response_check.choices[0].message.content.strip().lower()
+        # print(f"Movable Check for {cat_name}: {response_check_content}")
+        # if response_check_content == "yes": is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.")
+            continue
+        for i in range(len(image_paths)):
+            image_path = image_paths[i]
+            frame_name = frame_names[i]
+            base64_image = encode_image(image_path)
+            #1단계: 필터링
+            #print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+            response1 = captioner.chat.completions.create(
+                model="chatgpt-4o-latest",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
+                                        Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
+                                        Each action should be unique and clearly associated with a specific object.
+                                        Respond with YES if:
+                                        - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
+                                        - The {cat_name}s involve clear, distinguishable actions performed independently.
+                                        Respond with NONE if:
+                                        - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
+                                        - Actions are ambiguous, minor, or not clearly visible.
+                                        If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
+                                        If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
+                                        Answer only YES or NONE."""
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            response_content = response1.choices[0].message.content
+            should_caption = True if "yes" in response_content.lower() else False
+            #print(f"are {cat_name}s distinguished by action: {response_content}")
+            #2단계: dense caption 만들기
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": f"""
+                                            Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
+                                            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                            6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                            8. Include interactions with objects or other entities when they are prominent and observable.
+                                            9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                            Output only the caption.""",
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+                caption = response2.choices[0].message.content
+                #print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = []
+    valid_cat_names = list(all_captions.keys())
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat = video_data['annotations'][0][obj_id]['category_name']
+        if cat in valid_cat_names : valid_obj_ids.append(obj_id)
+    return all_captions, valid_obj_ids
+# Referring expression generator and QA filter
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    #cropped object for visibility check
+    cropped_I = I[y_min:y_max, x_min:x_max]
+    pil_cropped_I = Image.fromarray(cropped_I)
+    buff_crop = BytesIO()
+    pil_cropped_I.save(buff_crop, format='JPEG')
+    base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
+    #entire image for referring expression generation
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    # 구분 가능 여부 확인
+    generator = OpenAI()
+    response_check = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
+                                    Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
+                                    Guidelines:
+                                    - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
+                                    - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
+                                    - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
+                                    Output only either YES or NONE.
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    }
+                ]
+            },
+        ]
+    )
+    response_check_content = response_check.choices[0].message.content.strip().lower()
+    #print(f"is object {obj_id} visible: {response_check_content}")
+    if "yes" not in response_check_content:
+        print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
+        return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
+    # Referring expression 만들기
+    # generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
+                        Guidelines for creating the referring expression:
+                        1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
+                        2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
+                        3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
+                        4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
+                        5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
+                        6. Use '{cat_name}' as the noun for the referring expressions.
+                        Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
+                        {caption}
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                    # {
+                    #     "type": "image_url",
+                    #     "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    # }
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content.strip()
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
+    isValid = True if describesHighlighted and notDescribesNotHighlighted else False
+    #print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
+    #print(f"ref exp: {ref_exp}")
+    #print("")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    vid_ids = list(data.keys())
+    all_ref_exps = {}
+    os.environ['OPENAI_API_KEY'] = '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'
+    # 전체 데이터셋의 vid_id에 대해
+    for i in range(1):
+        vid_id = vid_ids[i]
+        #====캡션 만들기====
+        # print("=====================captioner========================")
+        captions, valid_obj_ids = getCaption(vid_id, data)
+        cats_in_vid = list(captions.keys())
+        # print()
+        #====referring expression 만들고 QA filtering====
+        # print("=====================referring expression generator & QA filter========================")
+        ref_expressions = {}
+        # 각 카테고리별로
+        for cat_name in cats_in_vid:
+            if cat_name not in ref_expressions:
+                ref_expressions[cat_name] = {}
+            # 각 비디오 프레임 별로
+            for frame_name in data[vid_id]['frame_names']:
+                # print(f'--------category: {cat_name}, frame_name: {frame_name}')
+                if frame_name not in ref_expressions[cat_name]:
+                    ref_expressions[cat_name][frame_name] = {}  # Create frame-level dictionary
+                caption = captions[cat_name][frame_name]
+                if not caption : continue
+                else :
+                    # 각 obj id별로
+                    for obj_id in valid_obj_ids:
+                        ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
+                        ref_expressions[cat_name][frame_name][obj_id] = ref_exp  # Store ref_exp
+        all_ref_exps[vid_id] = ref_expressions
+        with open('mbench/result_revised.json', 'w') as file:
+            json.dump(all_ref_exps, file)

.history/mbench/gpt_ref-ytvos_20250119071933.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import os
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+os.environ['OPENAI_API_KEY'] = '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'
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    if len(cat_names) == 1:
+        cat_name = next(iter(cat_names))
+    else:
+        print("more than 2 categories")
+        return -1
+    image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+    image_captions = {}
+    captioner = OpenAI()
+    for i in range(len(image_paths)):
+        image_path = image_paths[i]
+        frame_name = frame_names[i]
+        base64_image = encode_image(image_path)
+        #1단계: 필터링
+        response1 = captioner.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        response_content = response1.choices[0].message.content
+        should_caption = True if "yes" in response_content.lower() else False
+        #2단계: dense caption 만들기
+        if should_caption:
+            response2 = captioner.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""
+                                    Describe the image in detail focusing on the {cat_name}s' actions.
+                                    1. Each action should be prominent, clear and unique, describing the corresponding object only.
+                                    2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
+                                    3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
+                                    4. Do not include actions that needs to be guessed or suggested.""",
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            caption = response2.choices[0].message.content
+        else:
+            caption = None
+        image_captions[frame_name] = caption
+    return image_captions
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    #ref expression 만들기
+    generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
+                                1. The referring expression describes the action and does not contain information about appearance or location in the picture.
+                                2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
+                                3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
+                                4. The referring expression should only describe the highlighted {cat_name} and not any other.
+                                5. Use '{cat_name}' as the noun for the referring expressions.
+                                Output only the referring expression.
+                                {caption}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    describesNotHighlighted = True if "yes" in response2_content.lower() else False
+    isValid = True if describesHighlighted and not describesNotHighlighted else False
+    print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+def createRefExp(video_id, json_data):
+    video_data = json_data[video_id]
+    obj_ids = list(video_data['annotations'][0].keys())
+    frame_names = video_data['frame_names']
+    captions_per_frame = getCaption(video_id, json_data)
+    if captions_per_frame == -1:
+        print("There are more than 2 cateories")
+        return
+    video_ref_exps = {}
+    for frame_name in frame_names:
+        frame_caption = captions_per_frame[frame_name]
+        if frame_caption == None:
+            video_ref_exps[frame_name] = None
+        else:
+            frame_ref_exps = {}
+            for obj_id in obj_ids:
+                exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
+                frame_ref_exps[obj_id] = exp_per_obj
+            video_ref_exps[frame_name] = frame_ref_exps
+    return video_ref_exps
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    videos = set()
+    with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
+        manual_select = list(file)
+    for frame in manual_select:
+        result = json.loads(frame)
+        videos.add(result['video'])
+    videos = list(videos)
+    all_video_refs = {}
+    for i in range(1):
+        video_id = videos[i]
+        video_ref = createRefExp(video_id, data)
+        all_video_refs[video_id] = video_ref
+    json_obj = json.dumps(all_video_refs, indent=4)
+    with open('mbench/result.json', 'w') as file:
+        file.write(json_obj)

.history/mbench/gpt_ref-ytvos_20250119072546.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import os
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+os.environ['OPENAI_API_KEY'] = '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'
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    if len(cat_names) == 1:
+        cat_name = next(iter(cat_names))
+    else:
+        print("more than 2 categories")
+        return -1
+    image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+    image_captions = {}
+    captioner = OpenAI()
+    for i in range(len(image_paths)):
+        image_path = image_paths[i]
+        frame_name = frame_names[i]
+        base64_image = encode_image(image_path)
+        #1단계: 필터링
+        response1 = captioner.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        response_content = response1.choices[0].message.content
+        should_caption = True if "yes" in response_content.lower() else False
+        #2단계: dense caption 만들기
+        if should_caption:
+            response2 = captioner.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""
+                                    Describe the image in detail focusing on the {cat_name}s' actions.
+                                    1. Each action should be prominent, clear and unique, describing the corresponding object only.
+                                    2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
+                                    3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
+                                    4. Do not include actions that needs to be guessed or suggested.""",
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            caption = response2.choices[0].message.content
+        else:
+            caption = None
+        image_captions[frame_name] = caption
+    return image_captions
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    #ref expression 만들기
+    generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
+                                1. The referring expression describes the action and does not contain information about appearance or location in the picture.
+                                2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
+                                3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
+                                4. The referring expression should only describe the highlighted {cat_name} and not any other.
+                                5. Use '{cat_name}' as the noun for the referring expressions.
+                                Output only the referring expression.
+                                {caption}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    describesNotHighlighted = True if "yes" in response2_content.lower() else False
+    isValid = True if describesHighlighted and not describesNotHighlighted else False
+    print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+def createRefExp(video_id, json_data):
+    video_data = json_data[video_id]
+    obj_ids = list(video_data['annotations'][0].keys())
+    frame_names = video_data['frame_names']
+    captions_per_frame = getCaption(video_id, json_data)
+    if captions_per_frame == -1:
+        print("There are more than 2 cateories")
+        return None
+    video_ref_exps = {}
+    for frame_name in frame_names:
+        frame_caption = captions_per_frame[frame_name]
+        if frame_caption == None:
+            video_ref_exps[frame_name] = None
+        else:
+            frame_ref_exps = {}
+            for obj_id in obj_ids:
+                exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
+                frame_ref_exps[obj_id] = exp_per_obj
+            video_ref_exps[frame_name] = frame_ref_exps
+    return video_ref_exps
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    videos = set()
+    with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
+        manual_select = list(file)
+    for frame in manual_select:
+        result = json.loads(frame)
+        videos.add(result['video'])
+    videos = list(videos)
+    all_video_refs = {}
+    for i in range(1):
+        video_id = videos[i]
+        video_ref = createRefExp(video_id, data)
+        all_video_refs[video_id] = video_ref
+    json_obj = json.dumps(all_video_refs, indent=4)
+    with open('mbench/result.json', 'w') as file:
+        file.write(json_obj)

.history/mbench/make_ref-ytvos_json_20250113181932.py ADDED Viewed

File without changes

.history/mbench/make_ref-ytvos_json_20250113182455.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from datasets import build_dataset
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    data_idx = 0
+    while data_idx < 10:
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+        while metas[data_idx]['video'] == video_id:
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+            frames = metas[data_idx]['frames']
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+            annotation_data.append(obj_data)
+            frame_names.append(frame_name)
+            data_idx += 1
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250113182916.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    data_idx = 0
+    while data_idx < 10:
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+        while metas[data_idx]['video'] == video_id:
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+            frames = metas[data_idx]['frames']
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+            annotation_data.append(obj_data)
+            frame_names.append(frame_name)
+            data_idx += 1
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250113182917.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    data_idx = 0
+    while data_idx < 10:
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+        while metas[data_idx]['video'] == video_id:
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+            frames = metas[data_idx]['frames']
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :]
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+            annotation_data.append(obj_data)
+            frame_names.append(frame_name)
+            data_idx += 1
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250113183527.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    data_idx = 0
+    while data_idx < len(train_dataset):
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+        while metas[data_idx]['video'] == video_id:
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+            frames = metas[data_idx]['frames']
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :].tolist()
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+            annotation_data.append(obj_data)
+            frame_names.append(frame_name)
+            data_idx += 1
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250113195258.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    data_idx = 0
+    print(len(train_dataset), len(metas), flush = True)
+    while data_idx < len(train_dataset):
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+        while metas[data_idx]['video'] == video_id:
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+            frames = metas[data_idx]['frames']
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :].tolist()
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+            annotation_data.append(obj_data)
+            frame_names.append(frame_name)
+            data_idx += 1
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250113195443.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    data_idx = 0
+    while data_idx < len(train_dataset):
+        #하나의 비디오에 대해
+        video_data = {}
+        video_id = metas[data_idx]['video']
+        video_data['bins'] = metas[data_idx]['bins']
+        annotation_data = []
+        frame_names = []
+        while data_idx < len(train_dataset) and metas[data_idx]['video'] == video_id:
+            obj_id = metas[data_idx]['obj_id']
+            sample_id = metas[data_idx]['sample_id']
+            sample_frames_id = metas[data_idx]['sample_frames_id']
+            sample_frame_idx = sample_frames_id.index(sample_id)
+            frames = metas[data_idx]['frames']
+            frame_name = frames[sample_id]
+            cat_name = metas[data_idx]['category']
+            bbox = train_dataset[data_idx][1]['boxes'][sample_frame_idx, :].tolist()
+            obj_data = {obj_id: {
+                "category_name" : cat_name,
+                "bbox": bbox
+            }}
+            annotation_data.append(obj_data)
+            frame_names.append(frame_name)
+            data_idx += 1
+        video_data['annotations'] = annotation_data
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = osp.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250116140957.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    vid_idx = 0
+    while vid_idx < 5:
+        #하나의 비디오에 대해
+        video_data = {}
+        video_train_frames, video_train_info = train_dataset[vid_idx]
+        video_meta = metas[vid_idx]
+        video_id = video_meta['video']
+        video_data['bins'] = video_meta['bins']
+        bin_nums = len(video_meta['bins'])
+        obj_nums = len(list(video_meta['obj_id_cat'].keys()))
+        annotation_data = []
+        frame_names = []
+        for i in range(bin_nums):
+            bin_data = {}
+            for j in range(obj_nums):
+                obj_id = str(j+1)
+                obj_data = {
+                    "category_name":video_meta['obj_id_cat'][obj_id],
+                    "bbox":video_train_info['boxes'][i*obj_nums+j, :]
+                }
+                bin_data[obj_id] = obj_data
+            annotation_data.append(bin_data)
+        video_data['annotations'] = annotation_data
+        sample_indx = metas[vid_idx]['sample_indx']
+        frames = metas[vid_idx]['frames']
+        for i in sample_indx:
+            frame_name = frames[i]
+            frame_names.append(frame_name)
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+        vid_idx += 1
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250117032934.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import sys
+import os
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    vid_idx = 0
+    while vid_idx < len(train_dataset):
+        #하나의 비디오에 대해
+        video_data = {}
+        video_train_frames, video_train_info = train_dataset[vid_idx]
+        video_meta = metas[vid_idx]
+        video_id = video_meta['video']
+        video_data['bins'] = video_meta['bins']
+        bin_nums = len(video_meta['bins'])
+        obj_nums = len(list(video_meta['obj_id_cat'].keys()))
+        annotation_data = []
+        frame_names = []
+        for i in range(bin_nums):
+            bin_data = {}
+            for j in range(obj_nums):
+                obj_id = str(j+1)
+                print(video_meta['obj_id_cat'].keys())
+                obj_data = {
+                    "category_name":video_meta['obj_id_cat'][obj_id],
+                    "bbox":video_train_info['boxes'][i*obj_nums+j, :]
+                }
+                bin_data[obj_id] = obj_data
+            annotation_data.append(bin_data)
+        video_data['annotations'] = annotation_data
+        sample_indx = metas[vid_idx]['sample_indx']
+        frames = metas[vid_idx]['frames']
+        for i in sample_indx:
+            frame_name = frames[i]
+            frame_names.append(frame_name)
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+        vid_idx += 1
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame2.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250117074200.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import sys
+import os
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    vid_idx = 0
+    while vid_idx < 10:
+        #하나의 비디오에 대해
+        video_data = {}
+        video_train_frames, video_train_info = train_dataset[vid_idx]
+        video_meta = metas[vid_idx]
+        video_id = video_meta['video']
+        video_data['bins'] = video_meta['bins']
+        bin_nums = len(video_meta['bins'])
+        obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
+        annotation_data = []
+        frame_names = []
+        for i in range(bin_nums):
+            bin_data = {}
+            for j in range(obj_nums):
+                obj_id = str(j+1)
+                try:
+                    obj_data = {
+                        "category_name":video_meta['obj_id_cat'][obj_id],
+                        "bbox":video_train_info['boxes'][i*obj_nums+j, :].tolist()
+                    }
+                except:
+                    obj_data = {}
+                bin_data[obj_id] = obj_data
+            annotation_data.append(bin_data)
+        video_data['annotations'] = annotation_data
+        sample_indx = metas[vid_idx]['sample_indx']
+        frames = metas[vid_idx]['frames']
+        for i in sample_indx:
+            frame_name = frames[i]
+            frame_names.append(frame_name)
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+        vid_idx += 1
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame2.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250117074329.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import sys
+import os
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    vid_idx = 0
+    while vid_idx < len(train_dataset):
+        #하나의 비디오에 대해
+        video_data = {}
+        video_train_frames, video_train_info = train_dataset[vid_idx]
+        video_meta = metas[vid_idx]
+        video_id = video_meta['video']
+        video_data['bins'] = video_meta['bins']
+        bin_nums = len(video_meta['bins'])
+        obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
+        annotation_data = []
+        frame_names = []
+        for i in range(bin_nums):
+            bin_data = {}
+            for j in range(obj_nums):
+                obj_id = str(j+1)
+                try:
+                    obj_data = {
+                        "category_name":video_meta['obj_id_cat'][obj_id],
+                        "bbox":video_train_info['boxes'][i*obj_nums+j, :].tolist()
+                    }
+                except:
+                    obj_data = {}
+                bin_data[obj_id] = obj_data
+            annotation_data.append(bin_data)
+        video_data['annotations'] = annotation_data
+        sample_indx = metas[vid_idx]['sample_indx']
+        frames = metas[vid_idx]['frames']
+        for i in sample_indx:
+            frame_name = frames[i]
+            frame_names.append(frame_name)
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+        vid_idx += 1
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame2.json', mode='w') as file:
+        file.write(entire_json)

.history/slurm_script/jupyter_20250106230703.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/bin/bash
+#SBATCH --job-name=jupyter
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=0-06:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ srun jupyter notebook --no-browser --port=7890

.history/slurm_script/jupyter_20250113135212.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/bin/bash
+#SBATCH --job-name=jupyter
+#SBATCH --partition=a5000
+#SBATCH --nodelist=node04
+#SBATCH --gres=gpu:1
+#SBATCH --time=0-06:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ srun jupyter notebook --no-browser --port=7890

.history/slurm_script/jupyter_20250117012746.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/bin/bash
+#SBATCH --job-name=jupyter
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=0-06:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ srun jupyter notebook --no-browser --port=7890

.history/slurm_script/jupyter_20250117012750.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/bin/bash
+#SBATCH --job-name=jupyter
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ srun jupyter notebook --no-browser --port=7890

.history/slurm_script/jupyter_20250117143527.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/bin/bash
+#SBATCH --job-name=jupyter
+#SBATCH --partition=a5000
+#SBATCH --nodelist=node04
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/jupyter.out
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ srun jupyter notebook --no-browser --port=7890

.history/slurm_script/mbench_gpt_a2d_20250205122407.sh ADDED Viewed

File without changes

.history/slurm_script/mbench_gpt_a2d_20250205151525.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_a2d
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_a2d.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench_a2d/gpt_a2d_numbered.py \
+    --save_caption_path mbench_a2d/numbered_captions.json

.history/slurm_script/mbench_gpt_ref-ytvos-revised_20250121155759.sh ADDED Viewed

File without changes

.history/slurm_script/mbench_gpt_ref-ytvos_20250119070901.sh ADDED Viewed

File without changes

.history/slurm_script/mbench_gpt_ref-ytvos_20250119070932.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_ref-ytvos
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/make_ref-ytvos_json.py

.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130185113.sh ADDED Viewed

File without changes

.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130220432.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/gpt_ref-ytvos_numbered_cy.py \
+    --save_caption_path mbench/numbered_captions_gpt-4o-mini.json \
+    --save_valid_obj_ids_path mbench/numbered_valid_obj_idsgpt-4o-mini.json

.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250130220435.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/gpt_ref-ytvos_numbered_cy.py \
+    --save_caption_path mbench/numbered_captions_gpt-4o-mini.json \
+    --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o-mini.json

.history/slurm_script/mbench_gtp_ref-ytvos_numbered_20250207171522.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_gpt_ref-ytvos_numbered
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_gpt_ref-ytvos_numbered.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py \
+    --save_caption_path mbench/numbered_captions_gpt-4o_final.json \
+    --save_valid_obj_ids_path mbench/numbered_valid_obj_ids_gpt-4o_final.json

.history/slurm_script/mbench_ref-ytvos_json_20250113182619.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_ref-ytvos_json
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=0-06:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_ref-ytvos_json.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/make_ref-ytvos_json.py

.history/slurm_script/mbench_ref-ytvos_json_20250113182952.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_ref-ytvos_json
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_ref-ytvos_json.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/make_ref-ytvos_json.py

.history/slurm_script/mbench_ref-ytvos_json_20250116141255.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_ref-ytvos_json
+#SBATCH --partition=a5000
+#SBATCH --nodelist=node04
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_ref-ytvos_json.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/make_ref-ytvos_json.py

.history/slurm_script/mbench_ref-ytvos_json_20250117072826.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+#SBATCH --job-name=mbench_ref-ytvos_json
+#SBATCH --partition=a4000
+#SBATCH --nodelist=node05
+#SBATCH --gres=gpu:1
+#SBATCH --time=14-00:00:00
+#SBATCH --mem=5G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=/home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer/slurm_log/mbench_ref-ytvos_json.out
+ cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer
+ ml purge
+ ml load cuda/12.1
+ eval "$(conda shell.bash hook)"
+ conda activate referformer
+ python3 mbench/make_ref-ytvos_json.py

davis2017/results.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os
+import numpy as np
+from PIL import Image
+import sys
+class Results(object):
+    def __init__(self, root_dir):
+        self.root_dir = root_dir
+    def _read_mask(self, sequence, frame_id):
+        try:
+            mask_path = os.path.join(self.root_dir, sequence, f'{frame_id}.png')
+            return np.array(Image.open(mask_path))
+        except IOError as err:
+            sys.stdout.write(sequence + " frame %s not found!\n" % frame_id)
+            sys.stdout.write("The frames have to be indexed PNG files placed inside the corespondent sequence "
+                             "folder.\nThe indexes have to match with the initial frame.\n")
+            sys.stderr.write("IOError: " + err.strerror + "\n")
+            sys.exit()
+    def read_masks(self, sequence, masks_id):
+        mask_0 = self._read_mask(sequence, masks_id[0])
+        masks = np.zeros((len(masks_id), *mask_0.shape))
+        for ii, m in enumerate(masks_id):
+            masks[ii, ...] = self._read_mask(sequence, m)
+        num_objects = int(np.max(masks))
+        tmp = np.ones((num_objects, *masks.shape))
+        tmp = tmp * np.arange(1, num_objects + 1)[:, None, None, None]
+        masks = (tmp == masks[None, ...]) > 0
+        return masks

hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/7528dbb1b6ce860d242aff71294a5fef12a41572.lock ADDED Viewed

File without changes

hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/cc6c13cb9acd48b061e2d2664a50963c338b4998.lock ADDED Viewed

File without changes

hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/e7dbc990f8ede75b1ad2fd17028fbd89a950286a.lock ADDED Viewed

File without changes

hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/model.safetensors ADDED Viewed

File without changes

hf_cache/models--zhiqiulin--clip-flant5-xxl/.no_exist/89bad6fffe1126b24d4360c1e1f69145eb6103aa/model.safetensors.index.json ADDED Viewed

File without changes

hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/7528dbb1b6ce860d242aff71294a5fef12a41572 ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.31.0"
+}