dianecy commited on 27 days ago

Commit

9b855a7

verified ·

1 Parent(s): 5c8ef86

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.history/datasets/__init___20250113130146.py +38 -0
.history/datasets/ytvos_ref_20250113161625.py +243 -0
.history/datasets/ytvos_ref_20250113161634.py +242 -0
.history/datasets/ytvos_ref_20250113162627.py +242 -0
.history/datasets/ytvos_ref_20250113163106.py +244 -0
.history/datasets/ytvos_ref_20250113163121.py +245 -0
.history/datasets/ytvos_ref_20250113163340.py +249 -0
.history/datasets/ytvos_ref_20250113163347.py +249 -0
.history/datasets/ytvos_ref_20250114202456.py +251 -0
.history/datasets/ytvos_ref_20250114205130.py +250 -0
.history/datasets/ytvos_ref_20250114211235.py +252 -0
.history/datasets/ytvos_ref_20250114211331.py +250 -0
.history/datasets/ytvos_ref_20250114211640.py +242 -0
.history/datasets/ytvos_ref_20250114211841.py +242 -0
.history/datasets/ytvos_ref_20250114212623.py +242 -0
.history/datasets/ytvos_ref_20250116071135.py +240 -0
.history/datasets/ytvos_ref_20250116071255.py +239 -0
.history/datasets/ytvos_ref_20250116071502.py +240 -0
.history/datasets/ytvos_ref_20250116071546.py +240 -0
.history/datasets/ytvos_ref_20250116071553.py +240 -0
.history/datasets/ytvos_ref_20250116071841.py +239 -0
.history/datasets/ytvos_ref_20250116072442.py +241 -0
.history/slurm_script/mbench_ref-ytvos_json_20250113182526.sh +0 -0
LICENSE +201 -0
README.md +214 -0
davis2017/__init__.py +3 -0
davis2017/evaluation.py +110 -0
davis2017/metrics.py +197 -0
docs/A2D-Sentences.md +55 -0
docs/JHMDB-Sentences.md +27 -0
docs/Ref-DAVIS17.md +24 -0
docs/Ref-Youtube-VOS.md +83 -0
docs/data.md +127 -0
engine.py +253 -0
eval_davis.py +68 -0
jptr_chaeyun.txt +179 -0
make_ref-ytvos/annotate_ref_ytvos.py +288 -0
make_ref-ytvos/folder2lmdb.py +109 -0
make_ref-ytvos/manual_selected_frames.jsonl +101 -0
make_ref-ytvos/review_images.ipynb +0 -0
make_ref-ytvos/revised_frames.jsonl +0 -0
make_ref-ytvos/selected_frames.jsonl +0 -0
mbench/result.json +465 -0
models/__init__.py +5 -0
models/backbone.py +132 -0
models/criterion.py +208 -0
models/deformable_transformer.py +444 -0
models/matcher.py +206 -0
models/ops/make.sh +10 -0
models/ops/modules/__init__.py +9 -0

.history/datasets/__init___20250113130146.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch.utils.data
+import torchvision
+from .ytvos import build as build_ytvos
+from .ytvos_ref import build as build_ytvos_ref
+from .davis import build as build_davis
+from .a2d import build as build_a2d
+from .jhmdb import build as build_jhmdb
+from .refexp import build as build_refexp
+from .concat_dataset import build as build_joint
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10):
+        # if isinstance(dataset, torchvision.datasets.CocoDetection):
+        #     break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+def build_dataset(dataset_file: str, image_set: str, args):
+    if dataset_file == 'ytvos':
+        return build_ytvos(image_set, args)
+    if dataset_file == 'davis':
+        return build_davis(image_set, args)
+    if dataset_file == 'a2d':
+        return build_a2d(image_set, args)
+    if dataset_file == 'jhmdb':
+        return build_jhmdb(image_set, args)
+    # for pretraining
+    if dataset_file == "refcoco" or dataset_file == "refcoco+" or dataset_file == "refcocog":
+        return build_refexp(dataset_file, image_set, args)
+    # for joint training of refcoco and ytvos
+    if dataset_file == 'joint':
+        return build_joint(image_set, args)
+    raise ValueError(f'dataset {dataset_file} not supported')

.history/datasets/ytvos_ref_20250113161625.py ADDED Viewed

	@@ -0,0 +1,243 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                for bin_id in range(len(bins)):
+                    start_idx, end_idx = bins[bin_id]
+                    frame_id = random.randint(start_idx, end_idx - 1)
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113161634.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                for bin_id in range(len(bins)):
+                    start_idx, end_idx = bins[bin_id]
+                    frame_id = random.randint(start_idx, end_idx - 1)
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113162627.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    print(start_idx, end_idx)
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for frame_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113163106.py ADDED Viewed

	@@ -0,0 +1,244 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                continue
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for frame_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113163121.py ADDED Viewed

	@@ -0,0 +1,245 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                print(f"Too short video: {vid}")
+                continue
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for frame_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113163340.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for frame_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(skip_vid_count)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113163347.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for frame_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(skip_vid_count)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114202456.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            return vid_meta, vid_data
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for sample_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114205130.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for sample_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114211235.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                print(obj_id, type(obj_id))
+                print(vid_meta.keys())
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta[obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114211331.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114211640.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, sample_indx, bins, obj_id_cat = \
+                meta['video'], meta['sample_indx'], meta['bins'], meta['obj_id_cat']
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114211841.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, sample_indx, bins, frames, obj_id_cat = \
+                meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114212623.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, sample_indx, bins, frames, obj_id_cat = \
+                meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                mask = np.array(mask)
+                # create the target
+                for obj_id in list(obj_id_cat.keys()):
+                    obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                    if (obj_mask > 0).any():
+                        y1, y2, x1, x2 = self.bounding_box(mask)
+                        box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                        valid.append(1)
+                    else: # some frame didn't contain the instance
+                        box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                        valid.append(0)
+                    obj_mask = torch.from_numpy(obj_mask)
+                    # append
+                    imgs.append(img)
+                    masks.append(obj_mask)
+                    boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250116071135.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, sample_indx, bins, frames, obj_id_cat = \
+                meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                mask = np.array(mask)
+                # create the target
+                for obj_id in list(obj_id_cat.keys()):
+                    obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                    if (obj_mask > 0).any():
+                        y1, y2, x1, x2 = self.bounding_box(mask)
+                        box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                        valid.append(1)
+                    else: # some frame didn't contain the instance
+                        box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                        valid.append(0)
+                    obj_mask = torch.from_numpy(obj_mask)
+                    # append
+                    imgs.append(img)
+                    masks.append(obj_mask)
+                    boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': sample_indx, # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250116071255.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, sample_indx, bins, frames, obj_id_cat = \
+                meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                mask = np.array(mask)
+                # create the target
+                for obj_id in list(obj_id_cat.keys()):
+                    obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                    if (obj_mask > 0).any():
+                        y1, y2, x1, x2 = self.bounding_box(mask)
+                        box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                        valid.append(1)
+                    else: # some frame didn't contain the instance
+                        box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                        valid.append(0)
+                    obj_mask = torch.from_numpy(obj_mask)
+                    # append
+                    imgs.append(img)
+                    masks.append(obj_mask)
+                    boxes.append(box)
+            # transform
+            w, h = img.size
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': sample_indx, # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250116071502.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, sample_indx, bins, frames, obj_id_cat = \
+                meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                mask = np.array(mask)
+                # create the target
+                for obj_id in list(obj_id_cat.keys()):
+                    obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                    if (obj_mask > 0).any():
+                        y1, y2, x1, x2 = self.bounding_box(mask)
+                        box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                        valid.append(1)
+                    else: # some frame didn't contain the instance
+                        box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                        valid.append(0)
+                    obj_mask = torch.from_numpy(obj_mask)
+                    # append
+                    imgs.append(img)
+                    masks.append(obj_mask)
+                    boxes.append(box)
+            # transform
+            w, h = img.size
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': sample_indx, # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'obj_ids' : list(obj_id_cat.keys())
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250116071546.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, sample_indx, bins, frames, obj_id_cat = \
+                meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                mask = np.array(mask)
+                # create the target
+                for obj_id in list(obj_id_cat.keys()):
+                    obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                    if (obj_mask > 0).any():
+                        y1, y2, x1, x2 = self.bounding_box(mask)
+                        box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                        valid.append(1)
+                    else: # some frame didn't contain the instance
+                        box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                        valid.append(0)
+                    obj_mask = torch.from_numpy(obj_mask)
+                    # append
+                    imgs.append(img)
+                    masks.append(obj_mask)
+                    boxes.append(box)
+            # transform
+            w, h = img.size
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': sample_indx, # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'obj_ids' : list(obj_id_cat.keys()),
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250116071553.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, sample_indx, bins, frames, obj_id_cat = \
+                meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                mask = np.array(mask)
+                # create the target
+                for obj_id in list(obj_id_cat.keys()):
+                    obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                    if (obj_mask > 0).any():
+                        y1, y2, x1, x2 = self.bounding_box(mask)
+                        box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                        valid.append(1)
+                    else: # some frame didn't contain the instance
+                        box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                        valid.append(0)
+                    obj_mask = torch.from_numpy(obj_mask)
+                    # append
+                    imgs.append(img)
+                    masks.append(obj_mask)
+                    boxes.append(box)
+            # transform
+            w, h = img.size
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': sample_indx, # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'obj_ids' : list(obj_id_cat.keys()),
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250116071841.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, sample_indx, bins, frames, obj_id_cat = \
+                meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                mask = np.array(mask)
+                # create the target
+                for obj_id in list(obj_id_cat.keys()):
+                    obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                    if (obj_mask > 0).any():
+                        y1, y2, x1, x2 = self.bounding_box(mask)
+                        box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                        valid.append(1)
+                    else: # some frame didn't contain the instance
+                        box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                        valid.append(0)
+                    obj_mask = torch.from_numpy(obj_mask)
+                    # append
+                    masks.append(obj_mask)
+                    boxes.append(box)
+            # transform
+            w, h = img.size
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': sample_indx, # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'obj_ids' : list(obj_id_cat.keys()),
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250116072442.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, sample_indx, bins, frames, obj_id_cat = \
+                meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                imgs.append(img)
+                mask = Image.open(mask_path).convert('P')
+                mask = np.array(mask)
+                # create the target
+                for obj_id in list(obj_id_cat.keys()):
+                    obj_mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                    if (obj_mask > 0).any():
+                        y1, y2, x1, x2 = self.bounding_box(mask)
+                        box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                        valid.append(1)
+                    else: # some frame didn't contain the instance
+                        box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                        valid.append(0)
+                    obj_mask = torch.from_numpy(obj_mask)
+                    # append
+                    masks.append(obj_mask)
+                    boxes.append(box)
+            # transform
+            w, h = img.size
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': sample_indx, # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'obj_ids' : list(obj_id_cat.keys()),
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/slurm_script/mbench_ref-ytvos_json_20250113182526.sh ADDED Viewed

File without changes

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2020 - present, Facebook, Inc
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,214 @@

+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![Framework](https://img.shields.io/badge/PyTorch-%23EE4C2C.svg?&logo=PyTorch&logoColor=white)](https://pytorch.org/)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/language-as-queries-for-referring-video/referring-expression-segmentation-on-refer-1)](https://paperswithcode.com/sota/referring-expression-segmentation-on-refer-1?p=language-as-queries-for-referring-video)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/language-as-queries-for-referring-video/referring-expression-segmentation-on-a2d)](https://paperswithcode.com/sota/referring-expression-segmentation-on-a2d?p=language-as-queries-for-referring-video)
+The official implementation of the **CVPR2022** paper:
+<div align="center">
+<h1>
+<b>
+Language as Queries for Referring <br> Video Object Segmentation
+</b>
+</h1>
+</div>
+<p align="center"><img src="docs/network.png" width="800"/></p>
+> [**Language as Queries for Referring Video Object Segmentation**](https://arxiv.org/abs/2201.00487)
+>
+> Jiannan Wu, Yi Jiang, Peize Sun, Zehuan Yuan, Ping Luo
+### Abstract
+In this work, we propose a simple and unified framework built upon Transformer, termed ReferFormer. It views the language as queries and directly attends to the most relevant regions in the video frames. Concretely, we introduce a small set of object queries conditioned on the language as the input to the Transformer. In this manner, all the queries are obligated to find the referred objects only. They are eventually transformed into dynamic kernels which capture the crucial object-level information, and play the role of convolution filters to generate the segmentation masks from feature maps. The object tracking is achieved naturally by linking the corresponding queries across frames. This mechanism greatly simplifies the pipeline and the end-to-end framework is significantly different from the previous methods. Extensive experiments on Ref-Youtube-VOS, Ref-DAVIS17, A2D-Sentences and JHMDB-Sentences show the effectiveness of ReferFormer.
+## Update
+- **(2022/12/19)** We add the results on RefCOCO/+/g validation set.
+- **(2022/07/31)** We upload the files for joint-training.
+- **(2022/04/04)** We upload the data conversion and main files for pre-training.
+- **(2022/03/11)** We upload the model on Ref-Youtube-VOS by jointly training Ref-Youtube-VOS and Ref-COCO/+/g, which leads to higher performance.
+- **(2022/03/03)** ReferFormer is accepted by CVPR2022. 👏
+## Demo
+- Ref-DAVIS17
+<img src="docs/davis_demo1.gif" width="400"/><img src="docs/davis_demo2.gif" width="400"/>
+- Ref-Youtube-VOS
+<img src="docs/ytvos_demo1.gif" width="400"/><img src="docs/ytvos_demo2.gif" width="400"/>
+## Requirements
+We test the codes in the following environments, other versions may also be compatible:
+- CUDA 11.1
+- Python 3.7
+- Pytorch 1.8.1
+## Installation
+Please refer to [install.md](docs/install.md) for installation.
+## Data Preparation
+Please refer to [data.md](docs/data.md) for data preparation.
+We provide the pretrained model for different visual backbones. You may download them [here]([https://drive.google.com/drive/u/0/folders/11_qps3q75aH41IYHlXToyeIBUKkfdqso](https://connecthkuhk-my.sharepoint.com/:f:/g/personal/wjn922_connect_hku_hk/Et657S8tgGRNguj2hf4azsUBn1UVbMNLAmyjcRWGobs2_A?e=xobQFH)) and put them in the directory `pretrained_weights`.
+<!-- For the Swin Transformer and Video Swin Transformer backbones, the weights are intialized using the pretrained model provided in the repo [Swin-Transformer](https://github.com/microsoft/Swin-Transformer) and [Video-Swin-Transformer](https://github.com/SwinTransformer/Video-Swin-Transformer). For your convenience, we upload the pretrained model in the google drives [swin_pretrained](https://drive.google.com/drive/u/0/folders/1QWLayukDJYAxTFk7NPwerfso3Lrx35NL) and [video_swin_pretrained](https://drive.google.com/drive/u/0/folders/19qb9VbKSjuwgxsiPI3uv06XzQkB5brYM). -->
+After the organization, we expect the directory struture to be the following:
+```
+ReferFormer/
+├── data/
+│   ├── ref-youtube-vos/
+│   ├── ref-davis/
+│   ├── a2d_sentences/
+│   ├── jhmdb_sentences/
+├── davis2017/
+├── datasets/
+├── models/
+├── scipts/
+├── tools/
+├── util/
+├── pretrained_weights/
+├── eval_davis.py
+├── main.py
+├── engine.py
+├── inference_ytvos.py
+├── inference_davis.py
+├── opts.py
+...
+```
+## Model Zoo
+All the models are trained using 8 NVIDIA Tesla V100 GPU. You may change the `--backbone` parameter to use different backbones (see [here](https://github.com/wjn922/ReferFormer/blob/232b4066fb7d10845e4083e6a5a2cc0af5d1757e/opts.py#L31)).
+**Note:** If you encounter the `OOM` error, please add the command `--use_checkpoint` (we add this command for Swin-L, Video-Swin-S and Video-Swin-B models).
+### Ref-Youtube-VOS
+To evaluate the results, please upload the zip file to the [competition server](https://competitions.codalab.org/competitions/29139#participate-submit_results).
+| Backbone| J&F | CFBI J&F  | Pretrain | Model | Submission | CFBI Submission |
+| :----: | :----: | :----: | :----: | :----: | :----: | :----: |
+| ResNet-50 | 55.6 | 59.4 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EepGibYBfyRGt_QedfE9SywBLF3v-bjoxo2R9E9YDqmIcw?e=7J7k1J) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EVRsV76e78lKuekbMLHgwlsBdG09pRVafEuBPN_wKXjJ1Q?e=SMeZlS) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EZ8tt46rv4xIjoiUkHGGPjwB1Yi6w2H-9BBVTyINOINmgQ?e=yWbDjp) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EZp0dd70UCNGvla2g25lTawB2AZyCDPN7QMl_KeESI5dkQ?e=1BfD2W) |
+| ResNet-101 | 57.3 | 60.3 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESTAK4QCkMdNkVlQz1dd7GoBo3n_i9K4_FK4YLFBAFvBrg?e=Y3PlD5) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EaHNEx5MWR9HjTNh__W3IlYBIfhGd-nHKrshJ-MOyvofdw?e=shM4Ok) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EbrNhmt-wiNIv2tmQ-gOupgBrSBzhM1OJlNvid0J_8cPJg?e=8Fgets) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EWSPiUjgmORMuyaL91ueY1oBl159pO4k7RQYF-9eWrSJ-A?e=81hzDF) |
+| Swin-T | 58.7 | 61.2 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESdasB6JLydDrs6mf68FrLMBuQBLBF7y_uxdveWl9oK68w?e=H5zeqk) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EUxJmp6QYR5LoUK12Wj55E0Bm0o6_9zl3OvOBN5KE9kJkg?e=SRS0qL) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EUMveO7cX1VAq48IAk9c6zoBc_Zy5f1kwa5h6C9q4LYt0A?e=iz9uMg) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EcnHrx4S5KVPqFYhr9CCARoBftAxdtldaWyGQAougBFnig?e=KG1LDq) |
+| Swin-L | 62.4 | 63.3 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESngRLeZfV1LtrlZ7x5cVo4BR5_deWfov4Igt28LZGoDew?e=AVAsws) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EcCfv66Vl0xDl-rFukByXyQBEFNRTyLeVEKoeWrIvXmjNg?e=GcVTIr) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EdI15ujU4UpBilI4wt5lUQQB98JOq6KnMV5GHh77QiAn-w?e=o91ITz) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ETmJUpRGgyFHlGdEhcXqzekBDAfbFTExfHtmA4wHKCOkLw?e=l951Ea) |
+| Video-Swin-T* | 56.0 | - | - | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EYXL3SKYOsRGtfSN-Wr9JCUBDvcXbbp67Sa4hs5dEDplxw?e=g2hGWo) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EUosvwAGikhGsyTPEOELMjEBQM-HZOaJ3fqcJjG2SV-5YA?e=vSUD12) | - |
+| Video-Swin-T | 59.4 | - | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EUNTvEGXlsdLv3gicAbHfN0Ba23kcyy2-Z15IJTDLXKx_A?e=GqAYxT) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EZI2zogC5mtDu3KL5MVIaXIBzG3_3yTthoqyxjfTsGrvzA?e=lT5sVp) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EVKtr-5ZK5NIhhTvaUXGdRcBcHEGahAevUh1YCO2nvFfaQ?e=9Am7dc) | - |
+| Video-Swin-S | 60.1 | - | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/Eb015DXX1LhDpiDoojxJTu8BBQ8ACicpVS8gwFStRJDK1w?e=NC368q) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EZI2zogC5mtDu3KL5MVIaXIBzG3_3yTthoqyxjfTsGrvzA?e=QEAdwh) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EUUJn8Zu7mlCnxLP8eNSbpIBvoEqz88EOg3y9ftQHhAhCw?e=RnSwxX) | - |
+| Video-Swin-B | 62.9 | - |[weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ETDj4aGm_pRMuz8hLBi9Jy0BEFnsco0Uoz5qQEhWrxdNKQ?e=kKImMX)  | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EduJ_zS-Vd5Hn1qexxv5_mYBKX_8kRBOeX6dlfhED_GSwg?e=TxTWHb) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EZKduAM1fLpJrLK7l762xZ8BesK7zWKBjR0b9dFbCWhbfQ?e=SlAdyg) | - |
+\* indicates the model is trained from scratch.
+Joint training with Ref-COCO/+/g datasets.
+| Backbone| J&F | J | F | Model | Submission |
+| :----: | :----: | :----: | :----: | :----: | :----: |
+| ResNet-50 | 58.7 | 57.4 | 60.1 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EcxDd8USU4BGo_HlgukKiG4BXLvetkjLdi3_-N-3SpjMvw?e=tAPNFv) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EYmORJYVsUJLp8NnbtfnZigBCM-IJ5oomZZrXEbNPhIyww?e=Bh5eYx) |
+| ResNet-101 | 59.3 | 58.1 | 60.4 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EShgDd650nBBsfoNEiUbybcB84Ma5NydxOucISeCrZmzHw?e=YOSszd) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EcW6Lt67k0RCjr_FT2XOxVcBUcrFSlFJo19-YdFZpBxOsg?e=avszXt) |
+| Swin-L | 64.2 | 62.3 | 66.2 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/Ec_qxgvukuBPr-GQ_3gNcX0B8VCHCqIUvXX-0ydtk1s7HQ?e=7X99M1) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EbNV0kBQ7ZVDrfRafG6B3CwBbpM-yMJtQ9jI01HwEgWXBQ?e=FzoSrT) |
+| Video-Swin-T | 62.6 | 59.9 | 63.3 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EdCVQzM4HxxIvdZUBLiNpBwBrcPTLlFEqxHVxOzx0geF3A?e=1ZSZvK) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EdAT37_CDDZKkbC1U9MDxTYBkR1DVwTn0zxzqEvgrG-5ig?e=6P065H) |
+| Video-Swin-S | 63.3 | 61.4 | 65.2 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EdYbp2xp-xFFuolQopvILNMBYRq88ksNjpcv-zKfGzHxbA?e=NqRzTf) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EU6d1rGtkfBFkIoA-xUH2koBwdKW2fPCghYTzzd49KvFLQ?e=FMsJLT) |
+| Video-Swin-B | 64.9 | 62.8 | 67.0 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EY3-adM5LptFj--klo5gWgsBhpSDOps91j-C81sBI8i9Hw?e=n19q0w) | [link](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EcSdF-jsBmZLn7iUzc3zXTUBnlfnXDFxPP7mtRbC1ttJwg?e=0wzR0t) |
+### Ref-DAVIS17
+As described in the paper, we report the results using the model trained on Ref-Youtube-VOS without finetune.
+| Backbone| J&F | J | F | Model |
+| :----: | :----: | :----: | :----: | :----: |
+| ResNet-50 | 58.5 | 55.8 | 61.3 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EVRsV76e78lKuekbMLHgwlsBdG09pRVafEuBPN_wKXjJ1Q?e=SMeZlS) |
+| Swin-L | 60.5 | 57.6 | 63.4 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EcCfv66Vl0xDl-rFukByXyQBEFNRTyLeVEKoeWrIvXmjNg?e=GcVTIr) |
+| Video-Swin-B | 61.1 | 58.1 | 64.1 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EduJ_zS-Vd5Hn1qexxv5_mYBKX_8kRBOeX6dlfhED_GSwg?e=TxTWHb) |
+### A2D-Sentences
+The pretrained models are the same as those provided for Ref-Youtube-VOS.
+| Backbone| Overall IoU | Mean IoU | mAP  | Pretrain | Model |
+| :----: | :----: | :----: | :----: | :----: | :----: |
+| Video-Swin-T* | 72.3 | 64.1 | 48.6 | - | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EfJs5WPRKfxEvifnIO3impABNgydbiO5qqI_uCF6LYKlCQ?e=mSRLCQ) \| [log](https://connecthkuhk-my.sharepoint.com/:t:/g/personal/wjn922_connect_hku_hk/EVJyHq6zy6ZGuxE--K9nECwB333gFkP9vjXKjh9Mt0otcA?e=Kwnngd) |
+| Video-Swin-T | 77.6 | 69.6 | 52.8 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EUNTvEGXlsdLv3gicAbHfN0Ba23kcyy2-Z15IJTDLXKx_A?e=GqAYxT) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/Ed3po2mJGQZHivGwMJJg8oMBumXm3Ye94oPH6wfRFK1d8A?e=NG2E9c) \| [log](https://connecthkuhk-my.sharepoint.com/:t:/g/personal/wjn922_connect_hku_hk/EfO50qMduZNGvFcYJdRVKzABIJ8ZHhMiKWWvmDM14K9mnw?e=dgInSK) |
+| Video-Swin-S | 77.7 | 69.8 | 53.9 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/Eb015DXX1LhDpiDoojxJTu8BBQ8ACicpVS8gwFStRJDK1w?e=NC368q) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EbAiydTvu41KsMYBEFzy_d8B0Nyy1fIf2tWG7Ao-FYD0Ug?e=tmaVAu) \| [log](https://connecthkuhk-my.sharepoint.com/:t:/g/personal/wjn922_connect_hku_hk/EZl6sHhFDTBMgVGKVp18sqwBouTTnwPdirWId4PR6klTfg?e=17lDVV) |
+| Video-Swin-B | 78.6 | 70.3 | 55.0 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ETDj4aGm_pRMuz8hLBi9Jy0BEFnsco0Uoz5qQEhWrxdNKQ?e=kKImMX) | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EeP1aneDDbBCo9HnMTbNjsgBpMqrgfIzJzF_jVROpZ2GWQ?e=YmkNHC) \| [log](https://connecthkuhk-my.sharepoint.com/:t:/g/personal/wjn922_connect_hku_hk/EUnV-O_IAe5Mkyupsd7NosMBxUg8OjqepmQbpbV0PFB4gQ?e=W14suT) |
+\* the model is trained from scratch and set `--num_frames 6`.
+### JHMDB-Sentences
+As described in the paper, we report the results using the model trained on A2D-Sentences without finetune.
+| Backbone| Overall IoU | Mean IoU | mAP  | Model |
+| :----: | :----: | :----: | :----: | :----: |
+| Video-Swin-T* | 70.0 | 69.3 | 39.1 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EfJs5WPRKfxEvifnIO3impABNgydbiO5qqI_uCF6LYKlCQ?e=mSRLCQ) |
+| Video-Swin-T | 71.9 | 71.0 | 42.2 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/Ed3po2mJGQZHivGwMJJg8oMBumXm3Ye94oPH6wfRFK1d8A?e=NG2E9c) |
+| Video-Swin-S | 72.8 | 71.5 | 42.4 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EbAiydTvu41KsMYBEFzy_d8B0Nyy1fIf2tWG7Ao-FYD0Ug?e=tmaVAu) |
+| Video-Swin-B | 73.0 | 71.8 | 43.7 | [model](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EeP1aneDDbBCo9HnMTbNjsgBpMqrgfIzJzF_jVROpZ2GWQ?e=YmkNHC) |
+\* the model is trained from scratch and set `--num_frames 6`.
+### RefCOCO/+/g
+We also support evaluate on RefCOCO/+/g validation set by using the pretrained weights (num_frames=1).
+Specifically, we measure the [email protected] and overall IoU (oIoU) for REC and RIS tasks, respectively.
+REC (referring epression understanding):
+| Backbone| RefCOCO | RefCOCO+ | RefCOCOg  | Model |
+| :----: | :----: | :----: | :----: | :----: |
+| ResNet-50 | 85.0 | 79.2 | 79.0 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EepGibYBfyRGt_QedfE9SywBLF3v-bjoxo2R9E9YDqmIcw?e=7J7k1J) |
+| ResNet-101 | 85.4 | 75.8 | 79.9 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESTAK4QCkMdNkVlQz1dd7GoBo3n_i9K4_FK4YLFBAFvBrg?e=Y3PlD5) |
+| Swin-T | 86.7 | 77.2 | 80.6 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESdasB6JLydDrs6mf68FrLMBuQBLBF7y_uxdveWl9oK68w?e=H5zeqk) |
+| Swin-L | 89.8 | 80.0 | 83.9 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESngRLeZfV1LtrlZ7x5cVo4BR5_deWfov4Igt28LZGoDew?e=AVAsws) |
+RIS (referring image segmentation):
+| Backbone| RefCOCO | RefCOCO+ | RefCOCOg  | Model |
+| :----: | :----: | :----: | :----: | :----: |
+| ResNet-50 | 71.1 | 64.1 | 64.1 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/EepGibYBfyRGt_QedfE9SywBLF3v-bjoxo2R9E9YDqmIcw?e=7J7k1J) |
+| ResNet-101 | 71.8 | 61.1 | 64.9 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESTAK4QCkMdNkVlQz1dd7GoBo3n_i9K4_FK4YLFBAFvBrg?e=Y3PlD5) |
+| Swin-T | 72.9 | 62.4 | 66.1 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESdasB6JLydDrs6mf68FrLMBuQBLBF7y_uxdveWl9oK68w?e=H5zeqk) |
+| Swin-L | 77.1 | 65.8 | 69.3 | [weight](https://connecthkuhk-my.sharepoint.com/:u:/g/personal/wjn922_connect_hku_hk/ESngRLeZfV1LtrlZ7x5cVo4BR5_deWfov4Igt28LZGoDew?e=AVAsws) |
+## Get Started
+Please see [Ref-Youtube-VOS](docs/Ref-Youtube-VOS.md), [Ref-DAVIS17](docs/Ref-DAVIS17.md), [A2D-Sentences](docs/A2D-Sentences.md) and [JHMDB-Sentences](docs/JHMDB-Sentences.md) for details.
+## Acknowledgement
+This repo is based on [Deformable DETR](https://github.com/fundamentalvision/Deformable-DETR) and [VisTR](https://github.com/Epiphqny/VisTR). We also refer to the repositories [MDETR](https://github.com/ashkamath/mdetr) and [MTTR](https://github.com/mttr2021/MTTR). Thanks for their wonderful works.
+## Citation
+```
+@article{wu2022referformer,
+      title={Language as Queries for Referring Video Object Segmentation},
+      author={Jiannan Wu and Yi Jiang and Peize Sun and Zehuan Yuan and Ping Luo},
+      journal={arXiv preprint arXiv:2201.00487},
+      year={2022},
+}
+```

davis2017/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from __future__ import absolute_import
2	+
3	+ __version__ = '0.1.0'

davis2017/evaluation.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import sys
+from tqdm import tqdm
+import warnings
+warnings.filterwarnings("ignore", category=RuntimeWarning)
+import numpy as np
+from davis2017.davis import DAVIS
+from davis2017.metrics import db_eval_boundary, db_eval_iou
+from davis2017 import utils
+from davis2017.results import Results
+from scipy.optimize import linear_sum_assignment
+class DAVISEvaluation(object):
+    def __init__(self, davis_root, task, gt_set, sequences='all', codalab=False):
+        """
+        Class to evaluate DAVIS sequences from a certain set and for a certain task
+        :param davis_root: Path to the DAVIS folder that contains JPEGImages, Annotations, etc. folders.
+        :param task: Task to compute the evaluation, chose between semi-supervised or unsupervised.
+        :param gt_set: Set to compute the evaluation
+        :param sequences: Sequences to consider for the evaluation, 'all' to use all the sequences in a set.
+        """
+        self.davis_root = davis_root
+        self.task = task
+        self.dataset = DAVIS(root=davis_root, task=task, subset=gt_set, sequences=sequences, codalab=codalab)
+    @staticmethod
+    def _evaluate_semisupervised(all_gt_masks, all_res_masks, all_void_masks, metric):
+        if all_res_masks.shape[0] > all_gt_masks.shape[0]:
+            sys.stdout.write("\nIn your PNG files there is an index higher than the number of objects in the sequence!")
+            sys.exit()
+        elif all_res_masks.shape[0] < all_gt_masks.shape[0]:
+            zero_padding = np.zeros((all_gt_masks.shape[0] - all_res_masks.shape[0], *all_res_masks.shape[1:]))
+            all_res_masks = np.concatenate([all_res_masks, zero_padding], axis=0)
+        j_metrics_res, f_metrics_res = np.zeros(all_gt_masks.shape[:2]), np.zeros(all_gt_masks.shape[:2])
+        for ii in range(all_gt_masks.shape[0]):
+            if 'J' in metric:
+                j_metrics_res[ii, :] = db_eval_iou(all_gt_masks[ii, ...], all_res_masks[ii, ...], all_void_masks)
+            if 'F' in metric:
+                f_metrics_res[ii, :] = db_eval_boundary(all_gt_masks[ii, ...], all_res_masks[ii, ...], all_void_masks)
+        return j_metrics_res, f_metrics_res
+    @staticmethod
+    def _evaluate_unsupervised(all_gt_masks, all_res_masks, all_void_masks, metric, max_n_proposals=20):
+        if all_res_masks.shape[0] > max_n_proposals:
+            sys.stdout.write(f"\nIn your PNG files there is an index higher than the maximum number ({max_n_proposals}) of proposals allowed!")
+            sys.exit()
+        elif all_res_masks.shape[0] < all_gt_masks.shape[0]:
+            zero_padding = np.zeros((all_gt_masks.shape[0] - all_res_masks.shape[0], *all_res_masks.shape[1:]))
+            all_res_masks = np.concatenate([all_res_masks, zero_padding], axis=0)
+        j_metrics_res = np.zeros((all_res_masks.shape[0], all_gt_masks.shape[0], all_gt_masks.shape[1]))
+        f_metrics_res = np.zeros((all_res_masks.shape[0], all_gt_masks.shape[0], all_gt_masks.shape[1]))
+        for ii in range(all_gt_masks.shape[0]):
+            for jj in range(all_res_masks.shape[0]):
+                if 'J' in metric:
+                    j_metrics_res[jj, ii, :] = db_eval_iou(all_gt_masks[ii, ...], all_res_masks[jj, ...], all_void_masks)
+                if 'F' in metric:
+                    f_metrics_res[jj, ii, :] = db_eval_boundary(all_gt_masks[ii, ...], all_res_masks[jj, ...], all_void_masks)
+        if 'J' in metric and 'F' in metric:
+            all_metrics = (np.mean(j_metrics_res, axis=2) + np.mean(f_metrics_res, axis=2)) / 2
+        else:
+            all_metrics = np.mean(j_metrics_res, axis=2) if 'J' in metric else np.mean(f_metrics_res, axis=2)
+        row_ind, col_ind = linear_sum_assignment(-all_metrics)
+        return j_metrics_res[row_ind, col_ind, :], f_metrics_res[row_ind, col_ind, :]
+    def evaluate(self, res_path, metric=('J', 'F'), debug=False):
+        metric = metric if isinstance(metric, tuple) or isinstance(metric, list) else [metric]
+        if 'T' in metric:
+            raise ValueError('Temporal metric not supported!')
+        if 'J' not in metric and 'F' not in metric:
+            raise ValueError('Metric possible values are J for IoU or F for Boundary')
+        # Containers
+        metrics_res = {}
+        if 'J' in metric:
+            metrics_res['J'] = {"M": [], "R": [], "D": [], "M_per_object": {}}
+        if 'F' in metric:
+            metrics_res['F'] = {"M": [], "R": [], "D": [], "M_per_object": {}}
+        # Sweep all sequences
+        results = Results(root_dir=res_path)
+        for seq in tqdm(list(self.dataset.get_sequences())):
+            all_gt_masks, all_void_masks, all_masks_id = self.dataset.get_all_masks(seq, True)
+            if self.task == 'semi-supervised':
+                all_gt_masks, all_masks_id = all_gt_masks[:, 1:-1, :, :], all_masks_id[1:-1]
+            all_res_masks = results.read_masks(seq, all_masks_id)
+            if self.task == 'unsupervised':
+                j_metrics_res, f_metrics_res = self._evaluate_unsupervised(all_gt_masks, all_res_masks, all_void_masks, metric)
+            elif self.task == 'semi-supervised':
+                j_metrics_res, f_metrics_res = self._evaluate_semisupervised(all_gt_masks, all_res_masks, None, metric)
+            for ii in range(all_gt_masks.shape[0]):
+                seq_name = f'{seq}_{ii+1}'
+                if 'J' in metric:
+                    [JM, JR, JD] = utils.db_statistics(j_metrics_res[ii])
+                    metrics_res['J']["M"].append(JM)
+                    metrics_res['J']["R"].append(JR)
+                    metrics_res['J']["D"].append(JD)
+                    metrics_res['J']["M_per_object"][seq_name] = JM
+                if 'F' in metric:
+                    [FM, FR, FD] = utils.db_statistics(f_metrics_res[ii])
+                    metrics_res['F']["M"].append(FM)
+                    metrics_res['F']["R"].append(FR)
+                    metrics_res['F']["D"].append(FD)
+                    metrics_res['F']["M_per_object"][seq_name] = FM
+            # Show progress
+            if debug:
+                sys.stdout.write(seq + '\n')
+                sys.stdout.flush()
+        return metrics_res

davis2017/metrics.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import math
+import numpy as np
+import cv2
+def db_eval_iou(annotation, segmentation, void_pixels=None):
+    """ Compute region similarity as the Jaccard Index.
+    Arguments:
+        annotation   (ndarray): binary annotation   map.
+        segmentation (ndarray): binary segmentation map.
+        void_pixels  (ndarray): optional mask with void pixels
+    Return:
+        jaccard (float): region similarity
+    """
+    assert annotation.shape == segmentation.shape, \
+        f'Annotation({annotation.shape}) and segmentation:{segmentation.shape} dimensions do not match.'
+    annotation = annotation.astype(np.bool)
+    segmentation = segmentation.astype(np.bool)
+    if void_pixels is not None:
+        assert annotation.shape == void_pixels.shape, \
+            f'Annotation({annotation.shape}) and void pixels:{void_pixels.shape} dimensions do not match.'
+        void_pixels = void_pixels.astype(np.bool)
+    else:
+        void_pixels = np.zeros_like(segmentation)
+    # Intersection between all sets
+    inters = np.sum((segmentation & annotation) & np.logical_not(void_pixels), axis=(-2, -1))
+    union = np.sum((segmentation | annotation) & np.logical_not(void_pixels), axis=(-2, -1))
+    j = inters / union
+    if j.ndim == 0:
+        j = 1 if np.isclose(union, 0) else j
+    else:
+        j[np.isclose(union, 0)] = 1
+    return j
+def db_eval_boundary(annotation, segmentation, void_pixels=None, bound_th=0.008):
+    assert annotation.shape == segmentation.shape
+    if void_pixels is not None:
+        assert annotation.shape == void_pixels.shape
+    if annotation.ndim == 3:
+        n_frames = annotation.shape[0]
+        f_res = np.zeros(n_frames)
+        for frame_id in range(n_frames):
+            void_pixels_frame = None if void_pixels is None else void_pixels[frame_id, :, :, ]
+            f_res[frame_id] = f_measure(segmentation[frame_id, :, :, ], annotation[frame_id, :, :], void_pixels_frame, bound_th=bound_th)
+    elif annotation.ndim == 2:
+        f_res = f_measure(segmentation, annotation, void_pixels, bound_th=bound_th)
+    else:
+        raise ValueError(f'db_eval_boundary does not support tensors with {annotation.ndim} dimensions')
+    return f_res
+def f_measure(foreground_mask, gt_mask, void_pixels=None, bound_th=0.008):
+    """
+    Compute mean,recall and decay from per-frame evaluation.
+    Calculates precision/recall for boundaries between foreground_mask and
+    gt_mask using morphological operators to speed it up.
+    Arguments:
+        foreground_mask (ndarray): binary segmentation image.
+        gt_mask         (ndarray): binary annotated image.
+        void_pixels     (ndarray): optional mask with void pixels
+    Returns:
+        F (float): boundaries F-measure
+    """
+    assert np.atleast_3d(foreground_mask).shape[2] == 1
+    if void_pixels is not None:
+        void_pixels = void_pixels.astype(np.bool)
+    else:
+        void_pixels = np.zeros_like(foreground_mask).astype(np.bool)
+    bound_pix = bound_th if bound_th >= 1 else \
+        np.ceil(bound_th * np.linalg.norm(foreground_mask.shape))
+    # Get the pixel boundaries of both masks
+    fg_boundary = _seg2bmap(foreground_mask * np.logical_not(void_pixels))
+    gt_boundary = _seg2bmap(gt_mask * np.logical_not(void_pixels))
+    from skimage.morphology import disk
+    # fg_dil = binary_dilation(fg_boundary, disk(bound_pix))
+    fg_dil = cv2.dilate(fg_boundary.astype(np.uint8), disk(bound_pix).astype(np.uint8))
+    # gt_dil = binary_dilation(gt_boundary, disk(bound_pix))
+    gt_dil = cv2.dilate(gt_boundary.astype(np.uint8), disk(bound_pix).astype(np.uint8))
+    # Get the intersection
+    gt_match = gt_boundary * fg_dil
+    fg_match = fg_boundary * gt_dil
+    # Area of the intersection
+    n_fg = np.sum(fg_boundary)
+    n_gt = np.sum(gt_boundary)
+    # % Compute precision and recall
+    if n_fg == 0 and n_gt > 0:
+        precision = 1
+        recall = 0
+    elif n_fg > 0 and n_gt == 0:
+        precision = 0
+        recall = 1
+    elif n_fg == 0 and n_gt == 0:
+        precision = 1
+        recall = 1
+    else:
+        precision = np.sum(fg_match) / float(n_fg)
+        recall = np.sum(gt_match) / float(n_gt)
+    # Compute F measure
+    if precision + recall == 0:
+        F = 0
+    else:
+        F = 2 * precision * recall / (precision + recall)
+    return F
+def _seg2bmap(seg, width=None, height=None):
+    """
+    From a segmentation, compute a binary boundary map with 1 pixel wide
+    boundaries.  The boundary pixels are offset by 1/2 pixel towards the
+    origin from the actual segment boundary.
+    Arguments:
+        seg     : Segments labeled from 1..k.
+        width	  :	Width of desired bmap  <= seg.shape[1]
+        height  :	Height of desired bmap <= seg.shape[0]
+    Returns:
+        bmap (ndarray):	Binary boundary map.
+     David Martin <[email protected]>
+     January 2003
+    """
+    seg = seg.astype(np.bool)
+    seg[seg > 0] = 1
+    assert np.atleast_3d(seg).shape[2] == 1
+    width = seg.shape[1] if width is None else width
+    height = seg.shape[0] if height is None else height
+    h, w = seg.shape[:2]
+    ar1 = float(width) / float(height)
+    ar2 = float(w) / float(h)
+    assert not (
+        width > w | height > h | abs(ar1 - ar2) > 0.01
+    ), "Can" "t convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
+    e = np.zeros_like(seg)
+    s = np.zeros_like(seg)
+    se = np.zeros_like(seg)
+    e[:, :-1] = seg[:, 1:]
+    s[:-1, :] = seg[1:, :]
+    se[:-1, :-1] = seg[1:, 1:]
+    b = seg ^ e | seg ^ s | seg ^ se
+    b[-1, :] = seg[-1, :] ^ e[-1, :]
+    b[:, -1] = seg[:, -1] ^ s[:, -1]
+    b[-1, -1] = 0
+    if w == width and h == height:
+        bmap = b
+    else:
+        bmap = np.zeros((height, width))
+        for x in range(w):
+            for y in range(h):
+                if b[y, x]:
+                    j = 1 + math.floor((y - 1) + height / h)
+                    i = 1 + math.floor((x - 1) + width / h)
+                    bmap[j, i] = 1
+    return bmap
+if __name__ == '__main__':
+    from davis2017.davis import DAVIS
+    from davis2017.results import Results
+    dataset = DAVIS(root='input_dir/ref', subset='val', sequences='aerobatics')
+    results = Results(root_dir='examples/osvos')
+    # Test timing F measure
+    for seq in dataset.get_sequences():
+        all_gt_masks, _, all_masks_id = dataset.get_all_masks(seq, True)
+        all_gt_masks, all_masks_id = all_gt_masks[:, 1:-1, :, :], all_masks_id[1:-1]
+        all_res_masks = results.read_masks(seq, all_masks_id)
+        f_metrics_res = np.zeros(all_gt_masks.shape[:2])
+        for ii in range(all_gt_masks.shape[0]):
+            f_metrics_res[ii, :] = db_eval_boundary(all_gt_masks[ii, ...], all_res_masks[ii, ...])
+    # Run using to profile code: python -m cProfile -o f_measure.prof metrics.py
+    #                            snakeviz f_measure.prof

docs/A2D-Sentences.md ADDED Viewed

	@@ -0,0 +1,55 @@

+## A2D-Sentences
+### Model Zoo
+The pretrained models are the same as those provided for Ref-Youtube-VOS.
+| Backbone| Overall IoU | Mean IoU | mAP  | Pretrain | Model |
+| :----: | :----: | :----: | :----: | :----: | :----: |
+| Video-Swin-T* | 72.3 | 64.1 | 48.6 | - | [model](https://drive.google.com/file/d/1z-HO71IcFOZ9A6KD71wAXkbiQgKDpSp7/view?usp=sharing) \| [log](https://drive.google.com/file/d/1JhsXgcWOYv97u6tpAUnBi9-D3mxcHXzO/view?usp=sharing) |
+| Video-Swin-T | 77.6 | 69.6 | 52.8 | [weight](https://drive.google.com/file/d/1g9Dm1vLdwpwSKVtIZzWKPUk2-zK3IbQa/view?usp=sharing) | [model](https://drive.google.com/file/d/1z-HO71IcFOZ9A6KD71wAXkbiQgKDpSp7/view?usp=sharing) \| [log](https://drive.google.com/file/d/1xjevouL3a1gHZN5KHtA07Cpa07R4T1Qi/view?usp=sharing) |
+| Video-Swin-S | 77.7 | 69.8 | 53.9 | [weight](https://drive.google.com/file/d/1GrhFhsUidsVs7-dhY8NkVgWfBZdeit9C/view?usp=sharing) | [model](https://drive.google.com/file/d/1ng2FAX9J4FyQ7Bq1eeQC9Vvv1W8JZmek/view?usp=sharing) \| [log](https://drive.google.com/file/d/1Uu72THexbtEje4aKXR7Q2Yd4zyPmQsi3/view?usp=sharing) |
+| Video-Swin-B | 78.6 | 70.3 | 55.0 | [weight](https://drive.google.com/file/d/1MJ1362zjqu-uZdXsSQH6pI1QOFqwv5lY/view?usp=sharing) | [model](https://drive.google.com/file/d/1WlNjKS_Li-1KoUzuPM4MRM4b-oK2Ka7c/view?usp=sharing) \| [log](https://drive.google.com/file/d/1tH-f9_U0gY-iNfXm6GRyttJp3uvm5NQw/view?usp=sharing) |
+\* the model is trained from scratch and set `--num_frames 6`.
+### Inference & Evaluation
+```
+python3 -m torch.distributed.launch --nproc_per_node=8 --use_env main.py --dataset_file a2d --with_box_refine --freeze_text_encoder --batch_size 2 --resume [/path/to/model_weight] --backbone [backbone]  --eval
+```
+For example, evaluating the Video-Swin-Tiny model, run the following command:
+```
+python3 -m torch.distributed.launch --nproc_per_node=8 --use_env main.py --dataset_file a2d --with_box_refine --freeze_text_encoder --batch_size 2 --resume a2d_video_swin_tiny.pth --backbone video_swin_t_p4w7  --eval
+```
+### Training
+- Finetune
+```
+./scripts/dist_train_a2d.sh [/path/to/output_dir] [/path/to/pretrained_weight] --backbone [backbone]
+```
+For example, training the Video-Swin-Tiny model, run the following command:
+```
+./scripts/dist_train_a2d.sh a2d_dirs/video_swin_tiny pretrained_weights/video_swin_tiny_pretrained.pth --backbone video_swin_t_p4w7
+```
+- Train from scratch
+```
+python3 -m torch.distributed.launch --nproc_per_node=8 --use_env main.py --dataset_file a2d --with_box_refine --freeze_text_encoder --epochs 12 --lr_drop 8 10 --dropout 0 --weight_decay 1e-4 --output_dir=[/path/to/output_dir] --backbone [backbone] --backbone_pretrained [/path/to/pretrained backbone weight] [other args]
+```
+For example, training the Video-Swin-Tiny model from scratch and set window size as 6, run the following command:
+```
+python3 -m torch.distributed.launch --nproc_per_node=8 --use_env main.py --dataset_file a2d --with_box_refine --freeze_text_encoder --epochs 12 --lr_drop 8 10 --dropout 0 --weight_decay 1e-4 --output_dir a2d_dirs/video_swin_tiny_scratch_frame6 --backbone video_swin_t_p4w7 --bacbkone_pretrained video_swin_pretrained/swin_tiny_patch244_window877_kinetics400_1k.pth --num_frames 6
+```

docs/JHMDB-Sentences.md ADDED Viewed

	@@ -0,0 +1,27 @@

+## JHMDB-Sentences
+### Model Zoo
+As described in the paper, we report the results using the model trained on A2D-Sentences without finetune.
+| Backbone| Overall IoU | Mean IoU | mAP  | Model |
+| :----: | :----: | :----: | :----: | :----: |
+| Video-Swin-T* | 70.0 | 69.3 | 39.1 | [model](https://drive.google.com/file/d/1z-HO71IcFOZ9A6KD71wAXkbiQgKDpSp7/view?usp=sharing) |
+| Video-Swin-T | 71.9 | 71.0 | 42.2 | [model](https://drive.google.com/file/d/1z-HO71IcFOZ9A6KD71wAXkbiQgKDpSp7/view?usp=sharing) |
+| Video-Swin-S | 72.8 | 71.5 | 42.4 | [model](https://drive.google.com/file/d/1ng2FAX9J4FyQ7Bq1eeQC9Vvv1W8JZmek/view?usp=sharing) |
+| Video-Swin-B | 73.0 | 71.8 | 43.7 | [model](https://drive.google.com/file/d/1WlNjKS_Li-1KoUzuPM4MRM4b-oK2Ka7c/view?usp=sharing) |
+\* the model is trained from scratch and set `--num_frames 6`.
+### Inference & Evaluation
+```
+python3 -m torch.distributed.launch --nproc_per_node=8 --use_env main.py --dataset_file jhmdb --with_box_refine --freeze_text_encoder --batch_size 2 --resume [/path/to/model_weight] --backbone [backbone]  --eval
+```
+For example, evaluating the Video-Swin-Tiny model, run the following command:
+```
+python3 -m torch.distributed.launch --nproc_per_node=8 --use_env main.py --dataset_file jhmdb --with_box_refine --freeze_text_encoder --batch_size 2 --resume a2d_video_swin_tiny.pth --backbone video_swin_t_p4w7  --eval
+```

docs/Ref-DAVIS17.md ADDED Viewed

	@@ -0,0 +1,24 @@

+## Ref-DAVIS17
+### Model Zoo
+As described in the paper, we report the results using the model trained on Ref-Youtube-VOS without finetune.
+| Backbone| J&F | J | F | Model |
+| :----: | :----: | :----: | :----: | :----: |
+| ResNet-50 | 58.5 | 55.8 | 61.3 | [model](https://drive.google.com/file/d/1VKYIbd3tiuLyWkh7ajnIiA3HZ3_IdvxV/view?usp=sharing) |
+| Swin-L | 60.5 | 57.6 | 63.4 | [model](https://drive.google.com/file/d/1_uwwlWv8AXhHfE8GVId7YtGraznRebaZ/view?usp=sharing) |
+| Video-Swin-B | 61.1 | 58.1 | 64.1 | [model](https://drive.google.com/file/d/1nw7D3C_RrKTMzwtzjo39snbYLbv73anH/view?usp=sharing) |
+### Inference & Evaluation
+```
+./scripts/dist_test_davis.sh [/path/to/output_dir] [/path/to/model_weight] --backbone [backbone]
+```
+For example, evaluating the Swin-Large model, run the following command:
+```
+./scripts/dist_test_davis.sh davis_dirs/swin_large ytvos_swin_large.pth --backbone swin_l_p4w7
+```

docs/Ref-Youtube-VOS.md ADDED Viewed

	@@ -0,0 +1,83 @@

+## Ref-Youtube-VOS
+### Model Zoo
+To evaluate the results, please upload the zip file to the [competition server](https://competitions.codalab.org/competitions/29139#participate-submit_results).
+| Backbone| J&F | CFBI J&F  | Pretrain | Model | Submission | CFBI Submission |
+| :----: | :----: | :----: | :----: | :----: | :----: | :----: |
+| ResNet-50 | 55.6 | 59.4 | [weight](https://drive.google.com/file/d/1mJd5zBUv4EYLOKQ0H87-NeAuInyrn577/view?usp=sharing) | [model](https://drive.google.com/file/d/1VKYIbd3tiuLyWkh7ajnIiA3HZ3_IdvxV/view?usp=sharing) | [link](https://drive.google.com/file/d/1IXKu8a06ppPAVBvy4Y0UfcKhCat4HRJt/view?usp=sharing) | [link](https://drive.google.com/file/d/1VJAKZ_j7kQFpocv_vDzER47CXWwAAE8h/view?usp=sharing) |
+| ResNet-101 | 57.3 | 60.3 | [weight](https://drive.google.com/file/d/1EMOwwAygdSfTZiVxI4f0UaVd7P6JzmuM/view?usp=sharing) | [model](https://drive.google.com/file/d/1FCHAAMf-HXPhZGTZp748l3pn6FfMyV1L/view?usp=sharing) | [link](https://drive.google.com/file/d/1cFxjVW2RlwjoVYR1M6NlkRpv9L3tPlcZ/view?usp=sharing) | [link](https://drive.google.com/file/d/1RPnFPqf7iiVypc7QbN-ev6s6xfmD-m5c/view?usp=sharing) |
+| Swin-T | 58.7 | 61.2 | [weight](https://drive.google.com/file/d/155sZm6yE7YQ8Y8Ln0ShaVZKLejYORqTQ/view?usp=sharing) | [model](https://drive.google.com/file/d/19jIbjRRUGDhfnI604Pw7hcGP5DqdvVtl/view?usp=sharing) | [link](https://drive.google.com/file/d/1eZZ-2zz0gdCwPrislGP3WKAHk-RnNY7v/view?usp=sharing) | [link](https://drive.google.com/file/d/1O9B35oieBfo7sRjxTpSyFz52J2AAHLce/view?usp=sharing) |
+| Swin-L | 62.4 | 63.3 | [weight](https://drive.google.com/file/d/1eJKNHvk_KcFuT4k6Te7HDuuSXH2DVOY5/view?usp=sharing) | [model](https://drive.google.com/file/d/1_uwwlWv8AXhHfE8GVId7YtGraznRebaZ/view?usp=sharing) | [link](https://drive.google.com/file/d/1uxBwbKdlilaCNt-RbdcPj1LshA-WY9Q6/view?usp=sharing) | [link](https://drive.google.com/file/d/16kVmJzv5oXzk3zGcfMcb2sEiN6HTOCmW/view?usp=sharing) |
+| Video-Swin-T* | 55.8 | - | - | [model](https://drive.google.com/file/d/1vNiQGpKuYfR7F7YKZK7H2HAzljDf9Wuf/view?usp=sharing) | [link](https://drive.google.com/file/d/18G0qIeZndacj3Y0EuyJsZFeFRWJ0_3O_/view?usp=sharing) | - |
+| Video-Swin-T | 59.4 | - | [weight](https://drive.google.com/file/d/1g9Dm1vLdwpwSKVtIZzWKPUk2-zK3IbQa/view?usp=sharing) | [model](https://drive.google.com/file/d/17RL6o_A57giHT-bMuP7ysUGogueT7wYm/view?usp=sharing) | [link](https://drive.google.com/file/d/1nhjvDWgMWufMGAjOKesgyLRB_-Ct6kXP/view?usp=sharing) | - |
+| Video-Swin-S | 60.1 | - | [weight](https://drive.google.com/file/d/1GrhFhsUidsVs7-dhY8NkVgWfBZdeit9C/view?usp=sharing) | [model](https://drive.google.com/file/d/1GrhFhsUidsVs7-dhY8NkVgWfBZdeit9C/view?usp=sharing) | [link](https://drive.google.com/file/d/1mhb0UAaJkTFYmGrwXHHJuaXVp-0BSkgm/view?usp=sharing) | - |
+| Video-Swin-B | 62.9 | - |[weight](https://drive.google.com/file/d/1MJ1362zjqu-uZdXsSQH6pI1QOFqwv5lY/view?usp=sharing)  | [model](https://drive.google.com/file/d/1nw7D3C_RrKTMzwtzjo39snbYLbv73anH/view?usp=sharing) | [link](https://drive.google.com/file/d/1dAQdr2RqCxYUmOVQ4jFE-vv5zavNhz7B/view?usp=sharing) | - |
+\* indicates the model is trained from scratch.
+Joint training with Ref-COCO/+/g datasets.
+| Backbone| J&F | J | F | Model | Submission |
+| :----: | :----: | :----: | :----: | :----: | :----: |
+| ResNet-50 | 58.7 | 57.4 | 60.1 | [model](https://drive.google.com/file/d/1tXgC_GRmQCvHjhlNoT0uXc_0oQ21d0hk/view?usp=sharing) | [link](https://drive.google.com/file/d/1Vbrl11mBfjwpM-H4DOleyD1i2STCN-SM/view?usp=sharing) |
+| ResNet-101 | 59.3 | 58.1 | 60.4 | [model](https://drive.google.com/file/d/1LUflgRgwZgTpYr5V9qeDKTIlBjLqHOVj/view?usp=sharing) | [link](https://drive.google.com/file/d/1BANQcqY34SebORZ9_PTF4C-QWuCJl2_W/view?usp=sharing) |
+| Swin-L | 64.2 | 62.3 | 66.2 | [model](https://drive.google.com/file/d/1JeppEr8m0O9844xncSfSZrYE_NH8oXb7/view?usp=sharing) | [link](https://drive.google.com/file/d/14klluhPeQhhNKl3EBibtiziChSKfBHU0/view?usp=sharing) |
+| Video-Swin-T | 62.6 | 59.9 | 63.3 | [model](https://drive.google.com/file/d/1rVO2ZC4U4symSh9Ifgg68YGdYBZH00MT/view?usp=sharing) | [link](https://drive.google.com/file/d/1-i67hTmo-qpyICbJ9vbTeQdPaL2VnbXQ/view?usp=sharing) |
+| Video-Swin-S | 63.3 | 61.4 | 65.2 | [model](https://drive.google.com/file/d/15ifI2yd9oDqMB05DgjhNVMe2MGXVvZnj/view?usp=sharing) | [link](https://drive.google.com/file/d/1II1gZl99FGECkS7DR6B8MszxAKadu-9y/view?usp=sharing) |
+| Video-Swin-B | 64.9 | 62.8 | 67.0 | [model](https://drive.google.com/file/d/19XO5VoR6qTE3VNLF-IjYzabL-2tb9E14/view?usp=sharing) | [link](https://drive.google.com/file/d/11FTV-B3MkWfl4azNI-aRmiRqQ9TBXG03/view?usp=sharing) |
+### Inference & Evaluation
+First, inference using the trained model.
+```
+python3 inference_ytvos.py --with_box_refine --binary --freeze_text_encoder --output_dir=[/path/to/output_dir] --resume=[/path/to/model_weight] --backbone [backbone]
+```
+```
+python3 inference_ytvos.py --with_box_refine --binary --freeze_text_encoder --output_dir=ytvos_dirs/swin_tiny --resume=ytvos_swin_tiny.pth --backbone swin_t_p4w7
+```
+If you want to visualize the predited masks, you may add `--visualize` to the above command.
+Then, enter the `output_dir`, rename the folder `valid` as `Annotations`. Use the following command to zip the folder:
+```
+zip -q -r submission.zip Annotations
+```
+To evaluate the results, please upload the zip file to the [competition server](https://competitions.codalab.org/competitions/29139#participate-submit_results).
+### Training
+- Finetune
+The following command includes the training and inference stages.
+```
+./scripts/dist_train_test_ytvos.sh [/path/to/output_dir] [/path/to/pretrained_weight] --backbone [backbone]
+```
+For example, training the Video-Swin-Tiny model, run the following command:
+```
+./scripts/dist_train_test_ytvos.sh ytvos_dirs/video_swin_tiny pretrained_weights/video_swin_tiny_pretrained.pth --backbone video_swin_t_p4w7
+```
+- Train from scratch
+The following command includes the training and inference stages.
+```
+./scripts/dist_train_test_ytvos_scratch.sh [/path/to/output_dir] --backbone [backbone] --backbone_pretrained [/path/to/backbone_pretrained_weight] [other args]
+```
+For example, training the Video-Swin-Tiny model, run the following command:
+```
+./scripts/dist_train_test_ytvos.sh ytvos_dirs/video_swin_tiny_scratch --backbone video_swin_t_p4w7 --backbone_pretrained video_swin_pretrained/swin_tiny_patch244_window877_kinetics400_1k.pth
+```

docs/data.md ADDED Viewed

	@@ -0,0 +1,127 @@

+# Data Preparation
+Create a new directory `data` to store all the datasets.
+## Ref-COCO
+Download the dataset from the official website [COCO](https://cocodataset.org/#download).
+RefCOCO/+/g use the COCO2014 train split.
+Download the annotation files from [github](https://github.com/lichengunc/refer).
+Convert the annotation files:
+```
+python3 tools/data/convert_refexp_to_coco.py
+```
+Finally, we expect the directory structure to be the following:
+```
+ReferFormer
+├── data
+│   ├── coco
+│   │   ├── train2014
+│   │   ├── refcoco
+│   │   │   ├── instances_refcoco_train.json
+│   │   │   ├── instances_refcoco_val.json
+│   │   ├── refcoco+
+│   │   │   ├── instances_refcoco+_train.json
+│   │   │   ├── instances_refcoco+_val.json
+│   │   ├── refcocog
+│   │   │   ├── instances_refcocog_train.json
+│   │   │   ├── instances_refcocog_val.json
+```
+## Ref-Youtube-VOS
+Download the dataset from the competition's website [here](https://competitions.codalab.org/competitions/29139#participate-get_data).
+Then, extract and organize the file. We expect the directory structure to be the following:
+```
+ReferFormer
+├── data
+│   ├── ref-youtube-vos
+│   │   ├── meta_expressions
+│   │   ├── train
+│   │   │   ├── JPEGImages
+│   │   │   ├── Annotations
+│   │   │   ├── meta.json
+│   │   ├── valid
+│   │   │   ├── JPEGImages
+```
+## Ref-DAVIS17
+Downlaod the DAVIS2017 dataset from the [website](https://davischallenge.org/davis2017/code.html). Note that you only need to download the two zip files `DAVIS-2017-Unsupervised-trainval-480p.zip` and `DAVIS-2017_semantics-480p.zip`.
+Download the text annotations from the [website](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/research/video-segmentation/video-object-segmentation-with-language-referring-expressions).
+Then, put the zip files in the directory as follows.
+```
+ReferFormer
+├── data
+│   ├── ref-davis
+│   │   ├── DAVIS-2017_semantics-480p.zip
+│   │   ├── DAVIS-2017-Unsupervised-trainval-480p.zip
+│   │   ├── davis_text_annotations.zip
+```
+Unzip these zip files.
+```
+unzip -o davis_text_annotations.zip
+unzip -o DAVIS-2017_semantics-480p.zip
+unzip -o DAVIS-2017-Unsupervised-trainval-480p.zip
+```
+Preprocess the dataset to Ref-Youtube-VOS format. (Make sure you are in the main directory)
+```
+python tools/data/convert_davis_to_ytvos.py
+```
+Finally, unzip the file `DAVIS-2017-Unsupervised-trainval-480p.zip` again (since we use `mv` in preprocess for efficiency).
+```
+unzip -o DAVIS-2017-Unsupervised-trainval-480p.zip
+```
+## A2D-Sentences
+Follow the instructions and download the dataset from the website [here](https://kgavrilyuk.github.io/publication/actor_action/).
+Then, extract the files. Additionally, we use the same json annotation files generated by [MTTR](https://github.com/mttr2021/MTTR). Please download these files from [onedrive](https://connecthkuhk-my.sharepoint.com/:f:/g/personal/wjn922_connect_hku_hk/EnvcpWsMsY5NrMF5If3F6DwBseMrqmzQwpTtL8HXoLAChw?e=Vlv1et).
+We expect the directory structure to be the following:
+```
+ReferFormer
+├── data
+│   ├── a2d_sentences
+│   │   ├── Release
+│   │   ├── text_annotations
+│   │   │   ├── a2d_annotation_with_instances
+│   │   │   ├── a2d_annotation.txt
+│   │   │   ├── a2d_missed_videos.txt
+│   │   ├── a2d_sentences_single_frame_test_annotations.json
+│   │   ├── a2d_sentences_single_frame_train_annotations.json
+│   │   ├── a2d_sentences_test_annotations_in_coco_format.json
+```
+## JHMDB-Sentences
+Follow the instructions and download the dataset from the website [here](https://kgavrilyuk.github.io/publication/actor_action/).
+Then, extract the files. Additionally, we use the same json annotation files generated by [MTTR](https://github.com/mttr2021/MTTR). Please download these files from [onedrive](https://connecthkuhk-my.sharepoint.com/:f:/g/personal/wjn922_connect_hku_hk/EjPyzXq93s5Jm4GU07JrWIMBb6nObY8fEmLyuiGg-0uBtg?e=GsZ6jP).
+We expect the directory structure to be the following:
+```
+ReferFormer
+├── data
+│   ├── jhmdb_sentences
+│   │   ├── Rename_Images
+│   │   ├── puppet_mask
+│   │   ├── jhmdb_annotation.txt
+│   │   ├── jhmdb_sentences_samples_metadata.json
+│   │   ├── jhmdb_sentences_gt_annotations_in_coco_format.json
+```

engine.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Train and eval functions used in main.py
+Modified from DETR (https://github.com/facebookresearch/detr)
+"""
+import math
+from models import postprocessors
+import os
+import sys
+from typing import Iterable
+import torch
+import torch.distributed as dist
+import util.misc as utils
+from datasets.coco_eval import CocoEvaluator
+from datasets.refexp_eval import RefExpEvaluator
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from datasets.a2d_eval import calculate_precision_at_k_and_iou_metrics, calculate_bbox_precision_at_k_and_iou_metrics
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, max_norm: float = 0):
+    model.train()
+    criterion.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    print_freq = 10
+    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
+        samples = samples.to(device)
+        captions = [t["caption"] for t in targets]
+        targets = utils.targets_to(targets, device)
+        outputs = model(samples, captions, targets)
+        loss_dict = criterion(outputs, targets)
+        weight_dict = criterion.weight_dict
+        losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        loss_dict_reduced_unscaled = {f'{k}_unscaled': v
+                                      for k, v in loss_dict_reduced.items()}
+        loss_dict_reduced_scaled = {k: v * weight_dict[k]
+                                    for k, v in loss_dict_reduced.items() if k in weight_dict}
+        losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())
+        loss_value = losses_reduced_scaled.item()
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+        optimizer.zero_grad()
+        losses.backward()
+        if max_norm > 0:
+            grad_total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+        else:
+            grad_total_norm = utils.get_total_grad_norm(model.parameters(), max_norm)
+        optimizer.step()
+        metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled)
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+        metric_logger.update(grad_norm=grad_total_norm)
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+@torch.no_grad()
+def evaluate(model, criterion, postprocessors, data_loader, evaluator_list, device, args):
+    model.eval()
+    criterion.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = 'Test:'
+    predictions = []
+    for samples, targets in metric_logger.log_every(data_loader, 10, header):
+        dataset_name = targets[0]["dataset_name"]
+        samples = samples.to(device)
+        captions = [t["caption"] for t in targets]
+        targets = utils.targets_to(targets, device)
+        outputs = model(samples, captions, targets)
+        loss_dict = criterion(outputs, targets)
+        weight_dict = criterion.weight_dict
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        loss_dict_reduced_scaled = {k: v * weight_dict[k]
+                                    for k, v in loss_dict_reduced.items() if k in weight_dict}
+        loss_dict_reduced_unscaled = {f'{k}_unscaled': v
+                                      for k, v in loss_dict_reduced.items()}
+        metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()),
+                             **loss_dict_reduced_scaled,
+                             **loss_dict_reduced_unscaled)
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+        results = postprocessors['bbox'](outputs, orig_target_sizes)
+        if 'segm' in postprocessors.keys():
+            target_sizes = torch.stack([t["size"] for t in targets], dim=0)
+            results = postprocessors['segm'](results, outputs, orig_target_sizes, target_sizes)
+        res = {target['image_id'].item(): output for target, output in zip(targets, results)}
+        for evaluator in evaluator_list:
+            evaluator.update(res)
+        # REC & RES predictions
+        for p, target in zip(results, targets):
+            for s, b, m in zip(p['scores'], p['boxes'], p['rle_masks']):
+                    predictions.append({'image_id': target['image_id'].item(),
+                                        'category_id': 1,  # dummy label, as categories are not predicted in ref-vos
+                                        'bbox': b.tolist(),
+                                        'segmentation': m,
+                                        'score': s.item()})
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    for evaluator in evaluator_list:
+        evaluator.synchronize_between_processes()
+    # accumulate predictions from all images
+    refexp_res = None
+    for evaluator in evaluator_list:
+        if isinstance(evaluator, CocoEvaluator):
+            evaluator.accumulate()
+            evaluator.summarize()
+        elif isinstance(evaluator, RefExpEvaluator):
+            refexp_res = evaluator.summarize()
+    stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    # update stats
+    for evaluator in evaluator_list:
+        if isinstance(evaluator, CocoEvaluator):
+            if "bbox" in postprocessors.keys():
+                stats["coco_eval_bbox"] = evaluator.coco_eval["bbox"].stats.tolist()
+            if "segm" in postprocessors.keys():
+                stats["coco_eval_masks"] = evaluator.coco_eval["segm"].stats.tolist()
+    if refexp_res is not None:
+        stats.update(refexp_res)
+    # evaluate RES
+    # gather and merge predictions from all gpus
+    gathered_pred_lists = utils.all_gather(predictions)
+    predictions = [p for p_list in gathered_pred_lists for p in p_list]
+    eval_metrics = {}
+    if utils.is_main_process():
+        if dataset_name == 'refcoco':
+            coco_gt = COCO(os.path.join(args.coco_path, 'refcoco/instances_refcoco_val.json'))
+        elif dataset_name == 'refcoco+':
+            coco_gt = COCO(os.path.join(args.coco_path, 'refcoco+/instances_refcoco+_val.json'))
+        elif dataset_name == 'refcocog':
+            coco_gt = COCO(os.path.join(args.coco_path, 'refcocog/instances_refcocog_val.json'))
+        else:
+            raise NotImplementedError
+        coco_pred = coco_gt.loadRes(predictions)
+        coco_eval = COCOeval(coco_gt, coco_pred, iouType='segm')
+        coco_eval.params.useCats = 0  # ignore categories as they are not predicted in ref-vos task
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+        # ap_labels = ['mAP 0.5:0.95', 'AP 0.5', 'AP 0.75', 'AP 0.5:0.95 S', 'AP 0.5:0.95 M', 'AP 0.5:0.95 L']
+        # ap_metrics = coco_eval.stats[:6]
+        # eval_metrics = {l: m for l, m in zip(ap_labels, ap_metrics)}
+        # Precision and IOU
+        # bbox
+        precision_at_k, overall_iou, mean_iou = calculate_bbox_precision_at_k_and_iou_metrics(coco_gt, coco_pred)
+        eval_metrics.update({f'bbox P@{k}': m for k, m in zip([0.5, 0.6, 0.7, 0.8, 0.9], precision_at_k)})
+        eval_metrics.update({'bbox overall_iou': overall_iou, 'bbox mean_iou': mean_iou})
+        # mask
+        precision_at_k, overall_iou, mean_iou = calculate_precision_at_k_and_iou_metrics(coco_gt, coco_pred)
+        eval_metrics.update({f'segm P@{k}': m for k, m in zip([0.5, 0.6, 0.7, 0.8, 0.9], precision_at_k)})
+        eval_metrics.update({'segm overall_iou': overall_iou, 'segm mean_iou': mean_iou})
+        print(eval_metrics)
+        stats.update(eval_metrics)
+    return stats
+@torch.no_grad()
+def evaluate_a2d(model, data_loader, postprocessor, device, args):
+    model.eval()
+    predictions = []
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = 'Test:'
+    for samples, targets in metric_logger.log_every(data_loader, 10, header):
+        image_ids = [t['image_id'] for t in targets]
+        samples = samples.to(device)
+        captions = [t["caption"] for t in targets]
+        targets = utils.targets_to(targets, device)
+        outputs = model(samples, captions, targets)
+        orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
+        target_sizes = torch.stack([t["size"] for t in targets], dim=0)
+        processed_outputs = postprocessor(outputs, orig_target_sizes, target_sizes)
+        for p, image_id in zip(processed_outputs, image_ids):
+            for s, m in zip(p['scores'], p['rle_masks']):
+                    predictions.append({'image_id': image_id,
+                                        'category_id': 1,  # dummy label, as categories are not predicted in ref-vos
+                                        'segmentation': m,
+                                        'score': s.item()})
+    # gather and merge predictions from all gpus
+    gathered_pred_lists = utils.all_gather(predictions)
+    predictions = [p for p_list in gathered_pred_lists for p in p_list]
+    # evaluation
+    eval_metrics = {}
+    if utils.is_main_process():
+        if args.dataset_file == 'a2d':
+            coco_gt = COCO(os.path.join(args.a2d_path, 'a2d_sentences_test_annotations_in_coco_format.json'))
+        elif args.dataset_file == 'jhmdb':
+            coco_gt = COCO(os.path.join(args.jhmdb_path, 'jhmdb_sentences_gt_annotations_in_coco_format.json'))
+        else:
+            raise NotImplementedError
+        coco_pred = coco_gt.loadRes(predictions)
+        coco_eval = COCOeval(coco_gt, coco_pred, iouType='segm')
+        coco_eval.params.useCats = 0  # ignore categories as they are not predicted in ref-vos task
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+        ap_labels = ['mAP 0.5:0.95', 'AP 0.5', 'AP 0.75', 'AP 0.5:0.95 S', 'AP 0.5:0.95 M', 'AP 0.5:0.95 L']
+        ap_metrics = coco_eval.stats[:6]
+        eval_metrics = {l: m for l, m in zip(ap_labels, ap_metrics)}
+        # Precision and IOU
+        precision_at_k, overall_iou, mean_iou = calculate_precision_at_k_and_iou_metrics(coco_gt, coco_pred)
+        eval_metrics.update({f'P@{k}': m for k, m in zip([0.5, 0.6, 0.7, 0.8, 0.9], precision_at_k)})
+        eval_metrics.update({'overall_iou': overall_iou, 'mean_iou': mean_iou})
+        print(eval_metrics)
+    # sync all processes before starting a new epoch or exiting
+    dist.barrier()
+    return eval_metrics

eval_davis.py ADDED Viewed

	@@ -0,0 +1,68 @@

+#!/usr/bin/env python
+import os
+import sys
+from time import time
+import argparse
+import numpy as np
+import pandas as pd
+from davis2017.evaluation import DAVISEvaluation
+default_davis_path = 'data/ref-davis/DAVIS'
+time_start = time()
+parser = argparse.ArgumentParser()
+parser.add_argument('--davis_path', type=str, help='Path to the DAVIS folder containing the JPEGImages, Annotations, '
+                                                   'ImageSets, Annotations_unsupervised folders',
+                    required=False, default=default_davis_path)
+parser.add_argument('--set', type=str, help='Subset to evaluate the results', default='val') # val subset
+parser.add_argument('--task', type=str, help='Task to evaluate the results', default='unsupervised',
+                    choices=['semi-supervised', 'unsupervised'])
+parser.add_argument('--results_path', type=str, help='Path to the folder containing the sequences folders',
+                    required=True)
+args, _ = parser.parse_known_args()
+csv_name_global = f'global_results-{args.set}.csv'
+csv_name_per_sequence = f'per-sequence_results-{args.set}.csv'
+# Check if the method has been evaluated before, if so read the results, otherwise compute the results
+csv_name_global_path = os.path.join(args.results_path, csv_name_global)
+csv_name_per_sequence_path = os.path.join(args.results_path, csv_name_per_sequence)
+if os.path.exists(csv_name_global_path) and os.path.exists(csv_name_per_sequence_path):
+    print('Using precomputed results...')
+    table_g = pd.read_csv(csv_name_global_path)
+    table_seq = pd.read_csv(csv_name_per_sequence_path)
+else:
+    print(f'Evaluating sequences for the {args.task} task...')
+    # Create dataset and evaluate
+    dataset_eval = DAVISEvaluation(davis_root=args.davis_path, task=args.task, gt_set=args.set)
+    metrics_res = dataset_eval.evaluate(args.results_path)
+    J, F = metrics_res['J'], metrics_res['F']
+    # Generate dataframe for the general results
+    g_measures = ['J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay']
+    final_mean = (np.mean(J["M"]) + np.mean(F["M"])) / 2.
+    g_res = np.array([final_mean, np.mean(J["M"]), np.mean(J["R"]), np.mean(J["D"]), np.mean(F["M"]), np.mean(F["R"]),
+                      np.mean(F["D"])])
+    g_res = np.reshape(g_res, [1, len(g_res)])
+    table_g = pd.DataFrame(data=g_res, columns=g_measures)
+    with open(csv_name_global_path, 'w') as f:
+        table_g.to_csv(f, index=False, float_format="%.5f")
+    print(f'Global results saved in {csv_name_global_path}')
+    # Generate a dataframe for the per sequence results
+    seq_names = list(J['M_per_object'].keys())
+    seq_measures = ['Sequence', 'J-Mean', 'F-Mean']
+    J_per_object = [J['M_per_object'][x] for x in seq_names]
+    F_per_object = [F['M_per_object'][x] for x in seq_names]
+    table_seq = pd.DataFrame(data=list(zip(seq_names, J_per_object, F_per_object)), columns=seq_measures)
+    with open(csv_name_per_sequence_path, 'w') as f:
+        table_seq.to_csv(f, index=False, float_format="%.5f")
+    print(f'Per-sequence results saved in {csv_name_per_sequence_path}')
+# Print the results
+sys.stdout.write(f"--------------------------- Global results for {args.set} ---------------------------\n")
+print(table_g.to_string(index=False))
+sys.stdout.write(f"\n---------- Per sequence results for {args.set} ----------\n")
+print(table_seq.to_string(index=False))
+total_time = time() - time_start
+sys.stdout.write('\nTotal time:' + str(total_time))

jptr_chaeyun.txt ADDED Viewed

	@@ -0,0 +1,179 @@

+[I 2025-02-06 14:30:15.041 ServerApp] Extension package jupyter_lsp took 0.2983s to import
+[I 2025-02-06 14:30:16.739 ServerApp] jupyter_lsp | extension was successfully linked.
+[I 2025-02-06 14:30:16.744 ServerApp] jupyter_server_terminals | extension was successfully linked.
+[I 2025-02-06 14:30:16.749 ServerApp] jupyterlab | extension was successfully linked.
+[W 2025-02-06 14:30:16.751 JupyterNotebookApp] 'password' has moved from NotebookApp to ServerApp. This config will be passed to ServerApp. Be sure to update your config before our next release.
+[W 2025-02-06 14:30:16.754 ServerApp] ServerApp.password config is deprecated in 2.0. Use PasswordIdentityProvider.hashed_password.
+[I 2025-02-06 14:30:16.754 ServerApp] notebook | extension was successfully linked.
+[I 2025-02-06 14:30:17.430 ServerApp] notebook_shim | extension was successfully linked.
+[I 2025-02-06 14:30:17.804 ServerApp] notebook_shim | extension was successfully loaded.
+[I 2025-02-06 14:30:17.807 ServerApp] jupyter_lsp | extension was successfully loaded.
+[I 2025-02-06 14:30:17.808 ServerApp] jupyter_server_terminals | extension was successfully loaded.
+[I 2025-02-06 14:30:18.042 LabApp] JupyterLab extension loaded from /home/chaeyun/.conda/envs/risall/lib/python3.9/site-packages/jupyterlab
+[I 2025-02-06 14:30:18.042 LabApp] JupyterLab application directory is /data/conda_envs/chaeyun/envs/risall/share/jupyter/lab
+[I 2025-02-06 14:30:18.053 LabApp] Extension Manager is 'pypi'.
+[I 2025-02-06 14:30:18.386 ServerApp] jupyterlab | extension was successfully loaded.
+[I 2025-02-06 14:30:18.394 ServerApp] notebook | extension was successfully loaded.
+[I 2025-02-06 14:30:18.395 ServerApp] Serving notebooks from local directory: /data/projects/yejin/VerbCentric_RIS/ReferFormer
+[I 2025-02-06 14:30:18.395 ServerApp] Jupyter Server 2.15.0 is running at:
+[I 2025-02-06 14:30:18.395 ServerApp] http://localhost:5727/tree
+[I 2025-02-06 14:30:18.395 ServerApp]     http://127.0.0.1:5727/tree
+[I 2025-02-06 14:30:18.395 ServerApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
+[I 2025-02-06 14:30:18.574 ServerApp] Skipped non-installed server(s): bash-language-server, dockerfile-language-server-nodejs, javascript-typescript-langserver, jedi-language-server, julia-language-server, pyright, python-language-server, python-lsp-server, r-languageserver, sql-language-server, texlab, typescript-language-server, unified-language-server, vscode-css-languageserver-bin, vscode-html-languageserver-bin, vscode-json-languageserver-bin, yaml-language-server
+[W 2025-02-06 14:31:50.823 ServerApp] 404 GET /hub/api (@::1) 179.74ms referer=None
+[I 2025-02-06 14:31:50.827 JupyterNotebookApp] 302 GET /tree? (@::1) 0.51ms
+[I 2025-02-06 14:31:53.423 ServerApp] User 09e1c030b1ec4bb68957ab993d4377f9 logged in.
+[I 2025-02-06 14:31:53.423 ServerApp] 302 POST /login? (09e1c030b1ec4bb68957ab993d4377f9@::1) 1.16ms
+[I 2025-02-06 14:32:05.968 ServerApp] Creating new notebook in
+[I 2025-02-06 14:32:06.446 ServerApp] Kernel started: 5d7b2dc4-2827-441f-b000-c315d487a88b
+[W 2025-02-06 14:32:06.458 ServerApp] delete /gpt_ref-ytvos_numbered_cy-jvsc-01c75e0b-00a9-46d9-9ef5-cef387099deb810a8dd4-69a4-4831-a7c1-98121c4af797.ipynb
+[I 2025-02-06 14:32:07.994 ServerApp] Connecting to kernel 5d7b2dc4-2827-441f-b000-c315d487a88b.
+[W 2025-02-06 14:32:08.754 ServerApp] 404 GET /nbextensions/viewer/extension.js (@::1) 1.43ms referer=None
+[W 2025-02-06 14:32:08.755 ServerApp] 404 GET /nbextensions/jupyter-js-widgets/extension.js (@::1) 1.86ms referer=None
+[I 2025-02-06 14:32:45.217 ServerApp] Creating new notebook in
+[I 2025-02-06 14:32:45.480 ServerApp] Kernel started: 72239f5d-4d18-4a03-8eed-3375f161975b
+[W 2025-02-06 14:32:45.492 ServerApp] delete /check_image_numbered_cy-jvsc-9867d5b1-af20-4268-8b58-531863f46cb2f672a8a4-bc6f-48b5-923c-58bb212ea0d7.ipynb
+[I 2025-02-06 14:32:46.166 ServerApp] Connecting to kernel 72239f5d-4d18-4a03-8eed-3375f161975b.
+[W 2025-02-06 14:32:47.092 ServerApp] 404 GET /nbextensions/jupyter-js-widgets/extension.js (@::1) 1.11ms referer=None
+[W 2025-02-06 14:32:47.093 ServerApp] 404 GET /nbextensions/viewer/extension.js (@::1) 0.84ms referer=None
+[I 2025-02-06 14:38:36.722 ServerApp] Kernel interrupted: 5d7b2dc4-2827-441f-b000-c315d487a88b
+[W 2025-02-06 15:27:22.906 ServerApp] 404 GET /hub/api (@::1) 332.79ms referer=None
+[I 2025-02-06 15:27:23.033 JupyterNotebookApp] 302 GET /tree? (@::1) 0.64ms
+[I 2025-02-06 15:27:25.982 ServerApp] User 9c560d44658d478aa5d6decbf8541260 logged in.
+[I 2025-02-06 15:27:25.983 ServerApp] 302 POST /login? (9c560d44658d478aa5d6decbf8541260@::1) 1.08ms
+[I 2025-02-06 15:32:06.555 ServerApp] Creating new notebook in
+[I 2025-02-06 15:32:11.934 ServerApp] Kernel started: 97e74450-8dc0-4ea7-b396-deccdfc0a23f
+[W 2025-02-06 15:32:11.955 ServerApp] delete /check_image_numbered_cy-jvsc-1a1215f3-c818-462e-a439-92c1dcbe474e70d6c3f2-c47b-4d0e-adef-589dd6523fcc.ipynb
+[I 2025-02-06 15:32:15.473 ServerApp] Connecting to kernel 97e74450-8dc0-4ea7-b396-deccdfc0a23f.
+[I 2025-02-06 15:32:15.678 ServerApp] Starting buffering for 5d7b2dc4-2827-441f-b000-c315d487a88b:2db8540d-faed-4333-80b7-7fde202eaafd
+[I 2025-02-06 15:32:15.679 ServerApp] Starting buffering for 72239f5d-4d18-4a03-8eed-3375f161975b:848b127a-dce3-4988-9ab9-aa3fc5535255
+[W 2025-02-06 15:43:43.450 ServerApp] 404 GET /hub/api (@::1) 172.61ms referer=None
+[I 2025-02-06 15:43:43.455 JupyterNotebookApp] 302 GET /tree? (@::1) 0.67ms
+[I 2025-02-06 15:43:46.430 ServerApp] User 49ee40ca9dfd47c7b4fd4bf6b592f8d0 logged in.
+[I 2025-02-06 15:43:46.430 ServerApp] 302 POST /login? (49ee40ca9dfd47c7b4fd4bf6b592f8d0@::1) 1.25ms
+[I 2025-02-06 15:45:30.966 ServerApp] Creating new notebook in
+[I 2025-02-06 15:45:31.635 ServerApp] Kernel started: dde1cca7-f769-4156-a1a0-1303a7fb5ba5
+[W 2025-02-06 15:45:31.654 ServerApp] delete /CRIS_hp_check_cygsds-jvsc-c01fd7dc-3a7d-48dc-b410-e6b5e4207a574d09512b-5033-42ea-91e2-034fca8b2587.ipynb
+[I 2025-02-06 15:45:32.747 ServerApp] Connecting to kernel dde1cca7-f769-4156-a1a0-1303a7fb5ba5.
+[W 2025-02-06 15:45:33.602 ServerApp] 404 GET /nbextensions/viewer/extension.js (@::1) 2.10ms referer=None
+[W 2025-02-06 15:45:33.603 ServerApp] 404 GET /nbextensions/jupyter-js-widgets/extension.js (@::1) 2.54ms referer=None
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
+slurmstepd-node05: error: *** STEP 7716.0 ON node05 CANCELLED AT 2025-02-06T16:54:16 ***
+slurmstepd-node05: error: *** JOB 7716 ON node05 CANCELLED AT 2025-02-06T16:54:16 ***
+[C 2025-02-06 16:54:16.389 ServerApp] received signal 15, stopping

make_ref-ytvos/annotate_ref_ytvos.py ADDED Viewed

	@@ -0,0 +1,288 @@

+from datasets import build_dataset
+import argparse
+import opts
+import sys
+from pathlib import Path
+from os import path as osp
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+args = parser.parse_args()
+#==================데이터 불러오기===================
+# 전체 데이터셋
+train_dataset = build_dataset('ytvos', image_set = 'train', args = args)
+# 전체 데이터셋 메타데이터
+metas = train_dataset.metas
+# 필터링한 프레임들
+selected_frames_df = pd.read_json("selected_frames4.jsonl", lines = True)
+#==================마스크 만드는 함수들===================
+def prepare_mask_for_pil(mask_tensor):
+    mask_array = mask_tensor.squeeze(0).cpu().numpy()
+    mask_array = (mask_array * 255).astype(np.uint8)
+    mask_image = Image.fromarray(mask_array)
+    return mask_image
+def create_sub_masks(mask_image):
+    width, height = mask_image.size
+    sub_masks = {}
+    for x in range(width):
+        for y in range(height):
+            # Get the RGB values of the pixel
+            pixel = mask_image.getpixel((x, y))
+            # If the pixel is not black...
+            if pixel != 0 :
+                # Check to see if we've created a sub-mask...
+                pixel_str = str(pixel)
+                sub_mask = sub_masks.get(pixel_str)
+                if sub_mask is None:
+                   # Create a sub-mask (one bit per pixel) and add to the dictionary
+                    # Note: we add 1 pixel of padding in each direction
+                    # because the contours module doesn't handle cases
+                    # where pixels bleed to the edge of the image
+                    sub_masks[pixel_str] = Image.new('1', (width+2, height+2))
+                # Set the pixel value to 1 (default is 0), accounting for padding
+                sub_masks[pixel_str].putpixel((x+1, y+1), 1)
+    return sub_masks
+#==================마스크 annotation 만드는 함수===================
+def create_sub_mask_annotation(sub_mask, image_id, annotation_id, is_crowd):
+    # Find contours (boundary lines) around each sub-mask
+    # Note: there could be multiple contours if the object
+    # is partially occluded. (E.g. an elephant behind a tree)
+    contours = measure.find_contours(sub_mask, 0.5, positive_orientation='low')
+    segmentations = []
+    polygons = []
+    for contour in contours:
+        # Flip from (row, col) representation to (x, y)
+        # and subtract the padding pixel
+        for i in range(len(contour)):
+            row, col = contour[i]
+            contour[i] = (col - 1, row - 1)
+        # Make a polygon and simplify it
+        poly = Polygon(contour)
+        poly = poly.simplify(1.0, preserve_topology=False)
+        polygons.append(poly)
+        segmentation = np.array(poly.exterior.coords).ravel().tolist()
+        segmentations.append(segmentation)
+    # Combine the polygons to calculate the bounding box and area
+    multi_poly = MultiPolygon(polygons)
+    x, y, max_x, max_y = multi_poly.bounds
+    width = max_x - x
+    height = max_y - y
+    bbox = (x, y, width, height)
+    area = multi_poly.area
+    annotation = {
+        'segmentation': segmentations,
+        'iscrowd': is_crowd,
+        'image_id': image_id,
+        'id': annotation_id,
+        'bbox': bbox,
+        'area': area
+    }
+    return annotation
+#==================시각화 함수===================
+# annotation dictionary as input
+def showRef(annotation, image_dir, seg_box='seg'):
+    ax = plt.gca()
+    I = io.imread(osp.join(image_dir, annotation['file_name']))
+    ax.imshow(I)
+    for sid, sent in enumerate(annotation['sentences']):
+        print('%s. %s' % (sid + 1, sent))
+    if seg_box == 'seg':
+        polygons = []
+        color = []
+        c = (np.random.random((1, 3)) * 0.6 + 0.4).tolist()[0]
+        if type(annotation['segmentation'][0]) == list:
+            # polygon used for refcoco*
+            for seg in annotation['segmentation']:
+                poly = np.array(seg).reshape((int(len(seg) / 2), 2))
+                polygons.append(Polygon(poly))
+                color.append(c)
+            p = PatchCollection(polygons,
+                                facecolors=(221/255, 160/255, 221/255),  # 연보라색
+                                linewidths=0,
+                                alpha=0.4)
+            ax.add_collection(p)
+            p = PatchCollection(polygons,
+                                facecolors='none',
+                                edgecolors=color,
+                                linewidths=2)
+            ax.add_collection(p)
+        # else:
+        #     # mask used for refclef
+        #     rle = annotation['segmentation']
+        #     m = mask.decode(rle)
+        #     img = np.ones((m.shape[0], m.shape[1], 3))
+        #     color_mask = np.array([2.0, 166.0, 101.0]) / 255
+        #     for i in range(3):
+        #         img[:, :, i] = color_mask[i]
+        #     ax.imshow(np.dstack((img, m * 0.5)))
+    # bounding box
+    elif seg_box == 'box':
+        bbox = annotation['bbox']
+        box_plot = Rectangle((bbox[0], bbox[1]),
+                            bbox[2],
+                            bbox[3],
+                            fill=False,
+                            edgecolor='green',
+                            linewidth=3)
+        ax.add_patch(box_plot)
+#==================모두 종합한 함수===================
+def create_dict_from_selected_images(selected_frames_df):
+    image_id = 0
+    anno_id = 0
+    train_idx = 0
+    with open("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_instances2.jsonl", "w") as f:
+        for selected_idx in range(len(selected_frames_df)):
+            selected = selected_frames_df.loc[selected_idx]
+            selected_vid_id = selected['video']
+            selected_frame_id = selected['frame_id']
+            for obj_id in selected['objects'].keys():
+                selected_exp = selected['objects'][obj_id][0]  #캡션
+                selected_verb = selected['objects'][obj_id][1]  #동사
+                train_idx = next(
+                    idx for idx, meta in enumerate(metas)
+                    if meta['video'] == selected_vid_id
+                    and meta['frame_id'] == selected_frame_id
+                    and meta['obj_id'] == int(obj_id)
+                    and meta['exp'] == selected_exp
+                )
+                train_frames, train_info = train_dataset[train_idx]
+                try:
+                    valid_frame_loc = train_info['frames_idx'].tolist().index(selected_frame_id)  #valid한 frame이 있는 index
+                except ValueError:
+                    print(f"selected vid id: {selected_vid_id}, metas['frame_id']: {metas[train_idx]['frame_id']}, selected frame id: {selected_frame_id}, train_info['frames_idx']: {train_info['frames_idx'].tolist()}")
+                frame = train_frames[valid_frame_loc]  #해당 frame
+                frame = F.to_pil_image(frame)
+                image_file_name = f"{selected_vid_id}_{str(selected_frame_id).rjust(5, '0')}"
+                #원래 frame 저장하기
+                save_dir = Path("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_frames")
+                #save_dir.mkdir(exist_ok=True)
+                save_path = save_dir / f"{image_file_name}.png"
+                #frame.save(save_path)
+                #카테고리
+                label = train_info['labels'][valid_frame_loc].item()  #category id
+                category_name = metas[train_idx]['category']  #category name
+                #박스 정보
+                box = train_info['boxes'][valid_frame_loc]
+                # Annotation tools ########################################################################
+                mask = train_info['masks'][valid_frame_loc]
+                # print(mask.shape)
+                # frame과 mask 맞는지 확인만
+                # plt.imshow(frame.permute(1, 2, 0))
+                # mask_color = np.zeros((*mask.shape, 3), dtype = np.uint8)
+                # mask_color[mask == 1] = [255, 0, 0]
+                # plt.imshow(mask_color, alpha = 0.5)
+                # plt.show()
+                mask_image = prepare_mask_for_pil(mask)
+                sub_masks = create_sub_masks(mask_image)
+                for color, sub_mask in sub_masks.items():
+                    # print(f"Color: {color}, Sub-mask size: {sub_mask.size}")
+                    sub_mask_array = np.array(sub_mask, dtype=np.uint8)
+                    annotation = create_sub_mask_annotation(sub_mask_array, image_id, anno_id, is_crowd = 0)
+                    anno_id += 1
+                image_id += 1
+                #파일 경로 추가
+                annotation['file_name'] = f"{image_file_name}.png"
+                #불필요한 정보 지우기
+                annotation.pop('iscrowd', None)
+                annotation.pop('image_id', None)
+                annotation.pop('id', None)
+                valid = train_info['valid'][valid_frame_loc]
+                orig_size = train_info['orig_size']
+                size = train_info['size']
+                caption = metas[train_idx]['exp']
+                #filename, height, width 추가
+                #annotation['file_name'] = save_path
+                annotation['height'] = orig_size[0].item()
+                annotation['width'] = orig_size[1].item()
+                # category id,name, sentence dictionary 추가
+                annotation['label'] = label
+                annotation['category_name'] = category_name
+                sentence_dict = {
+                    "tokens" : caption.split(' '),
+                    "raw" : caption,
+                    "sent" : re.sub('[^A-Za-z0-9\s]+', '', caption.lower())
+                }
+                annotation['sentences'] = sentence_dict
+                ############################################################################################
+                # double check for segmentation annotation
+                # orig_img_np = draw_polygon_on_image(frame, annotation['segmentation'])
+                # plt.imshow(orig_img_np)
+                # plt.axis('off')
+                # plt.show()
+                # showRef(annotation, save_dir)
+                ############################################################################################
+                # 최종
+                f.write(json.dumps(annotation) + "\n")
+                f.flush()
+# if __name__ == '__main__':
+#     create_dict_from_selected_images(selected_frames_df)

make_ref-ytvos/folder2lmdb.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import argparse
+import os
+import os.path as osp
+import lmdb
+from PIL import Image
+import pyarrow as pa
+import json
+from tqdm import tqdm
+import warnings
+warnings.filterwarnings("ignore")
+def loads_pyarrow(buf):
+    """
+    Args:
+        buf: the output of `dumps`.
+    """
+    return pa.deserialize(buf)
+def raw_reader(path):
+    with open(path, 'rb') as f:
+        bin_data = f.read()
+    return bin_data
+def dumps_pyarrow(obj):
+    """
+    Serialize an object.
+    Returns:
+        Implementation-dependent bytes-like object
+    """
+    return pa.serialize(obj).to_buffer()
+def folder2lmdb(json_data, img_dir, mask_dir, output_dir, split, write_frequency=1000):
+    lmdb_path = osp.join(output_dir, "%s.lmdb" % split)
+    isdir = os.path.isdir(lmdb_path)
+    print("Generate LMDB to %s" % lmdb_path)
+    db = lmdb.open(lmdb_path, subdir=isdir,
+                   map_size=1099511627776 * 2, readonly=False,
+                   meminit=False, map_async=True)
+    txn = db.begin(write=True)
+    tbar = tqdm(json_data)
+    for idx, item in enumerate(tbar):
+        img = raw_reader(osp.join(img_dir, item['file_name']))
+        mask = raw_reader(osp.join(mask_dir, f"{idx}.png"))
+        # Pillow로 파일 크기 확인
+        #with Image.open(osp.join(img_dir, item['file_name'])) as im:
+            #print(f"Image size (Pillow): {im.size}")
+        #with Image.open(osp.join(mask_dir, item['file_name'])) as mk:
+            #print(f"Mask size (Pillow): {mk.size}")
+        data = {'img': img, 'mask': mask, 'cat': item['category_name'],
+                'seg_id': idx, 'file_name': item['file_name'],
+                'num_sents': 1, 'sents': item['sentences']['sent']}
+        txn.put(u'{}'.format(idx).encode('ascii'), dumps_pyarrow(data))
+        if idx % write_frequency == 0:
+            # print("[%d/%d]" % (idx, len(data_loader)))
+            txn.commit()
+            txn = db.begin(write=True)
+    # finish iterating through dataset
+    txn.commit()
+    keys = [u'{}'.format(k).encode('ascii') for k in range(idx + 1)]
+    with db.begin(write=True) as txn:
+        txn.put(b'__keys__', dumps_pyarrow(keys))
+        txn.put(b'__len__', dumps_pyarrow(len(keys)))
+    print("Flushing database ...")
+    db.sync()
+    db.close()
+def parse_args():
+    parser = argparse.ArgumentParser(description='COCO Folder to LMDB.')
+    parser.add_argument('-j', '--json-dir', type=str,
+                        default='',
+                        help='the name of json file.')
+    parser.add_argument('-i', '--img-dir', type=str,
+                        default='refcoco+',
+                        help='the folder of images.')
+    parser.add_argument('-m', '--mask-dir', type=str,
+                        default='refcoco+',
+                        help='the folder of masks.')
+    parser.add_argument('-o', '--output-dir', type=str,
+                        default='refcoco+',
+                        help='the folder of output lmdb file.')
+    parser.add_argument('-s', '--split', type=str,
+                        default='train',
+                        help='the split type.')
+    args = parser.parse_args()
+    return args
+if __name__ == '__main__':
+    args = parse_args()
+    args.split = osp.basename(args.json_dir).split(".")[0]
+    os.makedirs(args.output_dir, exist_ok=True)
+    json_data = []
+    with open(args.json_dir, 'rb') as f:
+        for line in f:
+            json_data.append(json.loads(line))
+    folder2lmdb(json_data, args.img_dir, args.mask_dir, args.output_dir, args.split)

make_ref-ytvos/manual_selected_frames.jsonl ADDED Viewed

	@@ -0,0 +1,101 @@

+{"index": 0, "new_sent": ""}
+{"index": 4, "new_sent": "a zebra walking away from camera"}
+{"index": 12, "new_sent": "the panda has his hand on another pandas back"}
+{"index": 13, "new_sent": "the panda is standing on the rocks"}
+{"index": 17, "new_sent": "the panda fell down on his back"}
+{"index": 28, "new_sent": "a sheep lying down"}
+{"index": 31, "new_sent": "a sheep lying down and getting spider legs"}
+{"index": 40, "new_sent": ""}
+{"index": 41, "new_sent": ""}
+{"index": 48, "new_sent": "man using his hands next to an inside tree"}
+{"index": 52, "new_sent": ""}
+{"index": 55, "new_sent": ""}
+{"index": 57, "new_sent": "a monkey hugging another monkey"}
+{"index": 76, "new_sent": "an ape seated and breastfeeding while another ape plays nearby"}
+{"index": 77, "new_sent": "an ape playing near a nursing ape"}
+{"index": 78, "new_sent": "an ape is laying on the chest of another ape sitting on the dirt"}
+{"index": 172, "new_sent": "person standing on stage and using a microphone"}
+{"index": 173, "new_sent": "person sitting on stage playing a piano"}
+{"index": 196, "new_sent": "a monkey eating some fruit"}
+{"index": 197, "new_sent": "a monkey sitting while watching another monkey eat"}
+{"index": 244, "new_sent": ""}
+{"index": 270, "new_sent": "a turtle in water while another follows"}
+{"index": 271, "new_sent": ""}
+{"index": 299, "new_sent": "a duck stretching out its neck"}
+{"index": 326, "new_sent": ""}
+{"index": 327, "new_sent": ""}
+{"index": 388, "new_sent": ""}
+{"index": 389, "new_sent": "a lizard putting its head under a branch"}
+{"index": 409, "new_sent": ""}
+{"index": 410, "new_sent": "a raccoon standing and attacking another raccoon"}
+{"index": 415, "new_sent": ""}
+{"index": 416, "new_sent": ""}
+{"index": 417, "new_sent": "a person taking a picture"}
+{"index": 428, "new_sent": "a panda laying under another panda"}
+{"index": 429, "new_sent": "a panda standing and playing with another panda"}
+{"index": 447, "new_sent": "a panda playing and rolling over on the ground"}
+{"index": 448, "new_sent": "a panda sitting and looking at another panda"}
+{"index": 451, "new_sent": ""}
+{"index": 495, "new_sent": "a lion sitting in front of a lion thats playing with a man"}
+{"index": 509, "new_sent": ""}
+{"index": 510, "new_sent": ""}
+{"index": 517, "new_sent": "a person squatting and looking at a skateboarder perform"}
+{"index": 518, "new_sent": ""}
+{"index": 528, "new_sent": "a person doing a hand stand"}
+{"index": 559, "new_sent": "a dog holding up his head"}
+{"index": 560, "new_sent": "a dog smelling the ground"}
+{"index": 561, "new_sent": ""}
+{"index": 562, "new_sent": ""}
+{"index": 569, "new_sent": ""}
+{"index": 570, "new_sent": ""}
+{"index": 594, "new_sent": "a mouse sitting under a wheel while another runs around"}
+{"index": 595, "new_sent": ""}
+{"index": 617, "new_sent": "a monkey moving underneath another monkey"}
+{"index": 618, "new_sent": "a monkey laying on the ground with its arm over another monkey"}
+{"index": 634, "new_sent": "ape laying under another ape"}
+{"index": 644, "new_sent": "ape reaching out his arms and legs"}
+{"index": 645, "new_sent": ""}
+{"index": 646, "new_sent": "a person standing with his arms crossed in a room with others"}
+{"index": 654, "new_sent": ""}
+{"index": 659, "new_sent": "a giraffe eating hay"}
+{"index": 662, "new_sent": "a penguin laying on its belly playing with another penguin"}
+{"index": 673, "new_sent": "a penguin moving on its belly"}
+{"index": 720, "new_sent": "a person riding a surfboard on a wave in front of other surfer"}
+{"index": 722, "new_sent": "a person laying on surf board"}
+{"index": 725, "new_sent": "person swimming away"}
+{"index": 735, "new_sent": "person mounting a cow"}
+{"index": 738, "new_sent": "person walking towards a cow"}
+{"index": 741, "new_sent": "person riding a cow"}
+{"index": 747, "new_sent": "person holding out his right arm"}
+{"index": 764, "new_sent": "a grey duck facing away"}
+{"index": 765, "new_sent": ""}
+{"index": 766, "new_sent": "a grey duck eating bread"}
+{"index": 816, "new_sent": "a person raising his arms and flying with another person"}
+{"index": 821, "new_sent": "person holding on to the belt and screaming"}
+{"index": 824, "new_sent": "person smiling"}
+{"index": 828, "new_sent": "person holding another persons arms"}
+{"index": 831, "new_sent": ""}
+{"index": 832, "new_sent": ""}
+{"index": 904, "new_sent": ""}
+{"index": 914, "new_sent": "elephant walking away from the camera"}
+{"index": 918, "new_sent": "a person riding a horse"}
+{"index": 919, "new_sent": "a person opening a gate"}
+{"index": 931, "new_sent": "person resting hand on a tree limb"}
+{"index": 932, "new_sent": "person trying to feed a small animal"}
+{"index": 993, "new_sent": "horse leading the way in the water"}
+{"index": 994, "new_sent": "horse following another horse"}
+{"index": 1049, "new_sent": "person fell from a bull"}
+{"index": 1051, "new_sent": "person getting up from the ground"}
+{"index": 1052, "new_sent": "person standing and running away from a bull"}
+{"index": 1054, "new_sent": "person squatting down"}
+{"index": 1096, "new_sent": "an ape crossing his arms"}
+{"index": 1097, "new_sent": ""}
+{"index": 1115, "new_sent": "a man trying to catch a frisbee"}
+{"index": 1137, "new_sent": ""}
+{"index": 1138, "new_sent": "the monkey leaning over the fence facing the bike"}
+{"index": 1139, "new_sent": ""}
+{"index": 1147, "new_sent": "the monkey hanging backwards"}
+{"index": 1173, "new_sent": ""}
+{"index": 1174, "new_sent": "a horse leading the way"}
+{"index": 1184, "new_sent": ""}
+{"index": 1194, "new_sent": "person milking a cow"}

make_ref-ytvos/review_images.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

make_ref-ytvos/revised_frames.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

make_ref-ytvos/selected_frames.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/result.json ADDED Viewed

	@@ -0,0 +1,465 @@

+{
+    "8056117b89": null,
+    "30085a2cc6": {
+        "00020": {
+            "1": {
+                "ref_exp": "The elephant walks on four legs, with its trunk lowered and touching the ground.",
+                "caption": "The image shows two elephants performing distinct actions. \n\n1. The larger elephant walks forward, swinging its trunk side to side as it moves through the grass. \n2. The smaller elephant follows closely behind, walking on four legs, with its trunk lowered and touching the ground. \n\nThe larger elephant also flaps its ears open and closed, creating a visible motion as it progresses. Both elephants are primarily focused on their movement through the grassy area.",
+                "cat_name": "elephant",
+                "file_name": "00020",
+                "isValid": true
+            },
+            "2": {
+                "ref_exp": "The elephant follows closely behind while walking on four legs with its trunk lowered and touching the ground.",
+                "caption": "The image shows two elephants performing distinct actions. \n\n1. The larger elephant walks forward, swinging its trunk side to side as it moves through the grass. \n2. The smaller elephant follows closely behind, walking on four legs, with its trunk lowered and touching the ground. \n\nThe larger elephant also flaps its ears open and closed, creating a visible motion as it progresses. Both elephants are primarily focused on their movement through the grassy area.",
+                "cat_name": "elephant",
+                "file_name": "00020",
+                "isValid": false
+            }
+        },
+        "00035": {
+            "1": {
+                "ref_exp": "The elephant is using its trunk to grasp and pull foliage from the ground.",
+                "caption": "In the image, one elephant is standing upright, using its trunk to grasp and pull foliage from the ground. It appears to be consuming the vegetation. The second elephant, smaller in size, is positioned nearby, walking closely alongside the first, with its trunk lowered toward the ground, possibly interacting with the grasses. The body posture of both elephants indicates movement and engagement with their surroundings.",
+                "cat_name": "elephant",
+                "file_name": "00035",
+                "isValid": false
+            },
+            "2": {
+                "ref_exp": "The elephant is standing and using its trunk to grasp and pull foliage from the ground.",
+                "caption": "In the image, one elephant is standing upright, using its trunk to grasp and pull foliage from the ground. It appears to be consuming the vegetation. The second elephant, smaller in size, is positioned nearby, walking closely alongside the first, with its trunk lowered toward the ground, possibly interacting with the grasses. The body posture of both elephants indicates movement and engagement with their surroundings.",
+                "cat_name": "elephant",
+                "file_name": "00035",
+                "isValid": true
+            }
+        },
+        "00055": {
+            "1": {
+                "ref_exp": "The elephant is walking forward while using its trunk to reach down to the ground.",
+                "caption": "In the image, a larger elephant is walking forward with a slight bend in its trunk, while a smaller elephant follows closely behind. The larger elephant is moving through the tall grass, using its trunk to occasionally reach down to the ground. At the same time, the smaller elephant appears to be moving at a brisk pace and is also navigating through the grass, maintaining proximity to the larger one. The movement of both elephants indicates a coordinated action as they travel in the same direction toward the nearby water.",
+                "cat_name": "elephant",
+                "file_name": "00055",
+                "isValid": false
+            },
+            "2": {
+                "ref_exp": "The elephant is walking forward with its trunk slightly bent, using it to reach down to the ground.",
+                "caption": "In the image, a larger elephant is walking forward with a slight bend in its trunk, while a smaller elephant follows closely behind. The larger elephant is moving through the tall grass, using its trunk to occasionally reach down to the ground. At the same time, the smaller elephant appears to be moving at a brisk pace and is also navigating through the grass, maintaining proximity to the larger one. The movement of both elephants indicates a coordinated action as they travel in the same direction toward the nearby water.",
+                "cat_name": "elephant",
+                "file_name": "00055",
+                "isValid": false
+            }
+        },
+        "00075": {
+            "1": {
+                "ref_exp": "The elephant is moving forward with its trunk extended downward.",
+                "caption": "In the image, several elephants are depicted in motion. \n\n1. The largest elephant is moving forward, its trunk extended downward, and its ears are flapping. \n2. A smaller elephant is walking closely behind, appearing to be following the larger one, with its trunk raised slightly. \n3. A very small elephant is positioned near the middle, walking on a similar path, with its trunk held close to the ground. \n\nThe actions of the elephants are directed towards moving through the grassy area, maintaining a formation where the smaller elephants trail behind the larger one.",
+                "cat_name": "elephant",
+                "file_name": "00075",
+                "isValid": false
+            },
+            "2": {
+                "ref_exp": "The elephant is moving forward with its trunk extended downward.",
+                "caption": "In the image, several elephants are depicted in motion. \n\n1. The largest elephant is moving forward, its trunk extended downward, and its ears are flapping. \n2. A smaller elephant is walking closely behind, appearing to be following the larger one, with its trunk raised slightly. \n3. A very small elephant is positioned near the middle, walking on a similar path, with its trunk held close to the ground. \n\nThe actions of the elephants are directed towards moving through the grassy area, maintaining a formation where the smaller elephants trail behind the larger one.",
+                "cat_name": "elephant",
+                "file_name": "00075",
+                "isValid": false
+            }
+        }
+    },
+    "013099c098": {
+        "00020": {
+            "1": {
+                "ref_exp": "The giant_panda is climbing on the back of another giant_panda.",
+                "caption": "Two giant pandas are engaged in distinct actions. \n\n1. The panda on the left is climbing on the back of the panda on the right, using its front paws to grasp the other panda's shoulders. \n\n2. The panda on the right is seated in a slight stance, with its head lowered and looking towards the ground while water from a small stream splashes around.\n\n3. Water is trickling down from the rocks behind them, creating ripples around the panda on the right as it shifts its weight.\n\n4. The panda on the left is positioned slightly higher, with its body elevated above the other, indicating a playful interaction. \n\nEach action is prominent and clear, showcasing the pandas\u2019 movements distinctly.",
+                "cat_name": "giant_panda",
+                "file_name": "00020",
+                "isValid": false
+            },
+            "2": {
+                "ref_exp": "The giant_panda is climbing on the back of another panda.",
+                "caption": "Two giant pandas are engaged in distinct actions. \n\n1. The panda on the left is climbing on the back of the panda on the right, using its front paws to grasp the other panda's shoulders. \n\n2. The panda on the right is seated in a slight stance, with its head lowered and looking towards the ground while water from a small stream splashes around.\n\n3. Water is trickling down from the rocks behind them, creating ripples around the panda on the right as it shifts its weight.\n\n4. The panda on the left is positioned slightly higher, with its body elevated above the other, indicating a playful interaction. \n\nEach action is prominent and clear, showcasing the pandas\u2019 movements distinctly.",
+                "cat_name": "giant_panda",
+                "file_name": "00020",
+                "isValid": true
+            }
+        },
+        "00030": {
+            "1": {
+                "ref_exp": "The giant panda is splashing water with its paws.",
+                "caption": "In the image, one giant panda is positioned beside a small flow of water, using its paws to splash water playfully. The panda appears to be engaging with the water, causing droplets to scatter into the air. Nearby, another panda is focused on grasping something with its mouth, seemingly nibbling or chewing on it. This panda is low to the ground, with its front paws bracing itself as it maintains balance while interacting with the object in front of it. Both pandas are actively involved in their respective actions, creating a dynamic scene.",
+                "cat_name": "giant_panda",
+                "file_name": "00030",
+                "isValid": false
+            },
+            "2": {
+                "ref_exp": "The giant_panda is using its paws to splash water.",
+                "caption": "In the image, one giant panda is positioned beside a small flow of water, using its paws to splash water playfully. The panda appears to be engaging with the water, causing droplets to scatter into the air. Nearby, another panda is focused on grasping something with its mouth, seemingly nibbling or chewing on it. This panda is low to the ground, with its front paws bracing itself as it maintains balance while interacting with the object in front of it. Both pandas are actively involved in their respective actions, creating a dynamic scene.",
+                "cat_name": "giant_panda",
+                "file_name": "00030",
+                "isValid": true
+            }
+        },
+        "00050": {
+            "1": {
+                "ref_exp": "The giant_panda is leaning forward to make contact with the other panda.",
+                "caption": "The image features two giant pandas engaged in the following actions:\n\n1. One panda is positioned on its hind legs, interacting with the other panda, which is lying on its back. The upright panda leans forward, making direct contact with the other panda.\n\n2. The panda that is lying on its back uses its front paws to swat playfully at the upright panda, displaying an open posture with its limbs extended.\n\n3. Water droplets are visible on the lying panda as it rolls slightly, resulting in splashes, indicating movement in a wet area.\n\n4. The upright panda appears to gently push down on the other panda\u2019s chest with its front paw while maintaining its position on hind legs.\n\n5. The prostrate panda lifts its head, looking towards the upright panda, showcasing an engaged posture in response to the interaction.",
+                "cat_name": "giant_panda",
+                "file_name": "00050",
+                "isValid": true
+            },
+            "2": {
+                "ref_exp": "The giant_panda interacts playfully with another panda.",
+                "caption": "The image features two giant pandas engaged in the following actions:\n\n1. One panda is positioned on its hind legs, interacting with the other panda, which is lying on its back. The upright panda leans forward, making direct contact with the other panda.\n\n2. The panda that is lying on its back uses its front paws to swat playfully at the upright panda, displaying an open posture with its limbs extended.\n\n3. Water droplets are visible on the lying panda as it rolls slightly, resulting in splashes, indicating movement in a wet area.\n\n4. The upright panda appears to gently push down on the other panda\u2019s chest with its front paw while maintaining its position on hind legs.\n\n5. The prostrate panda lifts its head, looking towards the upright panda, showcasing an engaged posture in response to the interaction.",
+                "cat_name": "giant_panda",
+                "file_name": "00050",
+                "isValid": false
+            }
+        },
+        "00070": {
+            "1": {
+                "ref_exp": "The giant_panda is using its front paws to grip another panda while lying on its back.",
+                "caption": "In the image, two giant pandas are engaged in a playful interaction. \n\n1. The first panda is lying on its back and using its front paws to push against the second panda, which is positioned above it.\n2. The second panda is leaning forward, resting its front paws on the first panda's chest, its face directed toward the first panda\u2019s face.\n3. Water droplets are splashing from the fur of both pandas as they move, indicating their active engagement.\n4. The first panda adjusts its position, rolling slightly to one side, while the second panda remains steady on top, maintaining contact through their paws.\n5. Both pandas appear to be using their forelimbs to grip each other playfully. \n\nThese actions illustrate their interaction and physical engagement without any other contextual details.",
+                "cat_name": "giant_panda",
+                "file_name": "00070",
+                "isValid": false
+            },
+            "2": {
+                "ref_exp": "The giant_panda is using its front paws to push against the other giant_panda.",
+                "caption": "In the image, two giant pandas are engaged in a playful interaction. \n\n1. The first panda is lying on its back and using its front paws to push against the second panda, which is positioned above it.\n2. The second panda is leaning forward, resting its front paws on the first panda's chest, its face directed toward the first panda\u2019s face.\n3. Water droplets are splashing from the fur of both pandas as they move, indicating their active engagement.\n4. The first panda adjusts its position, rolling slightly to one side, while the second panda remains steady on top, maintaining contact through their paws.\n5. Both pandas appear to be using their forelimbs to grip each other playfully. \n\nThese actions illustrate their interaction and physical engagement without any other contextual details.",
+                "cat_name": "giant_panda",
+                "file_name": "00070",
+                "isValid": false
+            }
+        }
+    },
+    "863b4049d7": {
+        "00010": null,
+        "00065": {
+            "1": {
+                "ref_exp": "The sheep is grazing.",
+                "caption": "In the image, the sheep exhibit various actions:\n\n1. **Grazing**: Several sheep are positioned near the ground, using their mouths to pull at visible patches of grass or forage in the dirt.\n\n2. **Moving**: A group of sheep is shifting from one spot to another, their legs visibly stepping as they walk in different directions.\n\n3. **Standing**: Some sheep are standing still, facing different orientations, with their bodies upright and stable.\n\n4. **Nudging**: A few sheep are seen nudging each other gently with their heads, engaging in a behavior that involves physical contact.\n\n5. **Vocalizing**: Some sheep are shown with their mouths open, indicating they are making noise, contributing to the collective sound of the group.\n\n6. **Resting**: A handful of sheep are positioned with their bodies lowered, likely resting while remaining alert to their surroundings. \n\nEach action demonstrates a specific behavior that reflects the sheep's daily activities.",
+                "cat_name": "sheep",
+                "file_name": "00065",
+                "isValid": false
+            },
+            "2": {
+                "ref_exp": "The sheep is grazing.",
+                "caption": "In the image, the sheep exhibit various actions:\n\n1. **Grazing**: Several sheep are positioned near the ground, using their mouths to pull at visible patches of grass or forage in the dirt.\n\n2. **Moving**: A group of sheep is shifting from one spot to another, their legs visibly stepping as they walk in different directions.\n\n3. **Standing**: Some sheep are standing still, facing different orientations, with their bodies upright and stable.\n\n4. **Nudging**: A few sheep are seen nudging each other gently with their heads, engaging in a behavior that involves physical contact.\n\n5. **Vocalizing**: Some sheep are shown with their mouths open, indicating they are making noise, contributing to the collective sound of the group.\n\n6. **Resting**: A handful of sheep are positioned with their bodies lowered, likely resting while remaining alert to their surroundings. \n\nEach action demonstrates a specific behavior that reflects the sheep's daily activities.",
+                "cat_name": "sheep",
+                "file_name": "00065",
+                "isValid": false
+            }
+        },
+        "00115": null,
+        "00165": {
+            "1": {
+                "ref_exp": "The sheep is stepping forward with a foot raised off the ground.",
+                "caption": "In the image, several sheep are depicted engaging in distinct actions:\n\n1. One sheep is moving away from the group, stepping to the left with a single foot raised off the ground, suggesting a direction towards the open space.\n2. A cluster of sheep stands closely together, with their heads lowered, indicating they are grazing or examining the ground.\n3. Another sheep is positioned at the back, looking back towards the rest of the group, its head turned to the right.\n4. A few sheep are lined up near a fence, forming a semi-circle, with their bodies angled outward.\n5. Some sheep are standing with their legs apart on the ground, creating a stable posture.\n\nOverall, the actions of the sheep demonstrate various movements and interactions within the group.",
+                "cat_name": "sheep",
+                "file_name": "00165",
+                "isValid": false
+            },
+            "2": {
+                "ref_exp": "The sheep is stepping away from the group.",
+                "caption": "In the image, several sheep are depicted engaging in distinct actions:\n\n1. One sheep is moving away from the group, stepping to the left with a single foot raised off the ground, suggesting a direction towards the open space.\n2. A cluster of sheep stands closely together, with their heads lowered, indicating they are grazing or examining the ground.\n3. Another sheep is positioned at the back, looking back towards the rest of the group, its head turned to the right.\n4. A few sheep are lined up near a fence, forming a semi-circle, with their bodies angled outward.\n5. Some sheep are standing with their legs apart on the ground, creating a stable posture.\n\nOverall, the actions of the sheep demonstrate various movements and interactions within the group.",
+                "cat_name": "sheep",
+                "file_name": "00165",
+                "isValid": false
+            }
+        }
+    },
+    "c36240d96f": {
+        "00035": null,
+        "00045": null,
+        "00095": null,
+        "00120": {
+            "1": {
+                "ref_exp": "The parrot pecks at the floor near the mirror.",
+                "caption": "The image features two parrots interacting with each other and their reflection in a mirror. \n\n1. One parrot approaches the mirror and pauses in front of it, turning its head to look at its reflection.\n2. The second parrot moves closer to the first parrot, tilting its head as it observes the first parrot's actions.\n3. The first parrot begins to peck at the floor near the mirror, using its beak to make contact with the surface.\n4. The second parrot shifts its position slightly, fluttering its wings and then stepping back.\n5. The first parrot flaps its wings in response, appearing to mirror the actions of the second parrot.\n6. Both parrots take turns moving closer to the mirror, occasionally stopping to examine their reflections.",
+                "cat_name": "parrot",
+                "file_name": "00120",
+                "isValid": false
+            },
+            "2": {
+                "ref_exp": "The parrot pecks at the floor near the mirror.",
+                "caption": "The image features two parrots interacting with each other and their reflection in a mirror. \n\n1. One parrot approaches the mirror and pauses in front of it, turning its head to look at its reflection.\n2. The second parrot moves closer to the first parrot, tilting its head as it observes the first parrot's actions.\n3. The first parrot begins to peck at the floor near the mirror, using its beak to make contact with the surface.\n4. The second parrot shifts its position slightly, fluttering its wings and then stepping back.\n5. The first parrot flaps its wings in response, appearing to mirror the actions of the second parrot.\n6. Both parrots take turns moving closer to the mirror, occasionally stopping to examine their reflections.",
+                "cat_name": "parrot",
+                "file_name": "00120",
+                "isValid": true
+            }
+        }
+    },
+    "f66981af4e": null,
+    "ef45ce3035": {
+        "00035": {
+            "1": {
+                "ref_exp": "The earless_seal is resting on a rocky surface.",
+                "caption": "In the image, several earless seals are observed engaged in various actions:\n\n1. One seal is swimming close to the surface, its body largely submerged while its head is above the waterline.\n2. Another seal is diving deeper, with its flippers spread as it propels itself downwards.\n3. Several seals are resting on a rocky surface, positioned on their sides with their bodies mostly still.\n4. One seal is nudging another with its snout while floating, demonstrating interaction.\n5. A seal is splashing water with its flippers while maneuvering in the shallow area near the rocks.\n6. Another seal is emerging from the water, shaking its body to remove droplets as it climbs onto the rocky shoreline.",
+                "cat_name": "earless_seal",
+                "file_name": "00035",
+                "isValid": true
+            },
+            "2": {}
+        },
+        "00065": {
+            "1": {
+                "ref_exp": "The earless_seal is resting on the surface.",
+                "caption": "In the image, the earless seals exhibit several distinct actions:\n\n1. One seal is resting on a rock, positioned against the water, with its body sprawled comfortably on the surface.\n2. Another seal is swimming in the water, partially submerged, with its head and upper body visible as it moves forward.\n3. A group of seals is engaged in a playful interaction, splashing the water around them, causing ripples and waves.\n4. A seal is seen diving below the surface, its streamlined body disappearing as it moves downward.\n5. Several seals are clustered together, appearing to engage in social behavior, while others are scattered in various positions throughout the water.",
+                "cat_name": "earless_seal",
+                "file_name": "00065",
+                "isValid": true
+            },
+            "2": {
+                "ref_exp": "The earless_seal is positioned on the ground with its body partially submerged in the water.",
+                "caption": "In the image, the earless seals exhibit several distinct actions:\n\n1. One seal is resting on a rock, positioned against the water, with its body sprawled comfortably on the surface.\n2. Another seal is swimming in the water, partially submerged, with its head and upper body visible as it moves forward.\n3. A group of seals is engaged in a playful interaction, splashing the water around them, causing ripples and waves.\n4. A seal is seen diving below the surface, its streamlined body disappearing as it moves downward.\n5. Several seals are clustered together, appearing to engage in social behavior, while others are scattered in various positions throughout the water.",
+                "cat_name": "earless_seal",
+                "file_name": "00065",
+                "isValid": true
+            }
+        },
+        "00120": {
+            "1": {
+                "ref_exp": "The earless_seal is climbing onto a rock using its flippers.",
+                "caption": "In the image, several earless seals are engaged in distinct actions:\n\n1. One seal is resting on a smooth rock, positioned upright with its body supported by its flippers.\n2. Another seal is partially submerged in the water, with its head above the surface while its body is mainly underwater.\n3. A seal is swimming, moving through the water with its body streamlined, creating ripples around it.\n4. Two seals are interacting close to each other, appearing to engage in playful behavior, possibly splashing water.\n5. A seal is climbing onto a rock, using its flippers to push itself up and elevate its body from the water.\n6. One seal is yawning, displaying its mouth wide open while remaining on a rock. \n\nEach action is clear and distinct, showcasing the behaviors of the earless seals in their environment.",
+                "cat_name": "earless_seal",
+                "file_name": "00120",
+                "isValid": false
+            },
+            "2": {
+                "ref_exp": "The earless_seal is climbing onto a rock.",
+                "caption": "In the image, several earless seals are engaged in distinct actions:\n\n1. One seal is resting on a smooth rock, positioned upright with its body supported by its flippers.\n2. Another seal is partially submerged in the water, with its head above the surface while its body is mainly underwater.\n3. A seal is swimming, moving through the water with its body streamlined, creating ripples around it.\n4. Two seals are interacting close to each other, appearing to engage in playful behavior, possibly splashing water.\n5. A seal is climbing onto a rock, using its flippers to push itself up and elevate its body from the water.\n6. One seal is yawning, displaying its mouth wide open while remaining on a rock. \n\nEach action is clear and distinct, showcasing the behaviors of the earless seals in their environment.",
+                "cat_name": "earless_seal",
+                "file_name": "00120",
+                "isValid": false
+            }
+        },
+        "00165": {
+            "1": {
+                "ref_exp": "The earless seal interacts with the water, creating ripples as it splashes with its flippers.",
+                "caption": "In the image, several earless seals are engaged in various actions:\n\n1. An earless seal lies on a rock, positioned upright and utilizing its flippers for balance.\n2. Another seal is submerged in the water, with only its head visible above the surface, actively paddling with its fore flippers.\n3. A group of seals swims near the surface, moving in synchrony while occasionally basking in the water.\n4. One seal is climbing onto a rocky ledge, using its body to push against the stone for support.\n5. Another seal is interacting with the water, creating ripples as it splashes with its flippers.\n6. In the background, two seals are positioned adjacent to each other, grooming themselves by scratching at their fur.",
+                "cat_name": "earless_seal",
+                "file_name": "00165",
+                "isValid": true
+            },
+            "2": {
+                "ref_exp": "The earless_seal is lying on a rock and using its flippers for balance.",
+                "caption": "In the image, several earless seals are engaged in various actions:\n\n1. An earless seal lies on a rock, positioned upright and utilizing its flippers for balance.\n2. Another seal is submerged in the water, with only its head visible above the surface, actively paddling with its fore flippers.\n3. A group of seals swims near the surface, moving in synchrony while occasionally basking in the water.\n4. One seal is climbing onto a rocky ledge, using its body to push against the stone for support.\n5. Another seal is interacting with the water, creating ripples as it splashes with its flippers.\n6. In the background, two seals are positioned adjacent to each other, grooming themselves by scratching at their fur.",
+                "cat_name": "earless_seal",
+                "file_name": "00165",
+                "isValid": false
+            }
+        }
+    },
+    "750be4c4d8": {
+        "00065": {
+            "1": {
+                "ref_exp": "The person is walking away from the buses.",
+                "caption": "The image depicts several individuals interacting with buses in a bus yard. \n\n1. One person in a red shirt is pointing at the bus, likely directing attention toward it.\n2. Another person wearing a yellow vest is walking towards the bus with a clipboard in hand.\n3. A third individual is standing still, watching the buses, possibly assessing the situation.\n4. An individual with a backpack is walking away from the buses, moving toward the left side of the image.\n\nThe buses appear to be articulated, connected at their center, and are positioned in the yard.",
+                "cat_name": "person",
+                "file_name": "00065",
+                "isValid": false
+            },
+            "2": {
+                "ref_exp": "The person is pointing at the bus.",
+                "caption": "The image depicts several individuals interacting with buses in a bus yard. \n\n1. One person in a red shirt is pointing at the bus, likely directing attention toward it.\n2. Another person wearing a yellow vest is walking towards the bus with a clipboard in hand.\n3. A third individual is standing still, watching the buses, possibly assessing the situation.\n4. An individual with a backpack is walking away from the buses, moving toward the left side of the image.\n\nThe buses appear to be articulated, connected at their center, and are positioned in the yard.",
+                "cat_name": "person",
+                "file_name": "00065",
+                "isValid": false
+            },
+            "3": {
+                "ref_exp": "The person is pointing at the bus.",
+                "caption": "The image depicts several individuals interacting with buses in a bus yard. \n\n1. One person in a red shirt is pointing at the bus, likely directing attention toward it.\n2. Another person wearing a yellow vest is walking towards the bus with a clipboard in hand.\n3. A third individual is standing still, watching the buses, possibly assessing the situation.\n4. An individual with a backpack is walking away from the buses, moving toward the left side of the image.\n\nThe buses appear to be articulated, connected at their center, and are positioned in the yard.",
+                "cat_name": "person",
+                "file_name": "00065",
+                "isValid": false
+            }
+        },
+        "00090": {
+            "1": {
+                "ref_exp": "A person is standing with a backpack, observing a bus.",
+                "caption": "In the image, multiple actions are taking place:\n\n1. A person is standing with a backpack, facing a yellow and blue bus, observing it.\n2. Several individuals are grouped together, appearing to converse or wait near another bus in the background.\n3. A bus is parked with its doors closed, indicating that it is not currently in use.\n4. A small group of children is nearby, some appearing to play or engage in activity.\n5. Another bus is positioned slightly ahead, and a few individuals are walking around it, possibly preparing to board or disembark.\n6. A larger vehicle, likely a truck, is positioned in the background, not in direct interaction with the people. \n\nEach action clearly reflects movement in relation to the buses within the scene.",
+                "cat_name": "person",
+                "file_name": "00090",
+                "isValid": true
+            },
+            "2": {
+                "ref_exp": "Person standing and observing a bus.",
+                "caption": "In the image, multiple actions are taking place:\n\n1. A person is standing with a backpack, facing a yellow and blue bus, observing it.\n2. Several individuals are grouped together, appearing to converse or wait near another bus in the background.\n3. A bus is parked with its doors closed, indicating that it is not currently in use.\n4. A small group of children is nearby, some appearing to play or engage in activity.\n5. Another bus is positioned slightly ahead, and a few individuals are walking around it, possibly preparing to board or disembark.\n6. A larger vehicle, likely a truck, is positioned in the background, not in direct interaction with the people. \n\nEach action clearly reflects movement in relation to the buses within the scene.",
+                "cat_name": "person",
+                "file_name": "00090",
+                "isValid": true
+            },
+            "3": {
+                "ref_exp": "Person is standing with a backpack and observing a bus.",
+                "caption": "In the image, multiple actions are taking place:\n\n1. A person is standing with a backpack, facing a yellow and blue bus, observing it.\n2. Several individuals are grouped together, appearing to converse or wait near another bus in the background.\n3. A bus is parked with its doors closed, indicating that it is not currently in use.\n4. A small group of children is nearby, some appearing to play or engage in activity.\n5. Another bus is positioned slightly ahead, and a few individuals are walking around it, possibly preparing to board or disembark.\n6. A larger vehicle, likely a truck, is positioned in the background, not in direct interaction with the people. \n\nEach action clearly reflects movement in relation to the buses within the scene.",
+                "cat_name": "person",
+                "file_name": "00090",
+                "isValid": false
+            }
+        },
+        "00115": {
+            "1": {
+                "ref_exp": "Person walking towards the yellow bus.",
+                "caption": "In the image, several actions are taking place:\n\n1. A yellow bus is positioned at the center, with its back doors opened, indicating it's either loading or unloading passengers.\n2. A group of individuals, including a small child, is positioned to the left of the yellow bus, standing and facing towards it, some appearing to engage in conversation.\n3. Another individual is walking towards the yellow bus, carrying a backpack, potentially intending to board.\n4. A person in a red jacket is near a second bus, with their body turned slightly towards it, suggesting they are looking at the bus.\n5. In the background, a few figures are standing near a building, possibly waiting or observing the area.\n\nThese actions are distinct and highlight the interactions around the buses.",
+                "cat_name": "person",
+                "file_name": "00115",
+                "isValid": false
+            },
+            "2": {
+                "ref_exp": "The person is looking at the bus.",
+                "caption": "In the image, several actions are taking place:\n\n1. A yellow bus is positioned at the center, with its back doors opened, indicating it's either loading or unloading passengers.\n2. A group of individuals, including a small child, is positioned to the left of the yellow bus, standing and facing towards it, some appearing to engage in conversation.\n3. Another individual is walking towards the yellow bus, carrying a backpack, potentially intending to board.\n4. A person in a red jacket is near a second bus, with their body turned slightly towards it, suggesting they are looking at the bus.\n5. In the background, a few figures are standing near a building, possibly waiting or observing the area.\n\nThese actions are distinct and highlight the interactions around the buses.",
+                "cat_name": "person",
+                "file_name": "00115",
+                "isValid": false
+            },
+            "3": {
+                "ref_exp": "A person is standing near a second bus.",
+                "caption": "In the image, several actions are taking place:\n\n1. A yellow bus is positioned at the center, with its back doors opened, indicating it's either loading or unloading passengers.\n2. A group of individuals, including a small child, is positioned to the left of the yellow bus, standing and facing towards it, some appearing to engage in conversation.\n3. Another individual is walking towards the yellow bus, carrying a backpack, potentially intending to board.\n4. A person in a red jacket is near a second bus, with their body turned slightly towards it, suggesting they are looking at the bus.\n5. In the background, a few figures are standing near a building, possibly waiting or observing the area.\n\nThese actions are distinct and highlight the interactions around the buses.",
+                "cat_name": "person",
+                "file_name": "00115",
+                "isValid": false
+            }
+        },
+        "00125": {
+            "1": {
+                "ref_exp": "The person is walking away from the bus.",
+                "caption": "In the image, a yellow and blue bus is maneuvering, likely making a turn or preparing to depart. Nearby, a group of people is gathered, with some individuals walking in different directions. One person is carrying a bag and walking away from the bus, while another group appears to be interacting near the bus. A fire truck is parked in the background, with its lights visible. Children can be seen moving around the area, possibly playing or exploring. The setting suggests an active scene with various actions unfolding around the bus.",
+                "cat_name": "person",
+                "file_name": "00125",
+                "isValid": true
+            },
+            "2": {
+                "ref_exp": "The person is standing by the bus.",
+                "caption": "In the image, a yellow and blue bus is maneuvering, likely making a turn or preparing to depart. Nearby, a group of people is gathered, with some individuals walking in different directions. One person is carrying a bag and walking away from the bus, while another group appears to be interacting near the bus. A fire truck is parked in the background, with its lights visible. Children can be seen moving around the area, possibly playing or exploring. The setting suggests an active scene with various actions unfolding around the bus.",
+                "cat_name": "person",
+                "file_name": "00125",
+                "isValid": true
+            },
+            "3": {
+                "ref_exp": "Person is standing and interacting with others.",
+                "caption": "In the image, a yellow and blue bus is maneuvering, likely making a turn or preparing to depart. Nearby, a group of people is gathered, with some individuals walking in different directions. One person is carrying a bag and walking away from the bus, while another group appears to be interacting near the bus. A fire truck is parked in the background, with its lights visible. Children can be seen moving around the area, possibly playing or exploring. The setting suggests an active scene with various actions unfolding around the bus.",
+                "cat_name": "person",
+                "file_name": "00125",
+                "isValid": false
+            }
+        }
+    },
+    "c307f33da2": {
+        "00225": {
+            "1": {
+                "ref_exp": "The giraffe extends its neck towards a person offering food.",
+                "caption": "In the image, one giraffe extends its long neck towards a person holding a piece of green leafy food. The giraffe's mouth opens slightly, indicating an action of reaching or preparing to take the food. Another giraffe, positioned slightly behind the first, appears to be standing still, not actively engaged in the feeding process. The person is also leaning forward with an outstretched arm, clearly directing the food towards the giraffe. In the background, several other individuals are observing the scene, but their actions are not the focus of this description.",
+                "cat_name": "giraffe",
+                "file_name": "00225",
+                "isValid": true
+            },
+            "2": {
+                "ref_exp": "The giraffe reaches towards the person holding food.",
+                "caption": "In the image, one giraffe extends its long neck towards a person holding a piece of green leafy food. The giraffe's mouth opens slightly, indicating an action of reaching or preparing to take the food. Another giraffe, positioned slightly behind the first, appears to be standing still, not actively engaged in the feeding process. The person is also leaning forward with an outstretched arm, clearly directing the food towards the giraffe. In the background, several other individuals are observing the scene, but their actions are not the focus of this description.",
+                "cat_name": "giraffe",
+                "file_name": "00225",
+                "isValid": true
+            },
+            "3": {}
+        },
+        "00245": {
+            "1": {
+                "ref_exp": "The giraffe extends its neck to grasp leaves from a woman's hand.",
+                "caption": "In the image, one giraffe extends its long neck towards a woman holding green leaves, using its tongue to grasp the leaves from her hand. This giraffe seems focused on the leaves. A second giraffe is visible in the background, standing upright with its body facing away from the viewer. The woman's arm is extended while she remains stationary, and she appears to be interacting with the giraffe. The leaves are prominently held in front of the giraffe's mouth as it reaches for them.",
+                "cat_name": "giraffe",
+                "file_name": "00245",
+                "isValid": true
+            },
+            "2": {
+                "ref_exp": "The giraffe extends its neck to grasp leaves from a woman's hand.",
+                "caption": "In the image, one giraffe extends its long neck towards a woman holding green leaves, using its tongue to grasp the leaves from her hand. This giraffe seems focused on the leaves. A second giraffe is visible in the background, standing upright with its body facing away from the viewer. The woman's arm is extended while she remains stationary, and she appears to be interacting with the giraffe. The leaves are prominently held in front of the giraffe's mouth as it reaches for them.",
+                "cat_name": "giraffe",
+                "file_name": "00245",
+                "isValid": true
+            },
+            "3": {}
+        },
+        "00265": {
+            "1": {
+                "ref_exp": "The giraffe extends its neck toward the woman, reaching out its tongue to grasp the greens being offered.",
+                "caption": "The image shows two giraffes engaging in distinct actions. \n\n1. The closest giraffe extends its neck toward a woman holding a cluster of leafy greens. Its head is tilted forward, and its tongue is reaching out, likely to grasp the food being offered.\n\n2. A second giraffe is standing slightly behind, facing away from the camera. It is standing upright, with its legs straight and its posture relaxed, while observing the interaction between the first giraffe and the woman.\n\n3. The woman, holding a child, is extending her arm, presenting the greens in an upward motion towards the giraffe.\n\n4. A child in her arms is looking toward the giraffe, with a curious expression as the animal approaches. \n\nThe scene captures the direct interaction and feeding behavior of giraffes in response to human presence.",
+                "cat_name": "giraffe",
+                "file_name": "00265",
+                "isValid": true
+            },
+            "2": {
+                "ref_exp": "The giraffe extends its neck toward the woman offering food.",
+                "caption": "The image shows two giraffes engaging in distinct actions. \n\n1. The closest giraffe extends its neck toward a woman holding a cluster of leafy greens. Its head is tilted forward, and its tongue is reaching out, likely to grasp the food being offered.\n\n2. A second giraffe is standing slightly behind, facing away from the camera. It is standing upright, with its legs straight and its posture relaxed, while observing the interaction between the first giraffe and the woman.\n\n3. The woman, holding a child, is extending her arm, presenting the greens in an upward motion towards the giraffe.\n\n4. A child in her arms is looking toward the giraffe, with a curious expression as the animal approaches. \n\nThe scene captures the direct interaction and feeding behavior of giraffes in response to human presence.",
+                "cat_name": "giraffe",
+                "file_name": "00265",
+                "isValid": true
+            },
+            "3": {
+                "ref_exp": "The giraffe extends its neck toward a woman offering food.",
+                "caption": "The image shows two giraffes engaging in distinct actions. \n\n1. The closest giraffe extends its neck toward a woman holding a cluster of leafy greens. Its head is tilted forward, and its tongue is reaching out, likely to grasp the food being offered.\n\n2. A second giraffe is standing slightly behind, facing away from the camera. It is standing upright, with its legs straight and its posture relaxed, while observing the interaction between the first giraffe and the woman.\n\n3. The woman, holding a child, is extending her arm, presenting the greens in an upward motion towards the giraffe.\n\n4. A child in her arms is looking toward the giraffe, with a curious expression as the animal approaches. \n\nThe scene captures the direct interaction and feeding behavior of giraffes in response to human presence.",
+                "cat_name": "giraffe",
+                "file_name": "00265",
+                "isValid": true
+            }
+        },
+        "00275": {
+            "1": {
+                "ref_exp": "The giraffe reaches forward to grasp food offered by a person.",
+                "caption": "In the image, one giraffe extends its neck forward, reaching toward a person holding out a piece of food. Its long tongue emerges to grasp the food offered. Another giraffe stands nearby, with its head turned slightly, observing the interaction. The first giraffe's movements are distinct as it leans in to eat, while the second giraffe remains stationary, seemingly uninterested in the immediate food. The person extends their arm, actively presenting the food to the giraffe.",
+                "cat_name": "giraffe",
+                "file_name": "00275",
+                "isValid": true
+            },
+            "2": {
+                "ref_exp": "The giraffe extends its neck forward to grasp the food offered.",
+                "caption": "In the image, one giraffe extends its neck forward, reaching toward a person holding out a piece of food. Its long tongue emerges to grasp the food offered. Another giraffe stands nearby, with its head turned slightly, observing the interaction. The first giraffe's movements are distinct as it leans in to eat, while the second giraffe remains stationary, seemingly uninterested in the immediate food. The person extends their arm, actively presenting the food to the giraffe.",
+                "cat_name": "giraffe",
+                "file_name": "00275",
+                "isValid": true
+            },
+            "3": {
+                "ref_exp": "The giraffe extends its neck to grasp food from a person.",
+                "caption": "In the image, one giraffe extends its neck forward, reaching toward a person holding out a piece of food. Its long tongue emerges to grasp the food offered. Another giraffe stands nearby, with its head turned slightly, observing the interaction. The first giraffe's movements are distinct as it leans in to eat, while the second giraffe remains stationary, seemingly uninterested in the immediate food. The person extends their arm, actively presenting the food to the giraffe.",
+                "cat_name": "giraffe",
+                "file_name": "00275",
+                "isValid": true
+            }
+        }
+    },
+    "9877af5063": {
+        "00040": {
+            "1": {
+                "ref_exp": "The sheep extends its mouth toward the stump.",
+                "caption": "In the image, the black sheep is approached closely to a wooden stump. \n\n1. The black sheep stands next to the stump, leaning its head forward.\n2. The sheep extends its mouth toward the stump, making contact with the surface.\n3. The sheep tilts its head slightly, pulling back after an interaction with the stump.\n4. In the background, another sheep is seen grazing, bending its neck to eat grass.\n\nThese actions highlight the sheep's engagement with its immediate environment, particularly the wooden stump.",
+                "cat_name": "sheep",
+                "file_name": "00040",
+                "isValid": false
+            },
+            "2": {
+                "ref_exp": "The sheep extends its mouth toward the stump.",
+                "caption": "In the image, the black sheep is approached closely to a wooden stump. \n\n1. The black sheep stands next to the stump, leaning its head forward.\n2. The sheep extends its mouth toward the stump, making contact with the surface.\n3. The sheep tilts its head slightly, pulling back after an interaction with the stump.\n4. In the background, another sheep is seen grazing, bending its neck to eat grass.\n\nThese actions highlight the sheep's engagement with its immediate environment, particularly the wooden stump.",
+                "cat_name": "sheep",
+                "file_name": "00040",
+                "isValid": true
+            }
+        },
+        "00055": null,
+        "00090": {
+            "1": {
+                "ref_exp": "The sheep is inspecting a broken tree stump.",
+                "caption": "The image features several sheep engaged in various actions:\n\n1. One black lamb stands near a broken tree stump, inspecting it closely.\n2. The lamb appears to nibble at the exposed wood, using its mouth to pull fibers.\n3. Another sheep, in the background, grazes on the grass, using its head to brush against the ground.\n4. A third sheep is seen moving its head side to side, possibly looking for additional grazing spots. \n5. Occasionally, the black lamb shifts its weight, adjusting its stance while still interacting with the stump. \n\nThese actions are clear and distinct within the scene.",
+                "cat_name": "sheep",
+                "file_name": "00090",
+                "isValid": true
+            },
+            "2": {
+                "ref_exp": "The sheep is inspecting and nibbling at the broken tree stump.",
+                "caption": "The image features several sheep engaged in various actions:\n\n1. One black lamb stands near a broken tree stump, inspecting it closely.\n2. The lamb appears to nibble at the exposed wood, using its mouth to pull fibers.\n3. Another sheep, in the background, grazes on the grass, using its head to brush against the ground.\n4. A third sheep is seen moving its head side to side, possibly looking for additional grazing spots. \n5. Occasionally, the black lamb shifts its weight, adjusting its stance while still interacting with the stump. \n\nThese actions are clear and distinct within the scene.",
+                "cat_name": "sheep",
+                "file_name": "00090",
+                "isValid": true
+            }
+        },
+        "00155": null
+    }
+}

models/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .referformer import build
+def build_model(args):
+    return build(args)

models/backbone.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""
+Backbone modules.
+Modified from DETR (https://github.com/facebookresearch/detr)
+"""
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from typing import Dict, List
+from einops import rearrange
+from util.misc import NestedTensor, is_main_process
+from .position_encoding import build_position_encoding
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+class BackboneBase(nn.Module):
+    def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                parameter.requires_grad_(False)
+        if return_interm_layers:
+            return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
+            # return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"} deformable detr
+            self.strides = [4, 8, 16, 32]
+            self.num_channels = [256, 512, 1024, 2048]
+        else:
+            return_layers = {'layer4': "0"}
+            self.strides = [32]
+            self.num_channels = [2048]
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        return out
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+    def __init__(self, name: str,
+                 train_backbone: bool,
+                 return_interm_layers: bool,
+                 dilation: bool):
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d)
+        assert name not in ('resnet18', 'resnet34'), "number of channels are hard coded"
+        super().__init__(backbone, train_backbone, return_interm_layers)
+        if dilation:
+            self.strides[-1] = self.strides[-1] // 2
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+        self.strides = backbone.strides
+        self.num_channels = backbone.num_channels
+    def forward(self, tensor_list: NestedTensor):
+        tensor_list.tensors = rearrange(tensor_list.tensors, 'b t c h w -> (b t) c h w')
+        tensor_list.mask = rearrange(tensor_list.mask, 'b t h w -> (b t) h w')
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self[1](x).to(x.tensors.dtype))
+        return out, pos
+def build_backbone(args):
+    position_embedding = build_position_encoding(args)
+    train_backbone = args.lr_backbone > 0
+    return_interm_layers = args.masks or (args.num)
+    backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = backbone.num_channels
+    return model

models/criterion.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+from util import box_ops
+from util.misc import (NestedTensor, nested_tensor_from_tensor_list,
+                       accuracy, get_world_size, interpolate,
+                       is_dist_avail_and_initialized, inverse_sigmoid)
+from .segmentation import (dice_loss, sigmoid_focal_loss)
+from einops import rearrange
+class SetCriterion(nn.Module):
+    """ This class computes the loss for ReferFormer.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses, focal_alpha=0.25):
+        """ Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer('empty_weight', empty_weight)
+        self.focal_alpha = focal_alpha
+        self.mask_out_stride = 4
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+        _, nf, nq = src_logits.shape[:3]
+        src_logits = rearrange(src_logits, 'b t q k -> b (t q) k')
+        # judge the valid frames
+        valid_indices = []
+        valids = [target['valid'] for target in targets]
+        for valid, (indice_i, indice_j) in zip(valids, indices):
+            valid_ind = valid.nonzero().flatten()
+            valid_i = valid_ind * nq + indice_i
+            valid_j = valid_ind + indice_j * nf
+            valid_indices.append((valid_i, valid_j))
+        idx = self._get_src_permutation_idx(valid_indices) # NOTE: use valid indices
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, valid_indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        if self.num_classes == 1: # binary referred
+            target_classes[idx] = 0
+        else:
+            target_classes[idx] = target_classes_o
+        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
+                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+        target_classes_onehot = target_classes_onehot[:,:,:-1]
+        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]
+        losses = {'loss_ce': loss_ce}
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            pass
+        return losses
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        src_boxes = outputs['pred_boxes']
+        bs, nf, nq = src_boxes.shape[:3]
+        src_boxes = src_boxes.transpose(1, 2)
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = src_boxes[idx]
+        src_boxes = src_boxes.flatten(0, 1)  # [b*t, 4]
+        target_boxes = torch.cat([t['boxes'] for t in targets], dim=0)  # [b*t, 4]
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses = {}
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+        loss_giou = 1 - torch.diag(box_ops.generalized_box_iou(
+            box_ops.box_cxcywh_to_xyxy(src_boxes),
+            box_ops.box_cxcywh_to_xyxy(target_boxes)))
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+        return losses
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+           targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_masks" in outputs
+        src_idx = self._get_src_permutation_idx(indices)
+        # tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks.transpose(1, 2)
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list([t["masks"] for t in targets],
+                                                              size_divisibility=32, split=False).decompose()
+        target_masks = target_masks.to(src_masks)
+        # downsample ground truth masks with ratio mask_out_stride
+        start = int(self.mask_out_stride // 2)
+        im_h, im_w = target_masks.shape[-2:]
+        target_masks = target_masks[:, :, start::self.mask_out_stride, start::self.mask_out_stride]
+        assert target_masks.size(2) * self.mask_out_stride == im_h
+        assert target_masks.size(3) * self.mask_out_stride == im_w
+        src_masks = src_masks[src_idx]
+        # upsample predictions to the target size
+        # src_masks = interpolate(src_masks, size=target_masks.shape[-2:], mode="bilinear", align_corners=False)
+        src_masks = src_masks.flatten(1) # [b, thw]
+        target_masks = target_masks.flatten(1) # [b, thw]
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
+        }
+        return losses
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'labels': self.loss_labels,
+            'boxes': self.loss_boxes,
+            'masks': self.loss_masks
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+    def forward(self, outputs, targets):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        target_valid = torch.stack([t["valid"] for t in targets], dim=0).reshape(-1) # [B, T] -> [B*T]
+        num_boxes = target_valid.sum().item()
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    kwargs = {}
+                    if loss == 'labels':
+                        # Logging is enabled only for the last layer
+                        kwargs = {'log': False}
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
+                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        return losses

models/deformable_transformer.py ADDED Viewed

	@@ -0,0 +1,444 @@

+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import copy
+from typing import Optional, List
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
+from util.misc import inverse_sigmoid
+from models.ops.modules import MSDeformAttn
+from einops import rearrange
+class DeformableTransformer(nn.Module):
+    def __init__(self, d_model=256, nhead=8,
+                 num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1,
+                 activation="relu", return_intermediate_dec=False,
+                 num_feature_levels=4, dec_n_points=4,  enc_n_points=4,
+                 two_stage=False, two_stage_num_proposals=300):
+        super().__init__()
+        self.d_model = d_model
+        self.nhead = nhead
+        self.dropout = dropout
+        self.two_stage = two_stage
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.num_feature_level = num_feature_levels
+        encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
+                                                          dropout, activation,
+                                                          num_feature_levels,
+                                                          nhead, enc_n_points)
+        self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers)
+        decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
+                                                          dropout, activation,
+                                                          num_feature_levels,
+                                                          nhead, dec_n_points)
+        self.decoder = DeformableTransformerDecoder(decoder_layer, num_decoder_layers, return_intermediate_dec)
+        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
+        if two_stage:
+            self.enc_output = nn.Linear(d_model, d_model)
+            self.enc_output_norm = nn.LayerNorm(d_model)
+            self.pos_trans = nn.Linear(d_model * 2, d_model * 2)
+            self.pos_trans_norm = nn.LayerNorm(d_model * 2)
+        else:
+            self.reference_points = nn.Linear(d_model, 2) # reference point here (x, y)
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        if not self.two_stage:
+            xavier_uniform_(self.reference_points.weight.data, gain=1.0)
+            constant_(self.reference_points.bias.data, 0.)
+        normal_(self.level_embed)
+    def get_proposal_pos_embed(self, proposals):
+        num_pos_feats = 128
+        temperature = 10000
+        scale = 2 * math.pi
+        dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = proposals.sigmoid() * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shapes):
+        N_, S_, C_ = memory.shape
+        base_scale = 4.0
+        proposals = []
+        _cur = 0
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(N_, H_, W_, 1)
+            valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+            grid_y, grid_x = torch.meshgrid(torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
+                                            torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+            scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
+            proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+            proposals.append(proposal)
+            _cur += (H_ * W_)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float('inf'))
+        output_memory = memory
+        output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        return output_memory, output_proposals
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+    def forward(self, srcs, tgt, masks, pos_embeds, query_embed=None):
+        assert self.two_stage or query_embed is not None
+        """
+        srcs (list[Tensor]): list of tensors num_layers x [batch_size*time, c, hi, wi], input of encoder
+        tgt (Tensor): [batch_size, time, c, num_queries_per_frame]
+        masks (list[Tensor]): list of tensors num_layers x [batch_size*time, hi, wi], the mask of srcs
+        pos_embeds (list[Tensor]): list of tensors num_layers x [batch_size*time, c, hi, wi], position encoding of srcs
+        query_embed (Tensor): [num_queries, c]
+        """
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            src = src.flatten(2).transpose(1, 2) # [batch_size, hi*wi, c]
+            mask = mask.flatten(1)               # [batch_size, hi*wi]
+            pos_embed = pos_embed.flatten(2).transpose(1, 2) # [batch_size, hi*wi, c]
+            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        # For a clip, concat all the features, first fpn layer size, then frame size
+        src_flatten = torch.cat(src_flatten, 1)     # [bs*t, \sigma(hi*wi), c]
+        mask_flatten = torch.cat(mask_flatten, 1)   # [bs*t, \sigma(hi*wi)]
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        # encoder
+        # memory: [bs*t, \sigma(hi*wi), c]
+        memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten)
+        # prepare input for decoder
+        bs, _, c = memory.shape
+        if self.two_stage:
+            output_memory, output_proposals = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes)
+            # hack implementation for two-stage Deformable DETR
+            enc_outputs_class = self.decoder.class_embed[self.decoder.num_layers](output_memory)
+            enc_outputs_coord_unact = self.decoder.bbox_embed[self.decoder.num_layers](output_memory) + output_proposals
+            topk = self.two_stage_num_proposals
+            topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+            topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+            topk_coords_unact = topk_coords_unact.detach()
+            reference_points = topk_coords_unact.sigmoid()
+            init_reference_out = reference_points
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))
+            query_embed, tgt = torch.split(pos_trans_out, c, dim=2)
+        else:
+            b, t, q, c = tgt.shape
+            tgt = rearrange(tgt, 'b t q c -> (b t) q c')
+            query_embed = query_embed.unsqueeze(0).expand(b*t, -1, -1)      # [batch_size*time, num_queries_per_frame, c]
+            reference_points = self.reference_points(query_embed).sigmoid() # [batch_size*time, num_queries_per_frame, 2]
+            init_reference_out = reference_points
+        # decoder
+        hs, inter_references, inter_samples = self.decoder(tgt, reference_points, memory,
+                                            spatial_shapes, level_start_index, valid_ratios, query_embed, mask_flatten)
+        inter_references_out = inter_references
+        # convert memory to fpn format
+        memory_features = []  # 8x -> 32x
+        spatial_index = 0
+        for lvl in range(self.num_feature_level - 1):
+            h, w = spatial_shapes[lvl]
+            # [bs*t, c, h, w]
+            memory_lvl = memory[:, spatial_index : spatial_index + h * w, :].reshape(bs, h, w, c).permute(0, 3, 1, 2).contiguous()
+            memory_features.append(memory_lvl)
+            spatial_index += h * w
+        if self.two_stage:
+            return hs, memory_features, init_reference_out, inter_references_out, enc_outputs_class, enc_outputs_coord_unact, inter_samples
+        # hs: [l, batch_size*time, num_queries_per_frame, c], where l is number of decoder layers
+        # init_reference_out: [batch_size*time, num_queries_per_frame, 2]
+        # inter_references_out: [l, batch_size*time, num_queries_per_frame, 4]
+        # memory: [batch_size*time, \sigma(hi*wi), c]
+        # memory_features: list[Tensor]
+        return hs, memory_features, init_reference_out, inter_references_out, None, None, inter_samples
+class DeformableTransformerEncoderLayer(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4):
+        super().__init__()
+        # self attention
+        self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None):
+        # self attention
+        src2, sampling_locations, attention_weights = self.self_attn(self.with_pos_embed(src, pos), reference_points,
+                                                                src, spatial_shapes, level_start_index, padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        # ffn
+        src = self.forward_ffn(src)
+        return src
+class DeformableTransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                                          torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+    def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None):
+        output = src
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device)
+        for _, layer in enumerate(self.layers):
+            output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask)
+        return output
+class DeformableTransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model=256, d_ffn=1024,
+                 dropout=0.1, activation="relu",
+                 n_levels=4, n_heads=8, n_points=4):
+        super().__init__()
+        # cross attention
+        self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward_ffn(self, tgt):
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+    def forward(self, tgt, query_pos, reference_points, src, src_spatial_shapes, level_start_index, src_padding_mask=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1))[0].transpose(0, 1)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        # cross attention
+        tgt2, sampling_locations, attention_weights = self.cross_attn(self.with_pos_embed(tgt, query_pos),
+                               reference_points,
+                               src, src_spatial_shapes, level_start_index, src_padding_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        # ffn
+        tgt = self.forward_ffn(tgt)
+        return tgt, sampling_locations, attention_weights
+class DeformableTransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.return_intermediate = return_intermediate
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+    def forward(self, tgt, reference_points, src, src_spatial_shapes, src_level_start_index, src_valid_ratios,
+                query_pos=None, src_padding_mask=None):
+        # we modify here for get the information of sample points
+        output = tgt
+        intermediate = []
+        intermediate_reference_points = []
+        intermediate_samples = [] # sample points
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] \
+                                         * torch.cat([src_valid_ratios, src_valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None]
+            output, sampling_locations, attention_weights = layer(output, query_pos, reference_points_input,
+                                                        src, src_spatial_shapes, src_level_start_index, src_padding_mask)
+            # sampling_loactions: [N, Len_q, self.n_heads, self.n_levels, self.n_points, 2],
+            #                     [B, Q, n_head, n_level(num_feature_level*num_frames), n_points, 2]
+            # attention_weights: [B, Q, n_head, n_level(num_feature_level*num_frames), n_points]
+            # src_valid_ratios: [N, self.n_levels, 2]
+            N, Len_q = sampling_locations.shape[:2]
+            sampling_locations = sampling_locations / src_valid_ratios[:, None, None, :, None, :]
+            weights_flat = attention_weights.view(N, Len_q, -1)      # [B, Q, n_head * n_level * n_points]
+            samples_flat = sampling_locations.view(N, Len_q, -1, 2)  # [B, Q, n_head * n_level * n_points, 2]
+            top_weights, top_idx = weights_flat.topk(30, dim=2)      # [B, Q, 30], [B, Q, 30]
+            weights_keep = torch.gather(weights_flat, 2, top_idx)    # [B, Q, 30]
+            samples_keep = torch.gather(samples_flat, 2, top_idx.unsqueeze(-1).repeat(1, 1, 1, 2))  # [B, Q, 30, 2]
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                tmp = self.bbox_embed[lid](output)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    assert reference_points.shape[-1] == 2
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+                intermediate_samples.append(samples_keep)
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(intermediate_reference_points), torch.stack(intermediate_samples)
+        return output, reference_points, samples_keep
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+def build_deforamble_transformer(args):
+    return DeformableTransformer(
+        d_model=args.hidden_dim,
+        nhead=args.nheads,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        dim_feedforward=args.dim_feedforward,
+        dropout=args.dropout,
+        activation="relu",
+        return_intermediate_dec=True,
+        num_feature_levels=args.num_feature_levels,
+        dec_n_points=args.dec_n_points,
+        enc_n_points=args.enc_n_points,
+        two_stage=args.two_stage,
+        two_stage_num_proposals=args.num_queries)

models/matcher.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""
+Instance Sequence Matching
+Modified from DETR (https://github.com/facebookresearch/detr)
+"""
+import torch
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+import torch.nn.functional as F
+from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou, multi_iou
+from util.misc import nested_tensor_from_tensor_list
+INF = 100000000
+def dice_coef(inputs, targets):
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1).unsqueeze(1) # [N, 1, THW]
+    targets = targets.flatten(1).unsqueeze(0) # [1, M, THW]
+    numerator = 2 * (inputs * targets).sum(2)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    # NOTE coef doesn't be subtracted to 1 as it is not necessary for computing costs
+    coef = (numerator + 1) / (denominator + 1)
+    return coef
+def sigmoid_focal_coef(inputs, targets, alpha: float = 0.25, gamma: float = 2):
+    N, M = len(inputs), len(targets)
+    inputs = inputs.flatten(1).unsqueeze(1).expand(-1, M, -1) # [N, M, THW]
+    targets = targets.flatten(1).unsqueeze(0).expand(N, -1, -1) # [N, M, THW]
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    coef = ce_loss * ((1 - p_t) ** gamma)
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        coef = alpha_t * coef
+    return coef.mean(2) # [N, M]
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1,
+                       cost_mask: float = 1, cost_dice: float = 1, num_classes: int = 1):
+        """Creates the matcher
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+            cost_mask: This is the relative weight of the sigmoid focal loss of the mask in the matching cost
+            cost_dice: This is the relative weight of the dice loss of the mask in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        self.cost_mask = cost_mask
+        self.cost_dice = cost_dice
+        self.num_classes = num_classes
+        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0 \
+            or cost_mask != 0 or cost_dice != 0, "all costs cant be 0"
+        self.mask_out_stride = 4
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """ Performs the matching
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries_per_frame, num_frames, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries_per_frame, num_frames, 4] with the predicted box coordinates
+                 "pred_masks": Tensor of dim [batch_size, num_queries_per_frame, num_frames, h, w], h,w in 4x size
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 NOTE: Since every frame has one object at most
+                 "labels": Tensor of dim [num_frames] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_frames, 4] containing the target box coordinates
+                 "masks": Tensor of dim [num_frames, h, w], h,w in origin size
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        src_logits = outputs["pred_logits"]
+        src_boxes = outputs["pred_boxes"]
+        src_masks = outputs["pred_masks"]
+        bs, nf, nq, h, w = src_masks.shape
+        # handle mask padding issue
+        target_masks, valid = nested_tensor_from_tensor_list([t["masks"] for t in targets],
+                                                             size_divisibility=32,
+                                                             split=False).decompose()
+        target_masks = target_masks.to(src_masks) # [B, T, H, W]
+        # downsample ground truth masks with ratio mask_out_stride
+        start = int(self.mask_out_stride // 2)
+        im_h, im_w = target_masks.shape[-2:]
+        target_masks = target_masks[:, :, start::self.mask_out_stride, start::self.mask_out_stride]
+        assert target_masks.size(2) * self.mask_out_stride == im_h
+        assert target_masks.size(3) * self.mask_out_stride == im_w
+        indices = []
+        for i in range(bs):
+            out_prob = src_logits[i].sigmoid()
+            out_bbox = src_boxes[i]
+            out_mask = src_masks[i]
+            tgt_ids = targets[i]["labels"]
+            tgt_bbox = targets[i]["boxes"]
+            tgt_mask = target_masks[i]
+            tgt_valid = targets[i]["valid"]
+            # class cost
+            # we average the cost on valid frames
+            cost_class = []
+            for t in range(nf):
+                if tgt_valid[t] == 0:
+                    continue
+                out_prob_split = out_prob[t]
+                tgt_ids_split = tgt_ids[t].unsqueeze(0)
+                # Compute the classification cost.
+                alpha = 0.25
+                gamma = 2.0
+                neg_cost_class = (1 - alpha) * (out_prob_split ** gamma) * (-(1 - out_prob_split + 1e-8).log())
+                pos_cost_class = alpha * ((1 - out_prob_split) ** gamma) * (-(out_prob_split + 1e-8).log())
+                if self.num_classes == 1:  # binary referred
+                    cost_class_split = pos_cost_class[:, [0]] - neg_cost_class[:, [0]]
+                else:
+                    cost_class_split = pos_cost_class[:, tgt_ids_split] - neg_cost_class[:, tgt_ids_split]
+                cost_class.append(cost_class_split)
+            cost_class = torch.stack(cost_class, dim=0).mean(0)  # [q, 1]
+            # box cost
+            # we average the cost on every frame
+            cost_bbox, cost_giou = [], []
+            for t in range(nf):
+                out_bbox_split = out_bbox[t]
+                tgt_bbox_split = tgt_bbox[t].unsqueeze(0)
+                # Compute the L1 cost between boxes
+                cost_bbox_split = torch.cdist(out_bbox_split, tgt_bbox_split, p=1)
+                # Compute the giou cost betwen boxes
+                cost_giou_split = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox_split),
+                                                box_cxcywh_to_xyxy(tgt_bbox_split))
+                cost_bbox.append(cost_bbox_split)
+                cost_giou.append(cost_giou_split)
+            cost_bbox = torch.stack(cost_bbox, dim=0).mean(0)
+            cost_giou = torch.stack(cost_giou, dim=0).mean(0)
+            # mask cost
+            # Compute the focal loss between masks
+            cost_mask = sigmoid_focal_coef(out_mask.transpose(0, 1), tgt_mask.unsqueeze(0))
+            # Compute the dice loss betwen masks
+            cost_dice = -dice_coef(out_mask.transpose(0, 1), tgt_mask.unsqueeze(0))
+            # Final cost matrix
+            C = self.cost_class * cost_class + self.cost_bbox * cost_bbox + self.cost_giou * cost_giou + \
+                self.cost_mask * cost_mask + self.cost_dice * cost_dice  # [q, 1]
+            # Only has one tgt, MinCost Matcher
+            _, src_ind = torch.min(C, dim=0)
+            tgt_ind = torch.arange(1).to(src_ind)
+            indices.append((src_ind.long(), tgt_ind.long()))
+        # list[tuple], length is batch_size
+        return indices
+def build_matcher(args):
+    if args.binary:
+        num_classes = 1
+    else:
+        if args.dataset_file == 'ytvos':
+            num_classes = 65
+        elif args.dataset_file == 'davis':
+            num_classes = 78
+        elif args.dataset_file == 'a2d' or args.dataset_file == 'jhmdb':
+            num_classes = 1
+        else:
+            num_classes = 91  # for coco
+    return HungarianMatcher(cost_class=args.set_cost_class,
+                            cost_bbox=args.set_cost_bbox,
+                            cost_giou=args.set_cost_giou,
+                            cost_mask=args.set_cost_mask,
+                            cost_dice=args.set_cost_dice,
+                            num_classes=num_classes)

models/ops/make.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/usr/bin/env bash
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+python setup.py build install

models/ops/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from .ms_deform_attn import MSDeformAttn