{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "import os\n", "\n", "hardpos_path = os.path.join('/data2/projects/VRIS/llama3', 'verb_ext_text_example_refzom.json')\n", "with open(hardpos_path, 'r', encoding='utf-8') as f:\n", " hardpos_json = json.load(f)\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "57624\n", "loading dataset ref-zom into memory...\n", "loading dataset split final\n", "creating index...\n", "index created.\n", "DONE (t=11.10s)\n" ] } ], "source": [ "print(len(hardpos_json.keys()))\n", "\n", "from refer.refer_zom import ZREFER\n", "refer = ZREFER('/data2/dataset/COCO2014/', 'ref-zom', 'final')\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2327 {'2327': []}\n", "{'sent_ids': [2327], 'file_name': 'COCO_train2014_000000318556.jpg', 'ann_id': [], 'ref_id': 2327, 'image_id': 318556, 'split': 'train', 'source': 'zero', 'sentences': [{'tokens': ['Cooking', 'table', 'in', 'background'], 'raw': 'Cooking table in background', 'sent_id': 2327, 'sent': 'Cooking table in background'}]}\n", "2328 {'2328': []}\n", "{'sent_ids': [2328], 'file_name': 'COCO_train2014_000000116100.jpg', 'ann_id': [], 'ref_id': 2328, 'image_id': 116100, 'split': 'train', 'source': 'zero', 'sentences': [{'tokens': ['An', 'elephant', 'that', 'has', \"it's\", 'trunk', 'and', 'all', 'four', 'feet', 'in', 'the', 'water.'], 'raw': \"An elephant that has it's trunk and all four feet in the water.\", 'sent_id': 2328, 'sent': \"An elephant that has it's trunk and all four feet in the water.\"}]}\n", "2329 {'2329': ['carrying plates of pizza']}\n", "{'sent_ids': [2329], 'file_name': 'COCO_train2014_000000538480.jpg', 'ann_id': [], 'ref_id': 2329, 'image_id': 538480, 'split': 'train', 'source': 'zero', 'sentences': [{'tokens': ['Man', 'in', 'a', 'black', 'shirt', 'carrying', 'plates', 'of', 'pizza.'], 'raw': 'Man in a black shirt carrying plates of pizza.', 'sent_id': 2329, 'sent': 'Man in a black shirt carrying plates of pizza.'}]}\n", "2330 {'2330': ['holding']}\n", "{'sent_ids': [2330], 'file_name': 'COCO_train2014_000000476220.jpg', 'ann_id': [], 'ref_id': 2330, 'image_id': 476220, 'split': 'train', 'source': 'zero', 'sentences': [{'tokens': ['The', 'stuffed', 'pig', 'that', 'the', 'blond', 'boy', 'is', 'holding'], 'raw': 'The stuffed pig that the blond boy is holding', 'sent_id': 2330, 'sent': 'The stuffed pig that the blond boy is holding'}]}\n", "2331 {'2331': []}\n", "{'sent_ids': [2331], 'file_name': 'COCO_train2014_000000299675.jpg', 'ann_id': [], 'ref_id': 2331, 'image_id': 299675, 'split': 'train', 'source': 'zero', 'sentences': [{'tokens': ['The', 'MacBook', 'Pro', 'box.'], 'raw': 'The MacBook Pro box.', 'sent_id': 2331, 'sent': 'The MacBook Pro box.'}]}\n", "2332 {'2332': []}\n", "{'sent_ids': [2332], 'file_name': 'COCO_train2014_000000032275.jpg', 'ann_id': [], 'ref_id': 2332, 'image_id': 32275, 'split': 'train', 'source': 'zero', 'sentences': [{'tokens': ['A', 'purple', 'brick', 'building', 'with', 'a', 'black', 'and', 'blue', 'parking', 'meter.', ''], 'raw': 'A purple brick building with a black and blue parking meter. ', 'sent_id': 2332, 'sent': 'A purple brick building with a black and blue parking meter. '}]}\n", "2333 {'2333': ['being wrapped around']}\n", "{'sent_ids': [2333], 'file_name': 'COCO_train2014_000000025470.jpg', 'ann_id': [], 'ref_id': 2333, 'image_id': 25470, 'split': 'train', 'source': 'zero', 'sentences': [{'tokens': ['The', 'lighter', 'colored', 'giraffe', 'whose', 'neck', 'is', 'wrapped', 'around', 'the', 'other', 'giraffes'], 'raw': 'The lighter colored giraffe whose neck is wrapped around the other giraffes', 'sent_id': 2333, 'sent': 'The lighter colored giraffe whose neck is wrapped around the other giraffes'}]}\n", "2334 {'2334': ['reaching for a frisbee']}\n", "{'sent_ids': [2334], 'file_name': 'COCO_train2014_000000513461.jpg', 'ann_id': [], 'ref_id': 2334, 'image_id': 513461, 'split': 'train', 'source': 'zero', 'sentences': [{'tokens': ['A', 'man', 'wearing', 'white', 'and', 'blue', 'shorts,', 'reaching', 'for', 'a', 'frisbee.'], 'raw': 'A man wearing white and blue shorts, reaching for a frisbee.', 'sent_id': 2334, 'sent': 'A man wearing white and blue shorts, reaching for a frisbee.'}]}\n", "2335 {'2335': []}\n", "{'sent_ids': [2335], 'file_name': 'COCO_train2014_000000285579.jpg', 'ann_id': [], 'ref_id': 2335, 'image_id': 285579, 'split': 'train', 'source': 'zero', 'sentences': [{'tokens': ['A', 'motorbike', 'occupied', 'by', 'two', 'men', 'dressed', 'like', 'teddy', 'bear.'], 'raw': 'A motorbike occupied by two men dressed like teddy bear.', 'sent_id': 2335, 'sent': 'A motorbike occupied by two men dressed like teddy bear.'}]}\n", "2336 {'2336': []}\n", "{'sent_ids': [2336], 'file_name': 'COCO_train2014_000000266366.jpg', 'ann_id': [], 'ref_id': 2336, 'image_id': 266366, 'split': 'train', 'source': 'zero', 'sentences': [{'tokens': ['left', 'portion', 'of', 'sandwich', 'closest', 'to', 'pickle'], 'raw': 'left portion of sandwich closest to pickle', 'sent_id': 2336, 'sent': 'left portion of sandwich closest to pickle'}]}\n", "2337 {'2337': ['leaning over']}\n", "{'sent_ids': [2337], 'file_name': 'COCO_train2014_000000321194.jpg', 'ann_id': [], 'ref_id': 2337, 'image_id': 321194, 'split': 'train', 'source': 'zero', 'sentences': [{'tokens': ['A', 'man', 'in', 'white', 'leaning', 'over.'], 'raw': 'A man in white leaning over.', 'sent_id': 2337, 'sent': 'A man in white leaning over.'}]}\n" ] } ], "source": [ "for idx, key in enumerate(hardpos_json) :\n", " print(key, hardpos_json[key])\n", " print(refer.Refs[int(key)])\n", " \n", " if idx == 10 :\n", " break" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "57624\n" ] } ], "source": [ "ref_ids = refer.getRefIds(split='train')\n", "print(len(ref_ids))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_values([['standing next to', 'being held'], ['standing in front']])\n" ] } ], "source": [ "pos_sents = hardpos_json['9914'].values()\n", "print(pos_sents)\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_values([['standing next to', 'being held'], ['standing in front']])\n", "[['standing next to', 'being held'], ['standing in front']]\n", "['standing next to', 'being held']\n", "[101, 100, 100, 102]\n" ] } ], "source": [ "from bert.tokenization_bert import BertTokenizer\n", "import random\n", "pos_sents = hardpos_json['9914'].values()\n", "print(pos_sents)\n", "pos_sents = [s for s in pos_sents if s is not None]\n", "print(pos_sents)\n", "pos_sent_picked = random.choice(list(pos_sents))\n", "print(pos_sent_picked)\n", "\n", "\n", "attention_mask = [0] * 20\n", "padded_input_ids = [0] * 20\n", "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", "\n", "input_ids = tokenizer.encode(text=pos_sent_picked, add_special_tokens=True)\n", "input_ids = input_ids[:20]\n", "print(input_ids)\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers\n", " warnings.warn(f\"Importing from {__name__} is deprecated, please import via timm.layers\", FutureWarning)\n" ] } ], "source": [ "import datetime\n", "import argparse\n", "import os\n", "import time\n", "\n", "import torch\n", "import torch.utils.data\n", "from torch import nn\n", "\n", "from functools import reduce\n", "import operator\n", "from bert.modeling_bert import BertModel\n", "import torchvision\n", "from lib import segmentation\n", "\n", "import transforms as T\n", "import utils\n", "import numpy as np\n", "\n", "import torch.nn.functional as F\n", "\n", "import gc\n", "from collections import OrderedDict\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Image size: 480\n" ] } ], "source": [ "# python -m torch.distributed.launch \\\n", "# --nproc_per_node 4 \\\n", "# --master_port ${LOCALHOST} \\\n", "# train.py \\\n", "# --model lavt_one \\\n", "# --dataset refcocog \\\n", "# --splitBy umd \\\n", "# --model_id gref_umd \\\n", "# --batch-size 8 \\\n", "# --lr 0.00005 \\\n", "# --wd 1e-2 \\\n", "# --output-dir ./models/gref_umd/lavt_test_lr \\\n", "# --swin_type base \\\n", "# --pretrained_swin_weights ./pretrained_weights/swin_base_patch4_window12_384_22k.pth \\\n", "# --epochs 40 \\\n", "# --img_size 480 2>&1 | tee ./models/gref_umd/lavt_test_lr\n", "import argparse\n", "from utils import init_distributed_mode\n", "\n", "def get_parser():\n", " parser = argparse.ArgumentParser(description=\"Model Training Configuration\")\n", " parser.add_argument('--batch_size', default=8, type=int)\n", " parser.add_argument('--output_dir', default='./models/gref_umd/lavt_test_dset', type=str)\n", " parser.add_argument('--pretrained_swin_weights', default='./pretrained_weights/swin_base_patch4_window12_384_22k.pth', type=str)\n", " parser.add_argument('--dataset', default='refcocog', type=str)\n", " parser.add_argument('--splitBy', default='umd', type=str)\n", " parser.add_argument('--model', default='lavt_one', type=str)\n", "\n", " parser.add_argument('--amsgrad', action='store_true',\n", " help='if true, set amsgrad to True in an Adam or AdamW optimizer.')\n", " parser.add_argument('-b', '--batch-size', default=8, type=int)\n", " parser.add_argument('--bert_tokenizer', default='bert-base-uncased', help='BERT tokenizer')\n", " parser.add_argument('--ck_bert', default='bert-base-uncased', help='pre-trained BERT weights')\n", " #parser.add_argument('--dataset', default='refcoco', help='refcoco, refcoco+, or refcocog')\n", " parser.add_argument('--ddp_trained_weights', action='store_true',\n", " help='Only needs specified when testing,'\n", " 'whether the weights to be loaded are from a DDP-trained model')\n", " parser.add_argument('--device', default='cuda:0', help='device') # only used when testing on a single machine\n", " parser.add_argument('--epochs', default=40, type=int, metavar='N', help='number of total epochs to run')\n", " parser.add_argument('--fusion_drop', default=0.0, type=float, help='dropout rate for PWAMs')\n", " parser.add_argument('--img_size', default=480, type=int, help='input image size')\n", " parser.add_argument(\"--local_rank\", type=int, help='local rank for DistributedDataParallel')\n", " parser.add_argument('--lr', default=0.00005, type=float, help='the initial learning rate')\n", " parser.add_argument('--mha', default='', help='If specified, should be in the format of a-b-c-d, e.g., 4-4-4-4,'\n", " 'where a, b, c, and d refer to the numbers of heads in stage-1,'\n", " 'stage-2, stage-3, and stage-4 PWAMs')\n", " #parser.add_argument('--model', default='lavt', help='model: lavt, lavt_one')\n", " parser.add_argument('--model_id', default='lavt', help='name to identify the model')\n", " parser.add_argument('--output-dir', default='./checkpoints/', help='path where to save checkpoint weights')\n", " parser.add_argument('--pin_mem', action='store_true',\n", " help='If true, pin memory when using the data loader.')\n", " parser.add_argument('--print-freq', default=10, type=int, help='print frequency')\n", " parser.add_argument('--refer_data_root', default='./refer/data/', help='REFER dataset root directory')\n", " parser.add_argument('--resume', default='', help='resume from checkpoint')\n", " parser.add_argument('--split', default='test', help='only used when testing')\n", " #parser.add_argument('--splitBy', default='unc', help='change to umd or google when the dataset is G-Ref (RefCOCOg)')\n", " parser.add_argument('--swin_type', default='base',\n", " help='tiny, small, base, or large variants of the Swin Transformer')\n", " parser.add_argument('--wd', '--weight-decay', default=1e-2, type=float, metavar='W', help='weight decay',\n", " dest='weight_decay')\n", " parser.add_argument('--window12', action='store_true',\n", " help='only needs specified when testing,'\n", " 'when training, window size is inferred from pre-trained weights file name'\n", " '(containing \\'window12\\'). Initialize Swin with window size 12 instead of the default 7.')\n", " parser.add_argument('-j', '--workers', default=8, type=int, metavar='N', help='number of data loading workers')\n", "\n", " parser.add_argument('--metric_learning', default=True, type=bool, help='whether to use metric learning')\n", " parser.add_argument('--exclude_multiobj', default=True, type=bool, help='whether to exclude multi-object images')\n", " parser.add_argument('--metric_mode', default='both', type=str, help='both : add hp and hn')\n", " parser.add_argument('--hn_prob', default=0.5, type=float, help='negative sample prob')\n", " \n", " return parser\n", "\n", "parser = get_parser()\n", "args = parser.parse_args([])\n", "print('Image size: {}'.format(str(args.img_size)))" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "import json\n", "import torch.utils.data as data\n", "import torch\n", "from torchvision import transforms\n", "from torch.autograd import Variable\n", "import numpy as np\n", "from PIL import Image\n", "import torchvision.transforms.functional as TF\n", "import random\n", "\n", "from bert.tokenization_bert import BertTokenizer\n", "\n", "import h5py\n", "from refer.refer import REFER\n", "\n", "from args import get_parser\n", "\n", "# Dataset configuration initialization\n", "# parser = get_parser()\n", "# args = parser.parse_args()\n", "\n", "\n", "class ReferDataset(data.Dataset):\n", "\n", " def __init__(self,\n", " args,\n", " image_transforms=None,\n", " target_transforms=None,\n", " split='train',\n", " eval_mode=False):\n", "\n", " self.classes = []\n", " self.image_transforms = image_transforms\n", " self.target_transform = target_transforms\n", " self.split = split\n", " self.refer = REFER(args.refer_data_root, args.dataset, args.splitBy)\n", "\n", " self.max_tokens = 20\n", "\n", " ref_ids = self.refer.getRefIds(split=self.split)\n", " img_ids = self.refer.getImgIds(ref_ids)\n", "\n", " all_imgs = self.refer.Imgs\n", " self.imgs = list(all_imgs[i] for i in img_ids)\n", " self.ref_ids = ref_ids\n", "\n", " self.input_ids = []\n", " self.attention_masks = []\n", " self.tokenizer = BertTokenizer.from_pretrained(args.bert_tokenizer)\n", "\n", " # for metric learning\n", " self.ROOT = '/data2/projects/seunghoon/VerbRIS/VerbCentric_CY/datasets/VRIS'\n", " self.metric_learning = args.metric_learning\n", " self.exclude_multiobj = args.exclude_multiobj\n", " self.metric_mode = args.metric_mode\n", " self.exclude_position = False\n", "\n", " if self.metric_learning:\n", " self.hardneg_prob = args.hn_prob \n", " self.multi_obj_ref_ids = self._load_multi_obj_ref_ids()\n", " self.hardpos_meta, self.hardneg_meta = self._load_metadata()\n", " else:\n", " self.hardneg_prob = 0.0\n", " self.multi_obj_ref_ids = None\n", " self.hardpos_meta, self.hardneg_meta = None, None\n", "\n", "\n", " self.eval_mode = eval_mode\n", " # if we are testing on a dataset, test all sentences of an object;\n", " # o/w, we are validating during training, randomly sample one sentence for efficiency\n", " for r in ref_ids:\n", " ref = self.refer.Refs[r]\n", "\n", " sentences_for_ref = []\n", " attentions_for_ref = []\n", "\n", " for i, (el, sent_id) in enumerate(zip(ref['sentences'], ref['sent_ids'])):\n", " sentence_raw = el['raw']\n", " attention_mask = [0] * self.max_tokens\n", " padded_input_ids = [0] * self.max_tokens\n", "\n", " input_ids = self.tokenizer.encode(text=sentence_raw, add_special_tokens=True)\n", "\n", " # truncation of tokens\n", " input_ids = input_ids[:self.max_tokens]\n", "\n", " padded_input_ids[:len(input_ids)] = input_ids\n", " attention_mask[:len(input_ids)] = [1]*len(input_ids)\n", "\n", " sentences_for_ref.append(torch.tensor(padded_input_ids).unsqueeze(0))\n", " attentions_for_ref.append(torch.tensor(attention_mask).unsqueeze(0))\n", "\n", " self.input_ids.append(sentences_for_ref)\n", " self.attention_masks.append(attentions_for_ref)\n", "\n", "\n", " def _tokenize(self, sentence):\n", " attention_mask = [0] * self.max_tokens\n", " padded_input_ids = [0] * self.max_tokens\n", "\n", " input_ids = self.tokenizer.encode(text=sentence, add_special_tokens=True)\n", " # truncation of tokens\n", " input_ids = input_ids[:self.max_tokens]\n", " padded_input_ids[:len(input_ids)] = input_ids\n", " attention_mask[:len(input_ids)] = [1]*len(input_ids)\n", "\n", " return torch.tensor(padded_input_ids), torch.tensor(attention_mask)\n", " \n", " def _plot(self, img, target):\n", " import matplotlib.pyplot as plt\n", "\n", " # If img is a PyTorch tensor, convert it to a NumPy array and adjust shape\n", " if isinstance(img, torch.Tensor):\n", " img = img.cpu().numpy()\n", " if img.shape[0] == 3: # Shape is (channels, height, width)\n", " img = img.transpose(1, 2, 0) # Now shape is (height, width, channels)\n", "\n", " # Ensure target is a NumPy array\n", " if isinstance(target, torch.Tensor):\n", " target = target.cpu().numpy()\n", " if target.ndim == 3 and target.shape[0] == 1: # Shape is (1, height, width)\n", " target = target.squeeze(0) # Now shape is (height, width)\n", "\n", " plt.imshow(img)\n", " plt.imshow(target, alpha=0.5)\n", " plt.show()\n", "\n", "\n", " def _load_multi_obj_ref_ids(self):\n", " # Load multi-object reference IDs based on configurations\n", " if not self.exclude_multiobj and not self.exclude_position :\n", " return None\n", " elif self.exclude_position:\n", " multiobj_path = os.path.join(self.ROOT, 'multiobj_ov2_nopos.txt')\n", " elif self.exclude_multiobj :\n", " multiobj_path = os.path.join(self.ROOT, 'multiobj_ov3.txt')\n", " with open(multiobj_path, 'r') as f:\n", " return [int(line.strip()) for line in f.readlines()]\n", "\n", " def _load_metadata(self):\n", " # Load metadata for hard positive verb phrases, hard negative queries\n", " if 'op2' in self.metric_mode :\n", " hardpos_path = os.path.join(self.ROOT, 'hardpos_verbphrase_op2_1024upd.json') \n", " else :\n", " hardpos_path = os.path.join(self.ROOT, 'hardpos_verbphrase_0906upd.json')\n", " # do not use hardneg_path\n", " hardneg_path = os.path.join(self.ROOT, 'hardneg_verb.json')\n", "\n", " with open(hardpos_path, 'r', encoding='utf-8') as f:\n", " hardpos_json = json.load(f)\n", " if \"hardpos_only\" in self.metric_mode :\n", " hardneg_json = None\n", " else : \n", " with open(hardneg_path, 'r', encoding='utf-8') as q:\n", " hardneg_json = json.load(q)\n", " return hardpos_json, hardneg_json\n", "\n", " def get_classes(self):\n", " return self.classes\n", "\n", " def __len__(self):\n", " return len(self.ref_ids)\n", "\n", " def __getitem__(self, index):\n", " this_ref_id = self.ref_ids[index]\n", " this_img_id = self.refer.getImgIds(this_ref_id)\n", " this_img = self.refer.Imgs[this_img_id[0]]\n", "\n", " img = Image.open(os.path.join(self.refer.IMAGE_DIR, this_img['file_name'])).convert(\"RGB\")\n", "\n", " ref = self.refer.loadRefs(this_ref_id)\n", " #print(ref)\n", "\n", " ref_mask = np.array(self.refer.getMask(ref[0])['mask'])\n", " annot = np.zeros(ref_mask.shape)\n", " annot[ref_mask == 1] = 1\n", "\n", " annot = Image.fromarray(annot.astype(np.uint8), mode=\"P\")\n", "\n", " if self.image_transforms is not None:\n", " # resize, from PIL to tensor, and mean and std normalization\n", " img, target = self.image_transforms(img, annot)\n", "\n", " pos_sent = None\n", " neg_sent = None\n", " pos_attn_mask = None\n", " neg_attn_mask = None\n", " choice_sent = None\n", "\n", " if self.eval_mode:\n", " embedding = []\n", " att = []\n", " for s in range(len(self.input_ids[index])):\n", " e = self.input_ids[index][s]\n", " a = self.attention_masks[index][s]\n", " embedding.append(e.unsqueeze(-1))\n", " att.append(a.unsqueeze(-1))\n", "\n", " tensor_embeddings = torch.cat(embedding, dim=-1)\n", " attention_mask = torch.cat(att, dim=-1)\n", " else: # train phase\n", " choice_sent = np.random.choice(len(self.input_ids[index]))\n", " tensor_embeddings = self.input_ids[index][choice_sent]\n", " attention_mask = self.attention_masks[index][choice_sent]\n", "\n", " # print(\"object id: \", this_ref_id)\n", " # print(\"sentence ids: \", self.input_ids[index])\n", " # for i in range(len(self.input_ids[index])):\n", " # print(\"object sentences: \", self.tokenizer.decode(self.input_ids[index][i].squeeze(0).tolist()))\n", " # # plot selected refid\n", " # self._plot(img, target)\n", "\n", " pos_sent, neg_sent = None, None\n", " pos_attn_mask, neg_attn_mask = None, None\n", " pos_mask = [[1, ]] # (GT, pos) 초기화\n", " neg_mask = [[0, ]] # (GT, neg) 초기화\n", "\n", " if self.metric_learning:\n", " if self.metric_mode in ['hardpos_only', 'hardpos_only_rev'] or self.hardneg_prob == 0.0:\n", " pos_sent_dict = self.hardpos_meta.get(str(this_ref_id), {})\n", " pos_sents = []\n", " for sent_list in pos_sent_dict.values():\n", " pos_sents.extend(sent_list)\n", " if pos_sents:\n", " pos_sent = random.choice(pos_sents)\n", " pos_sent, pos_attn_mask = self._tokenize(pos_sent)\n", " else:\n", " if random.random() < self.hardneg_prob:\n", " neg_sent_dict = self.hardneg_meta.get(str(this_ref_id), {})\n", " neg_sents = []\n", " for sent_list in neg_sent_dict.values():\n", " neg_sents.extend(sent_list)\n", " if neg_sents:\n", " neg_sent = random.choice(neg_sents)\n", " neg_sent, neg_attn_mask = self._tokenize(neg_sent)\n", " else:\n", " pos_sent_dict = self.hardpos_meta.get(str(this_ref_id), {})\n", " pos_sents = []\n", " for sent_list in pos_sent_dict.values():\n", " pos_sents.extend(sent_list)\n", " if pos_sents:\n", " pos_sent = random.choice(pos_sents)\n", " #print(\"original pos sentence: \", pos_sent)\n", " pos_sent, pos_attn_mask = self._tokenize(pos_sent)\n", " if pos_sent is None and len(self.input_ids[index]) > 1:\n", " to_select = list(range(len(self.input_ids[index])))\n", " to_select.remove(choice_sent)\n", " choice_sent = np.random.choice(to_select)\n", " pos_sent = self.input_ids[index][choice_sent]\n", " pos_attn_mask = self.attention_masks[index][choice_sent]\n", " #print(\"pos sent does not exist, use other sentence : \", self.tokenizer.decode(pos_sent.squeeze(0).tolist()))\n", "\n", " # concat tensors\n", " if img.dim() == 3:\n", " img = img.unsqueeze(0) # [1, C, H, W]\n", " if target.dim() == 2:\n", " target = target.unsqueeze(0) # [1, H, W]\n", " if tensor_embeddings.dim() == 1:\n", " tensor_embeddings = tensor_embeddings.unsqueeze(0) # [1, max_tokens]\n", " if attention_mask.dim() == 1:\n", " attention_mask = attention_mask.unsqueeze(0) # [1, max_tokens]\n", " if pos_sent is not None and pos_sent.dim() == 1:\n", " pos_sent = pos_sent.unsqueeze(0)\n", " if neg_sent is not None and neg_sent.dim() == 1:\n", " neg_sent = neg_sent.unsqueeze(0)\n", " if pos_attn_mask is not None and pos_attn_mask.dim() == 1:\n", " pos_attn_mask = pos_attn_mask.unsqueeze(0)\n", " if neg_attn_mask is not None and neg_attn_mask.dim() == 1:\n", " neg_attn_mask = neg_attn_mask.unsqueeze(0)\n", "\n", "\n", " # print(\"index: \", self.input_ids[index])\n", " # print(\"choice_sent: \", choice_sent)\n", " # print(\"tensor_embeddings: \", tensor_embeddings)\n", " # print(\"original sentence: \", self.tokenizer.decode(tensor_embeddings.squeeze(0).tolist()))\n", " # print(\"pos_sent: \", pos_sent)\n", " # print(\"neg_sent: \", neg_sent)\n", " # print(\"pos_attn_mask: \", pos_attn_mask)\n", " # print(\"neg_attn_mask: \", neg_attn_mask)\n", " # print(img.shape, target.shape, tensor_embeddings.shape, attention_mask.shape, pos_mask, neg_mask)\n", "\n", " if (pos_sent is not None) and (neg_sent is not None):\n", " img = torch.stack([img, img, img], dim=0)\n", " target = torch.stack([target, target, target], dim=0)\n", " tensor_embeddings = torch.stack([tensor_embeddings, pos_sent, neg_sent], dim=0)\n", " attention_mask = torch.stack([attention_mask, pos_attn_mask, neg_attn_mask], dim=0)\n", " pos_mask = [[1, 1, 0]]\n", " neg_mask = [[0, 0, 1]]\n", " elif (pos_sent is not None and not neg_sent) or (neg_sent is not None and not pos_sent):\n", " img = torch.stack([img, img], dim=0)\n", " target = torch.stack([target, target], dim=0)\n", " tensor_embeddings = torch.stack([tensor_embeddings, pos_sent], dim=0) if (pos_sent is not None) \\\n", " else torch.stack([tensor_embeddings, neg_sent], dim=0)\n", " attention_mask = torch.stack([attention_mask, pos_attn_mask], dim=0) if (pos_attn_mask is not None) \\\n", " else torch.stack([attention_mask, neg_attn_mask], dim=0)\n", " pos_mask = [[1, int(pos_sent is not None)]]\n", " neg_mask = [[0, int(neg_sent is not None)]]\n", " else:\n", " pass\n", " return img, target, tensor_embeddings, attention_mask, pos_mask, neg_mask\n" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [], "source": [ "def get_dataset(image_set, transform, args):\n", " #from data.dataset_refer_bert import ReferDataset\n", " ds = ReferDataset(args,\n", " split=image_set,\n", " image_transforms=transform,\n", " target_transforms=None\n", " )\n", " num_classes = 2\n", "\n", " return ds, num_classes\n", "\n", "def get_transform(args):\n", " transforms = [T.Resize(args.img_size, args.img_size),\n", " T.ToTensor(),\n", " T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])\n", " ]\n", "\n", " return T.Compose(transforms)\n", "\n", "\n", "def criterion(input, target):\n", " weight = torch.FloatTensor([0.9, 1.1]).cuda()\n", " return nn.functional.cross_entropy(input, target, weight=weight)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading dataset refcocog into memory...\n", "Split by umd!\n", "creating index...\n", "index created.\n", "DONE (t=6.64s)\n" ] } ], "source": [ "dataset, num_classes = get_dataset(\"train\",\n", " get_transform(args=args),\n", " args=args)\n", "train_sampler = torch.utils.data.distributed.DistributedSampler(dataset, num_replicas=1, rank=0,\n", " shuffle=True)\n" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])torch.Size([2, 1, 20])torch.Size([3, 1, 20])\n", "\n", "\n", "\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])torch.Size([2, 1, 20])torch.Size([3, 1, 20])torch.Size([2, 1, 20])\n", "\n", "\n", "\n", "\n", "torch.Size([1, 20])torch.Size([3, 1, 20])torch.Size([2, 1, 20])torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "\n", "\n", "\n", "torch.Size([3, 1, 20])\n", "torch.Size([2, 1, 20])torch.Size([1, 20])torch.Size([2, 1, 20])torch.Size([1, 20])\n", "torch.Size([2, 1, 20])\n", "\n", "\n", "\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])torch.Size([3, 1, 20])torch.Size([1, 20])\n", "\n", "torch.Size([3, 1, 20])\n", "\n", "\n", "\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "\n", "\n", "\n", "torch.Size([2, 1, 20])torch.Size([3, 1, 20])\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "\n", "torch.Size([2, 1, 20])\n", "\n", "\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "\n", "torch.Size([2, 1, 20])\n", "\n", "torch.Size([2, 1, 20])\n", "\n", "torch.Size([3, 1, 20])\n", "torch.Size([2, 1, 20])torch.Size([3, 1, 20])\n", "torch.Size([3, 1, 20])\n", "\n", "torch.Size([3, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([3, 1, 20])\n", "\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "\n", "torch.Size([2, 1, 20])\n" ] }, { "ename": "TypeError", "evalue": "Caught TypeError in DataLoader worker process 0.\nOriginal Traceback (most recent call last):\n File \"/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py\", line 302, in _worker_loop\n data = fetcher.fetch(index)\n File \"/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py\", line 52, in fetch\n return self.collate_fn(data)\n File \"/tmp/ipykernel_2235050/518736739.py\", line 10, in custom_collate\n tensor_embeddings = torch.cat(*tensor_embeddings, dim=0)\nTypeError: cat() received an invalid combination of arguments - got (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, dim=int), but expected one of:\n * (tuple of Tensors tensors, int dim, *, Tensor out)\n * (tuple of Tensors tensors, name dim, *, Tensor out)\n\n", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[118], line 36\u001b[0m\n\u001b[1;32m 30\u001b[0m data_loader \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39mDataLoader(\n\u001b[1;32m 31\u001b[0m dataset, batch_size\u001b[38;5;241m=\u001b[39margs\u001b[38;5;241m.\u001b[39mbatch_size,\n\u001b[1;32m 32\u001b[0m sampler\u001b[38;5;241m=\u001b[39mtrain_sampler, num_workers\u001b[38;5;241m=\u001b[39margs\u001b[38;5;241m.\u001b[39mworkers, \n\u001b[1;32m 33\u001b[0m collate_fn\u001b[38;5;241m=\u001b[39mcustom_collate, pin_memory\u001b[38;5;241m=\u001b[39margs\u001b[38;5;241m.\u001b[39mpin_mem, drop_last\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 35\u001b[0m \u001b[38;5;66;03m# single sample from dataloader\u001b[39;00m\n\u001b[0;32m---> 36\u001b[0m img, target, tensor_embeddings, attention_mask, pos_mask, neg_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43miter\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mdata_loader\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28mprint\u001b[39m(img\u001b[38;5;241m.\u001b[39mshape, target\u001b[38;5;241m.\u001b[39mshape, tensor_embeddings\u001b[38;5;241m.\u001b[39mshape, attention_mask\u001b[38;5;241m.\u001b[39mshape, pos_mask, neg_mask)\n", "File \u001b[0;32m~/.conda/envs/lavt/lib/python3.9/site-packages/torch/utils/data/dataloader.py:652\u001b[0m, in \u001b[0;36m_BaseDataLoaderIter.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 649\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampler_iter \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 650\u001b[0m \u001b[38;5;66;03m# TODO(https://github.com/pytorch/pytorch/issues/76750)\u001b[39;00m\n\u001b[1;32m 651\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reset() \u001b[38;5;66;03m# type: ignore[call-arg]\u001b[39;00m\n\u001b[0;32m--> 652\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_next_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 653\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 654\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataset_kind \u001b[38;5;241m==\u001b[39m _DatasetKind\u001b[38;5;241m.\u001b[39mIterable \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m 655\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \\\n\u001b[1;32m 656\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_yielded \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_IterableDataset_len_called:\n", "File \u001b[0;32m~/.conda/envs/lavt/lib/python3.9/site-packages/torch/utils/data/dataloader.py:1347\u001b[0m, in \u001b[0;36m_MultiProcessingDataLoaderIter._next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1345\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1346\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_task_info[idx]\n\u001b[0;32m-> 1347\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_process_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.conda/envs/lavt/lib/python3.9/site-packages/torch/utils/data/dataloader.py:1373\u001b[0m, in \u001b[0;36m_MultiProcessingDataLoaderIter._process_data\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 1371\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_try_put_index()\n\u001b[1;32m 1372\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, ExceptionWrapper):\n\u001b[0;32m-> 1373\u001b[0m \u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreraise\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1374\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m data\n", "File \u001b[0;32m~/.conda/envs/lavt/lib/python3.9/site-packages/torch/_utils.py:461\u001b[0m, in \u001b[0;36mExceptionWrapper.reraise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 457\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 458\u001b[0m \u001b[38;5;66;03m# If the exception takes multiple arguments, don't try to\u001b[39;00m\n\u001b[1;32m 459\u001b[0m \u001b[38;5;66;03m# instantiate since we don't know how to\u001b[39;00m\n\u001b[1;32m 460\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(msg) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m--> 461\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exception\n", "\u001b[0;31mTypeError\u001b[0m: Caught TypeError in DataLoader worker process 0.\nOriginal Traceback (most recent call last):\n File \"/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py\", line 302, in _worker_loop\n data = fetcher.fetch(index)\n File \"/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py\", line 52, in fetch\n return self.collate_fn(data)\n File \"/tmp/ipykernel_2235050/518736739.py\", line 10, in custom_collate\n tensor_embeddings = torch.cat(*tensor_embeddings, dim=0)\nTypeError: cat() received an invalid combination of arguments - got (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, dim=int), but expected one of:\n * (tuple of Tensors tensors, int dim, *, Tensor out)\n * (tuple of Tensors tensors, name dim, *, Tensor out)\n\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([2, 1, 20])\n", "torch.Size([3, 1, 20])torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "\n", "torch.Size([2, 1, 20])torch.Size([3, 1, 20])\n", "\n", "torch.Size([2, 1, 20])torch.Size([3, 1, 20])\n", "\n", "torch.Size([3, 1, 20])torch.Size([3, 1, 20])\n", "\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([3, 1, 20])\n", "\n", "torch.Size([3, 1, 20])torch.Size([2, 1, 20])\n", "\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "\n", "torch.Size([3, 1, 20])torch.Size([3, 1, 20])\n", "\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "\n", "torch.Size([2, 1, 20])torch.Size([2, 1, 20])\n", "\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([3, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([1, 20])\n", "torch.Size([2, 1, 20])\n", "torch.Size([3, 1, 20])\n" ] } ], "source": [ "from torch.nn.utils.rnn import pad_sequence\n", "\n", "def custom_collate(batch):\n", " imgs, targets, tensor_embeddings, attention_masks, pos_masks, neg_masks = zip(*batch)\n", " imgs = torch.cat([img for img in imgs], dim=0)\n", " targets = torch.cat([tgt for tgt in targets], dim=0)\n", " \n", " tensor_embeddings = torch.cat([t_e for t_e in tensor_embeddings], dim=0)\n", " attention_masks = torch.cat([a_m for a_m in attention_masks], dim=0)\n", "\n", " # Handle pos_masks\n", " if any(pos_mask is not None for pos_mask in pos_masks):\n", " pos_masks = [mask if mask is not None else torch.zeros_like(tensor_embeddings[0]) for mask in pos_masks]\n", " pos_masks = pad_sequence(pos_masks, batch_first=True, padding_value=0)\n", " else:\n", " pos_masks = None\n", "\n", " # Handle neg_masks\n", " if any(neg_mask is not None for neg_mask in neg_masks):\n", " neg_masks = [mask if mask is not None else torch.zeros_like(tensor_embeddings[0]) for mask in neg_masks]\n", " neg_masks = pad_sequence(neg_masks, batch_first=True, padding_value=0)\n", " else:\n", " neg_masks = None\n", "\n", " return imgs, targets, tensor_embeddings, attention_masks, pos_masks, neg_masks\n", "\n", "\n", "data_loader = torch.utils.data.DataLoader(\n", " dataset, batch_size=args.batch_size,\n", " sampler=train_sampler, num_workers=args.workers, \n", " collate_fn=custom_collate, pin_memory=args.pin_mem, drop_last=True)\n", "\n", "# single sample from dataloader\n", "img, target, tensor_embeddings, attention_mask, pos_mask, neg_mask = next(iter(data_loader))\n", "\n", "print(img.shape, target.shape, tensor_embeddings.shape, attention_mask.shape, pos_mask, neg_mask)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'image_id': 391435, 'split': 'train', 'sentences': [{'tokens': ['the', 'reflection', 'of', 'the', 'man', 'shaving'], 'raw': 'the reflection of the man shaving', 'sent_id': 13437, 'sent': 'the reflection of the man shaving'}, {'tokens': ['image', 'of', 'a', 'man', 'shaving', 'on', 'a', 'laptop', 'screen'], 'raw': 'image of a man shaving on a laptop screen', 'sent_id': 13438, 'sent': 'image of a man shaving on a laptop screen'}], 'file_name': 'COCO_train2014_000000391435_1709050.jpg', 'category_id': 1, 'ann_id': 1709050, 'sent_ids': [13437, 13438], 'ref_id': 45871}][{'image_id': 421848, 'split': 'train', 'sentences': [{'tokens': ['the', 'tallest', 'giraffe', 'among', 'the', 'two'], 'raw': 'The tallest giraffe among the two', 'sent_id': 82708, 'sent': 'the tallest giraffe among the two'}, {'tokens': ['the', 'tallest', 'of', 'two', 'giraffes'], 'raw': 'The tallest of two giraffes.', 'sent_id': 82709, 'sent': 'the tallest of two giraffes'}], 'file_name': 'COCO_train2014_000000421848_596471.jpg', 'category_id': 25, 'ann_id': 596471, 'sent_ids': [82708, 82709], 'ref_id': 36770}]\n", "[{'image_id': 13468, 'split': 'train', 'sentences': [{'tokens': ['a', 'sandwich', 'right', 'of', 'another'], 'raw': 'A sandwich right of another.', 'sent_id': 5866, 'sent': 'a sandwich right of another'}, {'tokens': ['sandwich', 'half', 'furthest', 'to', 'right'], 'raw': 'sandwich half furthest to right', 'sent_id': 5867, 'sent': 'sandwich half furthest to right'}], 'file_name': 'COCO_train2014_000000013468_310040.jpg', 'category_id': 54, 'ann_id': 310040, 'sent_ids': [5866, 5867], 'ref_id': 7280}][{'image_id': 181054, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'in', 'a', 'white', 'shirt', 'with', 'a', 'woman', 'buttoning', 'it', 'up'], 'raw': 'A man in a white shirt with a woman buttoning it up.', 'sent_id': 68075, 'sent': 'a man in a white shirt with a woman buttoning it up'}, {'tokens': ['a', 'man', 'in', 'a', 'white', 'shirt', 'looks', 'nervous', 'as', 'an', 'older', 'woman', 'buttons', 'him', 'up'], 'raw': 'A man in a white shirt looks nervous as an older woman buttons him up.', 'sent_id': 68076, 'sent': 'a man in a white shirt looks nervous as an older woman buttons him up'}], 'file_name': 'COCO_train2014_000000181054_484268.jpg', 'category_id': 1, 'ann_id': 484268, 'sent_ids': [68075, 68076], 'ref_id': 48236}]\n", "\n", "\n", "[{'image_id': 569919, 'split': 'train', 'sentences': [{'tokens': ['the', 'spoon', 'next', 'to', 'the', 'pizza'], 'raw': 'The spoon next to the pizza.', 'sent_id': 97107, 'sent': 'the spoon next to the pizza'}, {'tokens': ['a', 'metal', 'spoon', 'on', 'a', 'plate', 'on', 'a', 'table'], 'raw': 'A metal spoon on a plate on a table.', 'sent_id': 97108, 'sent': 'a metal spoon on a plate on a table'}], 'file_name': 'COCO_train2014_000000569919_703521.jpg', 'category_id': 50, 'ann_id': 703521, 'sent_ids': [97107, 97108], 'ref_id': 42368}][{'image_id': 129359, 'split': 'train', 'sentences': [{'tokens': ['a', 'white', 'dish', 'with', 'some', 'kind', 'of', 'sauce', 'in', 'it', 'along', 'with', 'a', 'silver', 'spoon'], 'raw': 'A white dish with some kind of sauce in it along with a silver spoon', 'sent_id': 97230, 'sent': 'a white dish with some kind of sauce in it along with a silver spoon'}, {'tokens': ['a', 'cup', 'of', 'food', 'with', 'a', 'spoon'], 'raw': 'A cup of food with a spoon.', 'sent_id': 97231, 'sent': 'a cup of food with a spoon'}], 'file_name': 'COCO_train2014_000000129359_1039869.jpg', 'category_id': 51, 'ann_id': 1039869, 'sent_ids': [97230, 97231], 'ref_id': 42420}][{'image_id': 2964, 'split': 'train', 'sentences': [{'tokens': ['bottle', 'of', '14', 'hands', 'wine'], 'raw': 'bottle of 14 Hands wine', 'sent_id': 44379, 'sent': 'bottle of 14 hands wine'}, {'tokens': ['a', 'bottle', 'of', 'wine', 'that', 'says', '14', 'hands', 'and', 'has', 'a', 'purple', 'horse', 'on', 'it'], 'raw': 'A bottle of wine that says 14 hands and has a purple horse on it.', 'sent_id': 44380, 'sent': 'a bottle of wine that says 14 hands and has a purple horse on it'}], 'file_name': 'COCO_train2014_000000002964_91245.jpg', 'category_id': 44, 'ann_id': 91245, 'sent_ids': [44379, 44380], 'ref_id': 22056}]\n", "\n", "\n", "[{'image_id': 330683, 'split': 'train', 'sentences': [{'tokens': ['a', 'black', 'cow', 'alongside', 'a', 'brown', 'cow'], 'raw': 'A black cow alongside a brown cow.', 'sent_id': 78006, 'sent': 'a black cow alongside a brown cow'}, {'tokens': ['a', 'black', 'cow', 'standing', 'between', 'another', 'black', 'cow', 'and', 'a', 'brown', 'cow'], 'raw': 'A black cow standing between another black cow and a brown cow', 'sent_id': 78007, 'sent': 'a black cow standing between another black cow and a brown cow'}], 'file_name': 'COCO_train2014_000000330683_76006.jpg', 'category_id': 21, 'ann_id': 76006, 'sent_ids': [78006, 78007], 'ref_id': 34980}]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n", "/home/seunghoon/.conda/envs/lavt/lib/python3.9/site-packages/torchvision/transforms/functional.py:417: UserWarning: Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[{'image_id': 263823, 'split': 'train', 'sentences': [{'tokens': ['the', 'umpire', 'behind', 'the', 'plate'], 'raw': 'the umpire behind the plate', 'sent_id': 9335, 'sent': 'the umpire behind the plate'}, {'tokens': ['umpire', 'wearing', 'blue'], 'raw': 'umpire wearing blue', 'sent_id': 9336, 'sent': 'umpire wearing blue'}], 'file_name': 'COCO_train2014_000000263823_2160611.jpg', 'category_id': 1, 'ann_id': 2160611, 'sent_ids': [9335, 9336], 'ref_id': 8614}]\n", "[{'image_id': 170366, 'split': 'train', 'sentences': [{'tokens': ['the', 'boy', 'in', 'the', 'suit'], 'raw': 'The boy in the suit.', 'sent_id': 96474, 'sent': 'the boy in the suit'}, {'tokens': ['a', 'young', 'man', 'with', 'brown', 'hair', 'in', 'a', 'black', 'suit', ',', 'with', 'a', 'black', 'hat', 'with', 'sunglasses', 'resting', 'on', 'it'], 'raw': 'A young man with brown hair in a black suit, with a black hat with sunglasses resting on it', 'sent_id': 96475, 'sent': 'a young man with brown hair in a black suit , with a black hat with sunglasses resting on it'}], 'file_name': 'COCO_train2014_000000170366_484717.jpg', 'category_id': 1, 'ann_id': 484717, 'sent_ids': [96474, 96475], 'ref_id': 42104}][{'image_id': 181316, 'split': 'train', 'sentences': [{'tokens': ['the', 'racket', 'held', 'by', 'a', 'girl', 'wearing', 'dark', 'skirt'], 'raw': 'The racket held by a girl wearing dark skirt.', 'sent_id': 14001, 'sent': 'the racket held by a girl wearing dark skirt'}, {'tokens': ['a', 'racket', 'being', 'held', 'by', 'the', 'girl', 'in', 'the', 'black', 'skirt'], 'raw': 'A racket being held by the girl in the black skirt.', 'sent_id': 14002, 'sent': 'a racket being held by the girl in the black skirt'}], 'file_name': 'COCO_train2014_000000181316_655443.jpg', 'category_id': 43, 'ann_id': 655443, 'sent_ids': [14001, 14002], 'ref_id': 45890}]\n", "\n", "[{'image_id': 96723, 'split': 'train', 'sentences': [{'tokens': ['a', 'number', 'of', 'books', 'on', 'a', 'shelf'], 'raw': 'A number of books on a shelf.', 'sent_id': 35543, 'sent': 'a number of books on a shelf'}, {'tokens': ['a', 'bunch', 'of', 'books', 'on', 'a', 'shelf'], 'raw': 'A bunch of books on a shelf.', 'sent_id': 35544, 'sent': 'a bunch of books on a shelf'}], 'file_name': 'COCO_train2014_000000096723_1139765.jpg', 'category_id': 84, 'ann_id': 1139765, 'sent_ids': [35543, 35544], 'ref_id': 18668}]\n", "[{'image_id': 273951, 'split': 'train', 'sentences': [{'tokens': ['a', 'white', 'woman', 'skier', 'with', 'a', 'colorful', 'hat', 'sitting', 'between', 'two', 'men', 'skiers'], 'raw': 'A white woman skier with a colorful hat sitting between two men skiers.', 'sent_id': 34676, 'sent': 'a white woman skier with a colorful hat sitting between two men skiers'}, {'tokens': ['a', 'blonde', 'woman', 'in', 'red'], 'raw': 'A blonde woman in red', 'sent_id': 34677, 'sent': 'a blonde woman in red'}], 'file_name': 'COCO_train2014_000000273951_509586.jpg', 'category_id': 1, 'ann_id': 509586, 'sent_ids': [34676, 34677], 'ref_id': 18328}][{'image_id': 387527, 'split': 'train', 'sentences': [{'tokens': ['a', 'banana', 'to', 'the', 'far', 'left', 'of', 'the', 'fruit', 'bowl'], 'raw': 'A banana to the far left of the fruit bowl.', 'sent_id': 65272, 'sent': 'a banana to the far left of the fruit bowl'}, {'tokens': ['the', 'farthest', 'banana', 'away', 'from', 'the', 'camera'], 'raw': 'The farthest banana away from the camera.', 'sent_id': 65273, 'sent': 'the farthest banana away from the camera'}], 'file_name': 'COCO_train2014_000000387527_1043422.jpg', 'category_id': 52, 'ann_id': 1043422, 'sent_ids': [65272, 65273], 'ref_id': 30094}][{'image_id': 103510, 'split': 'train', 'sentences': [{'tokens': ['the', 'carrots'], 'raw': 'the carrots', 'sent_id': 63484, 'sent': 'the carrots'}, {'tokens': ['a', 'group', 'of', 'fresh', 'baby', 'carrots'], 'raw': 'A group of fresh baby carrots.', 'sent_id': 63485, 'sent': 'a group of fresh baby carrots'}], 'file_name': 'COCO_train2014_000000103510_1063832.jpg', 'category_id': 57, 'ann_id': 1063832, 'sent_ids': [63484, 63485], 'ref_id': 29390}]\n", "\n", "\n", "[{'image_id': 427633, 'split': 'train', 'sentences': [{'tokens': ['the', 'back', 'side', 'of', 'a', 'multi', 'light', 'traffic', 'light'], 'raw': 'The back side of a multi light traffic light.', 'sent_id': 52260, 'sent': 'the back side of a multi light traffic light'}, {'tokens': ['a', 'stoplight', 'faces', 'to', 'the', 'right'], 'raw': 'A stoplight faces to the right.', 'sent_id': 52261, 'sent': 'a stoplight faces to the right'}], 'file_name': 'COCO_train2014_000000427633_2172860.jpg', 'category_id': 10, 'ann_id': 2172860, 'sent_ids': [52260, 52261], 'ref_id': 25098}]\n", "[{'image_id': 350083, 'split': 'train', 'sentences': [{'tokens': ['giraffe', 'to', 'left', 'of', 'other'], 'raw': 'giraffe to left of other', 'sent_id': 33055, 'sent': 'giraffe to left of other'}, {'tokens': ['the', 'slightly', 'shorter', 'giraffe'], 'raw': 'the slightly shorter giraffe', 'sent_id': 33056, 'sent': 'the slightly shorter giraffe'}], 'file_name': 'COCO_train2014_000000350083_599600.jpg', 'category_id': 25, 'ann_id': 599600, 'sent_ids': [33055, 33056], 'ref_id': 46695}]\n", "[{'image_id': 287519, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'with', 'cream', '&', 'black', 'colored', 'shirt', 'and', 'short', 'trousers', 'walking', 'behind', 'a', 'police', 'man'], 'raw': 'a man with cream & black colored shirt and short trousers walking behind a police man', 'sent_id': 99833, 'sent': 'a man with cream & black colored shirt and short trousers walking behind a police man'}, {'tokens': ['a', 'guy', 'wearing', 'a', 'short', 'sleeve', 'shirt', 'with', 'an', 'interesting', 'design'], 'raw': 'a guy wearing a short sleeve shirt with an interesting design', 'sent_id': 99834, 'sent': 'a guy wearing a short sleeve shirt with an interesting design'}], 'file_name': 'COCO_train2014_000000287519_2206490.jpg', 'category_id': 1, 'ann_id': 2206490, 'sent_ids': [99833, 99834], 'ref_id': 43469}]\n", "[{'image_id': 54141, 'split': 'train', 'sentences': [{'tokens': ['a', 'speed', 'boat', 'on', 'a', 'lake'], 'raw': 'A speed boat on a lake.', 'sent_id': 67100, 'sent': 'a speed boat on a lake'}, {'tokens': ['a', 'speedboat', 'pulling', 'a', 'wake', 'boarder', 'behind', 'it'], 'raw': 'A speedboat pulling a wake boarder behind it', 'sent_id': 67101, 'sent': 'a speedboat pulling a wake boarder behind it'}], 'file_name': 'COCO_train2014_000000054141_180852.jpg', 'category_id': 9, 'ann_id': 180852, 'sent_ids': [67100, 67101], 'ref_id': 48196}]\n", "[{'image_id': 156296, 'split': 'train', 'sentences': [{'tokens': ['a', 'woman', 'in', 'rain', 'boots', 'trying', 'to', 'fix', 'her', 'umbrella'], 'raw': 'A woman in rain boots trying to fix her umbrella.', 'sent_id': 9312, 'sent': 'a woman in rain boots trying to fix her umbrella'}, {'tokens': ['the', 'woman', 'with', 'the', 'black', 'umbrella'], 'raw': 'The woman with the black umbrella.', 'sent_id': 9313, 'sent': 'the woman with the black umbrella'}], 'file_name': 'COCO_train2014_000000156296_518143.jpg', 'category_id': 1, 'ann_id': 518143, 'sent_ids': [9312, 9313], 'ref_id': 8608}][{'image_id': 402212, 'split': 'train', 'sentences': [{'tokens': ['a', 'colorful', 'toy', 'van', 'in', 'the', 'street'], 'raw': 'a colorful toy van in the street', 'sent_id': 14596, 'sent': 'a colorful toy van in the street'}, {'tokens': ['colorful', 'truck'], 'raw': 'Colorful truck.', 'sent_id': 14597, 'sent': 'colorful truck'}], 'file_name': 'COCO_train2014_000000402212_396551.jpg', 'category_id': 8, 'ann_id': 396551, 'sent_ids': [14596, 14597], 'ref_id': 45911}]\n", "[{'image_id': 521796, 'split': 'train', 'sentences': [{'tokens': ['shelf', 'in', 'camper'], 'raw': 'Shelf in camper.', 'sent_id': 70798, 'sent': 'shelf in camper'}], 'file_name': 'COCO_train2014_000000521796_1978969.jpg', 'category_id': 78, 'ann_id': 1978969, 'sent_ids': [70798], 'ref_id': 48359}]\n", "[{'image_id': 136953, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'wearing', 'a', 'striped', 'shirt', 'and', 'black', 'pants'], 'raw': 'A man wearing a striped shirt and black pants.', 'sent_id': 21168, 'sent': 'a man wearing a striped shirt and black pants'}, {'tokens': ['a', 'man', 'in', 'a', 'striped', 'shirt'], 'raw': 'A man in a striped shirt.', 'sent_id': 21169, 'sent': 'a man in a striped shirt'}], 'file_name': 'COCO_train2014_000000136953_530897.jpg', 'category_id': 1, 'ann_id': 530897, 'sent_ids': [21168, 21169], 'ref_id': 13173}]\n", "[{'image_id': 179209, 'split': 'train', 'sentences': [{'tokens': ['the', 'large', 'truck', 'in', 'the', 'very', 'back'], 'raw': 'the large truck in the very back', 'sent_id': 45782, 'sent': 'the large truck in the very back'}, {'tokens': ['large', 'multiwheeled', 'truck', 'behind', 'a', 'truck', 'full', 'of', 'bananas'], 'raw': 'Large multiwheeled truck behind a truck full of bananas.', 'sent_id': 45783, 'sent': 'large multiwheeled truck behind a truck full of bananas'}], 'file_name': 'COCO_train2014_000000179209_2054261.jpg', 'category_id': 8, 'ann_id': 2054261, 'sent_ids': [45782, 45783], 'ref_id': 22607}]\n", "\n", "[{'image_id': 175565, 'split': 'train', 'sentences': [{'tokens': ['the', 'couch', ',', 'a', 'lady', 'sitted', 'on', 'it'], 'raw': 'the couch, a lady sitted on it.', 'sent_id': 1112, 'sent': 'the couch , a lady sitted on it'}], 'file_name': 'COCO_train2014_000000175565_99716.jpg', 'category_id': 63, 'ann_id': 99716, 'sent_ids': [1112], 'ref_id': 5449}]\n", "[{'image_id': 168217, 'split': 'train', 'sentences': [{'tokens': ['brown', 'and', 'pink', 'teddy', 'bears'], 'raw': 'Brown and pink teddy bears.', 'sent_id': 4409, 'sent': 'brown and pink teddy bears'}, {'tokens': ['a', 'brown', 'teddy', 'bear', 'with', 'a', 'morose', 'expression', 'sits', 'in', 'front', 'of', 'a', 'pink', 'teddy', 'bear', 'with', 'an', 'identical', 'expression'], 'raw': 'A brown teddy bear with a morose expression sits in front of a pink teddy bear with an identical expression.', 'sent_id': 4410, 'sent': 'a brown teddy bear with a morose expression sits in front of a pink teddy bear with an identical expression'}], 'file_name': 'COCO_train2014_000000168217_1162820.jpg', 'category_id': 88, 'ann_id': 1162820, 'sent_ids': [4409, 4410], 'ref_id': 6702}]\n", "[{'image_id': 413164, 'split': 'train', 'sentences': [{'tokens': ['a', 'table', 'with', 'a', 'white', 'tablecloth', 'on', 'it'], 'raw': 'A table with a white tablecloth on it', 'sent_id': 19497, 'sent': 'a table with a white tablecloth on it'}], 'file_name': 'COCO_train2014_000000413164_1092064.jpg', 'category_id': 67, 'ann_id': 1092064, 'sent_ids': [19497], 'ref_id': 12539}]\n", "[{'image_id': 30387, 'split': 'train', 'sentences': [{'tokens': ['the', 'man', 'all', 'way', 'at', 'the', 'end', 'of', 'the', 'line', 'who', 'is', 'barely', 'visible'], 'raw': 'The man all way at the end of the line who is barely visible.', 'sent_id': 12240, 'sent': 'the man all way at the end of the line who is barely visible'}, {'tokens': ['a', 'man', 'who', 'is', 'mostly', 'covered', 'up', 'by', 'the', 'arms', 'of', 'another', 'man'], 'raw': 'A man who is mostly covered up by the arms of another man.', 'sent_id': 12241, 'sent': 'a man who is mostly covered up by the arms of another man'}], 'file_name': 'COCO_train2014_000000030387_1706152.jpg', 'category_id': 1, 'ann_id': 1706152, 'sent_ids': [12240, 12241], 'ref_id': 9755}]\n", "[{'image_id': 347263, 'split': 'train', 'sentences': [{'tokens': ['woman', 'wearing', 'black', 'showing', 'someone', 'a', 'technique'], 'raw': 'Woman wearing black showing someone a technique', 'sent_id': 68299, 'sent': 'woman wearing black showing someone a technique'}, {'tokens': ['woman', 'with', 'sunglasses', 'on', 'her', 'head', 'is', 'cutting', 'a', 'roast', 'in', 'the', 'kitchen'], 'raw': 'woman with sunglasses on her head is cutting a roast in the kitchen', 'sent_id': 68300, 'sent': 'woman with sunglasses on her head is cutting a roast in the kitchen'}], 'file_name': 'COCO_train2014_000000347263_556785.jpg', 'category_id': 1, 'ann_id': 556785, 'sent_ids': [68299, 68300], 'ref_id': 31255}][{'image_id': 492325, 'split': 'train', 'sentences': [{'tokens': ['a', 'beer', 'which', 'is', 'standing'], 'raw': 'A beer which is standing', 'sent_id': 13351, 'sent': 'a beer which is standing'}, {'tokens': ['a', 'bear', 'that', 'is', 'sitting', 'up'], 'raw': 'A bear that is sitting up.', 'sent_id': 13352, 'sent': 'a bear that is sitting up'}], 'file_name': 'COCO_train2014_000000492325_588117.jpg', 'category_id': 23, 'ann_id': 588117, 'sent_ids': [13351, 13352], 'ref_id': 10175}]\n", "\n", "[{'image_id': 409706, 'split': 'train', 'sentences': [{'tokens': ['paper', 'being', 'drawn', 'on'], 'raw': 'paper being drawn on', 'sent_id': 84661, 'sent': 'paper being drawn on'}, {'tokens': ['drawing', 'book'], 'raw': 'drawing book', 'sent_id': 84662, 'sent': 'drawing book'}], 'file_name': 'COCO_train2014_000000409706_1140323.jpg', 'category_id': 84, 'ann_id': 1140323, 'sent_ids': [84661, 84662], 'ref_id': 37505}]\n", "[{'image_id': 526922, 'split': 'train', 'sentences': [{'tokens': ['a', 'bus', 'that', 'says', 'sebastian', 'el', 'gde'], 'raw': 'A bus that says Sebastian El GDE.', 'sent_id': 77013, 'sent': 'a bus that says sebastian el gde'}, {'tokens': ['the', 'r1', 'bus'], 'raw': 'The R1 bus.', 'sent_id': 77014, 'sent': 'the r1 bus'}], 'file_name': 'COCO_train2014_000000526922_248285.jpg', 'category_id': 6, 'ann_id': 248285, 'sent_ids': [77013, 77014], 'ref_id': 34577}]\n", "[{'image_id': 26274, 'split': 'train', 'sentences': [{'tokens': ['a', 'silver', 'van', 'with', 'people', 'facing', 'it'], 'raw': 'A silver van with people facing it.', 'sent_id': 61777, 'sent': 'a silver van with people facing it'}], 'file_name': 'COCO_train2014_000000026274_136004.jpg', 'category_id': 3, 'ann_id': 136004, 'sent_ids': [61777], 'ref_id': 28715}][{'image_id': 65842, 'split': 'train', 'sentences': [{'tokens': ['orange', 'sitting', 'in', 'boiling', 'wate', 'with', 'a', 'crack', 'on', 'the', 'side', 'and', 'a', 'white', 'spot', 'on', 'top'], 'raw': 'Orange sitting in boiling wate with a crack on the side and a white spot on top.', 'sent_id': 90770, 'sent': 'orange sitting in boiling wate with a crack on the side and a white spot on top'}, {'tokens': ['a', 'single', 'lemon', 'in', 'a', 'boiling', 'pot', 'with', 'five', 'lemons', 'surrounding'], 'raw': 'A single lemon in a boiling pot with five lemons surrounding.', 'sent_id': 90771, 'sent': 'a single lemon in a boiling pot with five lemons surrounding'}], 'file_name': 'COCO_train2014_000000065842_1050797.jpg', 'category_id': 55, 'ann_id': 1050797, 'sent_ids': [90770, 90771], 'ref_id': 49234}]\n", "\n", "[{'image_id': 315831, 'split': 'train', 'sentences': [{'tokens': ['donut', 'with', 'pink', 'frosting', 'and', 'sprinkles'], 'raw': 'Donut with pink frosting and sprinkles.', 'sent_id': 16573, 'sent': 'donut with pink frosting and sprinkles'}, {'tokens': ['a', 'purple', 'donut'], 'raw': 'A purple donut.', 'sent_id': 16574, 'sent': 'a purple donut'}], 'file_name': 'COCO_train2014_000000315831_1573196.jpg', 'category_id': 60, 'ann_id': 1573196, 'sent_ids': [16573, 16574], 'ref_id': 11418}]\n", "[{'image_id': 34404, 'split': 'train', 'sentences': [{'tokens': ['person', 'on', 'left', 'cut', 'off'], 'raw': 'person on left cut off', 'sent_id': 29208, 'sent': 'person on left cut off'}, {'tokens': ['the', 'skiir', 'standing', 'up'], 'raw': 'the skiir standing up', 'sent_id': 29209, 'sent': 'the skiir standing up'}], 'file_name': 'COCO_train2014_000000034404_467515.jpg', 'category_id': 1, 'ann_id': 467515, 'sent_ids': [29208, 29209], 'ref_id': 16214}]\n", "[{'image_id': 427523, 'split': 'train', 'sentences': [{'tokens': ['horse', 'on', 'the', 'left', 'in', 'the', 'right', 'hand', 'picture'], 'raw': 'horse on the left in the right hand picture', 'sent_id': 19348, 'sent': 'horse on the left in the right hand picture'}, {'tokens': ['horse', 'on', 'the', 'left'], 'raw': 'horse on the left', 'sent_id': 19349, 'sent': 'horse on the left'}], 'file_name': 'COCO_train2014_000000427523_54859.jpg', 'category_id': 19, 'ann_id': 54859, 'sent_ids': [19348, 19349], 'ref_id': 12482}]\n", "[{'image_id': 231963, 'split': 'train', 'sentences': [{'tokens': ['a', 'black', ',', 'green', 'and', 'red', 'train', 'car', 'stopped', 'on', 'the', 'tracks', 'to', 'the', 'right', 'of', 'a', 'train', 'engine'], 'raw': 'A black, green and red train car stopped on the tracks to the right of a train engine.', 'sent_id': 91692, 'sent': 'a black , green and red train car stopped on the tracks to the right of a train engine'}, {'tokens': ['the', 'back', 'of', 'the', 'train', 'leaving'], 'raw': 'The back of the train leaving', 'sent_id': 91693, 'sent': 'the back of the train leaving'}], 'file_name': 'COCO_train2014_000000231963_169754.jpg', 'category_id': 7, 'ann_id': 169754, 'sent_ids': [91692, 91693], 'ref_id': 40250}][{'image_id': 399922, 'split': 'train', 'sentences': [{'tokens': ['purse', 'on', 'front', 'mans', 'back'], 'raw': 'purse on front mans back', 'sent_id': 96153, 'sent': 'purse on front mans back'}, {'tokens': ['a', 'beige', 'satchel'], 'raw': 'a beige satchel.', 'sent_id': 96154, 'sent': 'a beige satchel'}], 'file_name': 'COCO_train2014_000000399922_1176815.jpg', 'category_id': 31, 'ann_id': 1176815, 'sent_ids': [96153, 96154], 'ref_id': 41986}]\n", "\n", "[{'image_id': 269045, 'split': 'train', 'sentences': [{'tokens': ['a', 'see', 'through', 'table'], 'raw': 'a see through table', 'sent_id': 36678, 'sent': 'a see through table'}, {'tokens': ['a', 'grilled', 'table', 'near', 'the', 'person', 'with', 'a', 'laptop'], 'raw': 'A grilled table near the person with a laptop', 'sent_id': 36679, 'sent': 'a grilled table near the person with a laptop'}], 'file_name': 'COCO_train2014_000000269045_1612853.jpg', 'category_id': 67, 'ann_id': 1612853, 'sent_ids': [36678, 36679], 'ref_id': 19074}]\n", "[{'image_id': 419297, 'split': 'train', 'sentences': [{'tokens': ['a', 'white', 'truck', 'filled', 'with', 'luggage'], 'raw': 'A white truck filled with luggage', 'sent_id': 50699, 'sent': 'a white truck filled with luggage'}, {'tokens': ['a', 'white', 'color', 'truck'], 'raw': 'A white color truck.', 'sent_id': 50700, 'sent': 'a white color truck'}], 'file_name': 'COCO_train2014_000000419297_1370937.jpg', 'category_id': 8, 'ann_id': 1370937, 'sent_ids': [50699, 50700], 'ref_id': 24467}]\n", "[{'image_id': 153609, 'split': 'train', 'sentences': [{'tokens': ['a', 'gentleman', 'wearing', 'a', 'suit', ',', 'looking', 'down', ',', 'and', 'walking', 'with', 'his', 'hands', 'on', 'his', 'hips'], 'raw': 'A gentleman wearing a suit, looking down, and walking with his hands on his hips.', 'sent_id': 102251, 'sent': 'a gentleman wearing a suit , looking down , and walking with his hands on his hips'}, {'tokens': ['a', 'man', 'in', 'a', 'black', 'suit', 'walking'], 'raw': 'A man in a black suit walking.', 'sent_id': 102252, 'sent': 'a man in a black suit walking'}], 'file_name': 'COCO_train2014_000000153609_462146.jpg', 'category_id': 1, 'ann_id': 462146, 'sent_ids': [102251, 102252], 'ref_id': 44392}][{'image_id': 61498, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'in', 'a', 'blue', 'shirt'], 'raw': 'A man in a blue shirt.', 'sent_id': 72219, 'sent': 'a man in a blue shirt'}, {'tokens': ['blue', 'shirt', 'boy'], 'raw': 'blue shirt boy', 'sent_id': 72220, 'sent': 'blue shirt boy'}], 'file_name': 'COCO_train2014_000000061498_500279.jpg', 'category_id': 1, 'ann_id': 500279, 'sent_ids': [72219, 72220], 'ref_id': 48407}]\n", "\n", "[{'image_id': 334714, 'split': 'train', 'sentences': [{'tokens': ['a', 'long', 'narrow', 'multi', '-', 'colored', 'kite', 'to', 'the', 'left', 'of', 'two', 'other', 'kites', 'of', 'the', 'same', 'colors'], 'raw': 'A long narrow multi-colored kite to the left of two other kites of the same colors.', 'sent_id': 29361, 'sent': 'a long narrow multi - colored kite to the left of two other kites of the same colors'}, {'tokens': ['a', 'side', 'of', 'the', 'kite', 'that', 'is', 'closest', 'to', 'the', 'man', 'in', 'black'], 'raw': 'A side of the kite that is closest to the man in black.', 'sent_id': 29362, 'sent': 'a side of the kite that is closest to the man in black'}], 'file_name': 'COCO_train2014_000000334714_623960.jpg', 'category_id': 38, 'ann_id': 623960, 'sent_ids': [29361, 29362], 'ref_id': 16269}]\n", "[{'image_id': 233111, 'split': 'train', 'sentences': [{'tokens': ['a', 'tennis', 'player', 'with', 'his', 'right', 'foot', 'balanced', 'on', 'its', 'toes'], 'raw': 'A tennis player with his right foot balanced on its toes.', 'sent_id': 94813, 'sent': 'a tennis player with his right foot balanced on its toes'}, {'tokens': ['a', 'man', 'in', 'a', 'blue', 'shirt', 'holding', 'a', 'tennis', 'racket'], 'raw': 'A man in a blue shirt holding a tennis racket.', 'sent_id': 94814, 'sent': 'a man in a blue shirt holding a tennis racket'}], 'file_name': 'COCO_train2014_000000233111_510824.jpg', 'category_id': 1, 'ann_id': 510824, 'sent_ids': [94813, 94814], 'ref_id': 49423}]\n", "[{'image_id': 333324, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'in', 'a', 'red', 'outfit', 'is', 'jumping', 'to', 'catch', 'a', 'frisbee'], 'raw': 'A man in a red outfit is jumping to catch a frisbee.', 'sent_id': 15405, 'sent': 'a man in a red outfit is jumping to catch a frisbee'}, {'tokens': ['a', 'man', 'jumping', 'for', 'a', 'frisbee'], 'raw': 'a man jumping for a frisbee', 'sent_id': 15406, 'sent': 'a man jumping for a frisbee'}], 'file_name': 'COCO_train2014_000000333324_424995.jpg', 'category_id': 1, 'ann_id': 424995, 'sent_ids': [15405, 15406], 'ref_id': 10961}]\n", "[{'image_id': 500390, 'split': 'train', 'sentences': [{'tokens': ['a', 'girl', 'playing', 'football', 'in', 'the', 'ground'], 'raw': 'A girl playing football in the ground.', 'sent_id': 26349, 'sent': 'a girl playing football in the ground'}, {'tokens': ['girl', 'running', 'with', 'hands', 'up'], 'raw': 'Girl running with hands up', 'sent_id': 26350, 'sent': 'girl running with hands up'}], 'file_name': 'COCO_train2014_000000500390_556949.jpg', 'category_id': 1, 'ann_id': 556949, 'sent_ids': [26349, 26350], 'ref_id': 15132}]\n", "[{'image_id': 282359, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'wearing', 'a', 'black', 'shirt', 'holding', 'a', 'tennis', 'racket'], 'raw': 'A man wearing a black shirt holding a tennis racket.', 'sent_id': 81458, 'sent': 'a man wearing a black shirt holding a tennis racket'}, {'tokens': ['a', 'young', 'man', 'in', 'a', 'black', 'shirt', 'and', 'gray', 'shorts'], 'raw': 'A young man in a black shirt and gray shorts.', 'sent_id': 81459, 'sent': 'a young man in a black shirt and gray shorts'}], 'file_name': 'COCO_train2014_000000282359_440734.jpg', 'category_id': 1, 'ann_id': 440734, 'sent_ids': [81458, 81459], 'ref_id': 36299}]\n", "[{'image_id': 377339, 'split': 'train', 'sentences': [{'tokens': ['black', 'suitcase', 'girl', 'is', 'sitting', 'in'], 'raw': 'black suitcase girl is sitting in', 'sent_id': 2283, 'sent': 'black suitcase girl is sitting in'}, {'tokens': ['a', 'suitcase', 'with', 'a', 'girl', 'sitting', 'in', 'it'], 'raw': 'A suitcase with a girl sitting in it', 'sent_id': 2284, 'sent': 'a suitcase with a girl sitting in it'}], 'file_name': 'COCO_train2014_000000377339_1186330.jpg', 'category_id': 33, 'ann_id': 1186330, 'sent_ids': [2283, 2284], 'ref_id': 5899}][{'image_id': 429215, 'split': 'train', 'sentences': [{'tokens': ['a', 'crown', 'of', 'broccoli', 'on', 'a', 'blue', 'plate', 'with', 'carrots', 'and', 'potatoes'], 'raw': 'A crown of broccoli on a blue plate with carrots and potatoes.', 'sent_id': 11008, 'sent': 'a crown of broccoli on a blue plate with carrots and potatoes'}, {'tokens': ['broccoli', 'on', 'a', 'plate'], 'raw': 'Broccoli on a plate.', 'sent_id': 11009, 'sent': 'broccoli on a plate'}], 'file_name': 'COCO_train2014_000000429215_1059094.jpg', 'category_id': 56, 'ann_id': 1059094, 'sent_ids': [11008, 11009], 'ref_id': 45758}]\n", "\n", "[{'image_id': 568492, 'split': 'train', 'sentences': [{'tokens': ['back', 'of', 'chair', 'with', 'women', 'in', 'hooded', 'jacket'], 'raw': 'BACK OF CHAIR WITH WOMEN IN HOODED JACKET', 'sent_id': 62788, 'sent': 'back of chair with women in hooded jacket'}, {'tokens': ['wooden', 'chair', 'with', 'person', 'in', 'red', 'coat', 'getting', 'off', 'of', 'it'], 'raw': 'wooden chair with person in red coat getting off of it', 'sent_id': 62789, 'sent': 'wooden chair with person in red coat getting off of it'}], 'file_name': 'COCO_train2014_000000568492_375813.jpg', 'category_id': 62, 'ann_id': 375813, 'sent_ids': [62788, 62789], 'ref_id': 48006}]\n", "[{'image_id': 183100, 'split': 'train', 'sentences': [{'tokens': ['a', 'red', 'handle', 'for', 'a', 'pair', 'of', 'shears', 'on', 'a', 'table'], 'raw': 'A red handle for a pair of shears on a table.', 'sent_id': 64588, 'sent': 'a red handle for a pair of shears on a table'}, {'tokens': ['scissors', 'with', 'red', 'handle'], 'raw': 'scissors with red handle', 'sent_id': 64589, 'sent': 'scissors with red handle'}], 'file_name': 'COCO_train2014_000000183100_2146842.jpg', 'category_id': 87, 'ann_id': 2146842, 'sent_ids': [64588, 64589], 'ref_id': 29813}]\n", "[{'image_id': 159109, 'split': 'train', 'sentences': [{'tokens': ['pizza', 'with', 'only', 'sausage', 'and', 'cheese', 'on', 'it'], 'raw': 'Pizza with only sausage and cheese on it.', 'sent_id': 20557, 'sent': 'pizza with only sausage and cheese on it'}, {'tokens': ['a', 'frozen', 'pepperoni', 'pizza', 'on', 'a', 'baking', 'sheet'], 'raw': 'A frozen pepperoni pizza on a baking sheet.', 'sent_id': 20558, 'sent': 'a frozen pepperoni pizza on a baking sheet'}], 'file_name': 'COCO_train2014_000000159109_1072198.jpg', 'category_id': 59, 'ann_id': 1072198, 'sent_ids': [20557, 20558], 'ref_id': 12943}][{'image_id': 99599, 'split': 'train', 'sentences': [{'tokens': ['a', 'woman', 'wearing', 'a', 'bracelet', 'helps', 'put', 'candles', 'on', 'a', 'cake'], 'raw': 'A woman wearing a bracelet helps put candles on a cake.', 'sent_id': 6577, 'sent': 'a woman wearing a bracelet helps put candles on a cake'}, {'tokens': ['person', 'wearing', 'a', 'silver', 'bracelet', 'and', 'a', 'silver', 'ring'], 'raw': 'Person wearing a silver bracelet and a silver ring.', 'sent_id': 6578, 'sent': 'person wearing a silver bracelet and a silver ring'}], 'file_name': 'COCO_train2014_000000099599_191383.jpg', 'category_id': 1, 'ann_id': 191383, 'sent_ids': [6577, 6578], 'ref_id': 7554}]\n", "\n", "[{'image_id': 198406, 'split': 'train', 'sentences': [{'tokens': ['a', 'bird', 'stands', 'on', 'the', 'road'], 'raw': 'A bird stands on the road.', 'sent_id': 59907, 'sent': 'a bird stands on the road'}, {'tokens': ['a', 'bird', 'is', 'standing', 'on', 'the', 'road', 'and', 'watching', 'the', 'other', 'bird', 'sitting', 'near'], 'raw': 'A bird is standing on the road and watching the other bird sitting near', 'sent_id': 59908, 'sent': 'a bird is standing on the road and watching the other bird sitting near'}], 'file_name': 'COCO_train2014_000000198406_43102.jpg', 'category_id': 16, 'ann_id': 43102, 'sent_ids': [59907, 59908], 'ref_id': 47881}]\n", "[{'image_id': 321173, 'split': 'train', 'sentences': [{'tokens': ['a', 'part', 'of', 'the', 'table', 'to', 'the', 'right', 'of', 'the', 'plate'], 'raw': 'A part of the table to the right of the plate.', 'sent_id': 83151, 'sent': 'a part of the table to the right of the plate'}], 'file_name': 'COCO_train2014_000000321173_390269.jpg', 'category_id': 67, 'ann_id': 390269, 'sent_ids': [83151], 'ref_id': 36932}]\n", "[{'image_id': 40901, 'split': 'train', 'sentences': [{'tokens': ['man', 'washing', 'his', 'hands', 'in', 'a', 'public', 'restroom'], 'raw': 'Man washing his hands in a public restroom.', 'sent_id': 57105, 'sent': 'man washing his hands in a public restroom'}, {'tokens': ['man', 'watching', 'his', 'hands', ',', 'not', 'his', 'reflection'], 'raw': 'Man watching his hands, not his reflection.', 'sent_id': 57106, 'sent': 'man watching his hands , not his reflection'}], 'file_name': 'COCO_train2014_000000040901_512279.jpg', 'category_id': 1, 'ann_id': 512279, 'sent_ids': [57105, 57106], 'ref_id': 26951}]\n", "[{'image_id': 286482, 'split': 'train', 'sentences': [{'tokens': ['green', 'suitcase'], 'raw': 'green suitcase', 'sent_id': 37373, 'sent': 'green suitcase'}, {'tokens': ['a', 'green', 'suitcase', 'with', 'two', 'other', 'luggage', 'pieces', 'stacked', 'atop', 'it'], 'raw': 'A green suitcase with two other luggage pieces stacked atop it.', 'sent_id': 37374, 'sent': 'a green suitcase with two other luggage pieces stacked atop it'}], 'file_name': 'COCO_train2014_000000286482_1184941.jpg', 'category_id': 33, 'ann_id': 1184941, 'sent_ids': [37373, 37374], 'ref_id': 19349}][{'image_id': 40094, 'split': 'train', 'sentences': [{'tokens': ['the', 'cat', 'that', 'is', 'the', 'lowest', 'on', 'the', 'couch'], 'raw': 'The cat that is the lowest on the couch.', 'sent_id': 54410, 'sent': 'the cat that is the lowest on the couch'}, {'tokens': ['the', 'cat', 'on', 'the', 'left'], 'raw': 'the cat on the left', 'sent_id': 54411, 'sent': 'the cat on the left'}], 'file_name': 'COCO_train2014_000000040094_52880.jpg', 'category_id': 17, 'ann_id': 52880, 'sent_ids': [54410, 54411], 'ref_id': 25939}]\n", "\n", "[{'image_id': 61209, 'split': 'train', 'sentences': [{'tokens': ['an', 'elephant', 'with', '2', 'white', 'tusks'], 'raw': 'An elephant with 2 white tusks.', 'sent_id': 14679, 'sent': 'an elephant with 2 white tusks'}, {'tokens': ['the', 'elephant', 'on', 'the', 'right', 'side', 'of', 'the', 'picture', 'that', 'seems', 'to', 'be', 'looking', 'into', 'the', 'camera'], 'raw': 'The elephant on the right side of the picture that seems to be looking into the camera.', 'sent_id': 14680, 'sent': 'the elephant on the right side of the picture that seems to be looking into the camera'}], 'file_name': 'COCO_train2014_000000061209_584430.jpg', 'category_id': 22, 'ann_id': 584430, 'sent_ids': [14679, 14680], 'ref_id': 45915}][{'image_id': 276806, 'split': 'train', 'sentences': [{'tokens': ['a', 'woman', 'that', 'is', 'waiting', 'for', 'a', 'slice', 'of', 'cake'], 'raw': 'A woman that is waiting for a slice of cake.', 'sent_id': 87859, 'sent': 'a woman that is waiting for a slice of cake'}, {'tokens': ['a', 'woman', 'wearing', 'a', 'brown', 'jacket', 'sitting', 'beside', 'a', 'younger', 'woman'], 'raw': 'A woman wearing a brown jacket sitting beside a younger woman.', 'sent_id': 87860, 'sent': 'a woman wearing a brown jacket sitting beside a younger woman'}], 'file_name': 'COCO_train2014_000000276806_487475.jpg', 'category_id': 1, 'ann_id': 487475, 'sent_ids': [87859, 87860], 'ref_id': 38745}]\n", "\n", "[{'image_id': 412101, 'split': 'train', 'sentences': [{'tokens': ['the', 'chair', 'at', 'the', 'bottom', 'right'], 'raw': 'the chair at the bottom right', 'sent_id': 42865, 'sent': 'the chair at the bottom right'}, {'tokens': ['the', 'black', 'plastic', 'chair', 'that', 'is', 'farthest', 'away', 'from', 'the', 'kitchen'], 'raw': 'The black plastic chair that is farthest away from the kitchen.', 'sent_id': 42866, 'sent': 'the black plastic chair that is farthest away from the kitchen'}], 'file_name': 'COCO_train2014_000000412101_103736.jpg', 'category_id': 62, 'ann_id': 103736, 'sent_ids': [42865, 42866], 'ref_id': 21474}]\n", "[{'image_id': 397132, 'split': 'train', 'sentences': [{'tokens': ['a', 'blue', 'minivan', 'beside', 'a', 'garbage', 'truck', 'on', 'a', 'street'], 'raw': 'a blue minivan beside a garbage truck on a street', 'sent_id': 82000, 'sent': 'a blue minivan beside a garbage truck on a street'}, {'tokens': ['blue', 'minivan', 'moving', 'on', 'the', 'street', 'with', 'other', 'vehicles'], 'raw': 'Blue minivan moving on the street with other vehicles.', 'sent_id': 82001, 'sent': 'blue minivan moving on the street with other vehicles'}], 'file_name': 'COCO_train2014_000000397132_353337.jpg', 'category_id': 3, 'ann_id': 353337, 'sent_ids': [82000, 82001], 'ref_id': 36513}]\n", "[{'image_id': 168879, 'split': 'train', 'sentences': [{'tokens': ['a', 'giraffe', 'on', 'right', 'side'], 'raw': 'a giraffe on right side', 'sent_id': 63328, 'sent': 'a giraffe on right side'}], 'file_name': 'COCO_train2014_000000168879_595571.jpg', 'category_id': 25, 'ann_id': 595571, 'sent_ids': [63328], 'ref_id': 29326}]\n", "[{'image_id': 287239, 'split': 'train', 'sentences': [{'tokens': ['a', 'toddler', 'with', 'blonde', 'hair', 'and', 'blue', 'eyes', 'who', 'has', 'a', 'brush', 'upon', 'their', 'head'], 'raw': 'a toddler with blonde hair and blue eyes who has a brush upon their head.', 'sent_id': 25368, 'sent': 'a toddler with blonde hair and blue eyes who has a brush upon their head'}, {'tokens': ['a', 'blonde', '-', 'haired', 'small', 'child'], 'raw': 'A blonde-haired small child.', 'sent_id': 25369, 'sent': 'a blonde - haired small child'}], 'file_name': 'COCO_train2014_000000287239_469385.jpg', 'category_id': 1, 'ann_id': 469385, 'sent_ids': [25368, 25369], 'ref_id': 14777}][{'image_id': 180559, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'in', 'blue', 'shirt', 'and', 'blue', 'hat', 'who', 'just', 'threw', 'a', 'frisbee'], 'raw': 'A man in blue shirt and blue hat who just threw a frisbee', 'sent_id': 93684, 'sent': 'a man in blue shirt and blue hat who just threw a frisbee'}, {'tokens': ['a', 'man', 'with', 'a', 'cap', 'is', 'trying', 'to', 'catch', 'a', 'frisbee'], 'raw': 'a man with a cap is trying to catch a Frisbee', 'sent_id': 93685, 'sent': 'a man with a cap is trying to catch a frisbee'}], 'file_name': 'COCO_train2014_000000180559_521030.jpg', 'category_id': 1, 'ann_id': 521030, 'sent_ids': [93684, 93685], 'ref_id': 41030}][{'image_id': 204792, 'split': 'train', 'sentences': [{'tokens': ['a', 'sheep', 'being', 'help', 'by', 'the', 'lady', 'in', 'the', 'black', 'shit', ';', 'left', 'side', 'of', 'the', 'picture'], 'raw': 'a sheep being help by the lady in the black shit; left side of the picture', 'sent_id': 1907, 'sent': 'a sheep being help by the lady in the black shit ; left side of the picture'}, {'tokens': ['the', 'animal', 'being', 'petted', 'by', 'the', 'blonde', 'lady'], 'raw': 'The animal being petted by the blonde lady.', 'sent_id': 1908, 'sent': 'the animal being petted by the blonde lady'}], 'file_name': 'COCO_train2014_000000204792_61249.jpg', 'category_id': 20, 'ann_id': 61249, 'sent_ids': [1907, 1908], 'ref_id': 5743}]\n", "\n", "\n", "[{'image_id': 516596, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'wearing', 'glasses', 'eating', 'a', 'huge', 'hero'], 'raw': 'a man wearing glasses eating a huge hero', 'sent_id': 38413, 'sent': 'a man wearing glasses eating a huge hero'}, {'tokens': ['a', 'man', 'wearing', 'glasses', 'eating', 'a', 'sandwich'], 'raw': 'a man wearing glasses eating a sandwich', 'sent_id': 38414, 'sent': 'a man wearing glasses eating a sandwich'}], 'file_name': 'COCO_train2014_000000516596_424086.jpg', 'category_id': 1, 'ann_id': 424086, 'sent_ids': [38413, 38414], 'ref_id': 19743}]\n", "[{'image_id': 28085, 'split': 'train', 'sentences': [{'tokens': ['pink', 'color', 'donuts', 'in', 'front', 'of', 'a', 'brown', 'colored', 'one'], 'raw': 'pink color donuts in front of a brown colored one', 'sent_id': 56205, 'sent': 'pink color donuts in front of a brown colored one'}, {'tokens': ['a', 'pink', 'color', 'ring', 'chocolate', 'iced', 'doughnut', 'in', 'front', 'of', 'the', 'brown', 'color'], 'raw': 'A PINK COLOR RING CHOCOLATE ICED DOUGHNUT IN FRONT OF THE BROWN COLOR', 'sent_id': 56206, 'sent': 'a pink color ring chocolate iced doughnut in front of the brown color'}], 'file_name': 'COCO_train2014_000000028085_1078993.jpg', 'category_id': 60, 'ann_id': 1078993, 'sent_ids': [56205, 56206], 'ref_id': 26602}]\n", "[{'image_id': 219633, 'split': 'train', 'sentences': [{'tokens': ['a', 'light', 'brown', 'dog', 'standing', 'next', 'to', 'a', 'dark', 'brown', 'dog'], 'raw': 'A light brown dog standing next to a dark brown dog.', 'sent_id': 79238, 'sent': 'a light brown dog standing next to a dark brown dog'}, {'tokens': ['the', 'lightest', 'color', 'dog', 'that', 'is', 'on', 'the', 'right'], 'raw': 'The lightest color dog that is on the right.', 'sent_id': 79239, 'sent': 'the lightest color dog that is on the right'}], 'file_name': 'COCO_train2014_000000219633_15879.jpg', 'category_id': 18, 'ann_id': 15879, 'sent_ids': [79238, 79239], 'ref_id': 35444}]\n", "[{'image_id': 110002, 'split': 'train', 'sentences': [{'tokens': ['large', 'zebra', 'left', 'of', 'screen'], 'raw': 'large zebra left of screen', 'sent_id': 100858, 'sent': 'large zebra left of screen'}, {'tokens': ['zebra', 'in', 'front', 'of', 'all', 'the', 'others'], 'raw': 'Zebra in front of all the others.', 'sent_id': 100859, 'sent': 'zebra in front of all the others'}], 'file_name': 'COCO_train2014_000000110002_589620.jpg', 'category_id': 24, 'ann_id': 589620, 'sent_ids': [100858, 100859], 'ref_id': 43849}]\n", "[{'image_id': 217460, 'split': 'train', 'sentences': [{'tokens': ['large', 'pizza', 'with', 'all', 'toppings'], 'raw': 'large pizza with all toppings', 'sent_id': 14009, 'sent': 'large pizza with all toppings'}, {'tokens': ['a', 'pizza', 'with', 'olives', ',', 'mushrooms', ',', 'artichokes', ',', 'and', 'ham'], 'raw': 'A pizza with olives, mushrooms, artichokes, and ham.', 'sent_id': 14010, 'sent': 'a pizza with olives , mushrooms , artichokes , and ham'}], 'file_name': 'COCO_train2014_000000217460_1077159.jpg', 'category_id': 59, 'ann_id': 1077159, 'sent_ids': [14009, 14010], 'ref_id': 10427}][{'image_id': 324336, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'standing', 'outside', 'with', 'an', 'umbrella'], 'raw': 'A man standing outside with an umbrella.', 'sent_id': 61801, 'sent': 'a man standing outside with an umbrella'}, {'tokens': ['a', 'lady', 'wearing', 'specs', 'and', 'holding', 'black', 'umbrella'], 'raw': 'A lady wearing specs and holding black umbrella', 'sent_id': 61802, 'sent': 'a lady wearing specs and holding black umbrella'}], 'file_name': 'COCO_train2014_000000324336_1734772.jpg', 'category_id': 1, 'ann_id': 1734772, 'sent_ids': [61801, 61802], 'ref_id': 28726}]\n", "[{'image_id': 10495, 'split': 'train', 'sentences': [{'tokens': ['a', 'baby', 'elephant', 'under', 'a', 'big', 'elephant'], 'raw': 'A baby elephant under a big elephant.', 'sent_id': 2010, 'sent': 'a baby elephant under a big elephant'}, {'tokens': ['a', 'baby', 'elephant', 'walking', 'with', 'its', 'mother'], 'raw': 'A baby elephant walking with its mother.', 'sent_id': 2011, 'sent': 'a baby elephant walking with its mother'}], 'file_name': 'COCO_train2014_000000010495_1821437.jpg', 'category_id': 22, 'ann_id': 1821437, 'sent_ids': [2010, 2011], 'ref_id': 5786}]\n", "\n", "[{'image_id': 279753, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'in', 'a', 'red', 'shirt', 'and', 'black', 'pants'], 'raw': 'A man in a red shirt and black pants', 'sent_id': 97009, 'sent': 'a man in a red shirt and black pants'}], 'file_name': 'COCO_train2014_000000279753_535460.jpg', 'category_id': 1, 'ann_id': 535460, 'sent_ids': [97009], 'ref_id': 42327}]\n", "[{'image_id': 427395, 'split': 'train', 'sentences': [{'tokens': ['a', 'purple', 'toothbrush'], 'raw': 'A purple toothbrush.', 'sent_id': 10965, 'sent': 'a purple toothbrush'}, {'tokens': ['purple', 'toothbrush', 'with', 'green', 'bristles'], 'raw': 'Purple toothbrush with green bristles.', 'sent_id': 10966, 'sent': 'purple toothbrush with green bristles'}], 'file_name': 'COCO_train2014_000000427395_342229.jpg', 'category_id': 90, 'ann_id': 342229, 'sent_ids': [10965, 10966], 'ref_id': 9269}][{'image_id': 63217, 'split': 'train', 'sentences': [{'tokens': ['a', 'boat', 'in', 'the', 'water', 'with', 'the', 'words', 'u', '.', 's', '.', 'coast', 'guard', 'on', 'the', 'side'], 'raw': 'a boat in the water with the words U.S. Coast Guard on the side', 'sent_id': 78927, 'sent': 'a boat in the water with the words u . s . coast guard on the side'}, {'tokens': ['a', 'red', 'and', 'white', 'us', 'coast', 'guard', 'ship'], 'raw': 'A red and white US Coast Guard Ship.', 'sent_id': 78928, 'sent': 'a red and white us coast guard ship'}], 'file_name': 'COCO_train2014_000000063217_179804.jpg', 'category_id': 9, 'ann_id': 179804, 'sent_ids': [78927, 78928], 'ref_id': 35324}]\n", "\n", "[{'image_id': 17520, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'wearing', 'a', 'checkered', 'shirt', 'walking', 'and', 'talking', 'on', 'the', 'phone'], 'raw': 'A man wearing a checkered shirt walking and talking on the phone.', 'sent_id': 56923, 'sent': 'a man wearing a checkered shirt walking and talking on the phone'}, {'tokens': ['man', 'wearing', 'a', 'plaid', 'shirt'], 'raw': 'man wearing a plaid shirt.', 'sent_id': 56924, 'sent': 'man wearing a plaid shirt'}], 'file_name': 'COCO_train2014_000000017520_196780.jpg', 'category_id': 1, 'ann_id': 196780, 'sent_ids': [56923, 56924], 'ref_id': 26881}]\n", "[{'image_id': 137203, 'split': 'train', 'sentences': [{'tokens': ['a', 'woman', 'in', 'red', 'is', 'trying', 'to', 'catch', 'a', 'pink', 'frisbee'], 'raw': 'A woman in red is trying to catch a pink frisbee', 'sent_id': 100490, 'sent': 'a woman in red is trying to catch a pink frisbee'}, {'tokens': ['a', 'woman', 'in', 'a', 'red', 'shirt', 'and', 'jean', 'shorts', 'about', 'to', 'catch', 'a', 'frisbee'], 'raw': 'A woman in a red shirt and jean shorts about to catch a Frisbee.', 'sent_id': 100491, 'sent': 'a woman in a red shirt and jean shorts about to catch a frisbee'}], 'file_name': 'COCO_train2014_000000137203_459492.jpg', 'category_id': 1, 'ann_id': 459492, 'sent_ids': [100490, 100491], 'ref_id': 49657}]\n", "[{'image_id': 34674, 'split': 'train', 'sentences': [{'tokens': ['the', 'horse', 'of', 'the', 'man', 'without', 'a', 'hat'], 'raw': 'The horse of the man without a hat', 'sent_id': 47889, 'sent': 'the horse of the man without a hat'}, {'tokens': ['horse', 'being', 'ridden', 'by', 'the', 'man', 'without', 'a', 'hat'], 'raw': 'Horse being ridden by the man without a hat.', 'sent_id': 47890, 'sent': 'horse being ridden by the man without a hat'}], 'file_name': 'COCO_train2014_000000034674_56042.jpg', 'category_id': 19, 'ann_id': 56042, 'sent_ids': [47889, 47890], 'ref_id': 23399}]\n", "[{'image_id': 293975, 'split': 'train', 'sentences': [{'tokens': ['a', 'white', 'laptop', 'comuter'], 'raw': 'A white laptop comuter.', 'sent_id': 48293, 'sent': 'a white laptop comuter'}, {'tokens': ['white', 'laptop'], 'raw': 'white laptop', 'sent_id': 48294, 'sent': 'white laptop'}], 'file_name': 'COCO_train2014_000000293975_1099887.jpg', 'category_id': 73, 'ann_id': 1099887, 'sent_ids': [48293, 48294], 'ref_id': 23543}]\n", "[{'image_id': 323705, 'split': 'train', 'sentences': [{'tokens': ['a', 'clock', 'face', 'where', 'all', 'the', 'numbers', 'are', 'displayed'], 'raw': 'A clock face where all the numbers are displayed.', 'sent_id': 8682, 'sent': 'a clock face where all the numbers are displayed'}, {'tokens': ['clock', 'facing', 'the', 'front'], 'raw': 'clock facing the front.', 'sent_id': 8683, 'sent': 'clock facing the front'}], 'file_name': 'COCO_train2014_000000323705_335093.jpg', 'category_id': 85, 'ann_id': 335093, 'sent_ids': [8682, 8683], 'ref_id': 8358}]\n", "[{'image_id': 416819, 'split': 'train', 'sentences': [{'tokens': ['a', 'zebra', 'with', 'his', 'back', 'to', 'the', 'camera'], 'raw': 'A zebra with his back to the camera', 'sent_id': 56637, 'sent': 'a zebra with his back to the camera'}, {'tokens': ['zebra', 'turn', 'the', 'head', 'left', 'hand', 'side'], 'raw': 'Zebra turn the head left hand side', 'sent_id': 56638, 'sent': 'zebra turn the head left hand side'}], 'file_name': 'COCO_train2014_000000416819_591965.jpg', 'category_id': 24, 'ann_id': 591965, 'sent_ids': [56637, 56638], 'ref_id': 26766}]\n", "[{'image_id': 522298, 'split': 'train', 'sentences': [{'tokens': ['a', 'pink', 'umbrella'], 'raw': 'A pink umbrella.', 'sent_id': 54683, 'sent': 'a pink umbrella'}, {'tokens': ['the', 'red', 'umbrella'], 'raw': 'the red umbrella', 'sent_id': 54684, 'sent': 'the red umbrella'}], 'file_name': 'COCO_train2014_000000522298_283547.jpg', 'category_id': 28, 'ann_id': 283547, 'sent_ids': [54683, 54684], 'ref_id': 26042}]\n", "[{'image_id': 427756, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'standing', 'with', 'blue', 'striped', 'shirt'], 'raw': 'A man standing with blue striped shirt.', 'sent_id': 62696, 'sent': 'a man standing with blue striped shirt'}, {'tokens': ['a', 'man', 'in', 'black', 'jeans', 'and', 'blue', 'and', 'black', 'striped', 'shirt', 'holding', 'wii', 'in', 'hand', 'standing', 'in', 'front', 'of', 'tv'], 'raw': 'A man in black jeans and blue and black striped shirt holding wii in hand standing in front of TV.', 'sent_id': 62697, 'sent': 'a man in black jeans and blue and black striped shirt holding wii in hand standing in front of tv'}], 'file_name': 'COCO_train2014_000000427756_490450.jpg', 'category_id': 1, 'ann_id': 490450, 'sent_ids': [62696, 62697], 'ref_id': 29075}][{'image_id': 405136, 'split': 'train', 'sentences': [{'tokens': ['a', 'woman', 'in', 'a', 'sleeveless', 'shirt', 'is', 'sitting', 'in', 'the', 'passenger', 'seat', 'watching', 'a', 'horse'], 'raw': 'A woman in a sleeveless shirt is sitting in the passenger seat watching a horse', 'sent_id': 12514, 'sent': 'a woman in a sleeveless shirt is sitting in the passenger seat watching a horse'}, {'tokens': ['a', 'person', 'sitting', 'next', 'to', 'the', 'driver'], 'raw': 'A person sitting next to the driver', 'sent_id': 12515, 'sent': 'a person sitting next to the driver'}], 'file_name': 'COCO_train2014_000000405136_188388.jpg', 'category_id': 1, 'ann_id': 188388, 'sent_ids': [12514, 12515], 'ref_id': 9851}]\n", "\n", "[{'image_id': 16465, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'in', 'a', 'white', 'soccer', 'uniform'], 'raw': 'A man in a white soccer uniform.', 'sent_id': 31029, 'sent': 'a man in a white soccer uniform'}, {'tokens': ['the', 'player', 'wearing', 'the', 'white', 'clothes'], 'raw': 'The player wearing the white clothes.', 'sent_id': 31030, 'sent': 'the player wearing the white clothes'}], 'file_name': 'COCO_train2014_000000016465_477891.jpg', 'category_id': 1, 'ann_id': 477891, 'sent_ids': [31029, 31030], 'ref_id': 16909}]\n", "[{'image_id': 326685, 'split': 'train', 'sentences': [{'tokens': ['a', 'blurry', 'shot', 'of', 'people', 'riding', 'a', 'scooter', 'in', 'the', 'rain'], 'raw': 'a blurry shot of people riding a scooter in the rain', 'sent_id': 19396, 'sent': 'a blurry shot of people riding a scooter in the rain'}, {'tokens': ['top', 'right', 'blurry', 'motorcyclist', 'going', 'out', 'of', 'frame'], 'raw': 'top right blurry motorcyclist going out of frame.', 'sent_id': 19397, 'sent': 'top right blurry motorcyclist going out of frame'}], 'file_name': 'COCO_train2014_000000326685_1713145.jpg', 'category_id': 1, 'ann_id': 1713145, 'sent_ids': [19396, 19397], 'ref_id': 12499}]\n", "[{'image_id': 326685, 'split': 'train', 'sentences': [{'tokens': ['green', '&', 'white', 'scooter', 'that', 'women', 'are', 'riding', 'in', 'rain'], 'raw': 'Green & white scooter that women are riding in rain', 'sent_id': 98188, 'sent': 'green & white scooter that women are riding in rain'}, {'tokens': ['white', 'color', 'motor', 'cycle'], 'raw': 'white color motor cycle', 'sent_id': 98189, 'sent': 'white color motor cycle'}], 'file_name': 'COCO_train2014_000000326685_147911.jpg', 'category_id': 4, 'ann_id': 147911, 'sent_ids': [98188, 98189], 'ref_id': 42804}][{'image_id': 316667, 'split': 'train', 'sentences': [{'tokens': ['a', 'bench', 'that', 'is', 'laying', 'on', 'the', 'ground'], 'raw': 'A bench that is laying on the ground', 'sent_id': 15264, 'sent': 'a bench that is laying on the ground'}, {'tokens': ['a', 'bench', 'on', 'which', 'the', 'guy', 'is', 'operating', 'the', 'skate', 'board'], 'raw': 'A bench on which the guy is operating the skate board', 'sent_id': 15265, 'sent': 'a bench on which the guy is operating the skate board'}], 'file_name': 'COCO_train2014_000000316667_1394952.jpg', 'category_id': 15, 'ann_id': 1394952, 'sent_ids': [15264, 15265], 'ref_id': 10907}]\n", "\n", "[{'image_id': 60170, 'split': 'train', 'sentences': [{'tokens': ['a', 'baby', 'elephant'], 'raw': 'A baby elephant', 'sent_id': 50475, 'sent': 'a baby elephant'}, {'tokens': ['an', 'elephant', 'that', 'is', 'relatively', 'small'], 'raw': 'An elephant that is relatively small.', 'sent_id': 50476, 'sent': 'an elephant that is relatively small'}], 'file_name': 'COCO_train2014_000000060170_582132.jpg', 'category_id': 22, 'ann_id': 582132, 'sent_ids': [50475, 50476], 'ref_id': 24381}]\n", "[{'image_id': 546366, 'split': 'train', 'sentences': [{'tokens': ['tennis', 'player', 'holding', 'racquet'], 'raw': 'tennis player holding racquet', 'sent_id': 1541, 'sent': 'tennis player holding racquet'}, {'tokens': ['a', 'woman', 'wearing', 'white'], 'raw': 'a woman wearing white.', 'sent_id': 1542, 'sent': 'a woman wearing white'}], 'file_name': 'COCO_train2014_000000546366_2150776.jpg', 'category_id': 1, 'ann_id': 2150776, 'sent_ids': [1541, 1542], 'ref_id': 45385}][{'image_id': 191994, 'split': 'train', 'sentences': [{'tokens': ['pizza', 'in', 'a', 'tray', 'ready', 'to', 'eat'], 'raw': 'pizza in a tray ready to eat', 'sent_id': 67556, 'sent': 'pizza in a tray ready to eat'}, {'tokens': ['a', 'sandwich', 'with', 'vegetables', 'on', 'a', 'white', 'bread', 'in', 'a', 'carrier'], 'raw': 'A sandwich with vegetables on a white bread in a carrier.', 'sent_id': 67557, 'sent': 'a sandwich with vegetables on a white bread in a carrier'}], 'file_name': 'COCO_train2014_000000191994_1539809.jpg', 'category_id': 51, 'ann_id': 1539809, 'sent_ids': [67556, 67557], 'ref_id': 30964}]\n", "\n", "[{'image_id': 239803, 'split': 'train', 'sentences': [{'tokens': ['a', 'teen', 'in', 'a', 'black', 'coat', 'to', 'the', 'right', 'of', 'two', 'other', 'teens'], 'raw': 'A teen in a black coat to the right of two other teens.', 'sent_id': 64376, 'sent': 'a teen in a black coat to the right of two other teens'}, {'tokens': ['a', 'young', 'gentleman', 'wearing', 'a', 'black', 'leather', 'jacket'], 'raw': 'A young gentleman wearing a black leather jacket', 'sent_id': 64377, 'sent': 'a young gentleman wearing a black leather jacket'}], 'file_name': 'COCO_train2014_000000239803_2166462.jpg', 'category_id': 1, 'ann_id': 2166462, 'sent_ids': [64376, 64377], 'ref_id': 29734}]\n", "[{'image_id': 235646, 'split': 'train', 'sentences': [{'tokens': ['the', 'giraffe', 'whose', 'head', 'is', 'not', 'visible'], 'raw': 'The giraffe whose head is not visible', 'sent_id': 50286, 'sent': 'the giraffe whose head is not visible'}, {'tokens': ['body', 'of', 'a', 'giraffe', 'stading', 'to', 'the', 'upper', 'right', 'of', 'the', 'group', 'against', 'the', 'fence'], 'raw': 'Body of a giraffe stading to the upper right of the group against the fence', 'sent_id': 50287, 'sent': 'body of a giraffe stading to the upper right of the group against the fence'}], 'file_name': 'COCO_train2014_000000235646_1414611.jpg', 'category_id': 25, 'ann_id': 1414611, 'sent_ids': [50286, 50287], 'ref_id': 24303}][{'image_id': 176385, 'split': 'train', 'sentences': [{'tokens': ['there', 'is', 'nobody', 'riding', 'this', 'skateboard'], 'raw': 'There is nobody riding this skateboard.', 'sent_id': 24280, 'sent': 'there is nobody riding this skateboard'}, {'tokens': ['a', 'skateboard', 'alone', 'on', 'the', 'ground'], 'raw': 'A skateboard alone on the ground.', 'sent_id': 24281, 'sent': 'a skateboard alone on the ground'}], 'file_name': 'COCO_train2014_000000176385_645613.jpg', 'category_id': 41, 'ann_id': 645613, 'sent_ids': [24280, 24281], 'ref_id': 14373}]\n", "\n", "[{'image_id': 131007, 'split': 'train', 'sentences': [{'tokens': ['black', 'chair', 'in', 'corner'], 'raw': 'black chair in corner', 'sent_id': 98162, 'sent': 'black chair in corner'}, {'tokens': ['a', 'black', 'recliner', 'chair'], 'raw': 'a black recliner chair', 'sent_id': 98163, 'sent': 'a black recliner chair'}], 'file_name': 'COCO_train2014_000000131007_115747.jpg', 'category_id': 63, 'ann_id': 115747, 'sent_ids': [98162, 98163], 'ref_id': 42794}]\n", "[{'image_id': 155995, 'split': 'train', 'sentences': [{'tokens': ['a', 'child', 'baseball', 'player', 'throwing', 'a', 'pitch', 'to', 'a', 'batter'], 'raw': 'A child baseball player throwing a pitch to a batter.', 'sent_id': 15351, 'sent': 'a child baseball player throwing a pitch to a batter'}, {'tokens': ['the', 'pitcher'], 'raw': 'the pitcher', 'sent_id': 15352, 'sent': 'the pitcher'}], 'file_name': 'COCO_train2014_000000155995_525361.jpg', 'category_id': 1, 'ann_id': 525361, 'sent_ids': [15351, 15352], 'ref_id': 10939}][{'image_id': 514025, 'split': 'train', 'sentences': [{'tokens': ['a', 'large', 'blue', 'and', 'white', 'crane', 'standing', 'on', 'the', 'dock'], 'raw': 'a large blue and white crane standing on the dock', 'sent_id': 43870, 'sent': 'a large blue and white crane standing on the dock'}, {'tokens': ['a', 'bird', 'that', 'is', 'standing', 'on', 'the', 'dock', 'with', 'long', 'legs', 'and', 'a', 'scrunched', 'up', 'neck'], 'raw': 'A bird that is standing on the dock with long legs and a scrunched up neck.', 'sent_id': 43871, 'sent': 'a bird that is standing on the dock with long legs and a scrunched up neck'}], 'file_name': 'COCO_train2014_000000514025_36534.jpg', 'category_id': 16, 'ann_id': 36534, 'sent_ids': [43870, 43871], 'ref_id': 21856}]\n", "[{'image_id': 485705, 'split': 'train', 'sentences': [{'tokens': ['middle', 'banana', 'in', 'the', 'bunch'], 'raw': 'middle banana in the bunch', 'sent_id': 30403, 'sent': 'middle banana in the bunch'}, {'tokens': ['the', 'bottom', 'banana', 'in', 'the', 'right', 'hand', 'picture'], 'raw': 'the bottom banana in the right hand picture', 'sent_id': 30404, 'sent': 'the bottom banana in the right hand picture'}], 'file_name': 'COCO_train2014_000000485705_1043190.jpg', 'category_id': 52, 'ann_id': 1043190, 'sent_ids': [30403, 30404], 'ref_id': 16660}]\n", "\n", "[{'image_id': 308758, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'wearing', 'a', 'chef', 'jacket'], 'raw': 'a man wearing a chef jacket', 'sent_id': 30897, 'sent': 'a man wearing a chef jacket'}, {'tokens': ['man', 'preparing', 'a', 'dish'], 'raw': 'Man preparing a dish', 'sent_id': 30898, 'sent': 'man preparing a dish'}], 'file_name': 'COCO_train2014_000000308758_196341.jpg', 'category_id': 1, 'ann_id': 196341, 'sent_ids': [30897, 30898], 'ref_id': 16860}][{'image_id': 54194, 'split': 'train', 'sentences': [{'tokens': ['a', 'lady', 'with', 'black', 'long', 'hair', 'in', 'a', 'yellow', 'shirt', ',', 'putting', 'butter', 'on', 'a', 'bread'], 'raw': 'a lady with black long hair in a yellow shirt, putting butter on a bread', 'sent_id': 52248, 'sent': 'a lady with black long hair in a yellow shirt , putting butter on a bread'}, {'tokens': ['a', 'woman', 'in', 'yellow', 'with', 'a', 'knife', 'in', 'her', 'hand', 'buttering', 'her', 'sub', 'sandwich'], 'raw': 'A woman in yellow with a knife in her hand buttering her sub sandwich.', 'sent_id': 52249, 'sent': 'a woman in yellow with a knife in her hand buttering her sub sandwich'}], 'file_name': 'COCO_train2014_000000054194_233992.jpg', 'category_id': 1, 'ann_id': 233992, 'sent_ids': [52248, 52249], 'ref_id': 25093}]\n", "\n", "[{'image_id': 563447, 'split': 'train', 'sentences': [{'tokens': ['the', 'kid', 'wearing', 'glasses'], 'raw': 'the kid wearing glasses', 'sent_id': 46806, 'sent': 'the kid wearing glasses'}, {'tokens': ['a', 'short', 'girl', 'standing', 'next', 'to', 'a', 'short', 'horse', 'wearing', 'a', 'belt', 'buckle', 'and', 'glasses'], 'raw': 'A short girl standing next to a short horse wearing a belt buckle and glasses', 'sent_id': 46807, 'sent': 'a short girl standing next to a short horse wearing a belt buckle and glasses'}], 'file_name': 'COCO_train2014_000000563447_186920.jpg', 'category_id': 1, 'ann_id': 186920, 'sent_ids': [46806, 46807], 'ref_id': 22985}]\n", "[{'image_id': 404592, 'split': 'train', 'sentences': [{'tokens': ['a', 'man', 'sitting', 'on', 'a', 'couch', 'between', 'two', 'other', 'people'], 'raw': 'A man sitting on a couch between two other people.', 'sent_id': 37227, 'sent': 'a man sitting on a couch between two other people'}, {'tokens': ['a', 'man', 'with', 'black', 'hair', 'wearing', 'a', 'black', 'shirt', 'and', 'holding', 'an', 'apple', 'laptop', 'between', 'a', 'man', 'and', 'a', 'woman'], 'raw': 'A man with black hair wearing a black shirt and holding an apple laptop between a man and a woman.', 'sent_id': 37228, 'sent': 'a man with black hair wearing a black shirt and holding an apple laptop between a man and a woman'}], 'file_name': 'COCO_train2014_000000404592_203428.jpg', 'category_id': 1, 'ann_id': 203428, 'sent_ids': [37227, 37228], 'ref_id': 19287}]\n", "[{'image_id': 36041, 'split': 'train', 'sentences': [{'tokens': ['a', 'girl', 'uitting', 'the', 'bike', 'with', 'boy', 'friend'], 'raw': 'A GIRL UITTING THE BIKE WITH BOY FRIEND', 'sent_id': 75019, 'sent': 'a girl uitting the bike with boy friend'}, {'tokens': ['the', 'girl', 'on', 'the', 'red', 'scooter'], 'raw': 'The girl on the red scooter', 'sent_id': 75020, 'sent': 'the girl on the red scooter'}], 'file_name': 'COCO_train2014_000000036041_199362.jpg', 'category_id': 1, 'ann_id': 199362, 'sent_ids': [75019, 75020], 'ref_id': 33798}]\n", "[{'image_id': 58105, 'split': 'train', 'sentences': [{'tokens': ['upside', 'down', 'chair'], 'raw': 'upside down chair', 'sent_id': 15047, 'sent': 'upside down chair'}, {'tokens': ['the', 'upside', 'down', 'chair'], 'raw': 'The upside down chair.', 'sent_id': 15048, 'sent': 'the upside down chair'}], 'file_name': 'COCO_train2014_000000058105_1587145.jpg', 'category_id': 62, 'ann_id': 1587145, 'sent_ids': [15047, 15048], 'ref_id': 10822}]\n", "[{'image_id': 309386, 'split': 'train', 'sentences': [{'tokens': ['a', 'food', 'on', 'tabule'], 'raw': 'a food on tabule', 'sent_id': 62106, 'sent': 'a food on tabule'}, {'tokens': ['a', 'table', 'with', 'pizza', 'slices', 'and', 'beer', 'on', 'it'], 'raw': 'A table with pizza slices and beer on it.', 'sent_id': 62107, 'sent': 'a table with pizza slices and beer on it'}], 'file_name': 'COCO_train2014_000000309386_1091316.jpg', 'category_id': 67, 'ann_id': 1091316, 'sent_ids': [62106, 62107], 'ref_id': 28845}][{'image_id': 419062, 'split': 'train', 'sentences': [{'tokens': ['a', 'medium', 'elephant', 'on', 'the', 'left'], 'raw': 'a medium elephant on the left', 'sent_id': 73909, 'sent': 'a medium elephant on the left'}, {'tokens': ['elephant', 'on', 'shore'], 'raw': 'elephant on shore', 'sent_id': 73910, 'sent': 'elephant on shore'}], 'file_name': 'COCO_train2014_000000419062_580921.jpg', 'category_id': 22, 'ann_id': 580921, 'sent_ids': [73909, 73910], 'ref_id': 33380}]\n", "\n", "[{'image_id': 325837, 'split': 'train', 'sentences': [{'tokens': ['a', 'glass', 'window', 'pain', 'behind', 'a', 'man', \"'\", 's'], 'raw': \"a glass window pain behind a man's\", 'sent_id': 68994, 'sent': \"a glass window pain behind a man ' s\"}, {'tokens': ['a', 'window', 'right', 'behind', 'the', 'man', \"'\", 's', 'head'], 'raw': \"a window right behind the man's head\", 'sent_id': 68995, 'sent': \"a window right behind the man ' s head\"}], 'file_name': 'COCO_train2014_000000325837_1732077.jpg', 'category_id': 1, 'ann_id': 1732077, 'sent_ids': [68994, 68995], 'ref_id': 31514}]\n", "[{'image_id': 258727, 'split': 'train', 'sentences': [{'tokens': ['a', 'sheep', 'eating', 'grass', 'facing', 'away', 'from', 'the', 'camera', 'and', 'closer', 'to', 'the', 'building'], 'raw': 'A sheep eating grass facing away from the camera and closer to the building.', 'sent_id': 95715, 'sent': 'a sheep eating grass facing away from the camera and closer to the building'}, {'tokens': ['there', 'is', 'one', 'sheep', 'is', 'eating', 'grass', 'infront', 'of', 'a', 'home'], 'raw': 'There is one sheep is eating grass infront of a home', 'sent_id': 95716, 'sent': 'there is one sheep is eating grass infront of a home'}], 'file_name': 'COCO_train2014_000000258727_62432.jpg', 'category_id': 20, 'ann_id': 62432, 'sent_ids': [95715, 95716], 'ref_id': 41806}]\n", "[{'image_id': 15262, 'split': 'train', 'sentences': [{'tokens': ['a', 'fork', 'on', 'a', 'plate'], 'raw': 'A fork on a plate', 'sent_id': 21138, 'sent': 'a fork on a plate'}, {'tokens': ['a', 'silver', 'fork'], 'raw': 'a silver fork', 'sent_id': 21139, 'sent': 'a silver fork'}], 'file_name': 'COCO_train2014_000000015262_1889611.jpg', 'category_id': 48, 'ann_id': 1889611, 'sent_ids': [21138, 21139], 'ref_id': 13163}]\n", "[{'image_id': 62336, 'split': 'train', 'sentences': [{'tokens': ['the', 'man', 'in', 'the', 'black', 'pullover', 'jacket', 'sitting', 'on', 'the', 'right'], 'raw': 'the man in the black pullover jacket sitting on the right', 'sent_id': 36534, 'sent': 'the man in the black pullover jacket sitting on the right'}, {'tokens': ['a', 'man', 'in', 'a', 'black', 'jacket', 'with', 'his', 'eyes', 'closed', ',', 'drinking', 'from', 'a', 'glass', 'of', 'wine'], 'raw': 'A man in a black jacket with his eyes closed, drinking from a glass of wine', 'sent_id': 36535, 'sent': 'a man in a black jacket with his eyes closed , drinking from a glass of wine'}], 'file_name': 'COCO_train2014_000000062336_1716597.jpg', 'category_id': 1, 'ann_id': 1716597, 'sent_ids': [36534, 36535], 'ref_id': 19021}]\n", "torch.Size([8, 3, 480, 480])\n", "torch.Size([8, 480, 480])\n", "torch.Size([8, 1, 20])\n", "torch.Size([8, 1, 20])\n", "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])\n" ] } ], "source": [ "# sample datas\n", "for i, (img, target, tensor_embeddings, attention_mask) in enumerate(data_loader):\n", " print(img.shape)\n", " print(target.shape)\n", " print(tensor_embeddings.shape)\n", " print(attention_mask.shape)\n", " print(attention_mask[0])\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "from matplotlib.ticker import MaxNLocator, FormatStrFormatter\n", "\n", "# Data from the table\n", "models = ['eqV2-S', 'eqV2-M', 'eqV2-L'] \n", "layers = [2, 4, 8, 3, 6, 12, 5, 10, 20]\n", "original_layers = [8, 12, 20]\n", "original_throughput = [9.4, 7.4, 4.9]\n", "ours_throughput = [40.4, 28.7, 16.8, 31.6, 22.3, 13.9, 24.1, 15.8, 9.4]\n", "\n", "# Create the plot\n", "fig, ax = plt.subplots(figsize=(8, 6))\n", "ax.scatter(original_layers, original_throughput, label='Original')\n", "ax.scatter(layers, ours_throughput, label='Ours')\n", "ax.set_xscale('log', base=2)\n", "ax.set_yscale('log',base=2)\n", "ax.set_title('Throughput Comparison', fontsize=16)\n", "ax.set_xlabel('Number of Layers', fontsize=13)\n", "ax.set_ylabel('Throughput (samples/sec)', fontsize=13)\n", "ax.legend(fontsize=12)\n", "\n", "# Set the tick locator and formatter to show integer values\n", "ax.xaxis.set_major_locator(MaxNLocator(integer=True))\n", "ax.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))\n", "ax.yaxis.set_major_locator(MaxNLocator(integer=True))\n", "ax.yaxis.set_major_formatter(FormatStrFormatter('%.0f'))\n", "ax.grid(True)\n", "\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "lavt", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 2 }