## data structure * imagenet 1k ``` data = { 'input_sample_list': [ { 'data': torch.rand(bs, 3, 224, 224, dtype=torch.float32), 'invalid_mask': None, 'modality': 'image', 'data_type': 'input', 'sample_info': { 'id': list(range(bs)), 'path': ['hah' for _ in range(bs)] } }, ], 'target_sample_list': [], 'target_idx_list': [torch.randint(0, 1000, (bs, ))], 'target_set_list': ['ImageNet22k'], 'shared_target_sets': { 'ImageNet22k': [{ 'data': torch.randint(0, 49411, (1000, 11)), 'invalid_mask': torch.zeros(1000, 11, dtype=torch.bool), 'modality': 'text', 'data_type': 'target', 'sample_info': { 'distributed': True, 'total_num': 1000, } }] }, 'task_info': { 'task_name': 'imagenet', 'task_type': 'image_classification', 'dataset_name': 'ImageNet22k', 'batchsize': None, 'sampling_ratio': None } } ``` * mscoco caption ``` data = { 'input_sample_list': [ { 'data': torch.rand(bs, 3, 224, 224, dtype=torch.float32), 'invalid_mask': None, 'modality': 'image', 'data_type': 'input', 'sample_info': [{ 'id': id, 'path': 'hahah', 'bs': bs } for _ in range(bs)] }, { 'data': torch.randint(0, 49411, (bs, 31 * 2)), 'invalid_mask': torch.zeros(bs, 31 * 2, dtype=torch.bool), 'modality': 'text', 'data_type': 'input', 'sample_info': [{ 'pe_index': torch.cat([torch.arange(31), torch.arange(31)], dim=0) } for _ in range(bs)] }, ], 'target_sample_list': [], 'target_idx_list': [torch.randint(0, 49411, (bs, 31))], 'target_set_list': ['Vocab_Word'], 'shared_target_sets': { 'Vocab_Word': [{ 'data': torch.randint(0, 49411, (49411, 2)), 'invalid_mask': None, 'modality': 'text', 'data_type': 'target', 'sample_info': { 'distributed': True, 'total_num': 49411, } }] }, 'task_info': { 'task_name': 'mscoco_caption', 'task_type': 'image_caption', 'dataset_name': 'MSCOCO', 'batchsize': None, 'sampling_ratio': None } } ``` * text_mlm ``` data = { 'input_sample_list': [ { 'data': torch.randint(0, 49411, (bs, 128)), 'invalid_mask': torch.zeros(bs, 128, dtype=torch.bool), 'modality': 'text', 'data_type': 'input', 'sample_info': { 'seq_length': 128 } }, ], 'target_sample_list': [], 'target_idx_list': [torch.randint(0, 49411, (bs, 128))], # most are -1, 'target_set_list': ['Vocab_Word'], 'shared_target_sets': { 'Vocab_Word': [{ 'data': torch.randint(0, 49411, (49411, 2)), 'invalid_mask': None, 'modality': 'text', 'data_type': 'target', 'sample_info': { 'distributed': True, 'total_num': 49411, } }] }, 'task_info': { 'task_name': 'bookswiki_pretrain', 'task_type': 'text_mlm', 'dataset_name': 'BooksWiki', 'batchsize': None, 'sampling_ratio': None } } ``` * mscoco retrieval ``` data = { 'input_sample_list': [ { 'data': torch.rand(bs, 3, 224, 224, dtype=torch.float32), 'invalid_mask': None, 'modality': 'image', 'sample_info': { 'id': list(range(bs)), 'path': ['hah' for _ in range(bs)] } }, ], 'target_sample_list': [ { 'data': torch.randint(0, 49411, (bs, 30)), 'invalid_mask': torch.zeros(bs, 30, dtype=torch.bool), 'modality': 'text', 'sample_info': {} }, ], 'target_idx_list': [], 'target_set_list': [], 'shared_target_sets': { 'ImageNet22k': [{ 'data': torch.randint(0, 49411, (1000, 11)), 'invalid_mask': torch.zeros(1000, 11, dtype=torch.bool), 'modality': 'text', 'sample_info': { 'distributed': True, 'total_num': 1000, } }] }, 'task_info': { 'task_name': 'mscoco_retrieve', 'task_type': 'image_retrieval', 'dataset_name': 'MSCOCO', 'batchsize': None, 'sampling_ratio': None } } ```