unit_test / data /preprocess /process_flickr_caption_json.py
herrius's picture
Upload 259 files
32b542e
import json
from collections import defaultdict
import jsonlines
subsets = ['train', 'val', 'test']
savepath = "flickr30k/annotations"
set2jsonline = {
'train': 'flickr30k/all_data_final_train_2014.jsonline',
'val': 'flickr30k/all_data_final_val_set0_2014.jsonline',
'test': 'flickr30k/all_data_final_test_set0_2014.jsonline',
}
import os
if not os.path.exists(savepath):
os.makedirs(savepath)
savename = {
'train': "flickr30k/captions_train.json",
'val': "flickr30k/captions_val.json",
'test': "flickr30k/captions_test.json",
}
# imagefields = defaultdict(list)
# annotationsfields = defaultdict(list)
for subset in subsets:
imagefield = []
annotaionfiled = []
sen_id = 0
with jsonlines.open(set2jsonline[subset]) as reader:
for annotation in reader:
sentences = annotation["sentences"]
image_id = annotation["img_path"]
imagefield.append({
"filename": annotation["img_path"],
"id": annotation['id'],
})
for sentence in sentences:
annotaionfiled.append({
"image_id": annotation['id'],
"id": sen_id,
"caption": sentence,
})
sen_id += 1
data = {
"images": imagefield,
"annotations": annotaionfiled,
}
json.dump( data, open(savename[subset], "w"))