|
import json |
|
import pandas as pd |
|
import pyarrow as pa |
|
import random |
|
import os |
|
|
|
from tqdm import tqdm |
|
from glob import glob |
|
from collections import defaultdict, Counter |
|
from .glossary import normalize_word |
|
|
|
|
|
def get_score(occurences): |
|
if occurences == 0: |
|
return 0.0 |
|
elif occurences == 1: |
|
return 0.3 |
|
elif occurences == 2: |
|
return 0.6 |
|
elif occurences == 3: |
|
return 0.9 |
|
else: |
|
return 1.0 |
|
|
|
|
|
def path2rest(path, split, annotations, label2ans): |
|
iid = int(path.split("/")[-1].split("_")[-1][:-4]) |
|
|
|
with open(path, "rb") as fp: |
|
binary = fp.read() |
|
|
|
_annot = annotations[split][iid] |
|
_annot = list(_annot.items()) |
|
qids, qas = [a[0] for a in _annot], [a[1] for a in _annot] |
|
questions = [qa[0] for qa in qas] |
|
answers = [qa[1] for qa in qas] if "test" not in split else list(list()) |
|
answer_labels = ( |
|
[a["labels"] for a in answers] if "test" not in split else list(list()) |
|
) |
|
answer_scores = ( |
|
[a["scores"] for a in answers] if "test" not in split else list(list()) |
|
) |
|
answers = ( |
|
[[label2ans[l] for l in al] for al in answer_labels] |
|
if "test" not in split |
|
else list(list()) |
|
) |
|
|
|
return [binary, questions, answers, answer_labels, answer_scores, iid, qids, split] |
|
|
|
|
|
def make_arrow(root, dataset_root): |
|
with open(f"{root}/v2_OpenEnded_mscoco_train2014_questions.json", "r") as fp: |
|
questions_train2014 = json.load(fp)["questions"] |
|
with open(f"{root}/v2_OpenEnded_mscoco_val2014_questions.json", "r") as fp: |
|
questions_val2014 = json.load(fp)["questions"] |
|
with open(f"{root}/v2_OpenEnded_mscoco_test2015_questions.json", "r") as fp: |
|
questions_test2015 = json.load(fp)["questions"] |
|
with open(f"{root}/v2_OpenEnded_mscoco_test-dev2015_questions.json", "r") as fp: |
|
questions_test_dev2015 = json.load(fp)["questions"] |
|
|
|
with open(f"{root}/v2_mscoco_train2014_annotations.json", "r") as fp: |
|
annotations_train2014 = json.load(fp)["annotations"] |
|
with open(f"{root}/v2_mscoco_val2014_annotations.json", "r") as fp: |
|
annotations_val2014 = json.load(fp)["annotations"] |
|
|
|
annotations = dict() |
|
|
|
for split, questions in zip( |
|
["train", "val", "test", "test-dev"], |
|
[ |
|
questions_train2014, |
|
questions_val2014, |
|
questions_test2015, |
|
questions_test_dev2015, |
|
], |
|
): |
|
_annot = defaultdict(dict) |
|
for q in tqdm(questions): |
|
_annot[q["image_id"]][q["question_id"]] = [q["question"]] |
|
|
|
annotations[split] = _annot |
|
|
|
all_major_answers = list() |
|
|
|
for split, annots in zip( |
|
["train", "val"], [annotations_train2014, annotations_val2014], |
|
): |
|
_annot = annotations[split] |
|
for q in tqdm(annots): |
|
all_major_answers.append(q["multiple_choice_answer"]) |
|
|
|
all_major_answers = [normalize_word(word) for word in tqdm(all_major_answers)] |
|
counter = {k: v for k, v in Counter(all_major_answers).items() if v >= 9} |
|
ans2label = {k: i for i, k in enumerate(counter.keys())} |
|
label2ans = list(counter.keys()) |
|
|
|
for split, annots in zip( |
|
["train", "val"], [annotations_train2014, annotations_val2014], |
|
): |
|
_annot = annotations[split] |
|
for q in tqdm(annots): |
|
answers = q["answers"] |
|
answer_count = {} |
|
for answer in answers: |
|
answer_ = answer["answer"] |
|
answer_count[answer_] = answer_count.get(answer_, 0) + 1 |
|
|
|
labels = [] |
|
scores = [] |
|
for answer in answer_count: |
|
if answer not in ans2label: |
|
continue |
|
labels.append(ans2label[answer]) |
|
score = get_score(answer_count[answer]) |
|
scores.append(score) |
|
|
|
_annot[q["image_id"]][q["question_id"]].append( |
|
{"labels": labels, "scores": scores,} |
|
) |
|
|
|
for split in ["train", "val"]: |
|
filtered_annot = dict() |
|
for ik, iv in annotations[split].items(): |
|
new_q = dict() |
|
for qk, qv in iv.items(): |
|
if len(qv[1]["labels"]) != 0: |
|
new_q[qk] = qv |
|
if len(new_q) != 0: |
|
filtered_annot[ik] = new_q |
|
annotations[split] = filtered_annot |
|
|
|
for split in [ |
|
"train", |
|
"val", |
|
"test", |
|
"test-dev", |
|
]: |
|
annot = annotations[split] |
|
split_name = { |
|
"train": "train2014", |
|
"val": "val2014", |
|
"test": "test2015", |
|
"test-dev": "test2015", |
|
}[split] |
|
paths = list(glob(f"{root}/{split_name}/*.jpg")) |
|
random.shuffle(paths) |
|
annot_paths = [ |
|
path |
|
for path in paths |
|
if int(path.split("/")[-1].split("_")[-1][:-4]) in annot |
|
] |
|
|
|
if len(paths) == len(annot_paths): |
|
print("all images have caption annotations") |
|
else: |
|
print("not all images have caption annotations") |
|
print( |
|
len(paths), len(annot_paths), len(annot), |
|
) |
|
|
|
bs = [ |
|
path2rest(path, split, annotations, label2ans) for path in tqdm(annot_paths) |
|
] |
|
|
|
dataframe = pd.DataFrame( |
|
bs, |
|
columns=[ |
|
"image", |
|
"questions", |
|
"answers", |
|
"answer_labels", |
|
"answer_scores", |
|
"image_id", |
|
"question_id", |
|
"split", |
|
], |
|
) |
|
|
|
table = pa.Table.from_pandas(dataframe) |
|
|
|
os.makedirs(dataset_root, exist_ok=True) |
|
with pa.OSFile(f"{dataset_root}/vqav2_{split}.arrow", "wb") as sink: |
|
with pa.RecordBatchFileWriter(sink, table.schema) as writer: |
|
writer.write_table(table) |
|
|
|
table = pa.ipc.RecordBatchFileReader( |
|
pa.memory_map(f"{dataset_root}/vqav2_val.arrow", "r") |
|
).read_all() |
|
|
|
pdtable = table.to_pandas() |
|
|
|
df1 = pdtable[:-1000] |
|
df2 = pdtable[-1000:] |
|
|
|
df1 = pa.Table.from_pandas(df1) |
|
df2 = pa.Table.from_pandas(df2) |
|
|
|
with pa.OSFile(f"{dataset_root}/vqav2_trainable_val.arrow", "wb") as sink: |
|
with pa.RecordBatchFileWriter(sink, df1.schema) as writer: |
|
writer.write_table(df1) |
|
|
|
with pa.OSFile(f"{dataset_root}/vqav2_rest_val.arrow", "wb") as sink: |
|
with pa.RecordBatchFileWriter(sink, df2.schema) as writer: |
|
writer.write_table(df2) |