Spaces:
Running
Running
import json | |
import pandas as pd | |
import pyarrow as pa | |
import random | |
import os | |
from tqdm import tqdm | |
from glob import glob | |
from collections import defaultdict, Counter | |
from .glossary import normalize_word | |
def get_score(occurences): | |
if occurences == 0: | |
return 0.0 | |
elif occurences == 1: | |
return 0.3 | |
elif occurences == 2: | |
return 0.6 | |
elif occurences == 3: | |
return 0.9 | |
else: | |
return 1.0 | |
def path2rest(path, split, annotations, label2ans): | |
iid = int(path.split("/")[-1].split("_")[-1][:-4]) | |
with open(path, "rb") as fp: | |
binary = fp.read() | |
_annot = annotations[split][iid] | |
_annot = list(_annot.items()) | |
qids, qas = [a[0] for a in _annot], [a[1] for a in _annot] | |
questions = [qa[0] for qa in qas] | |
answers = [qa[1] for qa in qas] if "test" not in split else list(list()) | |
answer_labels = ( | |
[a["labels"] for a in answers] if "test" not in split else list(list()) | |
) | |
answer_scores = ( | |
[a["scores"] for a in answers] if "test" not in split else list(list()) | |
) | |
answers = ( | |
[[label2ans[l] for l in al] for al in answer_labels] | |
if "test" not in split | |
else list(list()) | |
) | |
return [binary, questions, answers, answer_labels, answer_scores, iid, qids, split] | |
def make_arrow(root, dataset_root): | |
with open(f"{root}/v2_OpenEnded_mscoco_train2014_questions.json", "r") as fp: | |
questions_train2014 = json.load(fp)["questions"] | |
with open(f"{root}/v2_OpenEnded_mscoco_val2014_questions.json", "r") as fp: | |
questions_val2014 = json.load(fp)["questions"] | |
with open(f"{root}/v2_OpenEnded_mscoco_test2015_questions.json", "r") as fp: | |
questions_test2015 = json.load(fp)["questions"] | |
with open(f"{root}/v2_OpenEnded_mscoco_test-dev2015_questions.json", "r") as fp: | |
questions_test_dev2015 = json.load(fp)["questions"] | |
with open(f"{root}/v2_mscoco_train2014_annotations.json", "r") as fp: | |
annotations_train2014 = json.load(fp)["annotations"] | |
with open(f"{root}/v2_mscoco_val2014_annotations.json", "r") as fp: | |
annotations_val2014 = json.load(fp)["annotations"] | |
annotations = dict() | |
for split, questions in zip( | |
["train", "val", "test", "test-dev"], | |
[ | |
questions_train2014, | |
questions_val2014, | |
questions_test2015, | |
questions_test_dev2015, | |
], | |
): | |
_annot = defaultdict(dict) | |
for q in tqdm(questions): | |
_annot[q["image_id"]][q["question_id"]] = [q["question"]] | |
annotations[split] = _annot | |
all_major_answers = list() | |
for split, annots in zip( | |
["train", "val"], [annotations_train2014, annotations_val2014], | |
): | |
_annot = annotations[split] | |
for q in tqdm(annots): | |
all_major_answers.append(q["multiple_choice_answer"]) | |
all_major_answers = [normalize_word(word) for word in tqdm(all_major_answers)] | |
counter = {k: v for k, v in Counter(all_major_answers).items() if v >= 9} | |
ans2label = {k: i for i, k in enumerate(counter.keys())} | |
label2ans = list(counter.keys()) | |
for split, annots in zip( | |
["train", "val"], [annotations_train2014, annotations_val2014], | |
): | |
_annot = annotations[split] | |
for q in tqdm(annots): | |
answers = q["answers"] | |
answer_count = {} | |
for answer in answers: | |
answer_ = answer["answer"] | |
answer_count[answer_] = answer_count.get(answer_, 0) + 1 | |
labels = [] | |
scores = [] | |
for answer in answer_count: | |
if answer not in ans2label: | |
continue | |
labels.append(ans2label[answer]) | |
score = get_score(answer_count[answer]) | |
scores.append(score) | |
_annot[q["image_id"]][q["question_id"]].append( | |
{"labels": labels, "scores": scores,} | |
) | |
for split in ["train", "val"]: | |
filtered_annot = dict() | |
for ik, iv in annotations[split].items(): | |
new_q = dict() | |
for qk, qv in iv.items(): | |
if len(qv[1]["labels"]) != 0: | |
new_q[qk] = qv | |
if len(new_q) != 0: | |
filtered_annot[ik] = new_q | |
annotations[split] = filtered_annot | |
for split in [ | |
"train", | |
"val", | |
"test", | |
"test-dev", | |
]: | |
annot = annotations[split] | |
split_name = { | |
"train": "train2014", | |
"val": "val2014", | |
"test": "test2015", | |
"test-dev": "test2015", | |
}[split] | |
paths = list(glob(f"{root}/{split_name}/*.jpg")) | |
random.shuffle(paths) | |
annot_paths = [ | |
path | |
for path in paths | |
if int(path.split("/")[-1].split("_")[-1][:-4]) in annot | |
] | |
if len(paths) == len(annot_paths): | |
print("all images have caption annotations") | |
else: | |
print("not all images have caption annotations") | |
print( | |
len(paths), len(annot_paths), len(annot), | |
) | |
bs = [ | |
path2rest(path, split, annotations, label2ans) for path in tqdm(annot_paths) | |
] | |
dataframe = pd.DataFrame( | |
bs, | |
columns=[ | |
"image", | |
"questions", | |
"answers", | |
"answer_labels", | |
"answer_scores", | |
"image_id", | |
"question_id", | |
"split", | |
], | |
) | |
table = pa.Table.from_pandas(dataframe) | |
os.makedirs(dataset_root, exist_ok=True) | |
with pa.OSFile(f"{dataset_root}/vqav2_{split}.arrow", "wb") as sink: | |
with pa.RecordBatchFileWriter(sink, table.schema) as writer: | |
writer.write_table(table) | |
table = pa.ipc.RecordBatchFileReader( | |
pa.memory_map(f"{dataset_root}/vqav2_val.arrow", "r") | |
).read_all() | |
pdtable = table.to_pandas() | |
df1 = pdtable[:-1000] | |
df2 = pdtable[-1000:] | |
df1 = pa.Table.from_pandas(df1) | |
df2 = pa.Table.from_pandas(df2) | |
with pa.OSFile(f"{dataset_root}/vqav2_trainable_val.arrow", "wb") as sink: | |
with pa.RecordBatchFileWriter(sink, df1.schema) as writer: | |
writer.write_table(df1) | |
with pa.OSFile(f"{dataset_root}/vqav2_rest_val.arrow", "wb") as sink: | |
with pa.RecordBatchFileWriter(sink, df2.schema) as writer: | |
writer.write_table(df2) |