Llava-Med / llava /eval /summarize_gpt_review.py
Veda0718's picture
Upload 50 files
f268251 verified
raw
history blame
1.84 kB
import argparse
from copy import deepcopy
import util
from pprint import pprint
from collections import defaultdict
import pandas as pd
import json
def get_domain(x):
for domain in ['chest_xray', 'mri', 'histology', 'gross', 'ct_scan']:
in_domain = x['domain'][domain]
if in_domain:
return domain
def main(args):
scores_data = util.load_file_jsonl(args.scores_file)
predictions = [(x['question_id'], x['type'], get_domain(x), x['gpt_eval'].split('\n')[0].split(' ')) for x in scores_data]
score_type_dict = defaultdict(lambda: defaultdict(list))
for q_id, q_type, domain, (a1_score, a2_score) in predictions:
score_type_dict[q_type][1].append(a1_score)
score_type_dict[q_type][2].append(a2_score)
score_type_dict['overall'][1].append(a1_score)
score_type_dict['overall'][2].append(a2_score)
score_type_dict[domain][1].append(a1_score)
score_type_dict[domain][2].append(a2_score)
result = defaultdict(dict)
for q_type, score_dict in score_type_dict.items():
result[q_type]['gpt4_score'] = util.get_avg(score_dict[1])
result[q_type]['pred_score'] = util.get_avg(score_dict[2])
result[q_type]['pred_relative_score'] = util.get_avg([float(s2)/float(s1) for s1, s2 in zip(score_dict[1], score_dict[2])])*100
result[q_type]['data_size'] = len(score_dict[1])
df = pd.DataFrame.from_dict(result).filter(['conversation', 'detailed_description', 'chest_xray', 'mri', 'histology', 'gross', 'ct_scan', 'overall'])
print(df)
if __name__ == '__main__':
parser = argparse.ArgumentParser("GPT-4 Multimodal Chat Eval Postprocessing", add_help=True)
parser.add_argument("--scores-file", default="", metavar="FILE", help="input path to gpt-4 score file")
args = parser.parse_args()
main(args)