Spaces:
Configuration error
Configuration error
import os.path as osp | |
import pandas as pd | |
from tqdm import tqdm | |
from vlmeval.evaluate.misc import build_judge | |
from vlmeval.utils import can_infer, track_progress_rich, TSVDataset | |
from vlmeval.smp import * | |
import numpy as np | |
INTERNAL = os.environ.get('INTERNAL', 0) | |
abbrs = { | |
'coarse_perception': 'CP', | |
'finegrained_perception (instance-level)': 'FP-S', | |
'finegrained_perception (cross-instance)': 'FP-C', | |
'logic_reasoning': 'LR', | |
'relation_reasoning': 'RR', | |
'attribute_reasoning': 'AR' | |
} | |
def MMMU_preproc(data): | |
logger = get_logger('Evaluation') | |
cnt = 0 | |
As, Bs, Ans = list(data['A']), list(data['B']), list(data['answer']) | |
lt = len(data) | |
for i in range(lt): | |
if pd.isna(As[i]): | |
As[i] = Ans[i] | |
Bs[i] = 'Other Answers' | |
cnt += 1 | |
logger.info(f'During MMMU_preproc in Evaluation, {cnt} open questions are re-formulated to multi-choice ones. ') | |
data['A'] = As | |
data['B'] = Bs | |
return data | |
def report_acc(df): | |
# assert group in [None, 'category', 'l2-category'] | |
res = defaultdict(list) | |
if 'split' in df: | |
splits = list(set(df['split'])) | |
res['split'] = splits | |
else: | |
df['split'] = ['none'] * len(df) | |
res['split'] = ['none'] | |
for group in [None, 'l2-category', 'category']: | |
if group is None: | |
res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']] | |
elif group not in df: | |
continue | |
else: | |
abilities = list(set(df[group])) | |
abilities.sort() | |
for ab in abilities: | |
ab_name = abbrs[ab] if ab in abbrs else ab | |
sub_df = df[df[group] == ab] | |
res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']] | |
return pd.DataFrame(res) | |
def build_prompt(question, options, prediction): | |
tmpl = ( | |
'You are an AI assistant who will help me to match ' | |
'an answer with several options of a single-choice question. ' | |
'You are provided with a question, several options, and an answer, ' | |
'and you need to find which option is most similar to the answer. ' | |
'If the meaning of all options are significantly different from the answer, output Z. ' | |
'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n' | |
'Example 1: \n' | |
'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n' | |
'Answer: a cute teddy bear\nYour output: A\n' | |
'Example 2: \n' | |
'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n' | |
'Answer: Spider\nYour output: Z\n' | |
'Example 3: \n' | |
'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: ' | |
) | |
return tmpl.format(question, options, prediction) | |
def build_prompt_cn(question, options, prediction): | |
tmpl = ( | |
'你是一个帮助我匹配答案与单选题中多个选项的 AI 助手。' | |
'你会被提供:一个问题,多个选项,一个答案。你的任务是找到与答案意义最相近的选项。' | |
'如果所有选项的意义都与答案显著不同,则输出 Z。' | |
'你应该输出一个单个的大写字母,例如 A, B, C, D(如果它们是有效选项),或 Z。' | |
'例 1:' | |
'问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 一只可爱的泰迪熊\n输出: A\n' | |
'例 2: \n' | |
'问题: 图中最主要的物体是什么?\n选项: A. 泰迪熊 B. 兔子 C. 猫 D. 狗\n答案: 蜘蛛\n输出: Z\n' | |
'例 3: \n' | |
'问题: {}?\n选项: {}\n答案: {}\n输出: ' | |
) | |
return tmpl.format(question, options, prediction) | |
def build_choices(item): | |
ret = {} | |
for ch in string.ascii_uppercase: | |
if ch in item and (not pd.isna(item[ch])): | |
ret[ch] = item[ch] | |
return ret | |
def prefetch_answer(item): | |
choices = build_choices(item) | |
return can_infer(item['prediction'], choices) | |
def extract_answer_from_item(model, item): | |
logger = get_logger('Evaluation') | |
# It will return: (pred, raw, llm_time) | |
choices = build_choices(item) | |
option_str = build_option_str(choices) | |
if cn_string(item['question']): | |
prompt = build_prompt_cn(item['question'], option_str, item['prediction']) | |
else: | |
prompt = build_prompt(item['question'], option_str, item['prediction']) | |
retry = 3 | |
ret = can_infer(item['prediction'], choices) | |
if ret: | |
return dict(opt=ret, log=item['prediction']) | |
while retry: | |
ans = model.generate(prompt) | |
if 'Failed to obtain answer via API' in ans: | |
logger.warning('GPT API failed to answer. ') | |
else: | |
ret = can_infer(ans, choices) | |
if ret: | |
return dict(opt=ret, log=ans) | |
else: | |
logger.warning(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}') | |
retry -= 1 | |
if retry == 0: | |
options = list(choices) + ['Z'] if 'Z' not in choices else [] | |
return dict(opt=rd.choice(options), log='Failed to predict, thus randomly generate one. ') | |
def prefetch_sub_data(sub_data, answer_map, verbose=False): | |
lt = len(sub_data) | |
GT, PRED = [], [] | |
for i in range(lt): | |
item = sub_data.iloc[i] | |
idx = item['index'] | |
GT.append(answer_map[idx]) | |
PRED.append(prefetch_answer(item)) | |
if PRED[-1] and (GT[-1] != PRED[-1]): | |
log = ( | |
f'Failed in Prefetching Rolling {i}: Answer is {GT[-1]}, ' | |
f"Prediction is {item['prediction']}, Pre-fetched is {PRED[-1]}. " | |
) | |
return dict(hit=0, log=log) | |
flag = True | |
for g, p in zip(GT, PRED): | |
if g != p: | |
flag = False | |
ret = (dict(hit=1, log='Succeed During Pre-fetching'), ) if flag else (None, ) | |
ret = ret + (GT, PRED) if verbose else ret | |
return ret if len(ret) > 1 else ret[0] | |
def eval_sub_data(model, sub_data, answer_map): | |
res, GT, PRED = prefetch_sub_data(sub_data, answer_map, verbose=True) | |
if res is not None: | |
return res | |
lt = len(sub_data) | |
log = '' | |
for i in range(lt): | |
if PRED[i]: | |
log += f'Rolling {i} Matched.\n' | |
else: | |
res = extract_answer_from_item(model, sub_data.iloc[i]) | |
opt, match_log = res['opt'], res['log'] | |
PRED[i] = opt | |
if PRED[i] != GT[i]: | |
log += ( | |
f"Failed in Rolling {i}: Answer is {GT[i]}; Prediction is {sub_data.iloc[i]['prediction']}; " | |
f'Pre-fetched is {PRED[i]}; Match Log is {match_log}.\n' | |
) | |
return dict(hit=0, log=log) | |
else: | |
log += ( | |
f"Rolling {i}: Answer is {GT[i]}, Prediction is {sub_data.iloc[i]['prediction']}, " | |
f'Pre-fetched is {PRED[i]}.\n' | |
) | |
return dict(hit=1, log=log) | |
def eval_data_groups(model, data_groups, answer_map, result, result_file, nproc=16): | |
prefetched = [prefetch_sub_data(g, answer_map, verbose=False) for g in data_groups] | |
remain = [] | |
for dg, pf in zip(data_groups, prefetched): | |
if pf: | |
result[dg.iloc[0]['index'] % 1e6] = pf | |
else: | |
remain.append(dg) | |
dump(result, result_file) | |
tups = [(model, x, answer_map) for x in remain] | |
keys = [x.iloc[0]['index'] % 1e6 for x in remain] | |
if len(tups) == 0: | |
return | |
if model is None: | |
logger = get_logger('Evaluation') | |
logger.warning('Exact Matching mode, will not do GPT-based answer matching. ') | |
for k in keys: | |
result[k] = dict( | |
hit=0, log='Failed in Prefetch, no GPT-based answer matching under `exact_matching` policy.') | |
dump(result, result_file) | |
return | |
res = track_progress_rich( | |
eval_sub_data, | |
tups, | |
nproc=nproc, | |
chunksize=nproc, | |
save=result_file, | |
keys=keys) | |
result = load(result_file) | |
for k, v in zip(keys, res): | |
if k in result: | |
assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log'] | |
else: | |
result[k] = v | |
dump(result, result_file) | |
def multiple_choice_eval(eval_file, dataset='default', **judge_kwargs): | |
logger = get_logger('Evaluation') | |
# assert dataset is not None | |
dataset_map = { | |
'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11', | |
'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11' | |
} | |
if dataset in dataset_map: | |
dataset = dataset_map[dataset] | |
nproc = judge_kwargs.pop('nproc', 4) | |
if listinstr(['mmbench', 'ccbench'], dataset.lower()): | |
data = load(eval_file) | |
data['index'] = [int(x) for x in data['index']] | |
dump(data, eval_file) | |
rd.seed(2680) | |
suffix = eval_file.split('.')[-1] | |
model = judge_kwargs['model'] | |
assert model in ['chatgpt-0613', 'exact_matching', 'gpt-4-0125'] | |
name_str_map = { | |
'chatgpt-0613': 'openai', | |
'gpt-4-0125': 'gpt4' | |
} | |
name_str = name_str_map[model] if model in name_str_map else model | |
if model == 'exact_matching': | |
model = None | |
else: | |
if INTERNAL or gpt_key_set(): | |
model = build_judge(**judge_kwargs) | |
else: | |
logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') | |
model = None | |
logger.info(f'Evaluating {eval_file}') | |
result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl') | |
result = {} | |
if osp.exists(result_file): | |
result = load(result_file) | |
data = load(eval_file) | |
data = data.sort_values(by='index') | |
data['prediction'] = [str(x) for x in data['prediction']] | |
for k in data.keys(): | |
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k) | |
if dataset != 'default': | |
meta = TSVDataset(dataset).data | |
else: | |
logger.warning('Dataset is not provided, try to use the original `eval_file` as meta data. ') | |
meta = load(eval_file) | |
assert 'index' in meta and 'answer' in meta, 'Essentail columns missing in the eval_file.' | |
answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])} | |
cate_map = {i: c for i, c in zip(meta['index'], meta['category'])} if 'category' in meta else None | |
l2_cate_map = {i: c for i, c in zip(meta['index'], meta['l2-category'])} if 'l2-category' in meta else None | |
split_map = {i: c for i, c in zip(meta['index'], meta['split'])} if 'split' in meta else None | |
if cate_map is not None and np.all([pd.isna(x) for x in cate_map.values()]): | |
cate_map = None | |
if l2_cate_map is not None and np.all([pd.isna(x) for x in l2_cate_map.values()]): | |
l2_cate_map = None | |
if split_map is not None and np.all([pd.isna(x) for x in split_map.values()]): | |
split_map = None | |
if listinstr(['MMMU'], dataset): | |
data = MMMU_preproc(data) | |
answer_map = {k: (v if v in list(string.ascii_uppercase) else 'A') for k, v in answer_map.items()} | |
data = data[data['index'].isin(answer_map)] | |
data_main = data[data['index'] < int(1e6)] | |
meta_idx_set = set(meta['index']) | |
data_main = data_main[data_main['index'].isin(meta_idx_set)] | |
lt = len(data_main) | |
hit, tot = 0, 0 | |
data_groups = [] | |
for i in tqdm(range(lt)): | |
# Dealing with the normal part | |
item_main = data_main.iloc[i] | |
idx = item_main['index'] | |
if idx in result: | |
correct = result[idx]['hit'] | |
assert correct in [0, 1] | |
hit += correct | |
tot += 1 | |
continue | |
sub_data = data[data['index'] % int(1e6) == idx] | |
data_groups.append(sub_data) | |
if len(data_groups): | |
eval_data_groups( | |
model=model, | |
data_groups=data_groups, | |
answer_map=answer_map, | |
nproc=nproc, | |
result=result, | |
result_file=result_file) | |
tmp_pth = f'/tmp/{timestr()}.xlsx' | |
dump(data_main, tmp_pth) | |
data_main = load(tmp_pth) | |
res = load(result_file) | |
indices = data_main['index'] | |
data_main['hit'] = [res[i]['hit'] for i in indices] | |
data_main['log'] = [res[i]['log'] for i in indices] | |
main_idx = data_main['index'] | |
if cate_map is not None: | |
data_main['category'] = [cate_map[i] for i in main_idx] | |
if l2_cate_map is not None: | |
data_main['l2-category'] = [l2_cate_map[i] for i in main_idx] | |
if split_map is not None: | |
data_main['split'] = [split_map[i] for i in indices] | |
# load split | |
dump(data_main, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) | |
data_main = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) | |
acc = report_acc(data_main) | |
score_file = eval_file.replace(f'.{suffix}', '_acc.csv') | |
dump(acc, score_file) | |
logger.info(f'multiple_choice_eval successfully finished evaluating {eval_file}, results saved in {score_file}') | |
logger.info('Score: ') | |
logger.info(acc) | |
return acc | |
def parse_args(): | |
parser = argparse.ArgumentParser(description='Inference LLM Answers. ') | |
parser.add_argument('data', type=str, help='The question set for inference, in excel / tsv / json format. ') | |
parser.add_argument( | |
'--model', | |
type=str, | |
help='The LLM (GPT) used for inference. ', | |
default='chatgpt-0613', | |
choices=['chatgpt-0613', 'exact_matching', 'gpt-4-0125']) | |
parser.add_argument( | |
'--dataset', | |
type=str, | |
default='default', | |
help='The dataset to evaluate') | |
parser.add_argument('--nproc', type=int, default=6) | |
parser.add_argument('--verbose', action='store_true') | |
args = parser.parse_args() | |
return args | |
if __name__ == '__main__': | |
load_env() | |
args = parse_args() | |
judge_kwargs = dict(model=args.model, nproc=args.nproc, verbose=args.verbose) | |
if 'OPENAI_API_KEY_JUDGE' in os.environ and os.environ['OPENAI_API_KEY_JUDGE']: | |
judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE'] | |
if 'OPENAI_API_BASE_JUDGE' in os.environ and os.environ['OPENAI_API_BASE_JUDGE']: | |
judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE'] | |
acc = multiple_choice_eval(eval_file=args.data, dataset=args.dataset, **judge_kwargs) | |