yujinyujin9393's picture
Upload 7 files
bb8ff6c verified
raw
history blame
4.79 kB
import copy as cp
import json, sys
from collections import defaultdict
from urllib.request import urlopen
import gradio as gr
import numpy as np
import pandas as pd
from meta_data import DEFAULT_TASK
def listinstr(lst, s):
assert isinstance(lst, list)
for item in lst:
if item in s:
return True
return False
def load_results():
#data = json.loads(urlopen(URL).read())
with open('results.json', 'r') as file:
data = json.load(file)
return data
def nth_large(val, vals):
return sum([1 for v in vals if v > val]) + 1
def format_timestamp(timestamp):
date = timestamp[:-4] + '.' + timestamp[-4:-2] + '.' + timestamp[-2:]
return date
def BUILD_L1_DF(results, fields):
check_box = {}
check_box['essential'] = ['Model']
# revise there to set default dataset
check_box['required'] = DEFAULT_TASK
check_box['all'] = DEFAULT_TASK
type_map = defaultdict(lambda: 'number')
check_box['type_map'] = type_map
df = generate_table(results, fields)
return df, check_box
def BUILD_L2_DF(results, task):
results=results[task]
model_list=[]
benchmark_list=[]
all_fields=[]
for benchmark in results:
if benchmark!='category':
benchmark_list+=[benchmark]
if benchmark not in ["CRUXEval","AutoPenBench"]:
all_fields+=[benchmark]
else:
all_fields+=[benchmark+' (autonomous)', benchmark+' (assisted)']
model_list+=list(results[benchmark].keys())
model_list=list(set(model_list))
res = defaultdict(list)
res['Model']=model_list
for benchmark in benchmark_list:
if benchmark not in ["CRUXEval","AutoPenBench"]:
for model in model_list:
if model in results[benchmark]:
res[benchmark].append(results[benchmark][model])
else:
res[benchmark].append(None)
else:
for model in model_list:
res[benchmark+' (autonomous)'].append(results[benchmark][model]['autonomous'])
res[benchmark+' (assisted)'].append(results[benchmark][model]['assisted'])
df = pd.DataFrame(res)
required_fields = all_fields
check_box = {}
check_box['essential'] = ['Model']
check_box['required'] = required_fields
check_box['all'] = all_fields
type_map = defaultdict(lambda: 'number')
check_box['type_map'] = type_map
return df, check_box
def generate_table(results, fields):
model_list=[]
task_list=fields
benchmark_list=[]
for task in results:
for benchmark in results[task]:
if benchmark!='category':
benchmark_list+=[benchmark]
model_list+=list(results[task][benchmark].keys())
model_list=list(set(model_list))
res = defaultdict(list)
res['Model']=model_list
average_score={}
cnt={}
for task in task_list:
task_score=[]
for model in model_list:
score=[]
for benchmark in results[task]:
if benchmark != 'category':
if model not in results[task][benchmark]:
score.append(None)
elif not isinstance(results[task][benchmark][model], (int, float)):
score.append((results[task][benchmark][model]["autonomous"]+results[task][benchmark][model]["assisted"])/2)
else:
score.append(results[task][benchmark][model])
if not any (item is not None for item in score):
score=None
else:
score=np.mean([s for s in score if s is not None])
if model not in average_score:
average_score[model]=score
cnt[model]=1
else:
average_score[model]=((average_score[model]*cnt[model])+score)/(cnt[model]+1)
cnt[model]+=1
task_score.append(score)
res[task]=task_score
#res['Avg Score']=[average_score[model] for model in model_list]
#res['Avg Rank'] = [sorted(res['Avg Score'], reverse=True).index(score) + 1 for score in res['Avg Score']]
df = pd.DataFrame(res)
# valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])]
# valid = valid.sort_values('Avg Score')
# valid = valid.iloc[::-1]
# if len(fields):
# missing = missing.sort_values('MMBench_V11' if 'MMBench_V11' in fields else fields[0])
# missing = missing.iloc[::-1]
# df = pd.concat([valid, missing])
return df