import copy as cp import json, sys from collections import defaultdict from urllib.request import urlopen import gradio as gr import numpy as np import pandas as pd from meta_data import DEFAULT_TASK def listinstr(lst, s): assert isinstance(lst, list) for item in lst: if item in s: return True return False def load_results(): #data = json.loads(urlopen(URL).read()) with open('results.json', 'r') as file: data = json.load(file) return data def nth_large(val, vals): return sum([1 for v in vals if v > val]) + 1 def format_timestamp(timestamp): date = timestamp[:-4] + '.' + timestamp[-4:-2] + '.' + timestamp[-2:] return date def BUILD_L1_DF(results, fields): check_box = {} check_box['essential'] = ['Model'] # revise there to set default dataset check_box['required'] = DEFAULT_TASK check_box['all'] = DEFAULT_TASK type_map = defaultdict(lambda: 'number') check_box['type_map'] = type_map df = generate_table(results, fields) return df, check_box def BUILD_L2_DF(results, task): results=results[task] model_list=[] benchmark_list=[] all_fields=[] for benchmark in results: if benchmark!='category': benchmark_list+=[benchmark] if benchmark not in ["CRUXEval","AutoPenBench"]: all_fields+=[benchmark] else: all_fields+=[benchmark+' (autonomous)', benchmark+' (assisted)'] model_list+=list(results[benchmark].keys()) model_list=list(set(model_list)) res = defaultdict(list) res['Model']=model_list for benchmark in benchmark_list: if benchmark not in ["CRUXEval","AutoPenBench"]: for model in model_list: if model in results[benchmark]: res[benchmark].append(results[benchmark][model]) else: res[benchmark].append(None) else: for model in model_list: res[benchmark+' (autonomous)'].append(results[benchmark][model]['autonomous']) res[benchmark+' (assisted)'].append(results[benchmark][model]['assisted']) df = pd.DataFrame(res) required_fields = all_fields check_box = {} check_box['essential'] = ['Model'] check_box['required'] = required_fields check_box['all'] = all_fields type_map = defaultdict(lambda: 'number') check_box['type_map'] = type_map return df, check_box def generate_table(results, fields): model_list=[] task_list=fields benchmark_list=[] for task in results: for benchmark in results[task]: if benchmark!='category': benchmark_list+=[benchmark] model_list+=list(results[task][benchmark].keys()) model_list=list(set(model_list)) res = defaultdict(list) res['Model']=model_list average_score={} cnt={} for task in task_list: task_score=[] for model in model_list: score=[] for benchmark in results[task]: if benchmark != 'category': if model not in results[task][benchmark]: score.append(None) elif not isinstance(results[task][benchmark][model], (int, float)): score.append((results[task][benchmark][model]["autonomous"]+results[task][benchmark][model]["assisted"])/2) else: score.append(results[task][benchmark][model]) if not any (item is not None for item in score): score=None else: score=np.mean([s for s in score if s is not None]) if model not in average_score: average_score[model]=score cnt[model]=1 else: average_score[model]=((average_score[model]*cnt[model])+score)/(cnt[model]+1) cnt[model]+=1 task_score.append(score) res[task]=task_score #res['Avg Score']=[average_score[model] for model in model_list] #res['Avg Rank'] = [sorted(res['Avg Score'], reverse=True).index(score) + 1 for score in res['Avg Score']] df = pd.DataFrame(res) # valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])] # valid = valid.sort_values('Avg Score') # valid = valid.iloc[::-1] # if len(fields): # missing = missing.sort_values('MMBench_V11' if 'MMBench_V11' in fields else fields[0]) # missing = missing.iloc[::-1] # df = pd.concat([valid, missing]) return df