Spaces:
Runtime error
Runtime error
File size: 6,047 Bytes
e90fa51 ce50391 e90fa51 ce50391 e90fa51 ce50391 e90fa51 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
from datasets import load_dataset, Dataset
import os
from datasets import load_dataset
from datasets.utils.logging import disable_progress_bar
from constants import column_names, all_task_types
from utils_display import make_clickable_model
import random
import json
disable_progress_bar()
id_to_data = None
model_len_info = None
def estimated_win_rate(elo_a, elo_b):
"""
Calculate the estimated win rate for player A against player B using their Elo ratings.
:param elo_a: Elo rating of player A
:param elo_b: Elo rating of player B
:return: Estimated win rate for player A
"""
exponent = (elo_b - elo_a) / 400
probability_a_wins = 1 / (1 + 10 ** exponent)
return (1-probability_a_wins)*100
# Formats the columns
def formatter(x):
if type(x) is str:
x = x
else:
x = round(x, 2)
return x
def add_winrates(current_df):
df = current_df.copy()
elo_column = "Overall Elo"
# Correct way to filter the DataFrame and get the Elo rating for "gpt-4-0125-preview"
model_a_elo = df[df["Model"].str.contains("gpt-4")][elo_column].iloc[0]
# Correct way to filter the DataFrame and get the Elo rating for "gpt-3.5-turbo-0125"
model_b_elo = df[df["Model"].str.contains("gpt-3.5")][elo_column].iloc[0]
# Calculate the win rate of "gpt-4-0125-preview" against all models
df['Win% vs GPT-4'] = df[elo_column].apply(lambda x: estimated_win_rate(model_a_elo, x)).apply(formatter)
df['Win% vs GPT-3.5T'] = df[elo_column].apply(lambda x: estimated_win_rate(model_b_elo, x)).apply(formatter)
# apply the formatter for the two new columns
cols = list(df.columns)
cols.remove("# battles"); cols.append("# battles")
cols.remove("Length"); cols.append("Length")
df = df[cols]
return df
def add_winrates_tasks(current_df, ref="gpt-4"):
new_df = current_df.copy()
for t in all_task_types:
column = column_names[t]
model_a_elo = current_df[current_df["Model"].str.contains(ref)][column].iloc[0]
new_df[column] = current_df[column].apply(lambda x: estimated_win_rate(model_a_elo, x)).apply(formatter)
return new_df
def post_processing(df, model_len_info):
if model_len_info:
df["Length"] = df["model name "].apply(lambda x: model_len_info[x])
for col in df.columns:
if col == "model name ":
df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
else:
df[col] = df[col].apply(formatter) # For numerical values
df.rename(columns=column_names, inplace=True)
df.sort_values(by="Overall Elo", inplace=True, ascending=False)
# put the "Overall Elo" and "Task-Avg Elo" column to the front
# add the length info
df = df[["Model", "Overall Elo", "Task-Avg Elo"] + [col for col in df.columns if col not in ["Model", "Overall Elo", "Task-Avg Elo"]]]
return df
def apply_length_penalty(original_df, ablation_df, length_penalty=0.2):
original_df = original_df.copy()
ablation_df = ablation_df.copy()
# replace all values in original_df with the values as z = x - y * length_penalty where y is from ablation_df at the same row and column
# except for the "Model" column and the "# battles" column
# do not assume the order of the rows are the same in both dataframes
for i, row in original_df.iterrows():
for col in original_df.columns:
if col == "Model" or col == "# battles" or col == "Length":
continue
# assert that the model names are the same in both dataframes
assert original_df.at[i, "Model"] == ablation_df[ablation_df["Model"] == row["Model"]]["Model"].values[0]
original_df[col] = original_df[col].astype(float)
original_df.at[i, col] = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] * length_penalty
# post_processing
original_df = post_processing(original_df, model_len_info=None)
return original_df
def load_benchdata_dict():
with open("data_dir/predictions_logs.jsonl", "r") as f:
bench_data = [json.loads(d) for d in f]
id_to_data = {}
for item in bench_data:
id_to_data[item["idx"]] = item
return id_to_data
def load_eval_results():
with open("data_dir/predictions_logs.jsonl", "r") as f:
eval_results = [json.loads(d) for d in f]
return eval_results
def sample_an_eval_result(eval_results, model_list=[]):
global id_to_data
eval_results = list(eval_results)
random.shuffle(eval_results)
for eval_item in eval_results:
print(eval_item.keys())
model = eval_item['model']
task_type = eval_item['task_type'] # primary task type
if model not in model_list:
continue
plan_history = eval_item['plan_prompts']
ground_history = eval_item['ground_prompts']
task = eval_item['question']
if "image" in eval_item:
result_dict = {
"session_id": eval_item['idx'],
"task": task,
"task_type": task_type,
"plan_history": plan_history,
"ground_history": ground_history,
"pred": eval_item['pred'],
"answer": eval_item['answer'],
"correctness": eval_item['correctness'],
"image": eval_item['image'].replace("eval/aokvqa/images/val2017/", "file/data_dir/test_images/")
}
else:
result_dict = {
"session_id": eval_item['idx'],
"task": task,
"task_type": task_type,
"plan_history": plan_history,
"ground_history": ground_history,
"pred": eval_item['pred'],
"answer": eval_item['answer'],
"correctness": eval_item['correctness'],
"image": None
}
break
return result_dict
id_to_data = load_benchdata_dict() |