Leaderboard / app.py
taesiri's picture
update
c1f8e27
raw
history blame
8.35 kB
import gradio as gr
import pandas as pd
from glob import glob
# Load text benchmark results
csv_results = glob("results/*.pkl")
# Load vision benchmark results
vision_results = glob("results-vision/*.pkl")
# Load CoT text benchmark results
cot_text_results = glob("results-cot/*.pkl")
# Load CoT vision benchmark results
cot_vision_results = glob("results-vision-CoT/*.pkl")
# Function to load data, add model type and name
def load_data(files, model_type):
data = []
for file in files:
df = pd.read_pickle(file)
df["Model Type"] = model_type
df["Model Name"] = file.split("/")[-1].replace(".pkl", "")
data.append(df)
return pd.concat(data, ignore_index=True)
# Load and label all data
data = load_data(csv_results, "Text Only")
vision_data = load_data(vision_results, "Vision")
cot_text_data = load_data(cot_text_results, "CoT Text Only")
cot_vision_data = load_data(cot_vision_results, "CoT Vision")
# Combine all data into a single DataFrame
all_data = pd.concat(
[data, vision_data, cot_text_data, cot_vision_data], ignore_index=True
)
all_model_names = all_data['Model Name'].unique()
all_text_only_model_names = list(all_data[all_data['Model Type'] == 'Text Only']['Model Name'].unique())
print(all_text_only_model_names)
## Continue with the cold code --
# TODO: Update me to read from all_data for later
# Load the csv files into a dict with keys being name of the file and values being the data
data = {file: pd.read_pickle(file) for file in csv_results}
# Load the vision files into a dict
vision_data = {file: pd.read_pickle(file) for file in vision_results}
# Load the CoT text files into a dict
cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results}
# Load the CoT vision files into a dict
cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results}
def calculate_accuracy(df):
return df["parsed_judge_response"].mean() * 100
def accuracy_breakdown(df):
# 4 level accuracy
return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
# Define the column names with icons
headers_with_icons = [
"πŸ€– Model Name",
"⭐ Overall",
"πŸ“ˆ Level 1",
"πŸ” Level 2",
"πŸ“˜ Level 3",
"πŸ”¬ Level 4",
]
column_names = [
"Model Name",
"Overall Accuracy",
"Level 1 Accuracy",
"Level 2 Accuracy",
"Level 3 Accuracy",
"Level 4 Accuracy",
]
# Function to process data
def process_data(data):
data_for_df = []
for file, df in data.items():
overall_accuracy = round(calculate_accuracy(df), 2)
breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
model_name = file.split("/")[-1].replace(".pkl", "")
data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
return data_for_df
# Process all data
text_data_for_df = process_data(data)
vision_data_for_df = process_data(vision_data)
cot_text_data_for_df = process_data(cot_text_data)
cot_vision_data_for_df = process_data(cot_vision_data)
# Create DataFrames
accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names)
vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names)
cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names)
# Function to finalize DataFrame
def finalize_df(df):
df = df.round(1) # Round to one decimal place
df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
df.columns = headers_with_icons
df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
return df
# Finalize all DataFrames
accuracy_df = finalize_df(accuracy_df)
vision_accuracy_df = finalize_df(vision_accuracy_df)
cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
def load_heatmap(evt: gr.SelectData):
heatmap_image = gr.Image(f"results/{evt.value}.jpg")
return heatmap_image
def load_vision_heatmap(evt: gr.SelectData):
heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
return heatmap_image
def load_cot_heatmap(evt: gr.SelectData):
heatmap_image = gr.Image(f"results-cot/{evt.value}.jpg")
return heatmap_image
def load_cot_vision_heatmap(evt: gr.SelectData):
heatmap_image = gr.Image(f"results-vision-CoT/{evt.value}.jpg")
return heatmap_image
def calculate_order_by_first_substring(selected_models):
first_columns = all_data[all_data['substring_index'] == 1]
query_ids_df = first_columns[first_columns['Model Type'] == 'Text Only']
# Filter to include only the selected models
query_ids_df = query_ids_df[query_ids_df['Model Name'].isin(selected_models)]
print(len(query_ids_df))
query_ids_df = query_ids_df.groupby('query_id').filter(lambda x: x['parsed_judge_response'].eq(1).all())
print(len(query_ids_df))
query_ids = query_ids_df.query_id.unique()
# print('query_ids', len(query_ids))
# filter out fsm_ids and
fsm_ids = query_ids_df.fsm_id.unique()
print('fsm_ids', len(fsm_ids), "Total of 25 FSM is solvable by everything on the first substring")
# now filter all_data for query_ids and text only, then calcaulte the accuracy based on the parsed_judge_response for each model
text_only = all_data[all_data['Model Type'] == 'Text Only']
text_only_filtered = text_only[text_only['fsm_id'].isin(fsm_ids)]
# print # of query_ids from text_only_filtered
print(f"Number of query_ids from text_only_filtered: {len(text_only_filtered.query_id.unique())}")
text_only_filtered = text_only_filtered.groupby(['Model Name'])['parsed_judge_response'].mean().reset_index()
text_only_filtered['Accuracy'] = text_only_filtered['parsed_judge_response'] * 100
text_only_filtered.drop('parsed_judge_response', axis=1, inplace=True)
text_only_filtered.sort_values('Accuracy', ascending=False)
# round to two decimal places
text_only_filtered['Accuracy'] = text_only_filtered['Accuracy'].apply(lambda x: round(x, 2))
return text_only_filtered
with gr.Blocks() as demo:
gr.Markdown("# FSM Benchmark Leaderboard")
with gr.Tab("Text-only Benchmark"):
gr.Markdown("# Text-only Leaderboard")
leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
gr.Markdown("## Heatmap")
heatmap_image = gr.Image(label="", show_label=False)
leader_board.select(fn=load_heatmap, outputs=[heatmap_image])
with gr.Tab("Vision Benchmark"):
gr.Markdown("# Vision Benchmark Leaderboard")
leader_board_vision = gr.Dataframe(
vision_accuracy_df, headers=headers_with_icons
)
gr.Markdown("## Heatmap")
heatmap_image_vision = gr.Image(label="", show_label=False)
leader_board_vision.select(
fn=load_vision_heatmap, outputs=[heatmap_image_vision]
)
with gr.Tab("CoT Text-only Benchmark"):
gr.Markdown("# CoT Text-only Leaderboard")
cot_leader_board_text = gr.Dataframe(
cot_text_accuracy_df, headers=headers_with_icons
)
gr.Markdown("## Heatmap")
cot_heatmap_image_text = gr.Image(label="", show_label=False)
cot_leader_board_text.select(
fn=load_cot_heatmap, outputs=[cot_heatmap_image_text]
)
with gr.Tab("CoT Vision Benchmark"):
gr.Markdown("# CoT Vision Benchmark Leaderboard")
cot_leader_board_vision = gr.Dataframe(
cot_vision_accuracy_df, headers=headers_with_icons
)
gr.Markdown("## Heatmap")
cot_heatmap_image_vision = gr.Image(label="", show_label=False)
cot_leader_board_vision.select(
fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision]
)
with gr.Tab("Constraint Text-only Results"):
gr.Markdown("## Constraint Text-only Leaderboard by first substring")
included_models = gr.CheckboxGroup(
label="Models to include", choices=all_text_only_model_names, value=all_text_only_model_names
)
constrained_leader_board_text = gr.Dataframe()
included_models.input(fn=calculate_order_by_first_substring, inputs=[included_models], outputs=[constrained_leader_board_text])
demo.launch()