Spaces:

FSMBench
/

Leaderboard

Sleeping

App Files Files Community

Leaderboard / app.py

taesiri

update

856df6f about 1 year ago

raw

history blame

2.6 kB

	import gradio as gr
	import pandas as pd
	from glob import glob


	csv_results = glob("results/*.pkl")
	# load the csv files into a dict with keys being name of the file and values being the data
	data = {file: pd.read_pickle(file) for file in csv_results}


	def calculate_accuracy(df):
	return df["parsed_judge_response"].mean() * 100


	def accuracy_breakdown(df):
	# 4 level accuracy
	return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values


	# Define the column names with icons
	headers_with_icons = [
	"🤖 Model Name",
	"⭐ Overall",
	"📈 Level 1",
	"🔍 Level 2",
	"📘 Level 3",
	"🔬 Level 4",
	]


	accuracy = {file: calculate_accuracy(data[file]) for file in data}

	# Create a list to hold the data
	data_for_df = []
	# Define the column names with icons

	# Iterate over each file and its corresponding DataFrame in the data dictionary
	for file, df in data.items():
	# Get the overall accuracy and round it
	overall_accuracy = round(calculate_accuracy(df), 2)
	# Get the breakdown accuracy and round each value
	breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
	# Prepare the model name from the file name
	model_name = file.split("/")[-1].replace(".pkl", "") # Corrected the file extension
	# Append the data to the list
	data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)

	# Define the column names, adjust based on the number of difficulty levels you have
	column_names = [
	"Model Name",
	"Overall Accuracy",
	"Level 1 Accuracy",
	"Level 2 Accuracy",
	"Level 3 Accuracy",
	"Level 4 Accuracy",
	]

	# Create the DataFrame
	accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
	accuracy_df.columns = headers_with_icons
	accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)


	def load_heatmap(evt: gr.SelectData):
	heatmap_image = gr.Image(f"results/{evt.value}.jpg")
	return heatmap_image


	with gr.Blocks() as demo:
	gr.Markdown("# FSM Benchmark Leaderboard")
	# add link to home page and dataset
	with gr.Tab("Text-only Benchmark"):

	leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)

	gr.Markdown("## Heatmap")

	heatamp_image = gr.Image(label="", show_label=False)

	leader_board.select(fn=load_heatmap, outputs=[heatamp_image])

	with gr.Tab("Vision Benchmark"):
	gr.Markdown("# TBA")
	leader_board_vision = gr.Dataframe()
	gr.Markdown("## Heatmap")
	heatamp_image_vision = gr.Image(label="", show_label=False)

	demo.launch()