open-r1-eval-leaderboard

Running

App Files Files Community

open-r1-eval-leaderboard / app.py

lewtun HF Staff

Fix metrics

152be83 over 1 year ago

raw

history blame

3.94 kB

	import json
	from pathlib import Path

	import gradio as gr
	import pandas as pd

	TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for H4 Models</h1>"""

	DESCRIPTION = f"""
	Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
	"""


	def get_leaderboard_df():
	filepaths = list(Path("eval_results").rglob("*.json"))

	# Parse filepaths to get unique models
	models = set()
	for filepath in filepaths:
	path_parts = Path(filepath).parts
	model_revision = "_".join(path_parts[1:4])
	models.add(model_revision)

	# Initialize DataFrame
	df = pd.DataFrame(index=list(models))

	# Extract data from each file and populate the DataFrame
	for filepath in filepaths:
	path_parts = Path(filepath).parts
	model_revision = "_".join(path_parts[1:4])
	task = path_parts[4].capitalize()
	# Extract timestamp from filepath
	timestamp = filepath.stem.split("_")[-1][:-3]
	df.loc[model_revision, "Timestamp"] = timestamp

	with open(filepath, "r") as file:
	data = json.load(file)
	first_result_key = next(iter(data["results"])) # gets the first key in 'results'
	# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
	if task.lower() == "truthfulqa":
	value = data["results"][first_result_key]["truthfulqa_mc2"]
	# IFEval has several metrics but we report just the prompt-loose-acc one
	elif task.lower() == "ifeval":
	value = data["results"][first_result_key]["prompt_level_loose_acc"]
	# MMLU has several metrics but we report just the average one
	elif task.lower() == "mmlu":
	value = data["results"]["lighteval\|mmlu:_average\|5"]["acc"]
	# HellaSwag and ARC reports acc_norm
	elif task.lower() in ["hellaswag", "arc"]:
	value = data["results"][first_result_key]["acc_norm"]
	else:
	first_metric_key = next(
	iter(data["results"][first_result_key])
	) # gets the first key in the first result
	value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
	df.loc[model_revision, task] = value

	# Put IFEval in first column
	ifeval_col = df.pop("Ifeval")
	df.insert(1, "Ifeval", ifeval_col)
	df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
	# Convert all values to percentage
	df[df.select_dtypes(include=["number"]).columns] *= 100.0
	df = df.sort_values(by=["Average"], ascending=False)
	df = df.reset_index().rename(columns={"index": "Model"}).round(2)
	return df


	def refresh():
	return get_leaderboard_df()


	# Function to update the table based on search query
	def update_table(search_query):
	df = get_leaderboard_df()
	if search_query:
	search_terms = search_query.split(";")
	search_terms = [term.strip() for term in search_terms]
	pattern = "\|".join(search_terms)
	df = df[df["Model"].str.contains(pattern, regex=True)]
	return df


	leaderboard_df = get_leaderboard_df()

	demo = gr.Blocks()

	with demo:
	gr.HTML(TITLE)
	with gr.Column():
	gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
	with gr.Row():
	search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
	with gr.Group():
	leaderboard_table = gr.Dataframe(value=leaderboard_df, wrap=True, height=1000)
	with gr.Row():
	refresh_button = gr.Button("Refresh")

	search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table])
	refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])

	demo.launch()