Spaces:
Restarting
Restarting
import json | |
from pathlib import Path | |
import gradio as gr | |
import pandas as pd | |
TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for H4 Models</h1>""" | |
DESCRIPTION = f""" | |
Evaluation of H4 models across a diverse range of benchmarks from Eleuther's [LLM evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness). | |
""" | |
def get_leaderboard_df(): | |
filepaths = list(Path("eval_results").rglob("*.json")) | |
# Parse filepaths to get unique models | |
models = set() | |
for filepath in filepaths: | |
path_parts = Path(filepath).parts | |
model_revision = "_".join(path_parts[1:4]) # Adjust indices based on your path structure | |
models.add(model_revision) | |
# Initialize DataFrame | |
df = pd.DataFrame(index=list(models)) | |
# Extract data from each file and populate the DataFrame | |
for filepath in filepaths: | |
path_parts = Path(filepath).parts | |
model_revision = "_".join(path_parts[1:4]) # Adjust indices based on your path structure | |
task = Path(filepath).stem.split("_")[-1].capitalize() # gets 'mmlu' from the filename | |
with open(filepath, "r") as file: | |
data = json.load(file) | |
first_result_key = next(iter(data["results"])) # gets the first key in 'results' | |
first_metric_key = next(iter(data["results"][first_result_key])) # gets the first key in the first result | |
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric | |
df.loc[model_revision, task] = value | |
df.insert(loc=0, column="Average", value=df.mean(axis=1)) | |
df = df.sort_values(by=["Average"], ascending=False) | |
df = df.reset_index().rename(columns={"index": "Model"}).round(3) | |
return df | |
def refresh(): | |
return get_leaderboard_df() | |
leaderboard_df = get_leaderboard_df() | |
demo = gr.Blocks() | |
with demo: | |
gr.HTML(TITLE) | |
with gr.Column(): | |
gr.Markdown(DESCRIPTION, elem_classes="markdown-text") | |
with gr.Group(): | |
leaderboard_table = gr.Dataframe(value=leaderboard_df, wrap=True) | |
with gr.Row(): | |
refresh_button = gr.Button("Refresh") | |
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table]) | |
demo.launch() | |