open-r1-eval-leaderboard

Running

File size: 2,251 Bytes

aff26b6

import json
from pathlib import Path

import gradio as gr
import pandas as pd

TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for H4 Models</h1>"""

DESCRIPTION = f"""
Evaluation of H4 models across a diverse range of benchmarks from Eleuther's [LLM evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
"""


def get_leaderboard_df():
    filepaths = list(Path("eval_results").rglob("*.json"))

    # Parse filepaths to get unique models
    models = set()
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        model_revision = "_".join(path_parts[1:4])  # Adjust indices based on your path structure
        models.add(model_revision)

    # Initialize DataFrame
    df = pd.DataFrame(index=list(models))

    # Extract data from each file and populate the DataFrame
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        model_revision = "_".join(path_parts[1:4])  # Adjust indices based on your path structure
        task = Path(filepath).stem.split("_")[-1].capitalize()  # gets 'mmlu' from the filename

        with open(filepath, "r") as file:
            data = json.load(file)
            first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
            first_metric_key = next(iter(data["results"][first_result_key]))  # gets the first key in the first result
            value = data["results"][first_result_key][first_metric_key]  # gets the value of the first metric
            df.loc[model_revision, task] = value

    df.insert(loc=0, column="Average", value=df.mean(axis=1))
    df = df.sort_values(by=["Average"], ascending=False)
    df = df.reset_index().rename(columns={"index": "Model"}).round(3)
    return df


def refresh():
    return get_leaderboard_df()


leaderboard_df = get_leaderboard_df()

demo = gr.Blocks()

with demo:
    gr.HTML(TITLE)
    with gr.Column():
        gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
        with gr.Group():
            leaderboard_table = gr.Dataframe(value=leaderboard_df, wrap=True)
        with gr.Row():
            refresh_button = gr.Button("Refresh")
            refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])

demo.launch()