open-r1-eval-leaderboard

Running

File size: 3,410 Bytes

aff26b6
 
 
 
 
 
 
 
 
a53938b
aff26b6
 
 
 
 
 
 
 
 
 
a53938b
aff26b6
 
 
 
 
 
 
 
a53938b
 
 
 
 
aff26b6
 
 
 
a53938b
 
 
37ce6d8
 
 
a53938b
 
 
 
 
aff26b6
 
37ce6d8
 
 
a53938b
aff26b6
 
 
 
 
 
 
 
 
c9b339f
 
 
 
 
 
 
 
 
 
 
aff26b6
 
 
 
 
 
 
 
c9b339f
 
aff26b6
8fe80ef
aff26b6
 
c9b339f
 
 
aff26b6

import json
from pathlib import Path

import gradio as gr
import pandas as pd

TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for H4 Models</h1>"""

DESCRIPTION = f"""
Evaluation of H4 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval)
"""


def get_leaderboard_df():
    filepaths = list(Path("eval_results").rglob("*.json"))

    # Parse filepaths to get unique models
    models = set()
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        model_revision = "_".join(path_parts[1:4])
        models.add(model_revision)

    # Initialize DataFrame
    df = pd.DataFrame(index=list(models))

    # Extract data from each file and populate the DataFrame
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        model_revision = "_".join(path_parts[1:4])
        task = path_parts[4].capitalize()
        # Extract timestamp from filepath
        timestamp = filepath.stem.split("_")[-1][:-3]
        df.loc[model_revision, "Timestamp"] = timestamp

        with open(filepath, "r") as file:
            data = json.load(file)
            first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
            # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
            if task == "truthfulqa":
                value = data["results"][first_result_key]["truthfulqa_mc2"]
            # IFEval has several metrics but we report just the prompt-loose-acc one
            elif task == "ifeval":
                value = data["results"][first_result_key]["prompt_level_loose_acc"]
            else:
                first_metric_key = next(
                    iter(data["results"][first_result_key])
                )  # gets the first key in the first result
                value = data["results"][first_result_key][first_metric_key]  # gets the value of the first metric
            df.loc[model_revision, task] = value

    # Put IFEval in first column
    ifeval_col = df.pop("Ifeval")
    df.insert(1, "Ifeval", ifeval_col)
    df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
    df = df.sort_values(by=["Average"], ascending=False)
    df = df.reset_index().rename(columns={"index": "Model"}).round(3)
    return df


def refresh():
    return get_leaderboard_df()


# Function to update the table based on search query
def update_table(search_query):
    df = get_leaderboard_df()
    if search_query:
        search_terms = search_query.split(";")
        search_terms = [term.strip() for term in search_terms]
        pattern = "|".join(search_terms)
        df = df[df["Model"].str.contains(pattern, regex=True)]
    return df


leaderboard_df = get_leaderboard_df()

demo = gr.Blocks()

with demo:
    gr.HTML(TITLE)
    with gr.Column():
        gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
        with gr.Row():
            search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
        with gr.Group():
            leaderboard_table = gr.Dataframe(value=leaderboard_df, wrap=True, height=1000)
        with gr.Row():
            refresh_button = gr.Button("Refresh")

    search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table])
    refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])

demo.launch()