open-r1-eval-leaderboard

Running

File size: 3,944 Bytes

aff26b6
 
 
 
 
 
 
 
 
bd4436b
aff26b6
 
 
 
 
 
 
 
 
 
a53938b
aff26b6
 
 
 
 
 
 
 
a53938b
 
 
 
 
aff26b6
 
 
 
a53938b
152be83
a53938b
37ce6d8
152be83
37ce6d8
152be83
 
 
 
 
 
a53938b
 
 
 
 
aff26b6
 
37ce6d8
 
 
a53938b
21ef6e1
 
aff26b6
21ef6e1
aff26b6
 
 
 
 
 
 
c9b339f
 
 
 
 
 
 
 
 
 
 
aff26b6
 
 
 
 
 
 
 
c9b339f
 
aff26b6
8fe80ef
aff26b6
 
c9b339f
 
 
aff26b6

import json
from pathlib import Path

import gradio as gr
import pandas as pd

TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for H4 Models</h1>"""

DESCRIPTION = f"""
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
"""


def get_leaderboard_df():
    filepaths = list(Path("eval_results").rglob("*.json"))

    # Parse filepaths to get unique models
    models = set()
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        model_revision = "_".join(path_parts[1:4])
        models.add(model_revision)

    # Initialize DataFrame
    df = pd.DataFrame(index=list(models))

    # Extract data from each file and populate the DataFrame
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        model_revision = "_".join(path_parts[1:4])
        task = path_parts[4].capitalize()
        # Extract timestamp from filepath
        timestamp = filepath.stem.split("_")[-1][:-3]
        df.loc[model_revision, "Timestamp"] = timestamp

        with open(filepath, "r") as file:
            data = json.load(file)
            first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
            # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
            if task.lower() == "truthfulqa":
                value = data["results"][first_result_key]["truthfulqa_mc2"]
            # IFEval has several metrics but we report just the prompt-loose-acc one
            elif task.lower() == "ifeval":
                value = data["results"][first_result_key]["prompt_level_loose_acc"]
            # MMLU has several metrics but we report just the average one
            elif task.lower() == "mmlu":
                value = data["results"]["lighteval|mmlu:_average|5"]["acc"]
            # HellaSwag and ARC reports acc_norm
            elif task.lower() in ["hellaswag", "arc"]:
                value = data["results"][first_result_key]["acc_norm"]
            else:
                first_metric_key = next(
                    iter(data["results"][first_result_key])
                )  # gets the first key in the first result
                value = data["results"][first_result_key][first_metric_key]  # gets the value of the first metric
            df.loc[model_revision, task] = value

    # Put IFEval in first column
    ifeval_col = df.pop("Ifeval")
    df.insert(1, "Ifeval", ifeval_col)
    df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
    # Convert all values to percentage
    df[df.select_dtypes(include=["number"]).columns] *= 100.0
    df = df.sort_values(by=["Average"], ascending=False)
    df = df.reset_index().rename(columns={"index": "Model"}).round(2)
    return df


def refresh():
    return get_leaderboard_df()


# Function to update the table based on search query
def update_table(search_query):
    df = get_leaderboard_df()
    if search_query:
        search_terms = search_query.split(";")
        search_terms = [term.strip() for term in search_terms]
        pattern = "|".join(search_terms)
        df = df[df["Model"].str.contains(pattern, regex=True)]
    return df


leaderboard_df = get_leaderboard_df()

demo = gr.Blocks()

with demo:
    gr.HTML(TITLE)
    with gr.Column():
        gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
        with gr.Row():
            search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
        with gr.Group():
            leaderboard_table = gr.Dataframe(value=leaderboard_df, wrap=True, height=1000)
        with gr.Row():
            refresh_button = gr.Button("Refresh")

    search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table])
    refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])

demo.launch()