open-r1-eval-leaderboard

Running

File size: 9,067 Bytes

aff26b6
 
 
 
 
 
 
 
 
bd4436b
aff26b6
 
7ac902b
 
aff26b6
8a31745
aff26b6
 
 
 
 
 
a53938b
aff26b6
 
 
 
 
 
 
 
8a31745
599688f
8a31745
599688f
aff26b6
 
 
 
7ac902b
 
 
a53938b
152be83
a53938b
37ce6d8
152be83
37ce6d8
152be83
 
9d1c3ff
152be83
 
 
69bc633
 
a96d97e
 
62abb01
 
88fd41c
 
 
29ce0ec
fd86754
6e4986c
a53938b
 
 
 
 
1cfc013
 
7ac902b
1cfc013
 
 
 
 
d48b380
 
 
 
 
43975e6
d48b380
 
 
 
 
 
43975e6
d48b380
43975e6
 
 
7cbcc84
43975e6
 
 
1844235
 
43975e6
 
 
 
b966f9c
 
 
 
 
 
1cfc013
43975e6
b61cab9
599688f
 
8a31745
43975e6
 
 
 
 
 
21ef6e1
599688f
 
8ea545e
8a31745
 
 
3a567fa
 
 
 
 
 
aff26b6
 
 
b6e9e9b
 
 
8a31745
 
aff26b6
 
c9b339f
ab41a5c
b6e9e9b
ab41a5c
c9b339f
6e537e5
c9b339f
6e537e5
43975e6
 
ab41a5c
 
 
 
 
 
 
 
df128e8
 
 
aff26b6
 
 
 
 
 
c9b339f
 
8a31745
 
 
 
 
8ea545e
df128e8
 
8a31745
0521afd
 
df128e8
aff26b6
b6e9e9b
fcb88b9
 
 
 
8a31745
fcb88b9
aff26b6
 
c9b339f
ab41a5c
8a31745
ab41a5c
c9b339f
aff26b6

import json
from pathlib import Path

import gradio as gr
import pandas as pd

TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for H4 Models</h1>"""

DESCRIPTION = f"""
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
"""

BENCHMARKS_TO_SKIP = ["math", "mini_math"]


def get_leaderboard_df(agg: str = "max"):
    filepaths = list(Path("eval_results").rglob("*.json"))

    # Parse filepaths to get unique models
    models = set()
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        model_revision = "_".join(path_parts[1:4])
        models.add(model_revision)

    # Initialize DataFrame
    df = pd.DataFrame(index=list(models))

    # Extract data from each file and populate the DataFrame
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        date = filepath.stem.split("_")[-1][:-3]
        model_revision = "_".join(path_parts[1:4]) + "_" + date
        task = path_parts[4]
        df.loc[model_revision, "Date"] = date

        with open(filepath, "r") as file:
            data = json.load(file)
            first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
            # Skip benchmarks that we don't want to include in the leaderboard
            if task.lower() in BENCHMARKS_TO_SKIP:
                continue
            # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
            if task.lower() == "truthfulqa":
                value = data["results"][first_result_key]["truthfulqa_mc2"]
            # IFEval has several metrics but we report just the prompt-loose-acc one
            elif task.lower() == "ifeval":
                value = data["results"][first_result_key]["prompt_level_loose_acc"]
            # MMLU has several metrics but we report just the average one
            elif task.lower() == "mmlu":
                value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
            # HellaSwag and ARC reports acc_norm
            elif task.lower() in ["hellaswag", "arc"]:
                value = data["results"][first_result_key]["acc_norm"]
            # BBH has several metrics but we report just the average one
            elif task.lower() == "bbh":
                if "all" in data["results"]:
                    value = data["results"]["all"]["acc"]
                else:
                    value = -100
            # AGIEval reports acc_norm
            elif task.lower() == "agieval":
                value = data["results"]["all"]["acc_norm"]
            # MATH reports qem
            elif task.lower() in ["math", "math_v2", "aimo_kaggle"]:
                value = data["results"]["all"]["qem"]
            else:
                first_metric_key = next(
                    iter(data["results"][first_result_key])
                )  # gets the first key in the first result
                value = data["results"][first_result_key][first_metric_key]  # gets the value of the first metric

            # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
            if task.lower() in ["mini_math_v2"]:
                for k, v in data["results"].items():
                    if k != "all":
                        level = k.split("|")[1].split(":")[-1]
                        value = v["qem"]
                        df.loc[model_revision, f"{task}_{level}"] = value
            # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
            elif task.lower() in ["aimo_kaggle_medium_pot"]:
                for k, v in data["results"].items():
                    if k != "all" and "_average" not in k:
                        version = k.split("|")[1].split(":")[-1]
                        value = v["qem"] if "qem" in v else v["score"]
                        df.loc[model_revision, f"{task}_{version}"] = value
            # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
            elif task.lower() in ["aimo_kaggle_hard_pot"]:
                for k, v in data["results"].items():
                    if k != "all" and "_average" not in k:
                        version = k.split("|")[1].split(":")[-1]
                        value = v["qem"] if "qem" in v else v["score"]
                        df.loc[model_revision, f"{task}_{version}"] = value
            # For kaggle_tora we report accuracy, so need  to divide by 100
            elif task.lower() in [
                "aimo_tora_eval_kaggle_medium",
                "aimo_tora_eval_kaggle_hard",
                "aimo_kaggle_fast_eval_hard",
                "aimo_kaggle_tora_medium",
                "aimo_kaggle_tora_hard",
                "aimo_kaggle_tora_medium_extended",
                "aimo_kaggle_tora_hard_extended",
            ]:
                for k, v in data["results"].items():
                    value = float(v["qem"]) / 100.0
                    df.loc[model_revision, f"{task}"] = value
            # For AlpacaEval we report base winrate and lenght corrected one
            elif task.lower() == "alpaca_eval":
                value = data["results"][first_result_key]["win_rate"]
                df.loc[model_revision, "Alpaca_eval"] = value / 100.0
                value = data["results"][first_result_key]["length_controlled_winrate"]
                df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
            else:
                df.loc[model_revision, task] = float(value)

    # Drop rows where every entry is NaN
    df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])

    # Trim minimath column names
    df.columns = [c.replace("_level_", "_l") for c in df.columns]

    # Trim AIMO column names
    df.columns = [c.replace("aimo_", "") for c in df.columns]

    df = df.reset_index().rename(columns={"index": "Model"}).round(2)
    # Strip off date from model name
    df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])

    # Drop date and aggregate results by model name
    df = df.drop("Date", axis=1).groupby("Model").agg(agg).reset_index()

    df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))

    # Convert all values to percentage
    df[df.select_dtypes(include=["number"]).columns] *= 100.0
    df = df.sort_values(by=["Average"], ascending=False)

    return df


leaderboard_df = get_leaderboard_df()


def refresh(agg: str = "max"):
    return get_leaderboard_df(agg=agg)


# Function to update the table based on search query
def filter_and_search(cols: list[str], search_query: str):
    df = leaderboard_df
    if len(search_query) > 0:
        search_terms = search_query.split(";")
        search_terms = [term.strip().lower() for term in search_terms]
        pattern = "|".join(search_terms)
        df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
        # Drop any columns which are all NaN
        df = df.dropna(how="all", axis=1)
    if len(cols) > 0:
        index_cols = list(leaderboard_df.columns[:1])
        new_cols = index_cols + cols
        df = df.copy()[new_cols]
        # Drop rows with NaN values
        df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
        # Recompute average
        df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
    return df


demo = gr.Blocks()

with demo:
    gr.HTML(TITLE)
    with gr.Column():
        gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
        with gr.Row():
            search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
            agg = gr.Radio(
                ["min", "max", "mean"],
                value="max",
                label="Aggregation",
                info="How to aggregate results for each model",
            )
        with gr.Row():
            cols_bar = gr.CheckboxGroup(
                choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
                show_label=False,
                info="Select columns to display",
            )
        with gr.Group():
            # leaderboard_df = get_leaderboard_df()
            leaderboard_table = gr.Dataframe(
                value=leaderboard_df,
                wrap=True,
                height=1000,
                column_widths=[400, 110] + [(220 + len(c)) for c in leaderboard_df.columns[1:]],
            )
        with gr.Row():
            refresh_button = gr.Button("Refresh")

    cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
    agg.change(refresh, inputs=[agg], outputs=[leaderboard_table])
    search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar], outputs=[leaderboard_table])
    refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])

demo.launch()