In [34]:
import json
from pathlib import Path

import gradio as gr
import pandas as pd

TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for H4 Models</h1>"""

DESCRIPTION = f"""
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
"""

BENCHMARKS_TO_SKIP = ["math", "mini_math"]


def get_leaderboard_df(agg : str = "max"):
    filepaths = list(Path("eval_results").rglob("*.json"))

    # Parse filepaths to get unique models
    models = set()
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        model_revision = "_".join(path_parts[1:4])
        models.add(model_revision)

    # Initialize DataFrame
    df = pd.DataFrame(index=list(models))

    # Extract data from each file and populate the DataFrame
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        date = filepath.stem.split("_")[-1][:-3]
        model_revision = "_".join(path_parts[1:4]) + "_" + date
        task = path_parts[4]
        df.loc[model_revision, "Date"] = date

        with open(filepath, "r") as file:
            data = json.load(file)
            first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
            # Skip benchmarks that we don't want to include in the leaderboard
            if task.lower() in BENCHMARKS_TO_SKIP:
                continue
            # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
            if task.lower() == "truthfulqa":
                value = data["results"][first_result_key]["truthfulqa_mc2"]
            # IFEval has several metrics but we report just the prompt-loose-acc one
            elif task.lower() == "ifeval":
                value = data["results"][first_result_key]["prompt_level_loose_acc"]
            # MMLU has several metrics but we report just the average one
            elif task.lower() == "mmlu":
                value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
            # HellaSwag and ARC reports acc_norm
            elif task.lower() in ["hellaswag", "arc"]:
                value = data["results"][first_result_key]["acc_norm"]
            # BBH has several metrics but we report just the average one
            elif task.lower() == "bbh":
                if "all" in data["results"]:
                    value = data["results"]["all"]["acc"]
                else:
                    value = -100
            # AGIEval reports acc_norm
            elif task.lower() == "agieval":
                value = data["results"]["all"]["acc_norm"]
            # MATH reports qem
            elif task.lower() in ["math", "math_v2", "aimo_kaggle"]:
                value = data["results"]["all"]["qem"]
            else:
                first_metric_key = next(
                    iter(data["results"][first_result_key])
                )  # gets the first key in the first result
                value = data["results"][first_result_key][first_metric_key]  # gets the value of the first metric

            # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
            if task.lower() in ["mini_math_v2"]:
                for k, v in data["results"].items():
                    if k != "all":
                        level = k.split("|")[1].split(":")[-1]
                        value = v["qem"]
                        df.loc[model_revision, f"{task}_{level}"] = value
            # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
            elif task.lower() in ["aimo_kaggle_medium_pot"]:
                for k, v in data["results"].items():
                    if k != "all" and "_average" not in k:
                        version = k.split("|")[1].split(":")[-1]
                        value = v["qem"] if "qem" in v else v["score"]
                        df.loc[model_revision, f"{task}_{version}"] = value
            # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
            elif task.lower() in ["aimo_kaggle_hard_pot"]:
                for k, v in data["results"].items():
                    if k != "all" and "_average" not in k:
                        version = k.split("|")[1].split(":")[-1]
                        value = v["qem"] if "qem" in v else v["score"]
                        df.loc[model_revision, f"{task}_{version}"] = value
            # For kaggle_tora we report accuracy, so need  to divide by 100
            elif task.lower() in [
                "aimo_tora_eval_kaggle_medium",
                "aimo_tora_eval_kaggle_hard",
                "aimo_kaggle_fast_eval_hard",
                "aimo_kaggle_tora_medium",
                "aimo_kaggle_tora_hard",
                "aimo_kaggle_tora_medium_extended",
                "aimo_kaggle_tora_hard_extended",
            ]:
                for k, v in data["results"].items():
                    value = float(v["qem"]) / 100.0
                    df.loc[model_revision, f"{task}"] = value
            # For AlpacaEval we report base winrate and lenght corrected one
            elif task.lower() == "alpaca_eval":
                value = data["results"][first_result_key]["win_rate"]
                df.loc[model_revision, "Alpaca_eval"] = value / 100.0
                value = data["results"][first_result_key]["length_controlled_winrate"]
                df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
            else:
                df.loc[model_revision, task] = float(value)

    # Drop rows where every entry is NaN
    df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])

    # Trim minimath column names
    df.columns = [c.replace("_level_", "_l") for c in df.columns]

    # Trim AIMO column names
    df.columns = [c.replace("aimo_", "") for c in df.columns]

    df.insert(loc=0, column="Average", value=df.mean(axis=1, numeric_only=True))

    # Convert all values to percentage
    df[df.select_dtypes(include=["number"]).columns] *= 100.0
    df = df.sort_values(by=["Average"], ascending=False)
    df = df.reset_index().rename(columns={"index": "Model"}).round(2)
    # Strip off date from model name
    df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])

    # Drop date and aggregate results by model name
    df = df.drop("Date", axis=1).groupby("Model").agg(agg).reset_index()

    return df

In [41]:
df = get_leaderboard_df(agg='mean')

In [37]:
# df

In [40]:
df.query("Model == 'AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits'").dropna(axis=1, how="all")

Unnamed: 0,Model,Average,kaggle_tora_medium_extended,kaggle_tora_hard_extended
1741,AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits,28.89,61.45,28.89


In [38]:
df.query("Model == 'AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits'").dropna(axis=1, how="all")

Unnamed: 0,Model,Average,kaggle_tora_medium_extended,kaggle_tora_hard_extended
1741,AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits,65.06,65.06,32.22


In [16]:
df[["Model", "Date"]].merge(new_df, on="Model", how="left")

Unnamed: 0,Model,Date,Ifeval,Truthfulqa,Winogrande,Gsm8k,Mmlu,Hellaswag,Arc
0,NousResearch_Nous-Hermes-2-Yi-34B_main,2024-03-04,39.00,61.44,80.58,67.93,76.24,83.79,68.00
1,deepseek-ai_deepseek-llm-67b-chat_main,2024-03-05,55.27,57.78,79.16,76.12,71.18,83.94,64.16
2,NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main,2024-03-02,59.33,64.76,78.53,62.17,71.96,85.42,70.82
3,mistralai_Mixtral-8x7B-Instruct-v0.1_main,2024-03-02,55.08,70.79,73.56,59.89,70.60,86.68,72.01
4,deepseek-ai_deepseek-llm-67b-chat_main,2024-03-04,55.27,57.78,79.16,76.12,71.18,83.94,64.16
...,...,...,...,...,...,...,...,...,...
269,HuggingFaceH4_starcoder2-15b-ift_v18.0,2024-03-10,21.63,,,0.83,,,
270,HuggingFaceH4_mistral-7b-ift_v49.0,2024-03-07,20.15,,,0.00,,,
271,HuggingFaceH4_starchat-beta_main,2024-03-12,8.13,,,,,,
272,HuggingFaceH4_starcoder2-15b-ift_v7.0,2024-03-10,12.57,,,3.18,,,
