In [1]:
import json
from pathlib import Path

import gradio as gr
import pandas as pd

In [51]:
def get_leaderboard_df():
    filepaths = list(Path("eval_results").rglob("*.json"))

    # Parse filepaths to get unique models
    models = set()
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        model_revision = "_".join(path_parts[1:4])
        models.add(model_revision)

    # Initialize DataFrame
    df = pd.DataFrame(index=list(models))

    # Extract data from each file and populate the DataFrame
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        date = filepath.stem.split("_")[-1][:-3].split("T")[0]
        model_revision = "_".join(path_parts[1:4]) + "_" + date
        task = path_parts[4].capitalize()
        df.loc[model_revision, "Date"] = date

        with open(filepath, "r") as file:
            data = json.load(file)
            first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
            # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
            if task == "truthfulqa":
                value = data["results"][first_result_key]["truthfulqa_mc2"]
            else:
                first_metric_key = next(iter(data["results"][first_result_key]))  # gets the first key in the first result
                value = data["results"][first_result_key][first_metric_key]  # gets the value of the first metric
            df.loc[model_revision, task] = value
 
    # Drop rows where every entry is NaN
    df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
    df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
    df = df.sort_values(by=["Average"], ascending=False)
    df = df.reset_index().rename(columns={"index": "Model"}).round(3)
    # Strip off date from model name
    df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
    return df

In [52]:
df = get_leaderboard_df()

In [53]:
df

Unnamed: 0,Model,Date,Average,Ifeval,Truthfulqa,Winogrande,Gsm8k,Mmlu,Hellaswag,Arc
0,NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main,2024-03-02,0.617,0.553,0.477,0.785,0.622,0.51,0.677,0.698
1,NousResearch_Nous-Hermes-2-Yi-34B_main,2024-03-04,0.604,,0.439,0.806,,0.48,0.640,0.654
2,mistralai_Mixtral-8x7B-Instruct-v0.1_main,2024-03-02,0.603,0.497,0.554,0.736,0.599,0.43,0.709,0.698
3,deepseek-ai_deepseek-llm-67b-chat_main,2024-03-04,0.603,,0.395,0.792,,,,0.622
4,deepseek-ai_deepseek-llm-67b-chat_main,2024-03-05,0.585,0.505,,,0.761,0.42,0.654,
...,...,...,...,...,...,...,...,...,...,...
269,HuggingFaceH4_starcoder2-15b-ift_v18.0,2024-03-10,0.089,0.170,,,0.008,,,
270,HuggingFaceH4_mistral-7b-ift_v49.0,2024-03-07,0.086,0.172,,,0.000,,,
271,HuggingFaceH4_starchat-beta_main,2024-03-12,0.079,0.079,,,,,,
272,HuggingFaceH4_starcoder2-15b-ift_v7.0,2024-03-10,0.070,0.107,,,0.032,,,


In [32]:
df[df['Model'].str.contains("HuggingFaceH4_mistral-7b-ift_v48.56")]

Unnamed: 0,Model,Average,Ifeval,Truthfulqa,Winogrande,Gsm8k,Mmlu,Hellaswag,Arc
50,HuggingFaceH4_mistral-7b-ift_v48.56_2024-03-08,0.49,0.418,0.359,0.672,0.453,0.33,0.656,0.545
532,HuggingFaceH4_mistral-7b-ift_v48.56,,,,,,,,
