In [1]:
import json
from pathlib import Path

import gradio as gr
import pandas as pd

In [31]:
def get_leaderboard_df():
    filepaths = list(Path("eval_results").rglob("*.json"))

    # Parse filepaths to get unique models
    models = set()
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        model_revision = "_".join(path_parts[1:4])
        models.add(model_revision)

    # Initialize DataFrame
    df = pd.DataFrame(index=list(models))

    # Extract data from each file and populate the DataFrame
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        model_revision = "_".join(path_parts[1:4])
        task = path_parts[4].capitalize()
        # Extract timestamp from filepath
        timestamp = filepath.stem.split("_")[-1][:-3]
        df.loc[model_revision, "Timestamp"] = timestamp

        with open(filepath, "r") as file:
            data = json.load(file)
            first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
            # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
            if task == "truthfulqa":
                value = data["results"][first_result_key]["truthfulqa_mc2"]
            else:
                first_metric_key = next(iter(data["results"][first_result_key]))  # gets the first key in the first result
                value = data["results"][first_result_key][first_metric_key]  # gets the value of the first metric
            df.loc[model_revision, task] = value
 
    df.insert(loc=0, column="Average", value=df.mean(axis=1, numeric_only=True))
    df = df.sort_values(by=["Average"], ascending=False)
    df = df.reset_index().rename(columns={"index": "Model"}).round(3)
    return df

In [32]:
df = get_leaderboard_df()

In [None]:
df

Unnamed: 0,Model,Timestamp,Average,Truthfulqa,Winogrande,Gsm8k,Hellaswag,Arc
0,Qwen_Qwen1.5-0.5B-Chat_main,2024-02-28T07-35-58.803,0.296,0.271,0.519,0.039,0.363,0.287
