{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "from pathlib import Path\n", "\n", "import gradio as gr\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "def get_leaderboard_df():\n", " filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n", "\n", " # Parse filepaths to get unique models\n", " models = set()\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " model_revision = \"_\".join(path_parts[1:4])\n", " models.add(model_revision)\n", "\n", " # Initialize DataFrame\n", " df = pd.DataFrame(index=list(models))\n", "\n", " # Extract data from each file and populate the DataFrame\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " model_revision = \"_\".join(path_parts[1:4])\n", " task = path_parts[4].capitalize()\n", " # Extract timestamp from filepath\n", " timestamp = filepath.stem.split(\"_\")[-1][:-3]\n", " df.loc[model_revision, \"Timestamp\"] = timestamp\n", "\n", " with open(filepath, \"r\") as file:\n", " data = json.load(file)\n", " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n", " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n", " if task == \"truthfulqa\":\n", " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n", " else:\n", " first_metric_key = next(iter(data[\"results\"][first_result_key])) # gets the first key in the first result\n", " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n", " df.loc[model_revision, task] = value\n", " \n", " df.insert(loc=0, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n", " df = df.sort_values(by=[\"Average\"], ascending=False)\n", " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(3)\n", " return df" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "df = get_leaderboard_df()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Model | \n", "Timestamp | \n", "Average | \n", "Truthfulqa | \n", "Winogrande | \n", "Gsm8k | \n", "Hellaswag | \n", "Arc | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "Qwen_Qwen1.5-0.5B-Chat_main | \n", "2024-02-28T07-35-58.803 | \n", "0.296 | \n", "0.271 | \n", "0.519 | \n", "0.039 | \n", "0.363 | \n", "0.287 | \n", "