{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "from pathlib import Path\n", "\n", "import gradio as gr\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "def get_leaderboard_df():\n", " filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n", "\n", " # Parse filepaths to get unique models\n", " models = set()\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " model_revision = \"_\".join(path_parts[1:4])\n", " models.add(model_revision)\n", "\n", " # Initialize DataFrame\n", " df = pd.DataFrame(index=list(models))\n", "\n", " # Extract data from each file and populate the DataFrame\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " model_revision = \"_\".join(path_parts[1:4])\n", " task = path_parts[4].capitalize()\n", " # Extract timestamp from filepath\n", " timestamp = filepath.stem.split(\"_\")[-1][:-3]\n", " df.loc[model_revision, \"Timestamp\"] = timestamp\n", "\n", " with open(filepath, \"r\") as file:\n", " data = json.load(file)\n", " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n", " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n", " if task == \"truthfulqa\":\n", " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n", " else:\n", " first_metric_key = next(iter(data[\"results\"][first_result_key])) # gets the first key in the first result\n", " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n", " df.loc[model_revision, task] = value\n", " \n", " df.insert(loc=0, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n", " df = df.sort_values(by=[\"Average\"], ascending=False)\n", " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(3)\n", " return df" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "df = get_leaderboard_df()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelTimestampAverageTruthfulqaWinograndeGsm8kHellaswagArc
0Qwen_Qwen1.5-0.5B-Chat_main2024-02-28T07-35-58.8030.2960.2710.5190.0390.3630.287
\n", "
" ], "text/plain": [ " Model Timestamp Average Truthfulqa \\\n", "0 Qwen_Qwen1.5-0.5B-Chat_main 2024-02-28T07-35-58.803 0.296 0.271 \n", "\n", " Winogrande Gsm8k Hellaswag Arc \n", "0 0.519 0.039 0.363 0.287 " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "hf", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 2 }