{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "from pathlib import Path\n", "\n", "import gradio as gr\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "def get_leaderboard_df():\n", " filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n", "\n", " # Parse filepaths to get unique models\n", " models = set()\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " model_revision = \"_\".join(path_parts[1:4])\n", " models.add(model_revision)\n", "\n", " # Initialize DataFrame\n", " df = pd.DataFrame(index=list(models))\n", "\n", " # Extract data from each file and populate the DataFrame\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " date = filepath.stem.split(\"_\")[-1][:-3].split(\"T\")[0]\n", " model_revision = \"_\".join(path_parts[1:4]) + \"_\" + date\n", " task = path_parts[4].capitalize()\n", " df.loc[model_revision, \"Date\"] = date\n", "\n", " with open(filepath, \"r\") as file:\n", " data = json.load(file)\n", " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n", " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n", " if task == \"truthfulqa\":\n", " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n", " else:\n", " first_metric_key = next(iter(data[\"results\"][first_result_key])) # gets the first key in the first result\n", " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n", " df.loc[model_revision, task] = value\n", " \n", " # Drop rows where every entry is NaN\n", " df = df.dropna(how=\"all\", axis=0, subset=[c for c in df.columns if c != \"Date\"])\n", " df.insert(loc=1, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n", " df = df.sort_values(by=[\"Average\"], ascending=False)\n", " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(3)\n", " # Strip off date from model name\n", " df[\"Model\"] = df[\"Model\"].apply(lambda x: x.rsplit(\"_\", 1)[0])\n", " return df" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "df = get_leaderboard_df()" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelDateAverageIfevalTruthfulqaWinograndeGsm8kMmluHellaswagArc
0NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main2024-03-020.6170.5530.4770.7850.6220.510.6770.698
1NousResearch_Nous-Hermes-2-Yi-34B_main2024-03-040.604NaN0.4390.806NaN0.480.6400.654
2mistralai_Mixtral-8x7B-Instruct-v0.1_main2024-03-020.6030.4970.5540.7360.5990.430.7090.698
3deepseek-ai_deepseek-llm-67b-chat_main2024-03-040.603NaN0.3950.792NaNNaNNaN0.622
4deepseek-ai_deepseek-llm-67b-chat_main2024-03-050.5850.505NaNNaN0.7610.420.654NaN
.................................
269HuggingFaceH4_starcoder2-15b-ift_v18.02024-03-100.0890.170NaNNaN0.008NaNNaNNaN
270HuggingFaceH4_mistral-7b-ift_v49.02024-03-070.0860.172NaNNaN0.000NaNNaNNaN
271HuggingFaceH4_starchat-beta_main2024-03-120.0790.079NaNNaNNaNNaNNaNNaN
272HuggingFaceH4_starcoder2-15b-ift_v7.02024-03-100.0700.107NaNNaN0.032NaNNaNNaN
273HuggingFaceH4_zephyr-7b-beta-ift_v1.12024-03-130.0430.087NaNNaN0.000NaNNaNNaN
\n", "

274 rows × 10 columns

\n", "
" ], "text/plain": [ " Model Date Average \\\n", "0 NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main 2024-03-02 0.617 \n", "1 NousResearch_Nous-Hermes-2-Yi-34B_main 2024-03-04 0.604 \n", "2 mistralai_Mixtral-8x7B-Instruct-v0.1_main 2024-03-02 0.603 \n", "3 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-04 0.603 \n", "4 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-05 0.585 \n", ".. ... ... ... \n", "269 HuggingFaceH4_starcoder2-15b-ift_v18.0 2024-03-10 0.089 \n", "270 HuggingFaceH4_mistral-7b-ift_v49.0 2024-03-07 0.086 \n", "271 HuggingFaceH4_starchat-beta_main 2024-03-12 0.079 \n", "272 HuggingFaceH4_starcoder2-15b-ift_v7.0 2024-03-10 0.070 \n", "273 HuggingFaceH4_zephyr-7b-beta-ift_v1.1 2024-03-13 0.043 \n", "\n", " Ifeval Truthfulqa Winogrande Gsm8k Mmlu Hellaswag Arc \n", "0 0.553 0.477 0.785 0.622 0.51 0.677 0.698 \n", "1 NaN 0.439 0.806 NaN 0.48 0.640 0.654 \n", "2 0.497 0.554 0.736 0.599 0.43 0.709 0.698 \n", "3 NaN 0.395 0.792 NaN NaN NaN 0.622 \n", "4 0.505 NaN NaN 0.761 0.42 0.654 NaN \n", ".. ... ... ... ... ... ... ... \n", "269 0.170 NaN NaN 0.008 NaN NaN NaN \n", "270 0.172 NaN NaN 0.000 NaN NaN NaN \n", "271 0.079 NaN NaN NaN NaN NaN NaN \n", "272 0.107 NaN NaN 0.032 NaN NaN NaN \n", "273 0.087 NaN NaN 0.000 NaN NaN NaN \n", "\n", "[274 rows x 10 columns]" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelAverageIfevalTruthfulqaWinograndeGsm8kMmluHellaswagArc
50HuggingFaceH4_mistral-7b-ift_v48.56_2024-03-080.490.4180.3590.6720.4530.330.6560.545
532HuggingFaceH4_mistral-7b-ift_v48.56NaNNaNNaNNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " Model Average Ifeval \\\n", "50 HuggingFaceH4_mistral-7b-ift_v48.56_2024-03-08 0.49 0.418 \n", "532 HuggingFaceH4_mistral-7b-ift_v48.56 NaN NaN \n", "\n", " Truthfulqa Winogrande Gsm8k Mmlu Hellaswag Arc \n", "50 0.359 0.672 0.453 0.33 0.656 0.545 \n", "532 NaN NaN NaN NaN NaN NaN " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['Model'].str.contains(\"HuggingFaceH4_mistral-7b-ift_v48.56\")]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "hf", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 2 }