{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import json\n", "from pathlib import Path\n", "\n", "import gradio as gr\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def get_leaderboard_df():\n", " filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n", "\n", " # Parse filepaths to get unique models\n", " models = set()\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " model_revision = \"_\".join(path_parts[1:4])\n", " models.add(model_revision)\n", "\n", " # Initialize DataFrame\n", " df = pd.DataFrame(index=list(models))\n", "\n", " # Extract data from each file and populate the DataFrame\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " date = filepath.stem.split(\"_\")[-1][:-3].split(\"T\")[0]\n", " model_revision = \"_\".join(path_parts[1:4]) + \"_\" + date\n", " task = path_parts[4].capitalize()\n", " df.loc[model_revision, \"Date\"] = date\n", "\n", " with open(filepath, \"r\") as file:\n", " data = json.load(file)\n", " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n", " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n", " if task.lower() == \"truthfulqa\":\n", " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n", " # IFEval has several metrics but we report just the prompt-loose-acc one\n", " elif task.lower() == \"ifeval\":\n", " value = data[\"results\"][first_result_key][\"prompt_level_loose_acc\"]\n", " # MMLU has several metrics but we report just the average one\n", " elif task.lower() == \"mmlu\":\n", " value = data[\"results\"][\"lighteval|mmlu:_average|5\"][\"acc\"]\n", " # HellaSwag and ARC reports acc_norm\n", " elif task.lower() in [\"hellaswag\", \"arc\"]:\n", " value = data[\"results\"][first_result_key][\"acc_norm\"]\n", " else:\n", " first_metric_key = next(\n", " iter(data[\"results\"][first_result_key])\n", " ) # gets the first key in the first result\n", " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n", " df.loc[model_revision, task] = value\n", "\n", " # Put IFEval in first column\n", " ifeval_col = df.pop(\"Ifeval\")\n", " df.insert(1, \"Ifeval\", ifeval_col)\n", " # Drop rows where every entry is NaN\n", " df = df.dropna(how=\"all\", axis=0, subset=[c for c in df.columns if c != \"Date\"])\n", " df.insert(loc=1, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n", " # Convert all values to percentage\n", " df[df.select_dtypes(include=[\"number\"]).columns] *= 100.0\n", " df = df.sort_values(by=[\"Average\"], ascending=False)\n", " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(2)\n", " # Strip off date from model name\n", " df[\"Model\"] = df[\"Model\"].apply(lambda x: x.rsplit(\"_\", 1)[0])\n", " return df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = get_leaderboard_df()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelDateAverageIfevalTruthfulqaWinograndeGsm8kMmluHellaswagArc
0NousResearch_Nous-Hermes-2-Yi-34B_main2024-03-0474.01NaN61.4480.58NaN76.2483.7968.00
1deepseek-ai_deepseek-llm-67b-chat_main2024-03-0571.6255.27NaNNaN76.1271.1883.94NaN
2NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main2024-03-0270.4359.3364.7678.5362.1771.9685.4270.82
3mistralai_Mixtral-8x7B-Instruct-v0.1_main2024-03-0269.8055.0870.7973.5659.8970.6086.6872.01
4deepseek-ai_deepseek-llm-67b-chat_main2024-03-0467.03NaN57.7879.16NaNNaNNaN64.16
.................................
269HuggingFaceH4_starcoder2-15b-ift_v18.02024-03-1011.2321.63NaNNaN0.83NaNNaNNaN
270HuggingFaceH4_mistral-7b-ift_v49.02024-03-0710.0720.15NaNNaN0.00NaNNaNNaN
271HuggingFaceH4_starchat-beta_main2024-03-128.138.13NaNNaNNaNNaNNaNNaN
272HuggingFaceH4_starcoder2-15b-ift_v7.02024-03-107.8812.57NaNNaN3.18NaNNaNNaN
273HuggingFaceH4_zephyr-7b-beta-ift_v1.12024-03-134.719.43NaNNaN0.00NaNNaNNaN
\n", "

274 rows × 10 columns

\n", "
" ], "text/plain": [ " Model Date Average \\\n", "0 NousResearch_Nous-Hermes-2-Yi-34B_main 2024-03-04 74.01 \n", "1 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-05 71.62 \n", "2 NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main 2024-03-02 70.43 \n", "3 mistralai_Mixtral-8x7B-Instruct-v0.1_main 2024-03-02 69.80 \n", "4 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-04 67.03 \n", ".. ... ... ... \n", "269 HuggingFaceH4_starcoder2-15b-ift_v18.0 2024-03-10 11.23 \n", "270 HuggingFaceH4_mistral-7b-ift_v49.0 2024-03-07 10.07 \n", "271 HuggingFaceH4_starchat-beta_main 2024-03-12 8.13 \n", "272 HuggingFaceH4_starcoder2-15b-ift_v7.0 2024-03-10 7.88 \n", "273 HuggingFaceH4_zephyr-7b-beta-ift_v1.1 2024-03-13 4.71 \n", "\n", " Ifeval Truthfulqa Winogrande Gsm8k Mmlu Hellaswag Arc \n", "0 NaN 61.44 80.58 NaN 76.24 83.79 68.00 \n", "1 55.27 NaN NaN 76.12 71.18 83.94 NaN \n", "2 59.33 64.76 78.53 62.17 71.96 85.42 70.82 \n", "3 55.08 70.79 73.56 59.89 70.60 86.68 72.01 \n", "4 NaN 57.78 79.16 NaN NaN NaN 64.16 \n", ".. ... ... ... ... ... ... ... \n", "269 21.63 NaN NaN 0.83 NaN NaN NaN \n", "270 20.15 NaN NaN 0.00 NaN NaN NaN \n", "271 8.13 NaN NaN NaN NaN NaN NaN \n", "272 12.57 NaN NaN 3.18 NaN NaN NaN \n", "273 9.43 NaN NaN 0.00 NaN NaN NaN \n", "\n", "[274 rows x 10 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelIfevalTruthfulqaWinograndeGsm8kMmluHellaswagArc
0HuggingFaceH4_mistral-7b-ift_v41.044.3649.3572.9337.3060.8279.7058.36
1HuggingFaceH4_mistral-7b-ift_v41.147.3247.8972.6936.3260.3479.5757.51
2HuggingFaceH4_mistral-7b-ift_v41.1032.7251.0572.4525.9359.7581.9259.22
3HuggingFaceH4_mistral-7b-ift_v41.1137.8951.0564.5617.5957.6077.6555.89
4HuggingFaceH4_mistral-7b-ift_v41.1237.8945.9463.3021.1558.5074.9452.73
...........................
258mistralai_Mistral-7B-Instruct-v0.2_main53.9770.6868.8238.1359.4383.4565.70
259mistralai_Mixtral-8x7B-Instruct-v0.1_main55.0870.7973.5659.8970.6086.6872.01
260openchat_openchat-3.5-0106_main54.7157.5572.5366.1963.7280.1061.01
261stabilityai_stablelm-zephyr-3b_main34.7546.1958.4140.1845.1871.5745.82
262teknium_OpenHermes-2.5-Mistral-7B_main52.6858.6272.1454.0663.0182.3462.97
\n", "

263 rows × 8 columns

\n", "
" ], "text/plain": [ " Model Ifeval Truthfulqa \\\n", "0 HuggingFaceH4_mistral-7b-ift_v41.0 44.36 49.35 \n", "1 HuggingFaceH4_mistral-7b-ift_v41.1 47.32 47.89 \n", "2 HuggingFaceH4_mistral-7b-ift_v41.10 32.72 51.05 \n", "3 HuggingFaceH4_mistral-7b-ift_v41.11 37.89 51.05 \n", "4 HuggingFaceH4_mistral-7b-ift_v41.12 37.89 45.94 \n", ".. ... ... ... \n", "258 mistralai_Mistral-7B-Instruct-v0.2_main 53.97 70.68 \n", "259 mistralai_Mixtral-8x7B-Instruct-v0.1_main 55.08 70.79 \n", "260 openchat_openchat-3.5-0106_main 54.71 57.55 \n", "261 stabilityai_stablelm-zephyr-3b_main 34.75 46.19 \n", "262 teknium_OpenHermes-2.5-Mistral-7B_main 52.68 58.62 \n", "\n", " Winogrande Gsm8k Mmlu Hellaswag Arc \n", "0 72.93 37.30 60.82 79.70 58.36 \n", "1 72.69 36.32 60.34 79.57 57.51 \n", "2 72.45 25.93 59.75 81.92 59.22 \n", "3 64.56 17.59 57.60 77.65 55.89 \n", "4 63.30 21.15 58.50 74.94 52.73 \n", ".. ... ... ... ... ... \n", "258 68.82 38.13 59.43 83.45 65.70 \n", "259 73.56 59.89 70.60 86.68 72.01 \n", "260 72.53 66.19 63.72 80.10 61.01 \n", "261 58.41 40.18 45.18 71.57 45.82 \n", "262 72.14 54.06 63.01 82.34 62.97 \n", "\n", "[263 rows x 8 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df = df.drop([\"Date\", \"Average\"], axis=1).groupby(\"Model\").max().reset_index()\n", "new_df" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ModelDateIfevalTruthfulqaWinograndeGsm8kMmluHellaswagArc
0NousResearch_Nous-Hermes-2-Yi-34B_main2024-03-0439.0061.4480.5867.9376.2483.7968.00
1deepseek-ai_deepseek-llm-67b-chat_main2024-03-0555.2757.7879.1676.1271.1883.9464.16
2NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main2024-03-0259.3364.7678.5362.1771.9685.4270.82
3mistralai_Mixtral-8x7B-Instruct-v0.1_main2024-03-0255.0870.7973.5659.8970.6086.6872.01
4deepseek-ai_deepseek-llm-67b-chat_main2024-03-0455.2757.7879.1676.1271.1883.9464.16
..............................
269HuggingFaceH4_starcoder2-15b-ift_v18.02024-03-1021.63NaNNaN0.83NaNNaNNaN
270HuggingFaceH4_mistral-7b-ift_v49.02024-03-0720.15NaNNaN0.00NaNNaNNaN
271HuggingFaceH4_starchat-beta_main2024-03-128.13NaNNaNNaNNaNNaNNaN
272HuggingFaceH4_starcoder2-15b-ift_v7.02024-03-1012.57NaNNaN3.18NaNNaNNaN
273HuggingFaceH4_zephyr-7b-beta-ift_v1.12024-03-139.43NaNNaN0.00NaNNaNNaN
\n", "

274 rows × 9 columns

\n", "
" ], "text/plain": [ " Model Date Ifeval \\\n", "0 NousResearch_Nous-Hermes-2-Yi-34B_main 2024-03-04 39.00 \n", "1 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-05 55.27 \n", "2 NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main 2024-03-02 59.33 \n", "3 mistralai_Mixtral-8x7B-Instruct-v0.1_main 2024-03-02 55.08 \n", "4 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-04 55.27 \n", ".. ... ... ... \n", "269 HuggingFaceH4_starcoder2-15b-ift_v18.0 2024-03-10 21.63 \n", "270 HuggingFaceH4_mistral-7b-ift_v49.0 2024-03-07 20.15 \n", "271 HuggingFaceH4_starchat-beta_main 2024-03-12 8.13 \n", "272 HuggingFaceH4_starcoder2-15b-ift_v7.0 2024-03-10 12.57 \n", "273 HuggingFaceH4_zephyr-7b-beta-ift_v1.1 2024-03-13 9.43 \n", "\n", " Truthfulqa Winogrande Gsm8k Mmlu Hellaswag Arc \n", "0 61.44 80.58 67.93 76.24 83.79 68.00 \n", "1 57.78 79.16 76.12 71.18 83.94 64.16 \n", "2 64.76 78.53 62.17 71.96 85.42 70.82 \n", "3 70.79 73.56 59.89 70.60 86.68 72.01 \n", "4 57.78 79.16 76.12 71.18 83.94 64.16 \n", ".. ... ... ... ... ... ... \n", "269 NaN NaN 0.83 NaN NaN NaN \n", "270 NaN NaN 0.00 NaN NaN NaN \n", "271 NaN NaN NaN NaN NaN NaN \n", "272 NaN NaN 3.18 NaN NaN NaN \n", "273 NaN NaN 0.00 NaN NaN NaN \n", "\n", "[274 rows x 9 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[[\"Model\", \"Date\"]].merge(new_df, on=\"Model\", how=\"left\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "hf", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 2 }