{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import json\n", "from pathlib import Path\n", "\n", "import gradio as gr\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def get_leaderboard_df():\n", " filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n", "\n", " # Parse filepaths to get unique models\n", " models = set()\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " model_revision = \"_\".join(path_parts[1:4])\n", " models.add(model_revision)\n", "\n", " # Initialize DataFrame\n", " df = pd.DataFrame(index=list(models))\n", "\n", " # Extract data from each file and populate the DataFrame\n", " for filepath in filepaths:\n", " path_parts = Path(filepath).parts\n", " date = filepath.stem.split(\"_\")[-1][:-3].split(\"T\")[0]\n", " model_revision = \"_\".join(path_parts[1:4]) + \"_\" + date\n", " task = path_parts[4].capitalize()\n", " df.loc[model_revision, \"Date\"] = date\n", "\n", " with open(filepath, \"r\") as file:\n", " data = json.load(file)\n", " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n", " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n", " if task.lower() == \"truthfulqa\":\n", " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n", " # IFEval has several metrics but we report just the prompt-loose-acc one\n", " elif task.lower() == \"ifeval\":\n", " value = data[\"results\"][first_result_key][\"prompt_level_loose_acc\"]\n", " # MMLU has several metrics but we report just the average one\n", " elif task.lower() == \"mmlu\":\n", " value = data[\"results\"][\"lighteval|mmlu:_average|5\"][\"acc\"]\n", " # HellaSwag and ARC reports acc_norm\n", " elif task.lower() in [\"hellaswag\", \"arc\"]:\n", " value = data[\"results\"][first_result_key][\"acc_norm\"]\n", " else:\n", " first_metric_key = next(\n", " iter(data[\"results\"][first_result_key])\n", " ) # gets the first key in the first result\n", " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n", " df.loc[model_revision, task] = value\n", "\n", " # Put IFEval in first column\n", " ifeval_col = df.pop(\"Ifeval\")\n", " df.insert(1, \"Ifeval\", ifeval_col)\n", " # Drop rows where every entry is NaN\n", " df = df.dropna(how=\"all\", axis=0, subset=[c for c in df.columns if c != \"Date\"])\n", " df.insert(loc=1, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n", " # Convert all values to percentage\n", " df[df.select_dtypes(include=[\"number\"]).columns] *= 100.0\n", " df = df.sort_values(by=[\"Average\"], ascending=False)\n", " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(2)\n", " # Strip off date from model name\n", " df[\"Model\"] = df[\"Model\"].apply(lambda x: x.rsplit(\"_\", 1)[0])\n", " return df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = get_leaderboard_df()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Model | \n", "Date | \n", "Average | \n", "Ifeval | \n", "Truthfulqa | \n", "Winogrande | \n", "Gsm8k | \n", "Mmlu | \n", "Hellaswag | \n", "Arc | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "NousResearch_Nous-Hermes-2-Yi-34B_main | \n", "2024-03-04 | \n", "74.01 | \n", "NaN | \n", "61.44 | \n", "80.58 | \n", "NaN | \n", "76.24 | \n", "83.79 | \n", "68.00 | \n", "
1 | \n", "deepseek-ai_deepseek-llm-67b-chat_main | \n", "2024-03-05 | \n", "71.62 | \n", "55.27 | \n", "NaN | \n", "NaN | \n", "76.12 | \n", "71.18 | \n", "83.94 | \n", "NaN | \n", "
2 | \n", "NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main | \n", "2024-03-02 | \n", "70.43 | \n", "59.33 | \n", "64.76 | \n", "78.53 | \n", "62.17 | \n", "71.96 | \n", "85.42 | \n", "70.82 | \n", "
3 | \n", "mistralai_Mixtral-8x7B-Instruct-v0.1_main | \n", "2024-03-02 | \n", "69.80 | \n", "55.08 | \n", "70.79 | \n", "73.56 | \n", "59.89 | \n", "70.60 | \n", "86.68 | \n", "72.01 | \n", "
4 | \n", "deepseek-ai_deepseek-llm-67b-chat_main | \n", "2024-03-04 | \n", "67.03 | \n", "NaN | \n", "57.78 | \n", "79.16 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "64.16 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
269 | \n", "HuggingFaceH4_starcoder2-15b-ift_v18.0 | \n", "2024-03-10 | \n", "11.23 | \n", "21.63 | \n", "NaN | \n", "NaN | \n", "0.83 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
270 | \n", "HuggingFaceH4_mistral-7b-ift_v49.0 | \n", "2024-03-07 | \n", "10.07 | \n", "20.15 | \n", "NaN | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
271 | \n", "HuggingFaceH4_starchat-beta_main | \n", "2024-03-12 | \n", "8.13 | \n", "8.13 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
272 | \n", "HuggingFaceH4_starcoder2-15b-ift_v7.0 | \n", "2024-03-10 | \n", "7.88 | \n", "12.57 | \n", "NaN | \n", "NaN | \n", "3.18 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
273 | \n", "HuggingFaceH4_zephyr-7b-beta-ift_v1.1 | \n", "2024-03-13 | \n", "4.71 | \n", "9.43 | \n", "NaN | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
274 rows × 10 columns
\n", "\n", " | Model | \n", "Ifeval | \n", "Truthfulqa | \n", "Winogrande | \n", "Gsm8k | \n", "Mmlu | \n", "Hellaswag | \n", "Arc | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "HuggingFaceH4_mistral-7b-ift_v41.0 | \n", "44.36 | \n", "49.35 | \n", "72.93 | \n", "37.30 | \n", "60.82 | \n", "79.70 | \n", "58.36 | \n", "
1 | \n", "HuggingFaceH4_mistral-7b-ift_v41.1 | \n", "47.32 | \n", "47.89 | \n", "72.69 | \n", "36.32 | \n", "60.34 | \n", "79.57 | \n", "57.51 | \n", "
2 | \n", "HuggingFaceH4_mistral-7b-ift_v41.10 | \n", "32.72 | \n", "51.05 | \n", "72.45 | \n", "25.93 | \n", "59.75 | \n", "81.92 | \n", "59.22 | \n", "
3 | \n", "HuggingFaceH4_mistral-7b-ift_v41.11 | \n", "37.89 | \n", "51.05 | \n", "64.56 | \n", "17.59 | \n", "57.60 | \n", "77.65 | \n", "55.89 | \n", "
4 | \n", "HuggingFaceH4_mistral-7b-ift_v41.12 | \n", "37.89 | \n", "45.94 | \n", "63.30 | \n", "21.15 | \n", "58.50 | \n", "74.94 | \n", "52.73 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
258 | \n", "mistralai_Mistral-7B-Instruct-v0.2_main | \n", "53.97 | \n", "70.68 | \n", "68.82 | \n", "38.13 | \n", "59.43 | \n", "83.45 | \n", "65.70 | \n", "
259 | \n", "mistralai_Mixtral-8x7B-Instruct-v0.1_main | \n", "55.08 | \n", "70.79 | \n", "73.56 | \n", "59.89 | \n", "70.60 | \n", "86.68 | \n", "72.01 | \n", "
260 | \n", "openchat_openchat-3.5-0106_main | \n", "54.71 | \n", "57.55 | \n", "72.53 | \n", "66.19 | \n", "63.72 | \n", "80.10 | \n", "61.01 | \n", "
261 | \n", "stabilityai_stablelm-zephyr-3b_main | \n", "34.75 | \n", "46.19 | \n", "58.41 | \n", "40.18 | \n", "45.18 | \n", "71.57 | \n", "45.82 | \n", "
262 | \n", "teknium_OpenHermes-2.5-Mistral-7B_main | \n", "52.68 | \n", "58.62 | \n", "72.14 | \n", "54.06 | \n", "63.01 | \n", "82.34 | \n", "62.97 | \n", "
263 rows × 8 columns
\n", "\n", " | Model | \n", "Date | \n", "Ifeval | \n", "Truthfulqa | \n", "Winogrande | \n", "Gsm8k | \n", "Mmlu | \n", "Hellaswag | \n", "Arc | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "NousResearch_Nous-Hermes-2-Yi-34B_main | \n", "2024-03-04 | \n", "39.00 | \n", "61.44 | \n", "80.58 | \n", "67.93 | \n", "76.24 | \n", "83.79 | \n", "68.00 | \n", "
1 | \n", "deepseek-ai_deepseek-llm-67b-chat_main | \n", "2024-03-05 | \n", "55.27 | \n", "57.78 | \n", "79.16 | \n", "76.12 | \n", "71.18 | \n", "83.94 | \n", "64.16 | \n", "
2 | \n", "NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main | \n", "2024-03-02 | \n", "59.33 | \n", "64.76 | \n", "78.53 | \n", "62.17 | \n", "71.96 | \n", "85.42 | \n", "70.82 | \n", "
3 | \n", "mistralai_Mixtral-8x7B-Instruct-v0.1_main | \n", "2024-03-02 | \n", "55.08 | \n", "70.79 | \n", "73.56 | \n", "59.89 | \n", "70.60 | \n", "86.68 | \n", "72.01 | \n", "
4 | \n", "deepseek-ai_deepseek-llm-67b-chat_main | \n", "2024-03-04 | \n", "55.27 | \n", "57.78 | \n", "79.16 | \n", "76.12 | \n", "71.18 | \n", "83.94 | \n", "64.16 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
269 | \n", "HuggingFaceH4_starcoder2-15b-ift_v18.0 | \n", "2024-03-10 | \n", "21.63 | \n", "NaN | \n", "NaN | \n", "0.83 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
270 | \n", "HuggingFaceH4_mistral-7b-ift_v49.0 | \n", "2024-03-07 | \n", "20.15 | \n", "NaN | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
271 | \n", "HuggingFaceH4_starchat-beta_main | \n", "2024-03-12 | \n", "8.13 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
272 | \n", "HuggingFaceH4_starcoder2-15b-ift_v7.0 | \n", "2024-03-10 | \n", "12.57 | \n", "NaN | \n", "NaN | \n", "3.18 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
273 | \n", "HuggingFaceH4_zephyr-7b-beta-ift_v1.1 | \n", "2024-03-13 | \n", "9.43 | \n", "NaN | \n", "NaN | \n", "0.00 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
274 rows × 9 columns
\n", "