{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from pathlib import Path\n",
    "\n",
    "import gradio as gr\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_leaderboard_df():\n",
    "    filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n",
    "\n",
    "    # Parse filepaths to get unique models\n",
    "    models = set()\n",
    "    for filepath in filepaths:\n",
    "        path_parts = Path(filepath).parts\n",
    "        model_revision = \"_\".join(path_parts[1:4])\n",
    "        models.add(model_revision)\n",
    "\n",
    "    # Initialize DataFrame\n",
    "    df = pd.DataFrame(index=list(models))\n",
    "\n",
    "    # Extract data from each file and populate the DataFrame\n",
    "    for filepath in filepaths:\n",
    "        path_parts = Path(filepath).parts\n",
    "        date = filepath.stem.split(\"_\")[-1][:-3].split(\"T\")[0]\n",
    "        model_revision = \"_\".join(path_parts[1:4]) + \"_\" + date\n",
    "        task = path_parts[4].capitalize()\n",
    "        df.loc[model_revision, \"Date\"] = date\n",
    "\n",
    "        with open(filepath, \"r\") as file:\n",
    "            data = json.load(file)\n",
    "            first_result_key = next(iter(data[\"results\"]))  # gets the first key in 'results'\n",
    "            # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n",
    "            if task == \"truthfulqa\":\n",
    "                value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n",
    "            else:\n",
    "                first_metric_key = next(iter(data[\"results\"][first_result_key]))  # gets the first key in the first result\n",
    "                value = data[\"results\"][first_result_key][first_metric_key]  # gets the value of the first metric\n",
    "            df.loc[model_revision, task] = value\n",
    " \n",
    "    # Drop rows where every entry is NaN\n",
    "    df = df.dropna(how=\"all\", axis=0, subset=[c for c in df.columns if c != \"Date\"])\n",
    "    df.insert(loc=1, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n",
    "    df = df.sort_values(by=[\"Average\"], ascending=False)\n",
    "    df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(3)\n",
    "    # Strip off date from model name\n",
    "    df[\"Model\"] = df[\"Model\"].apply(lambda x: x.rsplit(\"_\", 1)[0])\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = get_leaderboard_df()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Model</th>\n",
       "      <th>Date</th>\n",
       "      <th>Average</th>\n",
       "      <th>Ifeval</th>\n",
       "      <th>Truthfulqa</th>\n",
       "      <th>Winogrande</th>\n",
       "      <th>Gsm8k</th>\n",
       "      <th>Mmlu</th>\n",
       "      <th>Hellaswag</th>\n",
       "      <th>Arc</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main</td>\n",
       "      <td>2024-03-02</td>\n",
       "      <td>0.617</td>\n",
       "      <td>0.553</td>\n",
       "      <td>0.477</td>\n",
       "      <td>0.785</td>\n",
       "      <td>0.622</td>\n",
       "      <td>0.51</td>\n",
       "      <td>0.677</td>\n",
       "      <td>0.698</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NousResearch_Nous-Hermes-2-Yi-34B_main</td>\n",
       "      <td>2024-03-04</td>\n",
       "      <td>0.604</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.439</td>\n",
       "      <td>0.806</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.48</td>\n",
       "      <td>0.640</td>\n",
       "      <td>0.654</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>mistralai_Mixtral-8x7B-Instruct-v0.1_main</td>\n",
       "      <td>2024-03-02</td>\n",
       "      <td>0.603</td>\n",
       "      <td>0.497</td>\n",
       "      <td>0.554</td>\n",
       "      <td>0.736</td>\n",
       "      <td>0.599</td>\n",
       "      <td>0.43</td>\n",
       "      <td>0.709</td>\n",
       "      <td>0.698</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>deepseek-ai_deepseek-llm-67b-chat_main</td>\n",
       "      <td>2024-03-04</td>\n",
       "      <td>0.603</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.395</td>\n",
       "      <td>0.792</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.622</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>deepseek-ai_deepseek-llm-67b-chat_main</td>\n",
       "      <td>2024-03-05</td>\n",
       "      <td>0.585</td>\n",
       "      <td>0.505</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.761</td>\n",
       "      <td>0.42</td>\n",
       "      <td>0.654</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>269</th>\n",
       "      <td>HuggingFaceH4_starcoder2-15b-ift_v18.0</td>\n",
       "      <td>2024-03-10</td>\n",
       "      <td>0.089</td>\n",
       "      <td>0.170</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.008</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270</th>\n",
       "      <td>HuggingFaceH4_mistral-7b-ift_v49.0</td>\n",
       "      <td>2024-03-07</td>\n",
       "      <td>0.086</td>\n",
       "      <td>0.172</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>271</th>\n",
       "      <td>HuggingFaceH4_starchat-beta_main</td>\n",
       "      <td>2024-03-12</td>\n",
       "      <td>0.079</td>\n",
       "      <td>0.079</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>272</th>\n",
       "      <td>HuggingFaceH4_starcoder2-15b-ift_v7.0</td>\n",
       "      <td>2024-03-10</td>\n",
       "      <td>0.070</td>\n",
       "      <td>0.107</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.032</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>273</th>\n",
       "      <td>HuggingFaceH4_zephyr-7b-beta-ift_v1.1</td>\n",
       "      <td>2024-03-13</td>\n",
       "      <td>0.043</td>\n",
       "      <td>0.087</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>274 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                Model        Date  Average  \\\n",
       "0    NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main  2024-03-02    0.617   \n",
       "1              NousResearch_Nous-Hermes-2-Yi-34B_main  2024-03-04    0.604   \n",
       "2           mistralai_Mixtral-8x7B-Instruct-v0.1_main  2024-03-02    0.603   \n",
       "3              deepseek-ai_deepseek-llm-67b-chat_main  2024-03-04    0.603   \n",
       "4              deepseek-ai_deepseek-llm-67b-chat_main  2024-03-05    0.585   \n",
       "..                                                ...         ...      ...   \n",
       "269            HuggingFaceH4_starcoder2-15b-ift_v18.0  2024-03-10    0.089   \n",
       "270                HuggingFaceH4_mistral-7b-ift_v49.0  2024-03-07    0.086   \n",
       "271                  HuggingFaceH4_starchat-beta_main  2024-03-12    0.079   \n",
       "272             HuggingFaceH4_starcoder2-15b-ift_v7.0  2024-03-10    0.070   \n",
       "273             HuggingFaceH4_zephyr-7b-beta-ift_v1.1  2024-03-13    0.043   \n",
       "\n",
       "     Ifeval  Truthfulqa  Winogrande  Gsm8k  Mmlu  Hellaswag    Arc  \n",
       "0     0.553       0.477       0.785  0.622  0.51      0.677  0.698  \n",
       "1       NaN       0.439       0.806    NaN  0.48      0.640  0.654  \n",
       "2     0.497       0.554       0.736  0.599  0.43      0.709  0.698  \n",
       "3       NaN       0.395       0.792    NaN   NaN        NaN  0.622  \n",
       "4     0.505         NaN         NaN  0.761  0.42      0.654    NaN  \n",
       "..      ...         ...         ...    ...   ...        ...    ...  \n",
       "269   0.170         NaN         NaN  0.008   NaN        NaN    NaN  \n",
       "270   0.172         NaN         NaN  0.000   NaN        NaN    NaN  \n",
       "271   0.079         NaN         NaN    NaN   NaN        NaN    NaN  \n",
       "272   0.107         NaN         NaN  0.032   NaN        NaN    NaN  \n",
       "273   0.087         NaN         NaN  0.000   NaN        NaN    NaN  \n",
       "\n",
       "[274 rows x 10 columns]"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Model</th>\n",
       "      <th>Average</th>\n",
       "      <th>Ifeval</th>\n",
       "      <th>Truthfulqa</th>\n",
       "      <th>Winogrande</th>\n",
       "      <th>Gsm8k</th>\n",
       "      <th>Mmlu</th>\n",
       "      <th>Hellaswag</th>\n",
       "      <th>Arc</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>HuggingFaceH4_mistral-7b-ift_v48.56_2024-03-08</td>\n",
       "      <td>0.49</td>\n",
       "      <td>0.418</td>\n",
       "      <td>0.359</td>\n",
       "      <td>0.672</td>\n",
       "      <td>0.453</td>\n",
       "      <td>0.33</td>\n",
       "      <td>0.656</td>\n",
       "      <td>0.545</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>532</th>\n",
       "      <td>HuggingFaceH4_mistral-7b-ift_v48.56</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              Model  Average  Ifeval  \\\n",
       "50   HuggingFaceH4_mistral-7b-ift_v48.56_2024-03-08     0.49   0.418   \n",
       "532             HuggingFaceH4_mistral-7b-ift_v48.56      NaN     NaN   \n",
       "\n",
       "     Truthfulqa  Winogrande  Gsm8k  Mmlu  Hellaswag    Arc  \n",
       "50        0.359       0.672  0.453  0.33      0.656  0.545  \n",
       "532         NaN         NaN    NaN   NaN        NaN    NaN  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['Model'].str.contains(\"HuggingFaceH4_mistral-7b-ift_v48.56\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "hf",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}