File size: 5,047 Bytes
a53938b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from pathlib import Path\n",
"\n",
"import gradio as gr\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"def get_leaderboard_df():\n",
" filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n",
"\n",
" # Parse filepaths to get unique models\n",
" models = set()\n",
" for filepath in filepaths:\n",
" path_parts = Path(filepath).parts\n",
" model_revision = \"_\".join(path_parts[1:4])\n",
" models.add(model_revision)\n",
"\n",
" # Initialize DataFrame\n",
" df = pd.DataFrame(index=list(models))\n",
"\n",
" # Extract data from each file and populate the DataFrame\n",
" for filepath in filepaths:\n",
" path_parts = Path(filepath).parts\n",
" model_revision = \"_\".join(path_parts[1:4])\n",
" task = path_parts[4].capitalize()\n",
" # Extract timestamp from filepath\n",
" timestamp = filepath.stem.split(\"_\")[-1][:-3]\n",
" df.loc[model_revision, \"Timestamp\"] = timestamp\n",
"\n",
" with open(filepath, \"r\") as file:\n",
" data = json.load(file)\n",
" first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n",
" # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n",
" if task == \"truthfulqa\":\n",
" value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n",
" else:\n",
" first_metric_key = next(iter(data[\"results\"][first_result_key])) # gets the first key in the first result\n",
" value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n",
" df.loc[model_revision, task] = value\n",
" \n",
" df.insert(loc=0, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n",
" df = df.sort_values(by=[\"Average\"], ascending=False)\n",
" df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(3)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"df = get_leaderboard_df()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Model</th>\n",
" <th>Timestamp</th>\n",
" <th>Average</th>\n",
" <th>Truthfulqa</th>\n",
" <th>Winogrande</th>\n",
" <th>Gsm8k</th>\n",
" <th>Hellaswag</th>\n",
" <th>Arc</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Qwen_Qwen1.5-0.5B-Chat_main</td>\n",
" <td>2024-02-28T07-35-58.803</td>\n",
" <td>0.296</td>\n",
" <td>0.271</td>\n",
" <td>0.519</td>\n",
" <td>0.039</td>\n",
" <td>0.363</td>\n",
" <td>0.287</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Model Timestamp Average Truthfulqa \\\n",
"0 Qwen_Qwen1.5-0.5B-Chat_main 2024-02-28T07-35-58.803 0.296 0.271 \n",
"\n",
" Winogrande Gsm8k Hellaswag Arc \n",
"0 0.519 0.039 0.363 0.287 "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "hf",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|