lewtun HF Staff commited on
Commit
599688f
·
1 Parent(s): 3bf1be5

Unstack models by date

Browse files
Files changed (3) hide show
  1. a.ipynb +0 -168
  2. app.py +7 -4
  3. debug.ipynb +413 -0
a.ipynb DELETED
@@ -1,168 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import json\n",
10
- "from pathlib import Path\n",
11
- "\n",
12
- "import gradio as gr\n",
13
- "import pandas as pd"
14
- ]
15
- },
16
- {
17
- "cell_type": "code",
18
- "execution_count": 31,
19
- "metadata": {},
20
- "outputs": [],
21
- "source": [
22
- "def get_leaderboard_df():\n",
23
- " filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n",
24
- "\n",
25
- " # Parse filepaths to get unique models\n",
26
- " models = set()\n",
27
- " for filepath in filepaths:\n",
28
- " path_parts = Path(filepath).parts\n",
29
- " model_revision = \"_\".join(path_parts[1:4])\n",
30
- " models.add(model_revision)\n",
31
- "\n",
32
- " # Initialize DataFrame\n",
33
- " df = pd.DataFrame(index=list(models))\n",
34
- "\n",
35
- " # Extract data from each file and populate the DataFrame\n",
36
- " for filepath in filepaths:\n",
37
- " path_parts = Path(filepath).parts\n",
38
- " model_revision = \"_\".join(path_parts[1:4])\n",
39
- " task = path_parts[4].capitalize()\n",
40
- " # Extract timestamp from filepath\n",
41
- " timestamp = filepath.stem.split(\"_\")[-1][:-3]\n",
42
- " df.loc[model_revision, \"Timestamp\"] = timestamp\n",
43
- "\n",
44
- " with open(filepath, \"r\") as file:\n",
45
- " data = json.load(file)\n",
46
- " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n",
47
- " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n",
48
- " if task == \"truthfulqa\":\n",
49
- " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n",
50
- " else:\n",
51
- " first_metric_key = next(iter(data[\"results\"][first_result_key])) # gets the first key in the first result\n",
52
- " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n",
53
- " df.loc[model_revision, task] = value\n",
54
- " \n",
55
- " df.insert(loc=0, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n",
56
- " df = df.sort_values(by=[\"Average\"], ascending=False)\n",
57
- " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(3)\n",
58
- " return df"
59
- ]
60
- },
61
- {
62
- "cell_type": "code",
63
- "execution_count": 32,
64
- "metadata": {},
65
- "outputs": [],
66
- "source": [
67
- "df = get_leaderboard_df()"
68
- ]
69
- },
70
- {
71
- "cell_type": "code",
72
- "execution_count": null,
73
- "metadata": {},
74
- "outputs": [
75
- {
76
- "data": {
77
- "text/html": [
78
- "<div>\n",
79
- "<style scoped>\n",
80
- " .dataframe tbody tr th:only-of-type {\n",
81
- " vertical-align: middle;\n",
82
- " }\n",
83
- "\n",
84
- " .dataframe tbody tr th {\n",
85
- " vertical-align: top;\n",
86
- " }\n",
87
- "\n",
88
- " .dataframe thead th {\n",
89
- " text-align: right;\n",
90
- " }\n",
91
- "</style>\n",
92
- "<table border=\"1\" class=\"dataframe\">\n",
93
- " <thead>\n",
94
- " <tr style=\"text-align: right;\">\n",
95
- " <th></th>\n",
96
- " <th>Model</th>\n",
97
- " <th>Timestamp</th>\n",
98
- " <th>Average</th>\n",
99
- " <th>Truthfulqa</th>\n",
100
- " <th>Winogrande</th>\n",
101
- " <th>Gsm8k</th>\n",
102
- " <th>Hellaswag</th>\n",
103
- " <th>Arc</th>\n",
104
- " </tr>\n",
105
- " </thead>\n",
106
- " <tbody>\n",
107
- " <tr>\n",
108
- " <th>0</th>\n",
109
- " <td>Qwen_Qwen1.5-0.5B-Chat_main</td>\n",
110
- " <td>2024-02-28T07-35-58.803</td>\n",
111
- " <td>0.296</td>\n",
112
- " <td>0.271</td>\n",
113
- " <td>0.519</td>\n",
114
- " <td>0.039</td>\n",
115
- " <td>0.363</td>\n",
116
- " <td>0.287</td>\n",
117
- " </tr>\n",
118
- " </tbody>\n",
119
- "</table>\n",
120
- "</div>"
121
- ],
122
- "text/plain": [
123
- " Model Timestamp Average Truthfulqa \\\n",
124
- "0 Qwen_Qwen1.5-0.5B-Chat_main 2024-02-28T07-35-58.803 0.296 0.271 \n",
125
- "\n",
126
- " Winogrande Gsm8k Hellaswag Arc \n",
127
- "0 0.519 0.039 0.363 0.287 "
128
- ]
129
- },
130
- "execution_count": 28,
131
- "metadata": {},
132
- "output_type": "execute_result"
133
- }
134
- ],
135
- "source": [
136
- "df"
137
- ]
138
- },
139
- {
140
- "cell_type": "code",
141
- "execution_count": null,
142
- "metadata": {},
143
- "outputs": [],
144
- "source": []
145
- }
146
- ],
147
- "metadata": {
148
- "kernelspec": {
149
- "display_name": "hf",
150
- "language": "python",
151
- "name": "python3"
152
- },
153
- "language_info": {
154
- "codemirror_mode": {
155
- "name": "ipython",
156
- "version": 3
157
- },
158
- "file_extension": ".py",
159
- "mimetype": "text/x-python",
160
- "name": "python",
161
- "nbconvert_exporter": "python",
162
- "pygments_lexer": "ipython3",
163
- "version": "3.10.6"
164
- }
165
- },
166
- "nbformat": 4,
167
- "nbformat_minor": 2
168
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -27,11 +27,10 @@ def get_leaderboard_df():
27
  # Extract data from each file and populate the DataFrame
28
  for filepath in filepaths:
29
  path_parts = Path(filepath).parts
30
- model_revision = "_".join(path_parts[1:4])
 
31
  task = path_parts[4].capitalize()
32
- # Extract timestamp from filepath
33
- timestamp = filepath.stem.split("_")[-1][:-3]
34
- df.loc[model_revision, "Timestamp"] = timestamp
35
 
36
  with open(filepath, "r") as file:
37
  data = json.load(file)
@@ -58,11 +57,15 @@ def get_leaderboard_df():
58
  # Put IFEval in first column
59
  ifeval_col = df.pop("Ifeval")
60
  df.insert(1, "Ifeval", ifeval_col)
 
 
61
  df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
62
  # Convert all values to percentage
63
  df[df.select_dtypes(include=["number"]).columns] *= 100.0
64
  df = df.sort_values(by=["Average"], ascending=False)
65
  df = df.reset_index().rename(columns={"index": "Model"}).round(2)
 
 
66
  return df
67
 
68
 
 
27
  # Extract data from each file and populate the DataFrame
28
  for filepath in filepaths:
29
  path_parts = Path(filepath).parts
30
+ date = filepath.stem.split("_")[-1][:-3].split("T")[0]
31
+ model_revision = "_".join(path_parts[1:4]) + "_" + date
32
  task = path_parts[4].capitalize()
33
+ df.loc[model_revision, "Date"] = date
 
 
34
 
35
  with open(filepath, "r") as file:
36
  data = json.load(file)
 
57
  # Put IFEval in first column
58
  ifeval_col = df.pop("Ifeval")
59
  df.insert(1, "Ifeval", ifeval_col)
60
+ # Drop rows where every entry is NaN
61
+ df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
62
  df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
63
  # Convert all values to percentage
64
  df[df.select_dtypes(include=["number"]).columns] *= 100.0
65
  df = df.sort_values(by=["Average"], ascending=False)
66
  df = df.reset_index().rename(columns={"index": "Model"}).round(2)
67
+ # Strip off date from model name
68
+ df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
69
  return df
70
 
71
 
debug.ipynb ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import json\n",
10
+ "from pathlib import Path\n",
11
+ "\n",
12
+ "import gradio as gr\n",
13
+ "import pandas as pd"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 51,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "def get_leaderboard_df():\n",
23
+ " filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n",
24
+ "\n",
25
+ " # Parse filepaths to get unique models\n",
26
+ " models = set()\n",
27
+ " for filepath in filepaths:\n",
28
+ " path_parts = Path(filepath).parts\n",
29
+ " model_revision = \"_\".join(path_parts[1:4])\n",
30
+ " models.add(model_revision)\n",
31
+ "\n",
32
+ " # Initialize DataFrame\n",
33
+ " df = pd.DataFrame(index=list(models))\n",
34
+ "\n",
35
+ " # Extract data from each file and populate the DataFrame\n",
36
+ " for filepath in filepaths:\n",
37
+ " path_parts = Path(filepath).parts\n",
38
+ " date = filepath.stem.split(\"_\")[-1][:-3].split(\"T\")[0]\n",
39
+ " model_revision = \"_\".join(path_parts[1:4]) + \"_\" + date\n",
40
+ " task = path_parts[4].capitalize()\n",
41
+ " df.loc[model_revision, \"Date\"] = date\n",
42
+ "\n",
43
+ " with open(filepath, \"r\") as file:\n",
44
+ " data = json.load(file)\n",
45
+ " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n",
46
+ " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n",
47
+ " if task == \"truthfulqa\":\n",
48
+ " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n",
49
+ " else:\n",
50
+ " first_metric_key = next(iter(data[\"results\"][first_result_key])) # gets the first key in the first result\n",
51
+ " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n",
52
+ " df.loc[model_revision, task] = value\n",
53
+ " \n",
54
+ " # Drop rows where every entry is NaN\n",
55
+ " df = df.dropna(how=\"all\", axis=0, subset=[c for c in df.columns if c != \"Date\"])\n",
56
+ " df.insert(loc=1, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n",
57
+ " df = df.sort_values(by=[\"Average\"], ascending=False)\n",
58
+ " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(3)\n",
59
+ " # Strip off date from model name\n",
60
+ " df[\"Model\"] = df[\"Model\"].apply(lambda x: x.rsplit(\"_\", 1)[0])\n",
61
+ " return df"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 52,
67
+ "metadata": {},
68
+ "outputs": [],
69
+ "source": [
70
+ "df = get_leaderboard_df()"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": 53,
76
+ "metadata": {},
77
+ "outputs": [
78
+ {
79
+ "data": {
80
+ "text/html": [
81
+ "<div>\n",
82
+ "<style scoped>\n",
83
+ " .dataframe tbody tr th:only-of-type {\n",
84
+ " vertical-align: middle;\n",
85
+ " }\n",
86
+ "\n",
87
+ " .dataframe tbody tr th {\n",
88
+ " vertical-align: top;\n",
89
+ " }\n",
90
+ "\n",
91
+ " .dataframe thead th {\n",
92
+ " text-align: right;\n",
93
+ " }\n",
94
+ "</style>\n",
95
+ "<table border=\"1\" class=\"dataframe\">\n",
96
+ " <thead>\n",
97
+ " <tr style=\"text-align: right;\">\n",
98
+ " <th></th>\n",
99
+ " <th>Model</th>\n",
100
+ " <th>Date</th>\n",
101
+ " <th>Average</th>\n",
102
+ " <th>Ifeval</th>\n",
103
+ " <th>Truthfulqa</th>\n",
104
+ " <th>Winogrande</th>\n",
105
+ " <th>Gsm8k</th>\n",
106
+ " <th>Mmlu</th>\n",
107
+ " <th>Hellaswag</th>\n",
108
+ " <th>Arc</th>\n",
109
+ " </tr>\n",
110
+ " </thead>\n",
111
+ " <tbody>\n",
112
+ " <tr>\n",
113
+ " <th>0</th>\n",
114
+ " <td>NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main</td>\n",
115
+ " <td>2024-03-02</td>\n",
116
+ " <td>0.617</td>\n",
117
+ " <td>0.553</td>\n",
118
+ " <td>0.477</td>\n",
119
+ " <td>0.785</td>\n",
120
+ " <td>0.622</td>\n",
121
+ " <td>0.51</td>\n",
122
+ " <td>0.677</td>\n",
123
+ " <td>0.698</td>\n",
124
+ " </tr>\n",
125
+ " <tr>\n",
126
+ " <th>1</th>\n",
127
+ " <td>NousResearch_Nous-Hermes-2-Yi-34B_main</td>\n",
128
+ " <td>2024-03-04</td>\n",
129
+ " <td>0.604</td>\n",
130
+ " <td>NaN</td>\n",
131
+ " <td>0.439</td>\n",
132
+ " <td>0.806</td>\n",
133
+ " <td>NaN</td>\n",
134
+ " <td>0.48</td>\n",
135
+ " <td>0.640</td>\n",
136
+ " <td>0.654</td>\n",
137
+ " </tr>\n",
138
+ " <tr>\n",
139
+ " <th>2</th>\n",
140
+ " <td>mistralai_Mixtral-8x7B-Instruct-v0.1_main</td>\n",
141
+ " <td>2024-03-02</td>\n",
142
+ " <td>0.603</td>\n",
143
+ " <td>0.497</td>\n",
144
+ " <td>0.554</td>\n",
145
+ " <td>0.736</td>\n",
146
+ " <td>0.599</td>\n",
147
+ " <td>0.43</td>\n",
148
+ " <td>0.709</td>\n",
149
+ " <td>0.698</td>\n",
150
+ " </tr>\n",
151
+ " <tr>\n",
152
+ " <th>3</th>\n",
153
+ " <td>deepseek-ai_deepseek-llm-67b-chat_main</td>\n",
154
+ " <td>2024-03-04</td>\n",
155
+ " <td>0.603</td>\n",
156
+ " <td>NaN</td>\n",
157
+ " <td>0.395</td>\n",
158
+ " <td>0.792</td>\n",
159
+ " <td>NaN</td>\n",
160
+ " <td>NaN</td>\n",
161
+ " <td>NaN</td>\n",
162
+ " <td>0.622</td>\n",
163
+ " </tr>\n",
164
+ " <tr>\n",
165
+ " <th>4</th>\n",
166
+ " <td>deepseek-ai_deepseek-llm-67b-chat_main</td>\n",
167
+ " <td>2024-03-05</td>\n",
168
+ " <td>0.585</td>\n",
169
+ " <td>0.505</td>\n",
170
+ " <td>NaN</td>\n",
171
+ " <td>NaN</td>\n",
172
+ " <td>0.761</td>\n",
173
+ " <td>0.42</td>\n",
174
+ " <td>0.654</td>\n",
175
+ " <td>NaN</td>\n",
176
+ " </tr>\n",
177
+ " <tr>\n",
178
+ " <th>...</th>\n",
179
+ " <td>...</td>\n",
180
+ " <td>...</td>\n",
181
+ " <td>...</td>\n",
182
+ " <td>...</td>\n",
183
+ " <td>...</td>\n",
184
+ " <td>...</td>\n",
185
+ " <td>...</td>\n",
186
+ " <td>...</td>\n",
187
+ " <td>...</td>\n",
188
+ " <td>...</td>\n",
189
+ " </tr>\n",
190
+ " <tr>\n",
191
+ " <th>269</th>\n",
192
+ " <td>HuggingFaceH4_starcoder2-15b-ift_v18.0</td>\n",
193
+ " <td>2024-03-10</td>\n",
194
+ " <td>0.089</td>\n",
195
+ " <td>0.170</td>\n",
196
+ " <td>NaN</td>\n",
197
+ " <td>NaN</td>\n",
198
+ " <td>0.008</td>\n",
199
+ " <td>NaN</td>\n",
200
+ " <td>NaN</td>\n",
201
+ " <td>NaN</td>\n",
202
+ " </tr>\n",
203
+ " <tr>\n",
204
+ " <th>270</th>\n",
205
+ " <td>HuggingFaceH4_mistral-7b-ift_v49.0</td>\n",
206
+ " <td>2024-03-07</td>\n",
207
+ " <td>0.086</td>\n",
208
+ " <td>0.172</td>\n",
209
+ " <td>NaN</td>\n",
210
+ " <td>NaN</td>\n",
211
+ " <td>0.000</td>\n",
212
+ " <td>NaN</td>\n",
213
+ " <td>NaN</td>\n",
214
+ " <td>NaN</td>\n",
215
+ " </tr>\n",
216
+ " <tr>\n",
217
+ " <th>271</th>\n",
218
+ " <td>HuggingFaceH4_starchat-beta_main</td>\n",
219
+ " <td>2024-03-12</td>\n",
220
+ " <td>0.079</td>\n",
221
+ " <td>0.079</td>\n",
222
+ " <td>NaN</td>\n",
223
+ " <td>NaN</td>\n",
224
+ " <td>NaN</td>\n",
225
+ " <td>NaN</td>\n",
226
+ " <td>NaN</td>\n",
227
+ " <td>NaN</td>\n",
228
+ " </tr>\n",
229
+ " <tr>\n",
230
+ " <th>272</th>\n",
231
+ " <td>HuggingFaceH4_starcoder2-15b-ift_v7.0</td>\n",
232
+ " <td>2024-03-10</td>\n",
233
+ " <td>0.070</td>\n",
234
+ " <td>0.107</td>\n",
235
+ " <td>NaN</td>\n",
236
+ " <td>NaN</td>\n",
237
+ " <td>0.032</td>\n",
238
+ " <td>NaN</td>\n",
239
+ " <td>NaN</td>\n",
240
+ " <td>NaN</td>\n",
241
+ " </tr>\n",
242
+ " <tr>\n",
243
+ " <th>273</th>\n",
244
+ " <td>HuggingFaceH4_zephyr-7b-beta-ift_v1.1</td>\n",
245
+ " <td>2024-03-13</td>\n",
246
+ " <td>0.043</td>\n",
247
+ " <td>0.087</td>\n",
248
+ " <td>NaN</td>\n",
249
+ " <td>NaN</td>\n",
250
+ " <td>0.000</td>\n",
251
+ " <td>NaN</td>\n",
252
+ " <td>NaN</td>\n",
253
+ " <td>NaN</td>\n",
254
+ " </tr>\n",
255
+ " </tbody>\n",
256
+ "</table>\n",
257
+ "<p>274 rows × 10 columns</p>\n",
258
+ "</div>"
259
+ ],
260
+ "text/plain": [
261
+ " Model Date Average \\\n",
262
+ "0 NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main 2024-03-02 0.617 \n",
263
+ "1 NousResearch_Nous-Hermes-2-Yi-34B_main 2024-03-04 0.604 \n",
264
+ "2 mistralai_Mixtral-8x7B-Instruct-v0.1_main 2024-03-02 0.603 \n",
265
+ "3 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-04 0.603 \n",
266
+ "4 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-05 0.585 \n",
267
+ ".. ... ... ... \n",
268
+ "269 HuggingFaceH4_starcoder2-15b-ift_v18.0 2024-03-10 0.089 \n",
269
+ "270 HuggingFaceH4_mistral-7b-ift_v49.0 2024-03-07 0.086 \n",
270
+ "271 HuggingFaceH4_starchat-beta_main 2024-03-12 0.079 \n",
271
+ "272 HuggingFaceH4_starcoder2-15b-ift_v7.0 2024-03-10 0.070 \n",
272
+ "273 HuggingFaceH4_zephyr-7b-beta-ift_v1.1 2024-03-13 0.043 \n",
273
+ "\n",
274
+ " Ifeval Truthfulqa Winogrande Gsm8k Mmlu Hellaswag Arc \n",
275
+ "0 0.553 0.477 0.785 0.622 0.51 0.677 0.698 \n",
276
+ "1 NaN 0.439 0.806 NaN 0.48 0.640 0.654 \n",
277
+ "2 0.497 0.554 0.736 0.599 0.43 0.709 0.698 \n",
278
+ "3 NaN 0.395 0.792 NaN NaN NaN 0.622 \n",
279
+ "4 0.505 NaN NaN 0.761 0.42 0.654 NaN \n",
280
+ ".. ... ... ... ... ... ... ... \n",
281
+ "269 0.170 NaN NaN 0.008 NaN NaN NaN \n",
282
+ "270 0.172 NaN NaN 0.000 NaN NaN NaN \n",
283
+ "271 0.079 NaN NaN NaN NaN NaN NaN \n",
284
+ "272 0.107 NaN NaN 0.032 NaN NaN NaN \n",
285
+ "273 0.087 NaN NaN 0.000 NaN NaN NaN \n",
286
+ "\n",
287
+ "[274 rows x 10 columns]"
288
+ ]
289
+ },
290
+ "execution_count": 53,
291
+ "metadata": {},
292
+ "output_type": "execute_result"
293
+ }
294
+ ],
295
+ "source": [
296
+ "df"
297
+ ]
298
+ },
299
+ {
300
+ "cell_type": "code",
301
+ "execution_count": 32,
302
+ "metadata": {},
303
+ "outputs": [
304
+ {
305
+ "data": {
306
+ "text/html": [
307
+ "<div>\n",
308
+ "<style scoped>\n",
309
+ " .dataframe tbody tr th:only-of-type {\n",
310
+ " vertical-align: middle;\n",
311
+ " }\n",
312
+ "\n",
313
+ " .dataframe tbody tr th {\n",
314
+ " vertical-align: top;\n",
315
+ " }\n",
316
+ "\n",
317
+ " .dataframe thead th {\n",
318
+ " text-align: right;\n",
319
+ " }\n",
320
+ "</style>\n",
321
+ "<table border=\"1\" class=\"dataframe\">\n",
322
+ " <thead>\n",
323
+ " <tr style=\"text-align: right;\">\n",
324
+ " <th></th>\n",
325
+ " <th>Model</th>\n",
326
+ " <th>Average</th>\n",
327
+ " <th>Ifeval</th>\n",
328
+ " <th>Truthfulqa</th>\n",
329
+ " <th>Winogrande</th>\n",
330
+ " <th>Gsm8k</th>\n",
331
+ " <th>Mmlu</th>\n",
332
+ " <th>Hellaswag</th>\n",
333
+ " <th>Arc</th>\n",
334
+ " </tr>\n",
335
+ " </thead>\n",
336
+ " <tbody>\n",
337
+ " <tr>\n",
338
+ " <th>50</th>\n",
339
+ " <td>HuggingFaceH4_mistral-7b-ift_v48.56_2024-03-08</td>\n",
340
+ " <td>0.49</td>\n",
341
+ " <td>0.418</td>\n",
342
+ " <td>0.359</td>\n",
343
+ " <td>0.672</td>\n",
344
+ " <td>0.453</td>\n",
345
+ " <td>0.33</td>\n",
346
+ " <td>0.656</td>\n",
347
+ " <td>0.545</td>\n",
348
+ " </tr>\n",
349
+ " <tr>\n",
350
+ " <th>532</th>\n",
351
+ " <td>HuggingFaceH4_mistral-7b-ift_v48.56</td>\n",
352
+ " <td>NaN</td>\n",
353
+ " <td>NaN</td>\n",
354
+ " <td>NaN</td>\n",
355
+ " <td>NaN</td>\n",
356
+ " <td>NaN</td>\n",
357
+ " <td>NaN</td>\n",
358
+ " <td>NaN</td>\n",
359
+ " <td>NaN</td>\n",
360
+ " </tr>\n",
361
+ " </tbody>\n",
362
+ "</table>\n",
363
+ "</div>"
364
+ ],
365
+ "text/plain": [
366
+ " Model Average Ifeval \\\n",
367
+ "50 HuggingFaceH4_mistral-7b-ift_v48.56_2024-03-08 0.49 0.418 \n",
368
+ "532 HuggingFaceH4_mistral-7b-ift_v48.56 NaN NaN \n",
369
+ "\n",
370
+ " Truthfulqa Winogrande Gsm8k Mmlu Hellaswag Arc \n",
371
+ "50 0.359 0.672 0.453 0.33 0.656 0.545 \n",
372
+ "532 NaN NaN NaN NaN NaN NaN "
373
+ ]
374
+ },
375
+ "execution_count": 32,
376
+ "metadata": {},
377
+ "output_type": "execute_result"
378
+ }
379
+ ],
380
+ "source": [
381
+ "df[df['Model'].str.contains(\"HuggingFaceH4_mistral-7b-ift_v48.56\")]"
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "code",
386
+ "execution_count": null,
387
+ "metadata": {},
388
+ "outputs": [],
389
+ "source": []
390
+ }
391
+ ],
392
+ "metadata": {
393
+ "kernelspec": {
394
+ "display_name": "hf",
395
+ "language": "python",
396
+ "name": "python3"
397
+ },
398
+ "language_info": {
399
+ "codemirror_mode": {
400
+ "name": "ipython",
401
+ "version": 3
402
+ },
403
+ "file_extension": ".py",
404
+ "mimetype": "text/x-python",
405
+ "name": "python",
406
+ "nbconvert_exporter": "python",
407
+ "pygments_lexer": "ipython3",
408
+ "version": "3.10.6"
409
+ }
410
+ },
411
+ "nbformat": 4,
412
+ "nbformat_minor": 2
413
+ }