lewtun HF staff commited on
Commit
e0f6c02
·
verified ·
1 Parent(s): c84416d

Upload eval_results/AI-MO/mistral-7b-sft/aimo_v03.01/math_v2/results_2024-04-27T07-57-00.481918.json with huggingface_hub

Browse files
eval_results/AI-MO/mistral-7b-sft/aimo_v03.01/math_v2/results_2024-04-27T07-57-00.481918.json ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 1404883.661664768,
9
+ "end_time": 1414730.017814066,
10
+ "total_evaluation_time_secondes": "9846.356149298139",
11
+ "model_name": "AI-MO/mistral-7b-sft",
12
+ "model_sha": "147a5797ae74bc37a561429745b8781fde7e9eff",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "13.99 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "custom|math_v2:algebra|0": {
19
+ "qem": 0.37068239258635216,
20
+ "qem_stderr": 0.014024699857095878
21
+ },
22
+ "custom|math_v2:counting_and_probability|0": {
23
+ "qem": 0.12236286919831224,
24
+ "qem_stderr": 0.015067866025208529
25
+ },
26
+ "custom|math_v2:geometry|0": {
27
+ "qem": 0.1336116910229645,
28
+ "qem_stderr": 0.015561969995340364
29
+ },
30
+ "custom|math_v2:intermediate_algebra|0": {
31
+ "qem": 0.10188261351052048,
32
+ "qem_stderr": 0.010071944446768333
33
+ },
34
+ "custom|math_v2:number_theory|0": {
35
+ "qem": 0.08148148148148149,
36
+ "qem_stderr": 0.011783628281121668
37
+ },
38
+ "custom|math_v2:prealgebra|0": {
39
+ "qem": 0.3157290470723307,
40
+ "qem_stderr": 0.015758384592042537
41
+ },
42
+ "custom|math_v2:precalculus|0": {
43
+ "qem": 0.11172161172161173,
44
+ "qem_stderr": 0.013494130099732627
45
+ },
46
+ "custom|math_v2:_average|0": {
47
+ "qem": 0.17678167237051048,
48
+ "qem_stderr": 0.013680374756758564
49
+ },
50
+ "all": {
51
+ "qem": 0.17678167237051048,
52
+ "qem_stderr": 0.013680374756758564
53
+ }
54
+ },
55
+ "versions": {
56
+ "custom|math_v2:algebra|0": 0,
57
+ "custom|math_v2:counting_and_probability|0": 0,
58
+ "custom|math_v2:geometry|0": 0,
59
+ "custom|math_v2:intermediate_algebra|0": 0,
60
+ "custom|math_v2:number_theory|0": 0,
61
+ "custom|math_v2:prealgebra|0": 0,
62
+ "custom|math_v2:precalculus|0": 0
63
+ },
64
+ "config_tasks": {
65
+ "custom|math_v2:algebra": {
66
+ "name": "math_v2:algebra",
67
+ "prompt_function": "math_prompt_fn",
68
+ "hf_repo": "lighteval/MATH",
69
+ "hf_subset": "algebra",
70
+ "metric": [
71
+ "quasi_exact_match_math"
72
+ ],
73
+ "hf_avail_splits": [
74
+ "train",
75
+ "test",
76
+ "validation"
77
+ ],
78
+ "evaluation_splits": [
79
+ "test"
80
+ ],
81
+ "few_shots_split": null,
82
+ "few_shots_select": null,
83
+ "generation_size": 2048,
84
+ "stop_sequence": null,
85
+ "output_regex": null,
86
+ "frozen": false,
87
+ "suite": [
88
+ "custom"
89
+ ],
90
+ "original_num_docs": 1187,
91
+ "effective_num_docs": 1187,
92
+ "trust_dataset": null,
93
+ "must_remove_duplicate_docs": null
94
+ },
95
+ "custom|math_v2:counting_and_probability": {
96
+ "name": "math_v2:counting_and_probability",
97
+ "prompt_function": "math_prompt_fn",
98
+ "hf_repo": "lighteval/MATH",
99
+ "hf_subset": "counting_and_probability",
100
+ "metric": [
101
+ "quasi_exact_match_math"
102
+ ],
103
+ "hf_avail_splits": [
104
+ "train",
105
+ "test",
106
+ "validation"
107
+ ],
108
+ "evaluation_splits": [
109
+ "test"
110
+ ],
111
+ "few_shots_split": null,
112
+ "few_shots_select": null,
113
+ "generation_size": 2048,
114
+ "stop_sequence": null,
115
+ "output_regex": null,
116
+ "frozen": false,
117
+ "suite": [
118
+ "custom"
119
+ ],
120
+ "original_num_docs": 474,
121
+ "effective_num_docs": 474,
122
+ "trust_dataset": null,
123
+ "must_remove_duplicate_docs": null
124
+ },
125
+ "custom|math_v2:geometry": {
126
+ "name": "math_v2:geometry",
127
+ "prompt_function": "math_prompt_fn",
128
+ "hf_repo": "lighteval/MATH",
129
+ "hf_subset": "geometry",
130
+ "metric": [
131
+ "quasi_exact_match_math"
132
+ ],
133
+ "hf_avail_splits": [
134
+ "train",
135
+ "test",
136
+ "validation"
137
+ ],
138
+ "evaluation_splits": [
139
+ "test"
140
+ ],
141
+ "few_shots_split": null,
142
+ "few_shots_select": null,
143
+ "generation_size": 2048,
144
+ "stop_sequence": null,
145
+ "output_regex": null,
146
+ "frozen": false,
147
+ "suite": [
148
+ "custom"
149
+ ],
150
+ "original_num_docs": 479,
151
+ "effective_num_docs": 479,
152
+ "trust_dataset": null,
153
+ "must_remove_duplicate_docs": null
154
+ },
155
+ "custom|math_v2:intermediate_algebra": {
156
+ "name": "math_v2:intermediate_algebra",
157
+ "prompt_function": "math_prompt_fn",
158
+ "hf_repo": "lighteval/MATH",
159
+ "hf_subset": "intermediate_algebra",
160
+ "metric": [
161
+ "quasi_exact_match_math"
162
+ ],
163
+ "hf_avail_splits": [
164
+ "train",
165
+ "test",
166
+ "validation"
167
+ ],
168
+ "evaluation_splits": [
169
+ "test"
170
+ ],
171
+ "few_shots_split": null,
172
+ "few_shots_select": null,
173
+ "generation_size": 2048,
174
+ "stop_sequence": null,
175
+ "output_regex": null,
176
+ "frozen": false,
177
+ "suite": [
178
+ "custom"
179
+ ],
180
+ "original_num_docs": 903,
181
+ "effective_num_docs": 903,
182
+ "trust_dataset": null,
183
+ "must_remove_duplicate_docs": null
184
+ },
185
+ "custom|math_v2:number_theory": {
186
+ "name": "math_v2:number_theory",
187
+ "prompt_function": "math_prompt_fn",
188
+ "hf_repo": "lighteval/MATH",
189
+ "hf_subset": "number_theory",
190
+ "metric": [
191
+ "quasi_exact_match_math"
192
+ ],
193
+ "hf_avail_splits": [
194
+ "train",
195
+ "test",
196
+ "validation"
197
+ ],
198
+ "evaluation_splits": [
199
+ "test"
200
+ ],
201
+ "few_shots_split": null,
202
+ "few_shots_select": null,
203
+ "generation_size": 2048,
204
+ "stop_sequence": null,
205
+ "output_regex": null,
206
+ "frozen": false,
207
+ "suite": [
208
+ "custom"
209
+ ],
210
+ "original_num_docs": 540,
211
+ "effective_num_docs": 540,
212
+ "trust_dataset": null,
213
+ "must_remove_duplicate_docs": null
214
+ },
215
+ "custom|math_v2:prealgebra": {
216
+ "name": "math_v2:prealgebra",
217
+ "prompt_function": "math_prompt_fn",
218
+ "hf_repo": "lighteval/MATH",
219
+ "hf_subset": "prealgebra",
220
+ "metric": [
221
+ "quasi_exact_match_math"
222
+ ],
223
+ "hf_avail_splits": [
224
+ "train",
225
+ "test",
226
+ "validation"
227
+ ],
228
+ "evaluation_splits": [
229
+ "test"
230
+ ],
231
+ "few_shots_split": null,
232
+ "few_shots_select": null,
233
+ "generation_size": 2048,
234
+ "stop_sequence": null,
235
+ "output_regex": null,
236
+ "frozen": false,
237
+ "suite": [
238
+ "custom"
239
+ ],
240
+ "original_num_docs": 871,
241
+ "effective_num_docs": 871,
242
+ "trust_dataset": null,
243
+ "must_remove_duplicate_docs": null
244
+ },
245
+ "custom|math_v2:precalculus": {
246
+ "name": "math_v2:precalculus",
247
+ "prompt_function": "math_prompt_fn",
248
+ "hf_repo": "lighteval/MATH",
249
+ "hf_subset": "precalculus",
250
+ "metric": [
251
+ "quasi_exact_match_math"
252
+ ],
253
+ "hf_avail_splits": [
254
+ "train",
255
+ "test",
256
+ "validation"
257
+ ],
258
+ "evaluation_splits": [
259
+ "test"
260
+ ],
261
+ "few_shots_split": null,
262
+ "few_shots_select": null,
263
+ "generation_size": 2048,
264
+ "stop_sequence": null,
265
+ "output_regex": null,
266
+ "frozen": false,
267
+ "suite": [
268
+ "custom"
269
+ ],
270
+ "original_num_docs": 546,
271
+ "effective_num_docs": 546,
272
+ "trust_dataset": null,
273
+ "must_remove_duplicate_docs": null
274
+ }
275
+ },
276
+ "summary_tasks": {
277
+ "custom|math_v2:algebra|0": {
278
+ "hashes": {
279
+ "hash_examples": "6ec951c5aa417d2a",
280
+ "hash_full_prompts": "4e0b4b752e408da7",
281
+ "hash_input_tokens": "40949aee162d5a25",
282
+ "hash_cont_tokens": "29a9ade9e1d83253"
283
+ },
284
+ "truncated": 1187,
285
+ "non_truncated": 0,
286
+ "padded": 186,
287
+ "non_padded": 1001,
288
+ "effective_few_shots": 0.0,
289
+ "num_truncated_few_shots": 0
290
+ },
291
+ "custom|math_v2:counting_and_probability|0": {
292
+ "hashes": {
293
+ "hash_examples": "cd34cb03dc09e1ad",
294
+ "hash_full_prompts": "9ebbc169a1089204",
295
+ "hash_input_tokens": "671cb933a0ed9e9f",
296
+ "hash_cont_tokens": "e5f65cf153a406ac"
297
+ },
298
+ "truncated": 474,
299
+ "non_truncated": 0,
300
+ "padded": 42,
301
+ "non_padded": 432,
302
+ "effective_few_shots": 0.0,
303
+ "num_truncated_few_shots": 0
304
+ },
305
+ "custom|math_v2:geometry|0": {
306
+ "hashes": {
307
+ "hash_examples": "e1011f83d0cb54d0",
308
+ "hash_full_prompts": "e6e4d9f63cdecf28",
309
+ "hash_input_tokens": "6ebe67f495eb0f04",
310
+ "hash_cont_tokens": "e0607eaa2ee9e7e0"
311
+ },
312
+ "truncated": 479,
313
+ "non_truncated": 0,
314
+ "padded": 129,
315
+ "non_padded": 350,
316
+ "effective_few_shots": 0.0,
317
+ "num_truncated_few_shots": 0
318
+ },
319
+ "custom|math_v2:intermediate_algebra|0": {
320
+ "hashes": {
321
+ "hash_examples": "aa72155be072b11c",
322
+ "hash_full_prompts": "38dad98a61e4f0ed",
323
+ "hash_input_tokens": "6a2208475c1766ea",
324
+ "hash_cont_tokens": "82c174ea7ce7c1fe"
325
+ },
326
+ "truncated": 900,
327
+ "non_truncated": 3,
328
+ "padded": 54,
329
+ "non_padded": 849,
330
+ "effective_few_shots": 0.0,
331
+ "num_truncated_few_shots": 0
332
+ },
333
+ "custom|math_v2:number_theory|0": {
334
+ "hashes": {
335
+ "hash_examples": "b8565befcdbe9247",
336
+ "hash_full_prompts": "8ca7a4c78d6e1280",
337
+ "hash_input_tokens": "40d9f5eaae183f33",
338
+ "hash_cont_tokens": "426355cc2a0d49dc"
339
+ },
340
+ "truncated": 540,
341
+ "non_truncated": 0,
342
+ "padded": 7,
343
+ "non_padded": 533,
344
+ "effective_few_shots": 0.0,
345
+ "num_truncated_few_shots": 0
346
+ },
347
+ "custom|math_v2:prealgebra|0": {
348
+ "hashes": {
349
+ "hash_examples": "e04d1527fe369f16",
350
+ "hash_full_prompts": "51e0c4e3d62158e4",
351
+ "hash_input_tokens": "c5bd83691493e671",
352
+ "hash_cont_tokens": "44fdef4732242a33"
353
+ },
354
+ "truncated": 871,
355
+ "non_truncated": 0,
356
+ "padded": 55,
357
+ "non_padded": 816,
358
+ "effective_few_shots": 0.0,
359
+ "num_truncated_few_shots": 0
360
+ },
361
+ "custom|math_v2:precalculus|0": {
362
+ "hashes": {
363
+ "hash_examples": "97606c134f223253",
364
+ "hash_full_prompts": "2438e95fc348976e",
365
+ "hash_input_tokens": "b860ff9d92ae1d85",
366
+ "hash_cont_tokens": "0ce410cf76391189"
367
+ },
368
+ "truncated": 546,
369
+ "non_truncated": 0,
370
+ "padded": 24,
371
+ "non_padded": 522,
372
+ "effective_few_shots": 0.0,
373
+ "num_truncated_few_shots": 0
374
+ }
375
+ },
376
+ "summary_general": {
377
+ "hashes": {
378
+ "hash_examples": "de5f0d623d1896c2",
379
+ "hash_full_prompts": "3cb45e93275657cb",
380
+ "hash_input_tokens": "e3bb985d4c1b1522",
381
+ "hash_cont_tokens": "391a0a352c5eff15"
382
+ },
383
+ "truncated": 4997,
384
+ "non_truncated": 3,
385
+ "padded": 497,
386
+ "non_padded": 4503,
387
+ "num_truncated_few_shots": 0
388
+ }
389
+ }