lewtun HF staff commited on
Commit
0b7bb81
·
verified ·
1 Parent(s): 8836bbc

Upload eval_results/AI-MO/mixtral-47b-sft/aimo_v00.01/math_v2/results_2024-04-29T14-03-18.023791.json with huggingface_hub

Browse files
eval_results/AI-MO/mixtral-47b-sft/aimo_v00.01/math_v2/results_2024-04-29T14-03-18.023791.json ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 190375.359157276,
9
+ "end_time": 276615.230493667,
10
+ "total_evaluation_time_secondes": "86239.87133639102",
11
+ "model_name": "AI-MO/mixtral-47b-sft",
12
+ "model_sha": "9f3ead81e0a9c2d74a04a83d24d9d8f6cb6d4826",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "87.49 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "custom|math_v2:algebra|0": {
19
+ "qem": 0.4667228306655434,
20
+ "qem_stderr": 0.014486506776458814
21
+ },
22
+ "custom|math_v2:counting_and_probability|0": {
23
+ "qem": 0.27848101265822783,
24
+ "qem_stderr": 0.020610622389419315
25
+ },
26
+ "custom|math_v2:geometry|0": {
27
+ "qem": 0.2567849686847599,
28
+ "qem_stderr": 0.01998150025896483
29
+ },
30
+ "custom|math_v2:intermediate_algebra|0": {
31
+ "qem": 0.14174972314507198,
32
+ "qem_stderr": 0.011613545265293153
33
+ },
34
+ "custom|math_v2:number_theory|0": {
35
+ "qem": 0.20925925925925926,
36
+ "qem_stderr": 0.01752124719563077
37
+ },
38
+ "custom|math_v2:prealgebra|0": {
39
+ "qem": 0.5361653272101033,
40
+ "qem_stderr": 0.016907186430552387
41
+ },
42
+ "custom|math_v2:precalculus|0": {
43
+ "qem": 0.1794871794871795,
44
+ "qem_stderr": 0.016438460824782845
45
+ },
46
+ "custom|math_v2:_average|0": {
47
+ "qem": 0.2955214715871636,
48
+ "qem_stderr": 0.01679415273444316
49
+ },
50
+ "all": {
51
+ "qem": 0.2955214715871636,
52
+ "qem_stderr": 0.01679415273444316
53
+ }
54
+ },
55
+ "versions": {
56
+ "custom|math_v2:algebra|0": 0,
57
+ "custom|math_v2:counting_and_probability|0": 0,
58
+ "custom|math_v2:geometry|0": 0,
59
+ "custom|math_v2:intermediate_algebra|0": 0,
60
+ "custom|math_v2:number_theory|0": 0,
61
+ "custom|math_v2:prealgebra|0": 0,
62
+ "custom|math_v2:precalculus|0": 0
63
+ },
64
+ "config_tasks": {
65
+ "custom|math_v2:algebra": {
66
+ "name": "math_v2:algebra",
67
+ "prompt_function": "math_prompt_fn",
68
+ "hf_repo": "lighteval/MATH",
69
+ "hf_subset": "algebra",
70
+ "metric": [
71
+ "quasi_exact_match_math"
72
+ ],
73
+ "hf_avail_splits": [
74
+ "train",
75
+ "test",
76
+ "validation"
77
+ ],
78
+ "evaluation_splits": [
79
+ "test"
80
+ ],
81
+ "few_shots_split": null,
82
+ "few_shots_select": null,
83
+ "generation_size": 2048,
84
+ "stop_sequence": null,
85
+ "output_regex": null,
86
+ "frozen": false,
87
+ "suite": [
88
+ "custom"
89
+ ],
90
+ "original_num_docs": 1187,
91
+ "effective_num_docs": 1187,
92
+ "trust_dataset": null,
93
+ "must_remove_duplicate_docs": null
94
+ },
95
+ "custom|math_v2:counting_and_probability": {
96
+ "name": "math_v2:counting_and_probability",
97
+ "prompt_function": "math_prompt_fn",
98
+ "hf_repo": "lighteval/MATH",
99
+ "hf_subset": "counting_and_probability",
100
+ "metric": [
101
+ "quasi_exact_match_math"
102
+ ],
103
+ "hf_avail_splits": [
104
+ "train",
105
+ "test",
106
+ "validation"
107
+ ],
108
+ "evaluation_splits": [
109
+ "test"
110
+ ],
111
+ "few_shots_split": null,
112
+ "few_shots_select": null,
113
+ "generation_size": 2048,
114
+ "stop_sequence": null,
115
+ "output_regex": null,
116
+ "frozen": false,
117
+ "suite": [
118
+ "custom"
119
+ ],
120
+ "original_num_docs": 474,
121
+ "effective_num_docs": 474,
122
+ "trust_dataset": null,
123
+ "must_remove_duplicate_docs": null
124
+ },
125
+ "custom|math_v2:geometry": {
126
+ "name": "math_v2:geometry",
127
+ "prompt_function": "math_prompt_fn",
128
+ "hf_repo": "lighteval/MATH",
129
+ "hf_subset": "geometry",
130
+ "metric": [
131
+ "quasi_exact_match_math"
132
+ ],
133
+ "hf_avail_splits": [
134
+ "train",
135
+ "test",
136
+ "validation"
137
+ ],
138
+ "evaluation_splits": [
139
+ "test"
140
+ ],
141
+ "few_shots_split": null,
142
+ "few_shots_select": null,
143
+ "generation_size": 2048,
144
+ "stop_sequence": null,
145
+ "output_regex": null,
146
+ "frozen": false,
147
+ "suite": [
148
+ "custom"
149
+ ],
150
+ "original_num_docs": 479,
151
+ "effective_num_docs": 479,
152
+ "trust_dataset": null,
153
+ "must_remove_duplicate_docs": null
154
+ },
155
+ "custom|math_v2:intermediate_algebra": {
156
+ "name": "math_v2:intermediate_algebra",
157
+ "prompt_function": "math_prompt_fn",
158
+ "hf_repo": "lighteval/MATH",
159
+ "hf_subset": "intermediate_algebra",
160
+ "metric": [
161
+ "quasi_exact_match_math"
162
+ ],
163
+ "hf_avail_splits": [
164
+ "train",
165
+ "test",
166
+ "validation"
167
+ ],
168
+ "evaluation_splits": [
169
+ "test"
170
+ ],
171
+ "few_shots_split": null,
172
+ "few_shots_select": null,
173
+ "generation_size": 2048,
174
+ "stop_sequence": null,
175
+ "output_regex": null,
176
+ "frozen": false,
177
+ "suite": [
178
+ "custom"
179
+ ],
180
+ "original_num_docs": 903,
181
+ "effective_num_docs": 903,
182
+ "trust_dataset": null,
183
+ "must_remove_duplicate_docs": null
184
+ },
185
+ "custom|math_v2:number_theory": {
186
+ "name": "math_v2:number_theory",
187
+ "prompt_function": "math_prompt_fn",
188
+ "hf_repo": "lighteval/MATH",
189
+ "hf_subset": "number_theory",
190
+ "metric": [
191
+ "quasi_exact_match_math"
192
+ ],
193
+ "hf_avail_splits": [
194
+ "train",
195
+ "test",
196
+ "validation"
197
+ ],
198
+ "evaluation_splits": [
199
+ "test"
200
+ ],
201
+ "few_shots_split": null,
202
+ "few_shots_select": null,
203
+ "generation_size": 2048,
204
+ "stop_sequence": null,
205
+ "output_regex": null,
206
+ "frozen": false,
207
+ "suite": [
208
+ "custom"
209
+ ],
210
+ "original_num_docs": 540,
211
+ "effective_num_docs": 540,
212
+ "trust_dataset": null,
213
+ "must_remove_duplicate_docs": null
214
+ },
215
+ "custom|math_v2:prealgebra": {
216
+ "name": "math_v2:prealgebra",
217
+ "prompt_function": "math_prompt_fn",
218
+ "hf_repo": "lighteval/MATH",
219
+ "hf_subset": "prealgebra",
220
+ "metric": [
221
+ "quasi_exact_match_math"
222
+ ],
223
+ "hf_avail_splits": [
224
+ "train",
225
+ "test",
226
+ "validation"
227
+ ],
228
+ "evaluation_splits": [
229
+ "test"
230
+ ],
231
+ "few_shots_split": null,
232
+ "few_shots_select": null,
233
+ "generation_size": 2048,
234
+ "stop_sequence": null,
235
+ "output_regex": null,
236
+ "frozen": false,
237
+ "suite": [
238
+ "custom"
239
+ ],
240
+ "original_num_docs": 871,
241
+ "effective_num_docs": 871,
242
+ "trust_dataset": null,
243
+ "must_remove_duplicate_docs": null
244
+ },
245
+ "custom|math_v2:precalculus": {
246
+ "name": "math_v2:precalculus",
247
+ "prompt_function": "math_prompt_fn",
248
+ "hf_repo": "lighteval/MATH",
249
+ "hf_subset": "precalculus",
250
+ "metric": [
251
+ "quasi_exact_match_math"
252
+ ],
253
+ "hf_avail_splits": [
254
+ "train",
255
+ "test",
256
+ "validation"
257
+ ],
258
+ "evaluation_splits": [
259
+ "test"
260
+ ],
261
+ "few_shots_split": null,
262
+ "few_shots_select": null,
263
+ "generation_size": 2048,
264
+ "stop_sequence": null,
265
+ "output_regex": null,
266
+ "frozen": false,
267
+ "suite": [
268
+ "custom"
269
+ ],
270
+ "original_num_docs": 546,
271
+ "effective_num_docs": 546,
272
+ "trust_dataset": null,
273
+ "must_remove_duplicate_docs": null
274
+ }
275
+ },
276
+ "summary_tasks": {
277
+ "custom|math_v2:algebra|0": {
278
+ "hashes": {
279
+ "hash_examples": "6ec951c5aa417d2a",
280
+ "hash_full_prompts": "4e0b4b752e408da7",
281
+ "hash_input_tokens": "5c933039b7f4c907",
282
+ "hash_cont_tokens": "2308663b30b9a53d"
283
+ },
284
+ "truncated": 1187,
285
+ "non_truncated": 0,
286
+ "padded": 186,
287
+ "non_padded": 1001,
288
+ "effective_few_shots": 0.0,
289
+ "num_truncated_few_shots": 0
290
+ },
291
+ "custom|math_v2:counting_and_probability|0": {
292
+ "hashes": {
293
+ "hash_examples": "cd34cb03dc09e1ad",
294
+ "hash_full_prompts": "9ebbc169a1089204",
295
+ "hash_input_tokens": "ff493ce6cceb8548",
296
+ "hash_cont_tokens": "3adf38220b16b399"
297
+ },
298
+ "truncated": 474,
299
+ "non_truncated": 0,
300
+ "padded": 42,
301
+ "non_padded": 432,
302
+ "effective_few_shots": 0.0,
303
+ "num_truncated_few_shots": 0
304
+ },
305
+ "custom|math_v2:geometry|0": {
306
+ "hashes": {
307
+ "hash_examples": "e1011f83d0cb54d0",
308
+ "hash_full_prompts": "e6e4d9f63cdecf28",
309
+ "hash_input_tokens": "f61c2560fd69fc44",
310
+ "hash_cont_tokens": "8baa9f6ebd25b83a"
311
+ },
312
+ "truncated": 479,
313
+ "non_truncated": 0,
314
+ "padded": 129,
315
+ "non_padded": 350,
316
+ "effective_few_shots": 0.0,
317
+ "num_truncated_few_shots": 0
318
+ },
319
+ "custom|math_v2:intermediate_algebra|0": {
320
+ "hashes": {
321
+ "hash_examples": "aa72155be072b11c",
322
+ "hash_full_prompts": "38dad98a61e4f0ed",
323
+ "hash_input_tokens": "024718b19ffe7955",
324
+ "hash_cont_tokens": "d0db1bccc9ecb8fb"
325
+ },
326
+ "truncated": 900,
327
+ "non_truncated": 3,
328
+ "padded": 54,
329
+ "non_padded": 849,
330
+ "effective_few_shots": 0.0,
331
+ "num_truncated_few_shots": 0
332
+ },
333
+ "custom|math_v2:number_theory|0": {
334
+ "hashes": {
335
+ "hash_examples": "b8565befcdbe9247",
336
+ "hash_full_prompts": "8ca7a4c78d6e1280",
337
+ "hash_input_tokens": "e7eb3cb8d452018e",
338
+ "hash_cont_tokens": "4b05ec61fa4c587b"
339
+ },
340
+ "truncated": 540,
341
+ "non_truncated": 0,
342
+ "padded": 7,
343
+ "non_padded": 533,
344
+ "effective_few_shots": 0.0,
345
+ "num_truncated_few_shots": 0
346
+ },
347
+ "custom|math_v2:prealgebra|0": {
348
+ "hashes": {
349
+ "hash_examples": "e04d1527fe369f16",
350
+ "hash_full_prompts": "51e0c4e3d62158e4",
351
+ "hash_input_tokens": "abf3673501aec1ab",
352
+ "hash_cont_tokens": "3a5c994ef68adac3"
353
+ },
354
+ "truncated": 871,
355
+ "non_truncated": 0,
356
+ "padded": 55,
357
+ "non_padded": 816,
358
+ "effective_few_shots": 0.0,
359
+ "num_truncated_few_shots": 0
360
+ },
361
+ "custom|math_v2:precalculus|0": {
362
+ "hashes": {
363
+ "hash_examples": "97606c134f223253",
364
+ "hash_full_prompts": "2438e95fc348976e",
365
+ "hash_input_tokens": "33a48df19e37782a",
366
+ "hash_cont_tokens": "256da5159c75776c"
367
+ },
368
+ "truncated": 546,
369
+ "non_truncated": 0,
370
+ "padded": 24,
371
+ "non_padded": 522,
372
+ "effective_few_shots": 0.0,
373
+ "num_truncated_few_shots": 0
374
+ }
375
+ },
376
+ "summary_general": {
377
+ "hashes": {
378
+ "hash_examples": "de5f0d623d1896c2",
379
+ "hash_full_prompts": "3cb45e93275657cb",
380
+ "hash_input_tokens": "6a387a67e9deefce",
381
+ "hash_cont_tokens": "99867db5ad909ba9"
382
+ },
383
+ "truncated": 4997,
384
+ "non_truncated": 3,
385
+ "padded": 497,
386
+ "non_padded": 4503,
387
+ "num_truncated_few_shots": 0
388
+ }
389
+ }