lewtun HF Staff commited on
Commit
1f642de
·
verified ·
1 Parent(s): 050ea81

Upload eval_results/AI-MO/starcoder2-15b-sft/aimo_v00.08/math_v2/results_2024-07-01T00-56-59.203970.json with huggingface_hub

Browse files
eval_results/AI-MO/starcoder2-15b-sft/aimo_v00.08/math_v2/results_2024-07-01T00-56-59.203970.json ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 871962.987885902,
9
+ "end_time": 890054.235331994,
10
+ "total_evaluation_time_secondes": "18091.247446091962",
11
+ "model_name": "AI-MO/starcoder2-15b-sft",
12
+ "model_sha": "5acfeb7362142e04e07190363226e22bbfd9f5fc",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "30.04 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "custom|math_v2:algebra|0": {
19
+ "qem": 0.7253580454928391,
20
+ "qem_stderr": 0.01296036737671156
21
+ },
22
+ "custom|math_v2:counting_and_probability|0": {
23
+ "qem": 0.459915611814346,
24
+ "qem_stderr": 0.02291602658553354
25
+ },
26
+ "custom|math_v2:geometry|0": {
27
+ "qem": 0.42171189979123175,
28
+ "qem_stderr": 0.022587392962640666
29
+ },
30
+ "custom|math_v2:intermediate_algebra|0": {
31
+ "qem": 0.32447397563676633,
32
+ "qem_stderr": 0.01558862136798398
33
+ },
34
+ "custom|math_v2:number_theory|0": {
35
+ "qem": 0.4981481481481482,
36
+ "qem_stderr": 0.021536376899401997
37
+ },
38
+ "custom|math_v2:prealgebra|0": {
39
+ "qem": 0.6969001148105626,
40
+ "qem_stderr": 0.015581832361947974
41
+ },
42
+ "custom|math_v2:precalculus|0": {
43
+ "qem": 0.3553113553113553,
44
+ "qem_stderr": 0.020501295376310333
45
+ },
46
+ "custom|math_v2:_average|0": {
47
+ "qem": 0.4974027358578928,
48
+ "qem_stderr": 0.018810273275790008
49
+ },
50
+ "all": {
51
+ "qem": 0.4974027358578928,
52
+ "qem_stderr": 0.018810273275790008
53
+ }
54
+ },
55
+ "versions": {
56
+ "custom|math_v2:algebra|0": 0,
57
+ "custom|math_v2:counting_and_probability|0": 0,
58
+ "custom|math_v2:geometry|0": 0,
59
+ "custom|math_v2:intermediate_algebra|0": 0,
60
+ "custom|math_v2:number_theory|0": 0,
61
+ "custom|math_v2:prealgebra|0": 0,
62
+ "custom|math_v2:precalculus|0": 0
63
+ },
64
+ "config_tasks": {
65
+ "custom|math_v2:algebra": {
66
+ "name": "math_v2:algebra",
67
+ "prompt_function": "math_prompt_fn",
68
+ "hf_repo": "lighteval/MATH",
69
+ "hf_subset": "algebra",
70
+ "metric": [
71
+ "quasi_exact_match_math"
72
+ ],
73
+ "hf_avail_splits": [
74
+ "train",
75
+ "test",
76
+ "validation"
77
+ ],
78
+ "evaluation_splits": [
79
+ "test"
80
+ ],
81
+ "few_shots_split": null,
82
+ "few_shots_select": null,
83
+ "generation_size": 2048,
84
+ "stop_sequence": null,
85
+ "output_regex": null,
86
+ "num_samples": null,
87
+ "frozen": false,
88
+ "suite": [
89
+ "custom"
90
+ ],
91
+ "original_num_docs": 1187,
92
+ "effective_num_docs": 1187,
93
+ "trust_dataset": null,
94
+ "must_remove_duplicate_docs": null,
95
+ "version": 0
96
+ },
97
+ "custom|math_v2:counting_and_probability": {
98
+ "name": "math_v2:counting_and_probability",
99
+ "prompt_function": "math_prompt_fn",
100
+ "hf_repo": "lighteval/MATH",
101
+ "hf_subset": "counting_and_probability",
102
+ "metric": [
103
+ "quasi_exact_match_math"
104
+ ],
105
+ "hf_avail_splits": [
106
+ "train",
107
+ "test",
108
+ "validation"
109
+ ],
110
+ "evaluation_splits": [
111
+ "test"
112
+ ],
113
+ "few_shots_split": null,
114
+ "few_shots_select": null,
115
+ "generation_size": 2048,
116
+ "stop_sequence": null,
117
+ "output_regex": null,
118
+ "num_samples": null,
119
+ "frozen": false,
120
+ "suite": [
121
+ "custom"
122
+ ],
123
+ "original_num_docs": 474,
124
+ "effective_num_docs": 474,
125
+ "trust_dataset": null,
126
+ "must_remove_duplicate_docs": null,
127
+ "version": 0
128
+ },
129
+ "custom|math_v2:geometry": {
130
+ "name": "math_v2:geometry",
131
+ "prompt_function": "math_prompt_fn",
132
+ "hf_repo": "lighteval/MATH",
133
+ "hf_subset": "geometry",
134
+ "metric": [
135
+ "quasi_exact_match_math"
136
+ ],
137
+ "hf_avail_splits": [
138
+ "train",
139
+ "test",
140
+ "validation"
141
+ ],
142
+ "evaluation_splits": [
143
+ "test"
144
+ ],
145
+ "few_shots_split": null,
146
+ "few_shots_select": null,
147
+ "generation_size": 2048,
148
+ "stop_sequence": null,
149
+ "output_regex": null,
150
+ "num_samples": null,
151
+ "frozen": false,
152
+ "suite": [
153
+ "custom"
154
+ ],
155
+ "original_num_docs": 479,
156
+ "effective_num_docs": 479,
157
+ "trust_dataset": null,
158
+ "must_remove_duplicate_docs": null,
159
+ "version": 0
160
+ },
161
+ "custom|math_v2:intermediate_algebra": {
162
+ "name": "math_v2:intermediate_algebra",
163
+ "prompt_function": "math_prompt_fn",
164
+ "hf_repo": "lighteval/MATH",
165
+ "hf_subset": "intermediate_algebra",
166
+ "metric": [
167
+ "quasi_exact_match_math"
168
+ ],
169
+ "hf_avail_splits": [
170
+ "train",
171
+ "test",
172
+ "validation"
173
+ ],
174
+ "evaluation_splits": [
175
+ "test"
176
+ ],
177
+ "few_shots_split": null,
178
+ "few_shots_select": null,
179
+ "generation_size": 2048,
180
+ "stop_sequence": null,
181
+ "output_regex": null,
182
+ "num_samples": null,
183
+ "frozen": false,
184
+ "suite": [
185
+ "custom"
186
+ ],
187
+ "original_num_docs": 903,
188
+ "effective_num_docs": 903,
189
+ "trust_dataset": null,
190
+ "must_remove_duplicate_docs": null,
191
+ "version": 0
192
+ },
193
+ "custom|math_v2:number_theory": {
194
+ "name": "math_v2:number_theory",
195
+ "prompt_function": "math_prompt_fn",
196
+ "hf_repo": "lighteval/MATH",
197
+ "hf_subset": "number_theory",
198
+ "metric": [
199
+ "quasi_exact_match_math"
200
+ ],
201
+ "hf_avail_splits": [
202
+ "train",
203
+ "test",
204
+ "validation"
205
+ ],
206
+ "evaluation_splits": [
207
+ "test"
208
+ ],
209
+ "few_shots_split": null,
210
+ "few_shots_select": null,
211
+ "generation_size": 2048,
212
+ "stop_sequence": null,
213
+ "output_regex": null,
214
+ "num_samples": null,
215
+ "frozen": false,
216
+ "suite": [
217
+ "custom"
218
+ ],
219
+ "original_num_docs": 540,
220
+ "effective_num_docs": 540,
221
+ "trust_dataset": null,
222
+ "must_remove_duplicate_docs": null,
223
+ "version": 0
224
+ },
225
+ "custom|math_v2:prealgebra": {
226
+ "name": "math_v2:prealgebra",
227
+ "prompt_function": "math_prompt_fn",
228
+ "hf_repo": "lighteval/MATH",
229
+ "hf_subset": "prealgebra",
230
+ "metric": [
231
+ "quasi_exact_match_math"
232
+ ],
233
+ "hf_avail_splits": [
234
+ "train",
235
+ "test",
236
+ "validation"
237
+ ],
238
+ "evaluation_splits": [
239
+ "test"
240
+ ],
241
+ "few_shots_split": null,
242
+ "few_shots_select": null,
243
+ "generation_size": 2048,
244
+ "stop_sequence": null,
245
+ "output_regex": null,
246
+ "num_samples": null,
247
+ "frozen": false,
248
+ "suite": [
249
+ "custom"
250
+ ],
251
+ "original_num_docs": 871,
252
+ "effective_num_docs": 871,
253
+ "trust_dataset": null,
254
+ "must_remove_duplicate_docs": null,
255
+ "version": 0
256
+ },
257
+ "custom|math_v2:precalculus": {
258
+ "name": "math_v2:precalculus",
259
+ "prompt_function": "math_prompt_fn",
260
+ "hf_repo": "lighteval/MATH",
261
+ "hf_subset": "precalculus",
262
+ "metric": [
263
+ "quasi_exact_match_math"
264
+ ],
265
+ "hf_avail_splits": [
266
+ "train",
267
+ "test",
268
+ "validation"
269
+ ],
270
+ "evaluation_splits": [
271
+ "test"
272
+ ],
273
+ "few_shots_split": null,
274
+ "few_shots_select": null,
275
+ "generation_size": 2048,
276
+ "stop_sequence": null,
277
+ "output_regex": null,
278
+ "num_samples": null,
279
+ "frozen": false,
280
+ "suite": [
281
+ "custom"
282
+ ],
283
+ "original_num_docs": 546,
284
+ "effective_num_docs": 546,
285
+ "trust_dataset": null,
286
+ "must_remove_duplicate_docs": null,
287
+ "version": 0
288
+ }
289
+ },
290
+ "summary_tasks": {
291
+ "custom|math_v2:algebra|0": {
292
+ "hashes": {
293
+ "hash_examples": "6ec951c5aa417d2a",
294
+ "hash_full_prompts": "4e0b4b752e408da7",
295
+ "hash_input_tokens": "3950b78fa2a502ed",
296
+ "hash_cont_tokens": "a4b90b7c300f8710"
297
+ },
298
+ "truncated": 1187,
299
+ "non_truncated": 0,
300
+ "padded": 167,
301
+ "non_padded": 1020,
302
+ "effective_few_shots": 0.0,
303
+ "num_truncated_few_shots": 0
304
+ },
305
+ "custom|math_v2:counting_and_probability|0": {
306
+ "hashes": {
307
+ "hash_examples": "cd34cb03dc09e1ad",
308
+ "hash_full_prompts": "9ebbc169a1089204",
309
+ "hash_input_tokens": "29e545f6a9a1aa13",
310
+ "hash_cont_tokens": "26b8180c1708d028"
311
+ },
312
+ "truncated": 474,
313
+ "non_truncated": 0,
314
+ "padded": 47,
315
+ "non_padded": 427,
316
+ "effective_few_shots": 0.0,
317
+ "num_truncated_few_shots": 0
318
+ },
319
+ "custom|math_v2:geometry|0": {
320
+ "hashes": {
321
+ "hash_examples": "e1011f83d0cb54d0",
322
+ "hash_full_prompts": "e6e4d9f63cdecf28",
323
+ "hash_input_tokens": "7d0b745fcd81512b",
324
+ "hash_cont_tokens": "e85a8852200647d8"
325
+ },
326
+ "truncated": 479,
327
+ "non_truncated": 0,
328
+ "padded": 113,
329
+ "non_padded": 366,
330
+ "effective_few_shots": 0.0,
331
+ "num_truncated_few_shots": 0
332
+ },
333
+ "custom|math_v2:intermediate_algebra|0": {
334
+ "hashes": {
335
+ "hash_examples": "aa72155be072b11c",
336
+ "hash_full_prompts": "38dad98a61e4f0ed",
337
+ "hash_input_tokens": "8b3a81aade71603a",
338
+ "hash_cont_tokens": "24a48066df1c4118"
339
+ },
340
+ "truncated": 903,
341
+ "non_truncated": 0,
342
+ "padded": 51,
343
+ "non_padded": 852,
344
+ "effective_few_shots": 0.0,
345
+ "num_truncated_few_shots": 0
346
+ },
347
+ "custom|math_v2:number_theory|0": {
348
+ "hashes": {
349
+ "hash_examples": "b8565befcdbe9247",
350
+ "hash_full_prompts": "8ca7a4c78d6e1280",
351
+ "hash_input_tokens": "63dcb48647f40c9a",
352
+ "hash_cont_tokens": "6dffe03bbe313c59"
353
+ },
354
+ "truncated": 540,
355
+ "non_truncated": 0,
356
+ "padded": 8,
357
+ "non_padded": 532,
358
+ "effective_few_shots": 0.0,
359
+ "num_truncated_few_shots": 0
360
+ },
361
+ "custom|math_v2:prealgebra|0": {
362
+ "hashes": {
363
+ "hash_examples": "e04d1527fe369f16",
364
+ "hash_full_prompts": "51e0c4e3d62158e4",
365
+ "hash_input_tokens": "6bbd4b605da14705",
366
+ "hash_cont_tokens": "08f93a80269dd0f5"
367
+ },
368
+ "truncated": 870,
369
+ "non_truncated": 1,
370
+ "padded": 50,
371
+ "non_padded": 821,
372
+ "effective_few_shots": 0.0,
373
+ "num_truncated_few_shots": 0
374
+ },
375
+ "custom|math_v2:precalculus|0": {
376
+ "hashes": {
377
+ "hash_examples": "97606c134f223253",
378
+ "hash_full_prompts": "2438e95fc348976e",
379
+ "hash_input_tokens": "05dc3cf37ca85967",
380
+ "hash_cont_tokens": "88322cec0640e382"
381
+ },
382
+ "truncated": 544,
383
+ "non_truncated": 2,
384
+ "padded": 20,
385
+ "non_padded": 526,
386
+ "effective_few_shots": 0.0,
387
+ "num_truncated_few_shots": 0
388
+ }
389
+ },
390
+ "summary_general": {
391
+ "hashes": {
392
+ "hash_examples": "de5f0d623d1896c2",
393
+ "hash_full_prompts": "3cb45e93275657cb",
394
+ "hash_input_tokens": "35e2acb80171b35e",
395
+ "hash_cont_tokens": "1dc343f2f60d0fbe"
396
+ },
397
+ "truncated": 4997,
398
+ "non_truncated": 3,
399
+ "padded": 456,
400
+ "non_padded": 4544,
401
+ "num_truncated_few_shots": 0
402
+ }
403
+ }