lewtun HF staff commited on
Commit
72a8df2
·
verified ·
1 Parent(s): 1937890

Upload eval_results/AI-MO/deepseek-math-7b-sft/aimo_v03.01/math/results_2024-04-26T04-51-02.678286.json with huggingface_hub

Browse files
eval_results/AI-MO/deepseek-math-7b-sft/aimo_v03.01/math/results_2024-04-26T04-51-02.678286.json ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 2,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 4213395.073455049,
9
+ "end_time": 4222017.114041344,
10
+ "total_evaluation_time_secondes": "8622.040586295538",
11
+ "model_name": "AI-MO/deepseek-math-7b-sft",
12
+ "model_sha": "dd01bf2509b1dede9c1f2afef533c04f5657c530",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "12.93 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|math:algebra|0": {
19
+ "qem": 0.6773378264532435,
20
+ "qem_stderr": 0.013574828385302717
21
+ },
22
+ "lighteval|math:counting_and_probability|0": {
23
+ "qem": 0.36075949367088606,
24
+ "qem_stderr": 0.022080579003443235
25
+ },
26
+ "lighteval|math:geometry|0": {
27
+ "qem": 0.348643006263048,
28
+ "qem_stderr": 0.02179646532161935
29
+ },
30
+ "lighteval|math:intermediate_algebra|0": {
31
+ "qem": 0.2469545957918051,
32
+ "qem_stderr": 0.014358724698059553
33
+ },
34
+ "lighteval|math:number_theory|0": {
35
+ "qem": 0.37962962962962965,
36
+ "qem_stderr": 0.020903123748455175
37
+ },
38
+ "lighteval|math:prealgebra|0": {
39
+ "qem": 0.6463834672789897,
40
+ "qem_stderr": 0.016208833993652156
41
+ },
42
+ "lighteval|math:precalculus|0": {
43
+ "qem": 0.29853479853479853,
44
+ "qem_stderr": 0.0196020862304134
45
+ },
46
+ "lighteval|math:_average|0": {
47
+ "qem": 0.42260611680320004,
48
+ "qem_stderr": 0.0183606630544208
49
+ },
50
+ "all": {
51
+ "qem": 0.42260611680320004,
52
+ "qem_stderr": 0.0183606630544208
53
+ }
54
+ },
55
+ "versions": {
56
+ "lighteval|math:algebra|0": 0,
57
+ "lighteval|math:counting_and_probability|0": 0,
58
+ "lighteval|math:geometry|0": 0,
59
+ "lighteval|math:intermediate_algebra|0": 0,
60
+ "lighteval|math:number_theory|0": 0,
61
+ "lighteval|math:prealgebra|0": 0,
62
+ "lighteval|math:precalculus|0": 0
63
+ },
64
+ "config_tasks": {
65
+ "lighteval|math:algebra": {
66
+ "name": "math:algebra",
67
+ "prompt_function": "math",
68
+ "hf_repo": "lighteval/MATH",
69
+ "hf_subset": "algebra",
70
+ "metric": [
71
+ "quasi_exact_match_math"
72
+ ],
73
+ "hf_avail_splits": [
74
+ "train",
75
+ "test",
76
+ "validation"
77
+ ],
78
+ "evaluation_splits": [
79
+ "test"
80
+ ],
81
+ "few_shots_split": null,
82
+ "few_shots_select": null,
83
+ "generation_size": 2048,
84
+ "stop_sequence": [
85
+ "\n"
86
+ ],
87
+ "output_regex": null,
88
+ "frozen": false,
89
+ "suite": [
90
+ "lighteval",
91
+ "math"
92
+ ],
93
+ "original_num_docs": 1187,
94
+ "effective_num_docs": 1187,
95
+ "trust_dataset": true,
96
+ "must_remove_duplicate_docs": null
97
+ },
98
+ "lighteval|math:counting_and_probability": {
99
+ "name": "math:counting_and_probability",
100
+ "prompt_function": "math",
101
+ "hf_repo": "lighteval/MATH",
102
+ "hf_subset": "counting_and_probability",
103
+ "metric": [
104
+ "quasi_exact_match_math"
105
+ ],
106
+ "hf_avail_splits": [
107
+ "train",
108
+ "test",
109
+ "validation"
110
+ ],
111
+ "evaluation_splits": [
112
+ "test"
113
+ ],
114
+ "few_shots_split": null,
115
+ "few_shots_select": null,
116
+ "generation_size": 2048,
117
+ "stop_sequence": [
118
+ "\n"
119
+ ],
120
+ "output_regex": null,
121
+ "frozen": false,
122
+ "suite": [
123
+ "lighteval",
124
+ "math"
125
+ ],
126
+ "original_num_docs": 474,
127
+ "effective_num_docs": 474,
128
+ "trust_dataset": true,
129
+ "must_remove_duplicate_docs": null
130
+ },
131
+ "lighteval|math:geometry": {
132
+ "name": "math:geometry",
133
+ "prompt_function": "math",
134
+ "hf_repo": "lighteval/MATH",
135
+ "hf_subset": "geometry",
136
+ "metric": [
137
+ "quasi_exact_match_math"
138
+ ],
139
+ "hf_avail_splits": [
140
+ "train",
141
+ "test",
142
+ "validation"
143
+ ],
144
+ "evaluation_splits": [
145
+ "test"
146
+ ],
147
+ "few_shots_split": null,
148
+ "few_shots_select": null,
149
+ "generation_size": 2048,
150
+ "stop_sequence": [
151
+ "\n"
152
+ ],
153
+ "output_regex": null,
154
+ "frozen": false,
155
+ "suite": [
156
+ "lighteval",
157
+ "math"
158
+ ],
159
+ "original_num_docs": 479,
160
+ "effective_num_docs": 479,
161
+ "trust_dataset": true,
162
+ "must_remove_duplicate_docs": null
163
+ },
164
+ "lighteval|math:intermediate_algebra": {
165
+ "name": "math:intermediate_algebra",
166
+ "prompt_function": "math",
167
+ "hf_repo": "lighteval/MATH",
168
+ "hf_subset": "intermediate_algebra",
169
+ "metric": [
170
+ "quasi_exact_match_math"
171
+ ],
172
+ "hf_avail_splits": [
173
+ "train",
174
+ "test",
175
+ "validation"
176
+ ],
177
+ "evaluation_splits": [
178
+ "test"
179
+ ],
180
+ "few_shots_split": null,
181
+ "few_shots_select": null,
182
+ "generation_size": 2048,
183
+ "stop_sequence": [
184
+ "\n"
185
+ ],
186
+ "output_regex": null,
187
+ "frozen": false,
188
+ "suite": [
189
+ "lighteval",
190
+ "math"
191
+ ],
192
+ "original_num_docs": 903,
193
+ "effective_num_docs": 903,
194
+ "trust_dataset": true,
195
+ "must_remove_duplicate_docs": null
196
+ },
197
+ "lighteval|math:number_theory": {
198
+ "name": "math:number_theory",
199
+ "prompt_function": "math",
200
+ "hf_repo": "lighteval/MATH",
201
+ "hf_subset": "number_theory",
202
+ "metric": [
203
+ "quasi_exact_match_math"
204
+ ],
205
+ "hf_avail_splits": [
206
+ "train",
207
+ "test",
208
+ "validation"
209
+ ],
210
+ "evaluation_splits": [
211
+ "test"
212
+ ],
213
+ "few_shots_split": null,
214
+ "few_shots_select": null,
215
+ "generation_size": 2048,
216
+ "stop_sequence": [
217
+ "\n"
218
+ ],
219
+ "output_regex": null,
220
+ "frozen": false,
221
+ "suite": [
222
+ "lighteval",
223
+ "math"
224
+ ],
225
+ "original_num_docs": 540,
226
+ "effective_num_docs": 540,
227
+ "trust_dataset": true,
228
+ "must_remove_duplicate_docs": null
229
+ },
230
+ "lighteval|math:prealgebra": {
231
+ "name": "math:prealgebra",
232
+ "prompt_function": "math",
233
+ "hf_repo": "lighteval/MATH",
234
+ "hf_subset": "prealgebra",
235
+ "metric": [
236
+ "quasi_exact_match_math"
237
+ ],
238
+ "hf_avail_splits": [
239
+ "train",
240
+ "test",
241
+ "validation"
242
+ ],
243
+ "evaluation_splits": [
244
+ "test"
245
+ ],
246
+ "few_shots_split": null,
247
+ "few_shots_select": null,
248
+ "generation_size": 2048,
249
+ "stop_sequence": [
250
+ "\n"
251
+ ],
252
+ "output_regex": null,
253
+ "frozen": false,
254
+ "suite": [
255
+ "lighteval",
256
+ "math"
257
+ ],
258
+ "original_num_docs": 871,
259
+ "effective_num_docs": 871,
260
+ "trust_dataset": true,
261
+ "must_remove_duplicate_docs": null
262
+ },
263
+ "lighteval|math:precalculus": {
264
+ "name": "math:precalculus",
265
+ "prompt_function": "math",
266
+ "hf_repo": "lighteval/MATH",
267
+ "hf_subset": "precalculus",
268
+ "metric": [
269
+ "quasi_exact_match_math"
270
+ ],
271
+ "hf_avail_splits": [
272
+ "train",
273
+ "test",
274
+ "validation"
275
+ ],
276
+ "evaluation_splits": [
277
+ "test"
278
+ ],
279
+ "few_shots_split": null,
280
+ "few_shots_select": null,
281
+ "generation_size": 2048,
282
+ "stop_sequence": [
283
+ "\n"
284
+ ],
285
+ "output_regex": null,
286
+ "frozen": false,
287
+ "suite": [
288
+ "lighteval",
289
+ "math"
290
+ ],
291
+ "original_num_docs": 546,
292
+ "effective_num_docs": 546,
293
+ "trust_dataset": true,
294
+ "must_remove_duplicate_docs": null
295
+ }
296
+ },
297
+ "summary_tasks": {
298
+ "lighteval|math:algebra|0": {
299
+ "hashes": {
300
+ "hash_examples": "37a2fd2f076d2e49",
301
+ "hash_full_prompts": "461f2ab197cdfd01",
302
+ "hash_input_tokens": "bd3e731d75bc03e6",
303
+ "hash_cont_tokens": "8b12597582e2af8a"
304
+ },
305
+ "truncated": 1187,
306
+ "non_truncated": 0,
307
+ "padded": 63,
308
+ "non_padded": 1124,
309
+ "effective_few_shots": 0.0,
310
+ "num_truncated_few_shots": 0
311
+ },
312
+ "lighteval|math:counting_and_probability|0": {
313
+ "hashes": {
314
+ "hash_examples": "97b4892e28bc078b",
315
+ "hash_full_prompts": "e2cc9eb28ca159b1",
316
+ "hash_input_tokens": "55728dc6558ad660",
317
+ "hash_cont_tokens": "ee728214399bbce1"
318
+ },
319
+ "truncated": 474,
320
+ "non_truncated": 0,
321
+ "padded": 17,
322
+ "non_padded": 457,
323
+ "effective_few_shots": 0.0,
324
+ "num_truncated_few_shots": 0
325
+ },
326
+ "lighteval|math:geometry|0": {
327
+ "hashes": {
328
+ "hash_examples": "9e9f0228b8b3d093",
329
+ "hash_full_prompts": "e581f42fefeed8c9",
330
+ "hash_input_tokens": "6615f77877bf8cef",
331
+ "hash_cont_tokens": "3632ce692fc1b61c"
332
+ },
333
+ "truncated": 479,
334
+ "non_truncated": 0,
335
+ "padded": 60,
336
+ "non_padded": 419,
337
+ "effective_few_shots": 0.0,
338
+ "num_truncated_few_shots": 0
339
+ },
340
+ "lighteval|math:intermediate_algebra|0": {
341
+ "hashes": {
342
+ "hash_examples": "cfe73a8e28ae94de",
343
+ "hash_full_prompts": "7557d30ba2cc15ef",
344
+ "hash_input_tokens": "608f04e5c215ffad",
345
+ "hash_cont_tokens": "a83dd8d0765c48b1"
346
+ },
347
+ "truncated": 903,
348
+ "non_truncated": 0,
349
+ "padded": 15,
350
+ "non_padded": 888,
351
+ "effective_few_shots": 0.0,
352
+ "num_truncated_few_shots": 0
353
+ },
354
+ "lighteval|math:number_theory|0": {
355
+ "hashes": {
356
+ "hash_examples": "4ee5237cf144afac",
357
+ "hash_full_prompts": "7b983a6fe1e3d0a8",
358
+ "hash_input_tokens": "0a9b44cdfde7644d",
359
+ "hash_cont_tokens": "c1a5a1b43fc8ffb6"
360
+ },
361
+ "truncated": 539,
362
+ "non_truncated": 1,
363
+ "padded": 1,
364
+ "non_padded": 539,
365
+ "effective_few_shots": 0.0,
366
+ "num_truncated_few_shots": 0
367
+ },
368
+ "lighteval|math:prealgebra|0": {
369
+ "hashes": {
370
+ "hash_examples": "3fb3afeb885f73d8",
371
+ "hash_full_prompts": "736a03165dbf8f49",
372
+ "hash_input_tokens": "434f9915c1b3fd21",
373
+ "hash_cont_tokens": "6cd2735b70a01c28"
374
+ },
375
+ "truncated": 871,
376
+ "non_truncated": 0,
377
+ "padded": 18,
378
+ "non_padded": 853,
379
+ "effective_few_shots": 0.0,
380
+ "num_truncated_few_shots": 0
381
+ },
382
+ "lighteval|math:precalculus|0": {
383
+ "hashes": {
384
+ "hash_examples": "753e25ab9ec4b46c",
385
+ "hash_full_prompts": "632ca5aa79f53b59",
386
+ "hash_input_tokens": "7808ce9bbfbc6f20",
387
+ "hash_cont_tokens": "29aee73acae487bc"
388
+ },
389
+ "truncated": 546,
390
+ "non_truncated": 0,
391
+ "padded": 4,
392
+ "non_padded": 542,
393
+ "effective_few_shots": 0.0,
394
+ "num_truncated_few_shots": 0
395
+ }
396
+ },
397
+ "summary_general": {
398
+ "hashes": {
399
+ "hash_examples": "b76099aa9092a203",
400
+ "hash_full_prompts": "2d2e105e1a0eeab9",
401
+ "hash_input_tokens": "ec60d48c4ec91205",
402
+ "hash_cont_tokens": "2362c0024ff536ef"
403
+ },
404
+ "truncated": 4999,
405
+ "non_truncated": 1,
406
+ "padded": 178,
407
+ "non_padded": 4822,
408
+ "num_truncated_few_shots": 0
409
+ }
410
+ }