lewtun HF staff commited on
Commit
5fcb3b1
·
verified ·
1 Parent(s): 3c7a7bf

Upload eval_results/AI-MO/mistral-7b-sft/aimo_v03.00/math/results_2024-04-21T22-00-32.902902.json with huggingface_hub

Browse files
eval_results/AI-MO/mistral-7b-sft/aimo_v03.00/math/results_2024-04-21T22-00-32.902902.json ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 882532.794319516,
9
+ "end_time": 891148.601742387,
10
+ "total_evaluation_time_secondes": "8615.807422871003",
11
+ "model_name": "AI-MO/mistral-7b-sft",
12
+ "model_sha": "159047b1ab76bbb7c9369ee71bfef1d441fc029e",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "13.99 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|math:algebra|0": {
19
+ "qem": 0.46166807076663857,
20
+ "qem_stderr": 0.01447596901495004
21
+ },
22
+ "lighteval|math:counting_and_probability|0": {
23
+ "qem": 0.18143459915611815,
24
+ "qem_stderr": 0.017719692309092615
25
+ },
26
+ "lighteval|math:geometry|0": {
27
+ "qem": 0.23382045929018788,
28
+ "qem_stderr": 0.019359430691791527
29
+ },
30
+ "lighteval|math:intermediate_algebra|0": {
31
+ "qem": 0.12624584717607973,
32
+ "qem_stderr": 0.011058593855296428
33
+ },
34
+ "lighteval|math:number_theory|0": {
35
+ "qem": 0.16111111111111112,
36
+ "qem_stderr": 0.015835091780678594
37
+ },
38
+ "lighteval|math:prealgebra|0": {
39
+ "qem": 0.46842709529276694,
40
+ "qem_stderr": 0.01691775751043896
41
+ },
42
+ "lighteval|math:precalculus|0": {
43
+ "qem": 0.1575091575091575,
44
+ "qem_stderr": 0.015604046923319667
45
+ },
46
+ "lighteval|math:_average|0": {
47
+ "qem": 0.25574519147172287,
48
+ "qem_stderr": 0.01585294029793826
49
+ },
50
+ "all": {
51
+ "qem": 0.25574519147172287,
52
+ "qem_stderr": 0.01585294029793826
53
+ }
54
+ },
55
+ "versions": {
56
+ "lighteval|math:algebra|0": 0,
57
+ "lighteval|math:counting_and_probability|0": 0,
58
+ "lighteval|math:geometry|0": 0,
59
+ "lighteval|math:intermediate_algebra|0": 0,
60
+ "lighteval|math:number_theory|0": 0,
61
+ "lighteval|math:prealgebra|0": 0,
62
+ "lighteval|math:precalculus|0": 0
63
+ },
64
+ "config_tasks": {
65
+ "lighteval|math:algebra": {
66
+ "name": "math:algebra",
67
+ "prompt_function": "math",
68
+ "hf_repo": "lighteval/MATH",
69
+ "hf_subset": "algebra",
70
+ "metric": [
71
+ "quasi_exact_match_math"
72
+ ],
73
+ "hf_avail_splits": [
74
+ "train",
75
+ "test",
76
+ "validation"
77
+ ],
78
+ "evaluation_splits": [
79
+ "test"
80
+ ],
81
+ "few_shots_split": null,
82
+ "few_shots_select": null,
83
+ "generation_size": 2048,
84
+ "stop_sequence": [
85
+ "\n"
86
+ ],
87
+ "output_regex": null,
88
+ "frozen": false,
89
+ "suite": [
90
+ "lighteval",
91
+ "math"
92
+ ],
93
+ "original_num_docs": 1187,
94
+ "effective_num_docs": 1187,
95
+ "trust_dataset": true,
96
+ "must_remove_duplicate_docs": null
97
+ },
98
+ "lighteval|math:counting_and_probability": {
99
+ "name": "math:counting_and_probability",
100
+ "prompt_function": "math",
101
+ "hf_repo": "lighteval/MATH",
102
+ "hf_subset": "counting_and_probability",
103
+ "metric": [
104
+ "quasi_exact_match_math"
105
+ ],
106
+ "hf_avail_splits": [
107
+ "train",
108
+ "test",
109
+ "validation"
110
+ ],
111
+ "evaluation_splits": [
112
+ "test"
113
+ ],
114
+ "few_shots_split": null,
115
+ "few_shots_select": null,
116
+ "generation_size": 2048,
117
+ "stop_sequence": [
118
+ "\n"
119
+ ],
120
+ "output_regex": null,
121
+ "frozen": false,
122
+ "suite": [
123
+ "lighteval",
124
+ "math"
125
+ ],
126
+ "original_num_docs": 474,
127
+ "effective_num_docs": 474,
128
+ "trust_dataset": true,
129
+ "must_remove_duplicate_docs": null
130
+ },
131
+ "lighteval|math:geometry": {
132
+ "name": "math:geometry",
133
+ "prompt_function": "math",
134
+ "hf_repo": "lighteval/MATH",
135
+ "hf_subset": "geometry",
136
+ "metric": [
137
+ "quasi_exact_match_math"
138
+ ],
139
+ "hf_avail_splits": [
140
+ "train",
141
+ "test",
142
+ "validation"
143
+ ],
144
+ "evaluation_splits": [
145
+ "test"
146
+ ],
147
+ "few_shots_split": null,
148
+ "few_shots_select": null,
149
+ "generation_size": 2048,
150
+ "stop_sequence": [
151
+ "\n"
152
+ ],
153
+ "output_regex": null,
154
+ "frozen": false,
155
+ "suite": [
156
+ "lighteval",
157
+ "math"
158
+ ],
159
+ "original_num_docs": 479,
160
+ "effective_num_docs": 479,
161
+ "trust_dataset": true,
162
+ "must_remove_duplicate_docs": null
163
+ },
164
+ "lighteval|math:intermediate_algebra": {
165
+ "name": "math:intermediate_algebra",
166
+ "prompt_function": "math",
167
+ "hf_repo": "lighteval/MATH",
168
+ "hf_subset": "intermediate_algebra",
169
+ "metric": [
170
+ "quasi_exact_match_math"
171
+ ],
172
+ "hf_avail_splits": [
173
+ "train",
174
+ "test",
175
+ "validation"
176
+ ],
177
+ "evaluation_splits": [
178
+ "test"
179
+ ],
180
+ "few_shots_split": null,
181
+ "few_shots_select": null,
182
+ "generation_size": 2048,
183
+ "stop_sequence": [
184
+ "\n"
185
+ ],
186
+ "output_regex": null,
187
+ "frozen": false,
188
+ "suite": [
189
+ "lighteval",
190
+ "math"
191
+ ],
192
+ "original_num_docs": 903,
193
+ "effective_num_docs": 903,
194
+ "trust_dataset": true,
195
+ "must_remove_duplicate_docs": null
196
+ },
197
+ "lighteval|math:number_theory": {
198
+ "name": "math:number_theory",
199
+ "prompt_function": "math",
200
+ "hf_repo": "lighteval/MATH",
201
+ "hf_subset": "number_theory",
202
+ "metric": [
203
+ "quasi_exact_match_math"
204
+ ],
205
+ "hf_avail_splits": [
206
+ "train",
207
+ "test",
208
+ "validation"
209
+ ],
210
+ "evaluation_splits": [
211
+ "test"
212
+ ],
213
+ "few_shots_split": null,
214
+ "few_shots_select": null,
215
+ "generation_size": 2048,
216
+ "stop_sequence": [
217
+ "\n"
218
+ ],
219
+ "output_regex": null,
220
+ "frozen": false,
221
+ "suite": [
222
+ "lighteval",
223
+ "math"
224
+ ],
225
+ "original_num_docs": 540,
226
+ "effective_num_docs": 540,
227
+ "trust_dataset": true,
228
+ "must_remove_duplicate_docs": null
229
+ },
230
+ "lighteval|math:prealgebra": {
231
+ "name": "math:prealgebra",
232
+ "prompt_function": "math",
233
+ "hf_repo": "lighteval/MATH",
234
+ "hf_subset": "prealgebra",
235
+ "metric": [
236
+ "quasi_exact_match_math"
237
+ ],
238
+ "hf_avail_splits": [
239
+ "train",
240
+ "test",
241
+ "validation"
242
+ ],
243
+ "evaluation_splits": [
244
+ "test"
245
+ ],
246
+ "few_shots_split": null,
247
+ "few_shots_select": null,
248
+ "generation_size": 2048,
249
+ "stop_sequence": [
250
+ "\n"
251
+ ],
252
+ "output_regex": null,
253
+ "frozen": false,
254
+ "suite": [
255
+ "lighteval",
256
+ "math"
257
+ ],
258
+ "original_num_docs": 871,
259
+ "effective_num_docs": 871,
260
+ "trust_dataset": true,
261
+ "must_remove_duplicate_docs": null
262
+ },
263
+ "lighteval|math:precalculus": {
264
+ "name": "math:precalculus",
265
+ "prompt_function": "math",
266
+ "hf_repo": "lighteval/MATH",
267
+ "hf_subset": "precalculus",
268
+ "metric": [
269
+ "quasi_exact_match_math"
270
+ ],
271
+ "hf_avail_splits": [
272
+ "train",
273
+ "test",
274
+ "validation"
275
+ ],
276
+ "evaluation_splits": [
277
+ "test"
278
+ ],
279
+ "few_shots_split": null,
280
+ "few_shots_select": null,
281
+ "generation_size": 2048,
282
+ "stop_sequence": [
283
+ "\n"
284
+ ],
285
+ "output_regex": null,
286
+ "frozen": false,
287
+ "suite": [
288
+ "lighteval",
289
+ "math"
290
+ ],
291
+ "original_num_docs": 546,
292
+ "effective_num_docs": 546,
293
+ "trust_dataset": true,
294
+ "must_remove_duplicate_docs": null
295
+ }
296
+ },
297
+ "summary_tasks": {
298
+ "lighteval|math:algebra|0": {
299
+ "hashes": {
300
+ "hash_examples": "37a2fd2f076d2e49",
301
+ "hash_full_prompts": "b594f95fc76837ae",
302
+ "hash_input_tokens": "268f292e08b20496",
303
+ "hash_cont_tokens": "5ff3b282e103a786"
304
+ },
305
+ "truncated": 1187,
306
+ "non_truncated": 0,
307
+ "padded": 186,
308
+ "non_padded": 1001,
309
+ "effective_few_shots": 0.0,
310
+ "num_truncated_few_shots": 0
311
+ },
312
+ "lighteval|math:counting_and_probability|0": {
313
+ "hashes": {
314
+ "hash_examples": "97b4892e28bc078b",
315
+ "hash_full_prompts": "7248a30b48b9a71a",
316
+ "hash_input_tokens": "a734e9fc478accef",
317
+ "hash_cont_tokens": "34e1403ab557ae3f"
318
+ },
319
+ "truncated": 474,
320
+ "non_truncated": 0,
321
+ "padded": 42,
322
+ "non_padded": 432,
323
+ "effective_few_shots": 0.0,
324
+ "num_truncated_few_shots": 0
325
+ },
326
+ "lighteval|math:geometry|0": {
327
+ "hashes": {
328
+ "hash_examples": "9e9f0228b8b3d093",
329
+ "hash_full_prompts": "b0ba9fa4265b2a7f",
330
+ "hash_input_tokens": "fd28b84db8b56f2b",
331
+ "hash_cont_tokens": "79a2cb59ecbb222f"
332
+ },
333
+ "truncated": 479,
334
+ "non_truncated": 0,
335
+ "padded": 129,
336
+ "non_padded": 350,
337
+ "effective_few_shots": 0.0,
338
+ "num_truncated_few_shots": 0
339
+ },
340
+ "lighteval|math:intermediate_algebra|0": {
341
+ "hashes": {
342
+ "hash_examples": "cfe73a8e28ae94de",
343
+ "hash_full_prompts": "0519b385dc4d18c5",
344
+ "hash_input_tokens": "ac07a8b5e8432e70",
345
+ "hash_cont_tokens": "b36ca367a0db8943"
346
+ },
347
+ "truncated": 900,
348
+ "non_truncated": 3,
349
+ "padded": 54,
350
+ "non_padded": 849,
351
+ "effective_few_shots": 0.0,
352
+ "num_truncated_few_shots": 0
353
+ },
354
+ "lighteval|math:number_theory|0": {
355
+ "hashes": {
356
+ "hash_examples": "4ee5237cf144afac",
357
+ "hash_full_prompts": "e34e2d9e3719c6b3",
358
+ "hash_input_tokens": "05c6e6efc4dd2f8e",
359
+ "hash_cont_tokens": "d07397c221f94650"
360
+ },
361
+ "truncated": 540,
362
+ "non_truncated": 0,
363
+ "padded": 7,
364
+ "non_padded": 533,
365
+ "effective_few_shots": 0.0,
366
+ "num_truncated_few_shots": 0
367
+ },
368
+ "lighteval|math:prealgebra|0": {
369
+ "hashes": {
370
+ "hash_examples": "3fb3afeb885f73d8",
371
+ "hash_full_prompts": "dd429103e5accb7a",
372
+ "hash_input_tokens": "a674b0bd864f5275",
373
+ "hash_cont_tokens": "fa2fff6b43f88d13"
374
+ },
375
+ "truncated": 871,
376
+ "non_truncated": 0,
377
+ "padded": 55,
378
+ "non_padded": 816,
379
+ "effective_few_shots": 0.0,
380
+ "num_truncated_few_shots": 0
381
+ },
382
+ "lighteval|math:precalculus|0": {
383
+ "hashes": {
384
+ "hash_examples": "753e25ab9ec4b46c",
385
+ "hash_full_prompts": "5437a265a758ad19",
386
+ "hash_input_tokens": "b75d4bd55f3f5325",
387
+ "hash_cont_tokens": "8e1e6abecde2e5f0"
388
+ },
389
+ "truncated": 546,
390
+ "non_truncated": 0,
391
+ "padded": 24,
392
+ "non_padded": 522,
393
+ "effective_few_shots": 0.0,
394
+ "num_truncated_few_shots": 0
395
+ }
396
+ },
397
+ "summary_general": {
398
+ "hashes": {
399
+ "hash_examples": "b76099aa9092a203",
400
+ "hash_full_prompts": "fc120bd09022ad3f",
401
+ "hash_input_tokens": "c7fcbcc2ee4328b7",
402
+ "hash_cont_tokens": "152aa86ad3998642"
403
+ },
404
+ "truncated": 4997,
405
+ "non_truncated": 3,
406
+ "padded": 497,
407
+ "non_padded": 4503,
408
+ "num_truncated_few_shots": 0
409
+ }
410
+ }