lewtun HF Staff commited on
Commit
04f3d33
·
verified ·
1 Parent(s): a2d5010

Upload eval_results/meta-llama/Llama-3.2-3B-Instruct/main/math/results_2024-10-06T16-58-55.587787.json with huggingface_hub

Browse files
eval_results/meta-llama/Llama-3.2-3B-Instruct/main/math/results_2024-10-06T16-58-55.587787.json ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 468952.59722243,
9
+ "end_time": 507653.112541274,
10
+ "total_evaluation_time_secondes": "38700.515318844",
11
+ "model_name": "meta-llama/Llama-3.2-3B-Instruct",
12
+ "model_sha": "392a143b624368100f77a3eafaa4a2468ba50a72",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "5.98 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|math:algebra|0": {
19
+ "qem": 0.13647851727042964,
20
+ "qem_stderr": 0.009968421116534627,
21
+ "maj@4": 0.11541701769165964,
22
+ "maj@4_stderr": 0.009278160257995039
23
+ },
24
+ "lighteval|math:counting_and_probability|0": {
25
+ "qem": 0.0759493670886076,
26
+ "qem_stderr": 0.012180900441921277,
27
+ "maj@4": 0.07172995780590717,
28
+ "maj@4_stderr": 0.011864703365865354
29
+ },
30
+ "lighteval|math:geometry|0": {
31
+ "qem": 0.11691022964509394,
32
+ "qem_stderr": 0.014696527629722107,
33
+ "maj@4": 0.11482254697286012,
34
+ "maj@4_stderr": 0.014581923359739
35
+ },
36
+ "lighteval|math:intermediate_algebra|0": {
37
+ "qem": 0.09523809523809523,
38
+ "qem_stderr": 0.009773930235808413,
39
+ "maj@4": 0.09523809523809523,
40
+ "maj@4_stderr": 0.009773930235808413
41
+ },
42
+ "lighteval|math:number_theory|0": {
43
+ "qem": 0.1111111111111111,
44
+ "qem_stderr": 0.01353655341940388,
45
+ "maj@4": 0.09074074074074075,
46
+ "maj@4_stderr": 0.012372305741867382
47
+ },
48
+ "lighteval|math:prealgebra|0": {
49
+ "qem": 0.08266360505166476,
50
+ "qem_stderr": 0.00933602178756277,
51
+ "maj@4": 0.07692307692307693,
52
+ "maj@4_stderr": 0.009034157223303146
53
+ },
54
+ "lighteval|math:precalculus|0": {
55
+ "qem": 0.1446886446886447,
56
+ "qem_stderr": 0.015068884082729252,
57
+ "maj@4": 0.14652014652014653,
58
+ "maj@4_stderr": 0.01514771264919227
59
+ },
60
+ "lighteval|math:_average|0": {
61
+ "qem": 0.10900565287052101,
62
+ "qem_stderr": 0.012080176959097475,
63
+ "maj@4": 0.10162736884178378,
64
+ "maj@4_stderr": 0.011721841833395799
65
+ },
66
+ "all": {
67
+ "qem": 0.10900565287052101,
68
+ "qem_stderr": 0.012080176959097475,
69
+ "maj@4": 0.10162736884178378,
70
+ "maj@4_stderr": 0.011721841833395799
71
+ }
72
+ },
73
+ "versions": {
74
+ "lighteval|math:algebra|0": 1,
75
+ "lighteval|math:counting_and_probability|0": 1,
76
+ "lighteval|math:geometry|0": 1,
77
+ "lighteval|math:intermediate_algebra|0": 1,
78
+ "lighteval|math:number_theory|0": 1,
79
+ "lighteval|math:prealgebra|0": 1,
80
+ "lighteval|math:precalculus|0": 1
81
+ },
82
+ "config_tasks": {
83
+ "lighteval|math:algebra": {
84
+ "name": "math:algebra",
85
+ "prompt_function": "math",
86
+ "hf_repo": "lighteval/MATH",
87
+ "hf_subset": "algebra",
88
+ "metric": [
89
+ "quasi_exact_match_math",
90
+ "maj_at_4_math"
91
+ ],
92
+ "hf_avail_splits": [
93
+ "train",
94
+ "test",
95
+ "validation"
96
+ ],
97
+ "evaluation_splits": [
98
+ "test"
99
+ ],
100
+ "few_shots_split": null,
101
+ "few_shots_select": null,
102
+ "generation_size": 2048,
103
+ "stop_sequence": [
104
+ "\n"
105
+ ],
106
+ "output_regex": null,
107
+ "num_samples": null,
108
+ "frozen": false,
109
+ "suite": [
110
+ "lighteval",
111
+ "math"
112
+ ],
113
+ "original_num_docs": 1187,
114
+ "effective_num_docs": 1187,
115
+ "trust_dataset": true,
116
+ "must_remove_duplicate_docs": null,
117
+ "version": 1
118
+ },
119
+ "lighteval|math:counting_and_probability": {
120
+ "name": "math:counting_and_probability",
121
+ "prompt_function": "math",
122
+ "hf_repo": "lighteval/MATH",
123
+ "hf_subset": "counting_and_probability",
124
+ "metric": [
125
+ "quasi_exact_match_math",
126
+ "maj_at_4_math"
127
+ ],
128
+ "hf_avail_splits": [
129
+ "train",
130
+ "test",
131
+ "validation"
132
+ ],
133
+ "evaluation_splits": [
134
+ "test"
135
+ ],
136
+ "few_shots_split": null,
137
+ "few_shots_select": null,
138
+ "generation_size": 2048,
139
+ "stop_sequence": [
140
+ "\n"
141
+ ],
142
+ "output_regex": null,
143
+ "num_samples": null,
144
+ "frozen": false,
145
+ "suite": [
146
+ "lighteval",
147
+ "math"
148
+ ],
149
+ "original_num_docs": 474,
150
+ "effective_num_docs": 474,
151
+ "trust_dataset": true,
152
+ "must_remove_duplicate_docs": null,
153
+ "version": 1
154
+ },
155
+ "lighteval|math:geometry": {
156
+ "name": "math:geometry",
157
+ "prompt_function": "math",
158
+ "hf_repo": "lighteval/MATH",
159
+ "hf_subset": "geometry",
160
+ "metric": [
161
+ "quasi_exact_match_math",
162
+ "maj_at_4_math"
163
+ ],
164
+ "hf_avail_splits": [
165
+ "train",
166
+ "test",
167
+ "validation"
168
+ ],
169
+ "evaluation_splits": [
170
+ "test"
171
+ ],
172
+ "few_shots_split": null,
173
+ "few_shots_select": null,
174
+ "generation_size": 2048,
175
+ "stop_sequence": [
176
+ "\n"
177
+ ],
178
+ "output_regex": null,
179
+ "num_samples": null,
180
+ "frozen": false,
181
+ "suite": [
182
+ "lighteval",
183
+ "math"
184
+ ],
185
+ "original_num_docs": 479,
186
+ "effective_num_docs": 479,
187
+ "trust_dataset": true,
188
+ "must_remove_duplicate_docs": null,
189
+ "version": 1
190
+ },
191
+ "lighteval|math:intermediate_algebra": {
192
+ "name": "math:intermediate_algebra",
193
+ "prompt_function": "math",
194
+ "hf_repo": "lighteval/MATH",
195
+ "hf_subset": "intermediate_algebra",
196
+ "metric": [
197
+ "quasi_exact_match_math",
198
+ "maj_at_4_math"
199
+ ],
200
+ "hf_avail_splits": [
201
+ "train",
202
+ "test",
203
+ "validation"
204
+ ],
205
+ "evaluation_splits": [
206
+ "test"
207
+ ],
208
+ "few_shots_split": null,
209
+ "few_shots_select": null,
210
+ "generation_size": 2048,
211
+ "stop_sequence": [
212
+ "\n"
213
+ ],
214
+ "output_regex": null,
215
+ "num_samples": null,
216
+ "frozen": false,
217
+ "suite": [
218
+ "lighteval",
219
+ "math"
220
+ ],
221
+ "original_num_docs": 903,
222
+ "effective_num_docs": 903,
223
+ "trust_dataset": true,
224
+ "must_remove_duplicate_docs": null,
225
+ "version": 1
226
+ },
227
+ "lighteval|math:number_theory": {
228
+ "name": "math:number_theory",
229
+ "prompt_function": "math",
230
+ "hf_repo": "lighteval/MATH",
231
+ "hf_subset": "number_theory",
232
+ "metric": [
233
+ "quasi_exact_match_math",
234
+ "maj_at_4_math"
235
+ ],
236
+ "hf_avail_splits": [
237
+ "train",
238
+ "test",
239
+ "validation"
240
+ ],
241
+ "evaluation_splits": [
242
+ "test"
243
+ ],
244
+ "few_shots_split": null,
245
+ "few_shots_select": null,
246
+ "generation_size": 2048,
247
+ "stop_sequence": [
248
+ "\n"
249
+ ],
250
+ "output_regex": null,
251
+ "num_samples": null,
252
+ "frozen": false,
253
+ "suite": [
254
+ "lighteval",
255
+ "math"
256
+ ],
257
+ "original_num_docs": 540,
258
+ "effective_num_docs": 540,
259
+ "trust_dataset": true,
260
+ "must_remove_duplicate_docs": null,
261
+ "version": 1
262
+ },
263
+ "lighteval|math:prealgebra": {
264
+ "name": "math:prealgebra",
265
+ "prompt_function": "math",
266
+ "hf_repo": "lighteval/MATH",
267
+ "hf_subset": "prealgebra",
268
+ "metric": [
269
+ "quasi_exact_match_math",
270
+ "maj_at_4_math"
271
+ ],
272
+ "hf_avail_splits": [
273
+ "train",
274
+ "test",
275
+ "validation"
276
+ ],
277
+ "evaluation_splits": [
278
+ "test"
279
+ ],
280
+ "few_shots_split": null,
281
+ "few_shots_select": null,
282
+ "generation_size": 2048,
283
+ "stop_sequence": [
284
+ "\n"
285
+ ],
286
+ "output_regex": null,
287
+ "num_samples": null,
288
+ "frozen": false,
289
+ "suite": [
290
+ "lighteval",
291
+ "math"
292
+ ],
293
+ "original_num_docs": 871,
294
+ "effective_num_docs": 871,
295
+ "trust_dataset": true,
296
+ "must_remove_duplicate_docs": null,
297
+ "version": 1
298
+ },
299
+ "lighteval|math:precalculus": {
300
+ "name": "math:precalculus",
301
+ "prompt_function": "math",
302
+ "hf_repo": "lighteval/MATH",
303
+ "hf_subset": "precalculus",
304
+ "metric": [
305
+ "quasi_exact_match_math",
306
+ "maj_at_4_math"
307
+ ],
308
+ "hf_avail_splits": [
309
+ "train",
310
+ "test",
311
+ "validation"
312
+ ],
313
+ "evaluation_splits": [
314
+ "test"
315
+ ],
316
+ "few_shots_split": null,
317
+ "few_shots_select": null,
318
+ "generation_size": 2048,
319
+ "stop_sequence": [
320
+ "\n"
321
+ ],
322
+ "output_regex": null,
323
+ "num_samples": null,
324
+ "frozen": false,
325
+ "suite": [
326
+ "lighteval",
327
+ "math"
328
+ ],
329
+ "original_num_docs": 546,
330
+ "effective_num_docs": 546,
331
+ "trust_dataset": true,
332
+ "must_remove_duplicate_docs": null,
333
+ "version": 1
334
+ }
335
+ },
336
+ "summary_tasks": {
337
+ "lighteval|math:algebra|0": {
338
+ "hashes": {
339
+ "hash_examples": "37a2fd2f076d2e49",
340
+ "hash_full_prompts": "d9a5b0f2a8114d54",
341
+ "hash_input_tokens": "eaf49802bcba393d",
342
+ "hash_cont_tokens": "48574d8fc6c82a2b"
343
+ },
344
+ "truncated": 1187,
345
+ "non_truncated": 0,
346
+ "padded": 173,
347
+ "non_padded": 1014,
348
+ "effective_few_shots": 0.0,
349
+ "num_truncated_few_shots": 0
350
+ },
351
+ "lighteval|math:counting_and_probability|0": {
352
+ "hashes": {
353
+ "hash_examples": "97b4892e28bc078b",
354
+ "hash_full_prompts": "db0180839766d2fc",
355
+ "hash_input_tokens": "6c1ea8e80f2d3547",
356
+ "hash_cont_tokens": "bafac5edeae67b53"
357
+ },
358
+ "truncated": 474,
359
+ "non_truncated": 0,
360
+ "padded": 44,
361
+ "non_padded": 430,
362
+ "effective_few_shots": 0.0,
363
+ "num_truncated_few_shots": 0
364
+ },
365
+ "lighteval|math:geometry|0": {
366
+ "hashes": {
367
+ "hash_examples": "9e9f0228b8b3d093",
368
+ "hash_full_prompts": "9f1f8446cd4f819d",
369
+ "hash_input_tokens": "b4ef39515dc76bc7",
370
+ "hash_cont_tokens": "6d2c7491b49e1e29"
371
+ },
372
+ "truncated": 479,
373
+ "non_truncated": 0,
374
+ "padded": 109,
375
+ "non_padded": 370,
376
+ "effective_few_shots": 0.0,
377
+ "num_truncated_few_shots": 0
378
+ },
379
+ "lighteval|math:intermediate_algebra|0": {
380
+ "hashes": {
381
+ "hash_examples": "cfe73a8e28ae94de",
382
+ "hash_full_prompts": "d9e693a7c7e2f26c",
383
+ "hash_input_tokens": "4703312c9e092f68",
384
+ "hash_cont_tokens": "3ab75e92be2efdb4"
385
+ },
386
+ "truncated": 900,
387
+ "non_truncated": 3,
388
+ "padded": 50,
389
+ "non_padded": 853,
390
+ "effective_few_shots": 0.0,
391
+ "num_truncated_few_shots": 0
392
+ },
393
+ "lighteval|math:number_theory|0": {
394
+ "hashes": {
395
+ "hash_examples": "4ee5237cf144afac",
396
+ "hash_full_prompts": "f16e298dfa576420",
397
+ "hash_input_tokens": "7006511b5ea80ae6",
398
+ "hash_cont_tokens": "1469f6f5075b7f6a"
399
+ },
400
+ "truncated": 540,
401
+ "non_truncated": 0,
402
+ "padded": 5,
403
+ "non_padded": 535,
404
+ "effective_few_shots": 0.0,
405
+ "num_truncated_few_shots": 0
406
+ },
407
+ "lighteval|math:prealgebra|0": {
408
+ "hashes": {
409
+ "hash_examples": "3fb3afeb885f73d8",
410
+ "hash_full_prompts": "0042b8b6b9844f6e",
411
+ "hash_input_tokens": "15bd31882799759e",
412
+ "hash_cont_tokens": "086d00e684329152"
413
+ },
414
+ "truncated": 871,
415
+ "non_truncated": 0,
416
+ "padded": 39,
417
+ "non_padded": 832,
418
+ "effective_few_shots": 0.0,
419
+ "num_truncated_few_shots": 0
420
+ },
421
+ "lighteval|math:precalculus|0": {
422
+ "hashes": {
423
+ "hash_examples": "753e25ab9ec4b46c",
424
+ "hash_full_prompts": "be85bdd6e29e020f",
425
+ "hash_input_tokens": "632369d53714e8b3",
426
+ "hash_cont_tokens": "18b685d68cb708c6"
427
+ },
428
+ "truncated": 546,
429
+ "non_truncated": 0,
430
+ "padded": 18,
431
+ "non_padded": 528,
432
+ "effective_few_shots": 0.0,
433
+ "num_truncated_few_shots": 0
434
+ }
435
+ },
436
+ "summary_general": {
437
+ "hashes": {
438
+ "hash_examples": "b76099aa9092a203",
439
+ "hash_full_prompts": "2f90e54fa88614f5",
440
+ "hash_input_tokens": "17311b592ccff4a5",
441
+ "hash_cont_tokens": "0d1f0bbf21b83adc"
442
+ },
443
+ "truncated": 4997,
444
+ "non_truncated": 3,
445
+ "padded": 438,
446
+ "non_padded": 4562,
447
+ "num_truncated_few_shots": 0
448
+ }
449
+ }