lewtun HF staff commited on
Commit
80d2acf
·
verified ·
1 Parent(s): 4e6391d

Upload eval_results/alvarobartt/mistral-7b-orpo-airoboros-pref-10k/main/agieval/results_2024-03-28T16-53-48.794914.json with huggingface_hub

Browse files
eval_results/alvarobartt/mistral-7b-orpo-airoboros-pref-10k/main/agieval/results_2024-03-28T16-53-48.794914.json ADDED
@@ -0,0 +1,900 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 1,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 1032809.666722011,
9
+ "end_time": 1033077.684305349,
10
+ "total_evaluation_time_secondes": "268.0175833379617",
11
+ "model_name": "alvarobartt/mistral-7b-orpo-airoboros-pref-10k",
12
+ "model_sha": "31df48fe52c5a16097ee33dab1601bfb55d33424",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "13.99 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|agieval:aqua-rat|0": {
19
+ "acc": 0.2559055118110236,
20
+ "acc_stderr": 0.027434282297895837,
21
+ "acc_norm": 0.2204724409448819,
22
+ "acc_norm_stderr": 0.026063493749639587
23
+ },
24
+ "lighteval|agieval:gaokao-biology|0": {
25
+ "acc": 0.29523809523809524,
26
+ "acc_stderr": 0.03155253554505397,
27
+ "acc_norm": 0.3333333333333333,
28
+ "acc_norm_stderr": 0.03260773253630126
29
+ },
30
+ "lighteval|agieval:gaokao-chemistry|0": {
31
+ "acc": 0.2028985507246377,
32
+ "acc_stderr": 0.028019647132494573,
33
+ "acc_norm": 0.2608695652173913,
34
+ "acc_norm_stderr": 0.0305941674719917
35
+ },
36
+ "lighteval|agieval:gaokao-chinese|0": {
37
+ "acc": 0.24390243902439024,
38
+ "acc_stderr": 0.027435567505267607,
39
+ "acc_norm": 0.22357723577235772,
40
+ "acc_norm_stderr": 0.02661827325520527
41
+ },
42
+ "lighteval|agieval:gaokao-english|0": {
43
+ "acc": 0.6928104575163399,
44
+ "acc_stderr": 0.026415601914389,
45
+ "acc_norm": 0.6699346405228758,
46
+ "acc_norm_stderr": 0.026925654653615693
47
+ },
48
+ "lighteval|agieval:gaokao-geography|0": {
49
+ "acc": 0.2964824120603015,
50
+ "acc_stderr": 0.03245669931306982,
51
+ "acc_norm": 0.3165829145728643,
52
+ "acc_norm_stderr": 0.03305628600297041
53
+ },
54
+ "lighteval|agieval:gaokao-history|0": {
55
+ "acc": 0.3659574468085106,
56
+ "acc_stderr": 0.031489558297455304,
57
+ "acc_norm": 0.34893617021276596,
58
+ "acc_norm_stderr": 0.03115852213135779
59
+ },
60
+ "lighteval|agieval:gaokao-mathqa|0": {
61
+ "acc": 0.2678062678062678,
62
+ "acc_stderr": 0.023669514493780273,
63
+ "acc_norm": 0.30484330484330485,
64
+ "acc_norm_stderr": 0.024606263101409006
65
+ },
66
+ "lighteval|agieval:gaokao-physics|0": {
67
+ "acc": 0.35,
68
+ "acc_stderr": 0.0338114785302567,
69
+ "acc_norm": 0.34,
70
+ "acc_norm_stderr": 0.03358032446172574
71
+ },
72
+ "lighteval|agieval:logiqa-en|0": {
73
+ "acc": 0.3225806451612903,
74
+ "acc_stderr": 0.018335437421251717,
75
+ "acc_norm": 0.3563748079877112,
76
+ "acc_norm_stderr": 0.018785092461820002
77
+ },
78
+ "lighteval|agieval:logiqa-zh|0": {
79
+ "acc": 0.3271889400921659,
80
+ "acc_stderr": 0.01840302389757353,
81
+ "acc_norm": 0.32872503840245776,
82
+ "acc_norm_stderr": 0.018425103591390705
83
+ },
84
+ "lighteval|agieval:lsat-ar|0": {
85
+ "acc": 0.17391304347826086,
86
+ "acc_stderr": 0.025047317386049713,
87
+ "acc_norm": 0.1826086956521739,
88
+ "acc_norm_stderr": 0.02553042195273417
89
+ },
90
+ "lighteval|agieval:lsat-lr|0": {
91
+ "acc": 0.3803921568627451,
92
+ "acc_stderr": 0.021518665715717256,
93
+ "acc_norm": 0.3411764705882353,
94
+ "acc_norm_stderr": 0.021014312949349193
95
+ },
96
+ "lighteval|agieval:lsat-rc|0": {
97
+ "acc": 0.45724907063197023,
98
+ "acc_stderr": 0.03043051529856916,
99
+ "acc_norm": 0.40148698884758366,
100
+ "acc_norm_stderr": 0.02994367764191132
101
+ },
102
+ "lighteval|agieval:sat-en|0": {
103
+ "acc": 0.6359223300970874,
104
+ "acc_stderr": 0.03360641055142777,
105
+ "acc_norm": 0.5728155339805825,
106
+ "acc_norm_stderr": 0.03454921537431907
107
+ },
108
+ "lighteval|agieval:sat-en-without-passage|0": {
109
+ "acc": 0.4174757281553398,
110
+ "acc_stderr": 0.03444258173919335,
111
+ "acc_norm": 0.3737864077669903,
112
+ "acc_norm_stderr": 0.03379061271978838
113
+ },
114
+ "lighteval|agieval:sat-math|0": {
115
+ "acc": 0.35,
116
+ "acc_stderr": 0.03223061875589931,
117
+ "acc_norm": 0.2863636363636364,
118
+ "acc_norm_stderr": 0.0305474781396946
119
+ },
120
+ "lighteval|agieval:_average|0": {
121
+ "acc": 0.3550425350275545,
122
+ "acc_stderr": 0.02801761504678499,
123
+ "acc_norm": 0.3448168932358321,
124
+ "acc_norm_stderr": 0.028105684246777875
125
+ },
126
+ "all": {
127
+ "acc": 0.3550425350275545,
128
+ "acc_stderr": 0.02801761504678499,
129
+ "acc_norm": 0.3448168932358321,
130
+ "acc_norm_stderr": 0.028105684246777875
131
+ }
132
+ },
133
+ "versions": {
134
+ "lighteval|agieval:aqua-rat|0": 0,
135
+ "lighteval|agieval:gaokao-biology|0": 0,
136
+ "lighteval|agieval:gaokao-chemistry|0": 0,
137
+ "lighteval|agieval:gaokao-chinese|0": 0,
138
+ "lighteval|agieval:gaokao-english|0": 0,
139
+ "lighteval|agieval:gaokao-geography|0": 0,
140
+ "lighteval|agieval:gaokao-history|0": 0,
141
+ "lighteval|agieval:gaokao-mathqa|0": 0,
142
+ "lighteval|agieval:gaokao-physics|0": 0,
143
+ "lighteval|agieval:logiqa-en|0": 0,
144
+ "lighteval|agieval:logiqa-zh|0": 0,
145
+ "lighteval|agieval:lsat-ar|0": 0,
146
+ "lighteval|agieval:lsat-lr|0": 0,
147
+ "lighteval|agieval:lsat-rc|0": 0,
148
+ "lighteval|agieval:sat-en|0": 0,
149
+ "lighteval|agieval:sat-en-without-passage|0": 0,
150
+ "lighteval|agieval:sat-math|0": 0
151
+ },
152
+ "config_tasks": {
153
+ "lighteval|agieval:aqua-rat": {
154
+ "name": "agieval:aqua-rat",
155
+ "prompt_function": "agieval",
156
+ "hf_repo": "dmayhem93/agieval-aqua-rat",
157
+ "hf_subset": "default",
158
+ "metric": [
159
+ "loglikelihood_acc",
160
+ "loglikelihood_acc_norm_nospace"
161
+ ],
162
+ "hf_avail_splits": [
163
+ "test"
164
+ ],
165
+ "evaluation_splits": [
166
+ "test"
167
+ ],
168
+ "few_shots_split": null,
169
+ "few_shots_select": "random_sampling",
170
+ "generation_size": 1,
171
+ "stop_sequence": null,
172
+ "output_regex": null,
173
+ "frozen": false,
174
+ "suite": [
175
+ "lighteval"
176
+ ],
177
+ "original_num_docs": 254,
178
+ "effective_num_docs": 254,
179
+ "trust_dataset": true,
180
+ "must_remove_duplicate_docs": null
181
+ },
182
+ "lighteval|agieval:gaokao-biology": {
183
+ "name": "agieval:gaokao-biology",
184
+ "prompt_function": "agieval",
185
+ "hf_repo": "dmayhem93/agieval-gaokao-biology",
186
+ "hf_subset": "default",
187
+ "metric": [
188
+ "loglikelihood_acc",
189
+ "loglikelihood_acc_norm_nospace"
190
+ ],
191
+ "hf_avail_splits": [
192
+ "test"
193
+ ],
194
+ "evaluation_splits": [
195
+ "test"
196
+ ],
197
+ "few_shots_split": null,
198
+ "few_shots_select": "random_sampling",
199
+ "generation_size": 1,
200
+ "stop_sequence": null,
201
+ "output_regex": null,
202
+ "frozen": false,
203
+ "suite": [
204
+ "lighteval"
205
+ ],
206
+ "original_num_docs": 210,
207
+ "effective_num_docs": 210,
208
+ "trust_dataset": true,
209
+ "must_remove_duplicate_docs": null
210
+ },
211
+ "lighteval|agieval:gaokao-chemistry": {
212
+ "name": "agieval:gaokao-chemistry",
213
+ "prompt_function": "agieval",
214
+ "hf_repo": "dmayhem93/agieval-gaokao-chemistry",
215
+ "hf_subset": "default",
216
+ "metric": [
217
+ "loglikelihood_acc",
218
+ "loglikelihood_acc_norm_nospace"
219
+ ],
220
+ "hf_avail_splits": [
221
+ "test"
222
+ ],
223
+ "evaluation_splits": [
224
+ "test"
225
+ ],
226
+ "few_shots_split": null,
227
+ "few_shots_select": "random_sampling",
228
+ "generation_size": 1,
229
+ "stop_sequence": null,
230
+ "output_regex": null,
231
+ "frozen": false,
232
+ "suite": [
233
+ "lighteval"
234
+ ],
235
+ "original_num_docs": 207,
236
+ "effective_num_docs": 207,
237
+ "trust_dataset": true,
238
+ "must_remove_duplicate_docs": null
239
+ },
240
+ "lighteval|agieval:gaokao-chinese": {
241
+ "name": "agieval:gaokao-chinese",
242
+ "prompt_function": "agieval",
243
+ "hf_repo": "dmayhem93/agieval-gaokao-chinese",
244
+ "hf_subset": "default",
245
+ "metric": [
246
+ "loglikelihood_acc",
247
+ "loglikelihood_acc_norm_nospace"
248
+ ],
249
+ "hf_avail_splits": [
250
+ "test"
251
+ ],
252
+ "evaluation_splits": [
253
+ "test"
254
+ ],
255
+ "few_shots_split": null,
256
+ "few_shots_select": "random_sampling",
257
+ "generation_size": 1,
258
+ "stop_sequence": null,
259
+ "output_regex": null,
260
+ "frozen": false,
261
+ "suite": [
262
+ "lighteval"
263
+ ],
264
+ "original_num_docs": 246,
265
+ "effective_num_docs": 246,
266
+ "trust_dataset": true,
267
+ "must_remove_duplicate_docs": null
268
+ },
269
+ "lighteval|agieval:gaokao-english": {
270
+ "name": "agieval:gaokao-english",
271
+ "prompt_function": "agieval",
272
+ "hf_repo": "dmayhem93/agieval-gaokao-english",
273
+ "hf_subset": "default",
274
+ "metric": [
275
+ "loglikelihood_acc",
276
+ "loglikelihood_acc_norm_nospace"
277
+ ],
278
+ "hf_avail_splits": [
279
+ "test"
280
+ ],
281
+ "evaluation_splits": [
282
+ "test"
283
+ ],
284
+ "few_shots_split": null,
285
+ "few_shots_select": "random_sampling",
286
+ "generation_size": 1,
287
+ "stop_sequence": null,
288
+ "output_regex": null,
289
+ "frozen": false,
290
+ "suite": [
291
+ "lighteval"
292
+ ],
293
+ "original_num_docs": 306,
294
+ "effective_num_docs": 306,
295
+ "trust_dataset": true,
296
+ "must_remove_duplicate_docs": null
297
+ },
298
+ "lighteval|agieval:gaokao-geography": {
299
+ "name": "agieval:gaokao-geography",
300
+ "prompt_function": "agieval",
301
+ "hf_repo": "dmayhem93/agieval-gaokao-geography",
302
+ "hf_subset": "default",
303
+ "metric": [
304
+ "loglikelihood_acc",
305
+ "loglikelihood_acc_norm_nospace"
306
+ ],
307
+ "hf_avail_splits": [
308
+ "test"
309
+ ],
310
+ "evaluation_splits": [
311
+ "test"
312
+ ],
313
+ "few_shots_split": null,
314
+ "few_shots_select": "random_sampling",
315
+ "generation_size": 1,
316
+ "stop_sequence": null,
317
+ "output_regex": null,
318
+ "frozen": false,
319
+ "suite": [
320
+ "lighteval"
321
+ ],
322
+ "original_num_docs": 199,
323
+ "effective_num_docs": 199,
324
+ "trust_dataset": true,
325
+ "must_remove_duplicate_docs": null
326
+ },
327
+ "lighteval|agieval:gaokao-history": {
328
+ "name": "agieval:gaokao-history",
329
+ "prompt_function": "agieval",
330
+ "hf_repo": "dmayhem93/agieval-gaokao-history",
331
+ "hf_subset": "default",
332
+ "metric": [
333
+ "loglikelihood_acc",
334
+ "loglikelihood_acc_norm_nospace"
335
+ ],
336
+ "hf_avail_splits": [
337
+ "test"
338
+ ],
339
+ "evaluation_splits": [
340
+ "test"
341
+ ],
342
+ "few_shots_split": null,
343
+ "few_shots_select": "random_sampling",
344
+ "generation_size": 1,
345
+ "stop_sequence": null,
346
+ "output_regex": null,
347
+ "frozen": false,
348
+ "suite": [
349
+ "lighteval"
350
+ ],
351
+ "original_num_docs": 235,
352
+ "effective_num_docs": 235,
353
+ "trust_dataset": true,
354
+ "must_remove_duplicate_docs": null
355
+ },
356
+ "lighteval|agieval:gaokao-mathqa": {
357
+ "name": "agieval:gaokao-mathqa",
358
+ "prompt_function": "agieval",
359
+ "hf_repo": "dmayhem93/agieval-gaokao-mathqa",
360
+ "hf_subset": "default",
361
+ "metric": [
362
+ "loglikelihood_acc",
363
+ "loglikelihood_acc_norm_nospace"
364
+ ],
365
+ "hf_avail_splits": [
366
+ "test"
367
+ ],
368
+ "evaluation_splits": [
369
+ "test"
370
+ ],
371
+ "few_shots_split": null,
372
+ "few_shots_select": "random_sampling",
373
+ "generation_size": 1,
374
+ "stop_sequence": null,
375
+ "output_regex": null,
376
+ "frozen": false,
377
+ "suite": [
378
+ "lighteval"
379
+ ],
380
+ "original_num_docs": 351,
381
+ "effective_num_docs": 351,
382
+ "trust_dataset": true,
383
+ "must_remove_duplicate_docs": null
384
+ },
385
+ "lighteval|agieval:gaokao-physics": {
386
+ "name": "agieval:gaokao-physics",
387
+ "prompt_function": "agieval",
388
+ "hf_repo": "dmayhem93/agieval-gaokao-physics",
389
+ "hf_subset": "default",
390
+ "metric": [
391
+ "loglikelihood_acc",
392
+ "loglikelihood_acc_norm_nospace"
393
+ ],
394
+ "hf_avail_splits": [
395
+ "test"
396
+ ],
397
+ "evaluation_splits": [
398
+ "test"
399
+ ],
400
+ "few_shots_split": null,
401
+ "few_shots_select": "random_sampling",
402
+ "generation_size": 1,
403
+ "stop_sequence": null,
404
+ "output_regex": null,
405
+ "frozen": false,
406
+ "suite": [
407
+ "lighteval"
408
+ ],
409
+ "original_num_docs": 200,
410
+ "effective_num_docs": 200,
411
+ "trust_dataset": true,
412
+ "must_remove_duplicate_docs": null
413
+ },
414
+ "lighteval|agieval:logiqa-en": {
415
+ "name": "agieval:logiqa-en",
416
+ "prompt_function": "agieval",
417
+ "hf_repo": "dmayhem93/agieval-logiqa-en",
418
+ "hf_subset": "default",
419
+ "metric": [
420
+ "loglikelihood_acc",
421
+ "loglikelihood_acc_norm_nospace"
422
+ ],
423
+ "hf_avail_splits": [
424
+ "test"
425
+ ],
426
+ "evaluation_splits": [
427
+ "test"
428
+ ],
429
+ "few_shots_split": null,
430
+ "few_shots_select": "random_sampling",
431
+ "generation_size": 1,
432
+ "stop_sequence": null,
433
+ "output_regex": null,
434
+ "frozen": false,
435
+ "suite": [
436
+ "lighteval"
437
+ ],
438
+ "original_num_docs": 651,
439
+ "effective_num_docs": 651,
440
+ "trust_dataset": true,
441
+ "must_remove_duplicate_docs": null
442
+ },
443
+ "lighteval|agieval:logiqa-zh": {
444
+ "name": "agieval:logiqa-zh",
445
+ "prompt_function": "agieval",
446
+ "hf_repo": "dmayhem93/agieval-logiqa-zh",
447
+ "hf_subset": "default",
448
+ "metric": [
449
+ "loglikelihood_acc",
450
+ "loglikelihood_acc_norm_nospace"
451
+ ],
452
+ "hf_avail_splits": [
453
+ "test"
454
+ ],
455
+ "evaluation_splits": [
456
+ "test"
457
+ ],
458
+ "few_shots_split": null,
459
+ "few_shots_select": "random_sampling",
460
+ "generation_size": 1,
461
+ "stop_sequence": null,
462
+ "output_regex": null,
463
+ "frozen": false,
464
+ "suite": [
465
+ "lighteval"
466
+ ],
467
+ "original_num_docs": 651,
468
+ "effective_num_docs": 651,
469
+ "trust_dataset": true,
470
+ "must_remove_duplicate_docs": null
471
+ },
472
+ "lighteval|agieval:lsat-ar": {
473
+ "name": "agieval:lsat-ar",
474
+ "prompt_function": "agieval",
475
+ "hf_repo": "dmayhem93/agieval-lsat-ar",
476
+ "hf_subset": "default",
477
+ "metric": [
478
+ "loglikelihood_acc",
479
+ "loglikelihood_acc_norm_nospace"
480
+ ],
481
+ "hf_avail_splits": [
482
+ "test"
483
+ ],
484
+ "evaluation_splits": [
485
+ "test"
486
+ ],
487
+ "few_shots_split": null,
488
+ "few_shots_select": "random_sampling",
489
+ "generation_size": 1,
490
+ "stop_sequence": null,
491
+ "output_regex": null,
492
+ "frozen": false,
493
+ "suite": [
494
+ "lighteval"
495
+ ],
496
+ "original_num_docs": 230,
497
+ "effective_num_docs": 230,
498
+ "trust_dataset": true,
499
+ "must_remove_duplicate_docs": null
500
+ },
501
+ "lighteval|agieval:lsat-lr": {
502
+ "name": "agieval:lsat-lr",
503
+ "prompt_function": "agieval",
504
+ "hf_repo": "dmayhem93/agieval-lsat-lr",
505
+ "hf_subset": "default",
506
+ "metric": [
507
+ "loglikelihood_acc",
508
+ "loglikelihood_acc_norm_nospace"
509
+ ],
510
+ "hf_avail_splits": [
511
+ "test"
512
+ ],
513
+ "evaluation_splits": [
514
+ "test"
515
+ ],
516
+ "few_shots_split": null,
517
+ "few_shots_select": "random_sampling",
518
+ "generation_size": 1,
519
+ "stop_sequence": null,
520
+ "output_regex": null,
521
+ "frozen": false,
522
+ "suite": [
523
+ "lighteval"
524
+ ],
525
+ "original_num_docs": 510,
526
+ "effective_num_docs": 510,
527
+ "trust_dataset": true,
528
+ "must_remove_duplicate_docs": null
529
+ },
530
+ "lighteval|agieval:lsat-rc": {
531
+ "name": "agieval:lsat-rc",
532
+ "prompt_function": "agieval",
533
+ "hf_repo": "dmayhem93/agieval-lsat-rc",
534
+ "hf_subset": "default",
535
+ "metric": [
536
+ "loglikelihood_acc",
537
+ "loglikelihood_acc_norm_nospace"
538
+ ],
539
+ "hf_avail_splits": [
540
+ "test"
541
+ ],
542
+ "evaluation_splits": [
543
+ "test"
544
+ ],
545
+ "few_shots_split": null,
546
+ "few_shots_select": "random_sampling",
547
+ "generation_size": 1,
548
+ "stop_sequence": null,
549
+ "output_regex": null,
550
+ "frozen": false,
551
+ "suite": [
552
+ "lighteval"
553
+ ],
554
+ "original_num_docs": 269,
555
+ "effective_num_docs": 269,
556
+ "trust_dataset": true,
557
+ "must_remove_duplicate_docs": null
558
+ },
559
+ "lighteval|agieval:sat-en": {
560
+ "name": "agieval:sat-en",
561
+ "prompt_function": "agieval",
562
+ "hf_repo": "dmayhem93/agieval-sat-en",
563
+ "hf_subset": "default",
564
+ "metric": [
565
+ "loglikelihood_acc",
566
+ "loglikelihood_acc_norm_nospace"
567
+ ],
568
+ "hf_avail_splits": [
569
+ "test"
570
+ ],
571
+ "evaluation_splits": [
572
+ "test"
573
+ ],
574
+ "few_shots_split": null,
575
+ "few_shots_select": "random_sampling",
576
+ "generation_size": 1,
577
+ "stop_sequence": null,
578
+ "output_regex": null,
579
+ "frozen": false,
580
+ "suite": [
581
+ "lighteval"
582
+ ],
583
+ "original_num_docs": 206,
584
+ "effective_num_docs": 206,
585
+ "trust_dataset": true,
586
+ "must_remove_duplicate_docs": null
587
+ },
588
+ "lighteval|agieval:sat-en-without-passage": {
589
+ "name": "agieval:sat-en-without-passage",
590
+ "prompt_function": "agieval",
591
+ "hf_repo": "dmayhem93/agieval-sat-en-without-passage",
592
+ "hf_subset": "default",
593
+ "metric": [
594
+ "loglikelihood_acc",
595
+ "loglikelihood_acc_norm_nospace"
596
+ ],
597
+ "hf_avail_splits": [
598
+ "test"
599
+ ],
600
+ "evaluation_splits": [
601
+ "test"
602
+ ],
603
+ "few_shots_split": null,
604
+ "few_shots_select": "random_sampling",
605
+ "generation_size": 1,
606
+ "stop_sequence": null,
607
+ "output_regex": null,
608
+ "frozen": false,
609
+ "suite": [
610
+ "lighteval"
611
+ ],
612
+ "original_num_docs": 206,
613
+ "effective_num_docs": 206,
614
+ "trust_dataset": true,
615
+ "must_remove_duplicate_docs": null
616
+ },
617
+ "lighteval|agieval:sat-math": {
618
+ "name": "agieval:sat-math",
619
+ "prompt_function": "agieval",
620
+ "hf_repo": "dmayhem93/agieval-sat-math",
621
+ "hf_subset": "default",
622
+ "metric": [
623
+ "loglikelihood_acc",
624
+ "loglikelihood_acc_norm_nospace"
625
+ ],
626
+ "hf_avail_splits": [
627
+ "test"
628
+ ],
629
+ "evaluation_splits": [
630
+ "test"
631
+ ],
632
+ "few_shots_split": null,
633
+ "few_shots_select": "random_sampling",
634
+ "generation_size": 1,
635
+ "stop_sequence": null,
636
+ "output_regex": null,
637
+ "frozen": false,
638
+ "suite": [
639
+ "lighteval"
640
+ ],
641
+ "original_num_docs": 220,
642
+ "effective_num_docs": 220,
643
+ "trust_dataset": true,
644
+ "must_remove_duplicate_docs": null
645
+ }
646
+ },
647
+ "summary_tasks": {
648
+ "lighteval|agieval:aqua-rat|0": {
649
+ "hashes": {
650
+ "hash_examples": "f09607f69e5b7525",
651
+ "hash_full_prompts": "f0af1499da980246",
652
+ "hash_input_tokens": "0be64c6b62a0d093",
653
+ "hash_cont_tokens": "a12c4ac8996ba11d"
654
+ },
655
+ "truncated": 0,
656
+ "non_truncated": 254,
657
+ "padded": 1270,
658
+ "non_padded": 0,
659
+ "effective_few_shots": 0.0,
660
+ "num_truncated_few_shots": 0
661
+ },
662
+ "lighteval|agieval:gaokao-biology|0": {
663
+ "hashes": {
664
+ "hash_examples": "f262eaf4a72db963",
665
+ "hash_full_prompts": "4027de4b1cdd1c67",
666
+ "hash_input_tokens": "806946132c723d9d",
667
+ "hash_cont_tokens": "22b786cf7aa6d1a9"
668
+ },
669
+ "truncated": 0,
670
+ "non_truncated": 210,
671
+ "padded": 840,
672
+ "non_padded": 0,
673
+ "effective_few_shots": 0.0,
674
+ "num_truncated_few_shots": 0
675
+ },
676
+ "lighteval|agieval:gaokao-chemistry|0": {
677
+ "hashes": {
678
+ "hash_examples": "47f2e649f58d9da5",
679
+ "hash_full_prompts": "39de31ab927f9675",
680
+ "hash_input_tokens": "cd38dce96b719830",
681
+ "hash_cont_tokens": "318562bcb4103fc4"
682
+ },
683
+ "truncated": 0,
684
+ "non_truncated": 207,
685
+ "padded": 831,
686
+ "non_padded": 0,
687
+ "effective_few_shots": 0.0,
688
+ "num_truncated_few_shots": 0
689
+ },
690
+ "lighteval|agieval:gaokao-chinese|0": {
691
+ "hashes": {
692
+ "hash_examples": "1010b21fde4726ab",
693
+ "hash_full_prompts": "3b4313f1bd85fd2e",
694
+ "hash_input_tokens": "e06b3acf207c78c3",
695
+ "hash_cont_tokens": "7b177add04591cdb"
696
+ },
697
+ "truncated": 0,
698
+ "non_truncated": 246,
699
+ "padded": 982,
700
+ "non_padded": 2,
701
+ "effective_few_shots": 0.0,
702
+ "num_truncated_few_shots": 0
703
+ },
704
+ "lighteval|agieval:gaokao-english|0": {
705
+ "hashes": {
706
+ "hash_examples": "4864e492a350ae93",
707
+ "hash_full_prompts": "547863254a606496",
708
+ "hash_input_tokens": "e1f88c49c5393664",
709
+ "hash_cont_tokens": "c9ca0addab2a9327"
710
+ },
711
+ "truncated": 0,
712
+ "non_truncated": 306,
713
+ "padded": 1224,
714
+ "non_padded": 0,
715
+ "effective_few_shots": 0.0,
716
+ "num_truncated_few_shots": 0
717
+ },
718
+ "lighteval|agieval:gaokao-geography|0": {
719
+ "hashes": {
720
+ "hash_examples": "ec3a021e37650e7d",
721
+ "hash_full_prompts": "0a7cffbf555ab29e",
722
+ "hash_input_tokens": "5abdd157cc0c61db",
723
+ "hash_cont_tokens": "e1bc87e81807da78"
724
+ },
725
+ "truncated": 0,
726
+ "non_truncated": 199,
727
+ "padded": 796,
728
+ "non_padded": 0,
729
+ "effective_few_shots": 0.0,
730
+ "num_truncated_few_shots": 0
731
+ },
732
+ "lighteval|agieval:gaokao-history|0": {
733
+ "hashes": {
734
+ "hash_examples": "b3fad1596f1ae1f9",
735
+ "hash_full_prompts": "b8aca4146c3435af",
736
+ "hash_input_tokens": "435500ff3379011f",
737
+ "hash_cont_tokens": "b3c6c60f59b08db4"
738
+ },
739
+ "truncated": 0,
740
+ "non_truncated": 235,
741
+ "padded": 940,
742
+ "non_padded": 0,
743
+ "effective_few_shots": 0.0,
744
+ "num_truncated_few_shots": 0
745
+ },
746
+ "lighteval|agieval:gaokao-mathqa|0": {
747
+ "hashes": {
748
+ "hash_examples": "1d1088556861b0b0",
749
+ "hash_full_prompts": "1441e196c635c040",
750
+ "hash_input_tokens": "932aee9a7e354fda",
751
+ "hash_cont_tokens": "5d69ebf8391bf298"
752
+ },
753
+ "truncated": 0,
754
+ "non_truncated": 351,
755
+ "padded": 1404,
756
+ "non_padded": 0,
757
+ "effective_few_shots": 0.0,
758
+ "num_truncated_few_shots": 0
759
+ },
760
+ "lighteval|agieval:gaokao-physics|0": {
761
+ "hashes": {
762
+ "hash_examples": "eb05f035c7bfca2f",
763
+ "hash_full_prompts": "be15722274b1466d",
764
+ "hash_input_tokens": "f815a74f14d6edfd",
765
+ "hash_cont_tokens": "93b4c52fa838ace2"
766
+ },
767
+ "truncated": 0,
768
+ "non_truncated": 200,
769
+ "padded": 800,
770
+ "non_padded": 0,
771
+ "effective_few_shots": 0.0,
772
+ "num_truncated_few_shots": 0
773
+ },
774
+ "lighteval|agieval:logiqa-en|0": {
775
+ "hashes": {
776
+ "hash_examples": "0a688a45f69c21e0",
777
+ "hash_full_prompts": "ca179e67bdc726a6",
778
+ "hash_input_tokens": "28f2f008004bf2fd",
779
+ "hash_cont_tokens": "2624c1243afac3f2"
780
+ },
781
+ "truncated": 0,
782
+ "non_truncated": 651,
783
+ "padded": 2604,
784
+ "non_padded": 0,
785
+ "effective_few_shots": 0.0,
786
+ "num_truncated_few_shots": 0
787
+ },
788
+ "lighteval|agieval:logiqa-zh|0": {
789
+ "hashes": {
790
+ "hash_examples": "620d6888b6012ea5",
791
+ "hash_full_prompts": "55e305ed89c6e580",
792
+ "hash_input_tokens": "77534c4e0d7f4bf5",
793
+ "hash_cont_tokens": "725ca2b921b6f8fe"
794
+ },
795
+ "truncated": 0,
796
+ "non_truncated": 651,
797
+ "padded": 2603,
798
+ "non_padded": 1,
799
+ "effective_few_shots": 0.0,
800
+ "num_truncated_few_shots": 0
801
+ },
802
+ "lighteval|agieval:lsat-ar|0": {
803
+ "hashes": {
804
+ "hash_examples": "627c8f5ccd5da209",
805
+ "hash_full_prompts": "59e010e22954d5b7",
806
+ "hash_input_tokens": "c2d0ef2e24158dae",
807
+ "hash_cont_tokens": "23c097e1d431f2b8"
808
+ },
809
+ "truncated": 0,
810
+ "non_truncated": 230,
811
+ "padded": 1137,
812
+ "non_padded": 13,
813
+ "effective_few_shots": 0.0,
814
+ "num_truncated_few_shots": 0
815
+ },
816
+ "lighteval|agieval:lsat-lr|0": {
817
+ "hashes": {
818
+ "hash_examples": "794641c86de172f5",
819
+ "hash_full_prompts": "efc3c1a3a1586d3e",
820
+ "hash_input_tokens": "00c145036001ea42",
821
+ "hash_cont_tokens": "b555f4319746d815"
822
+ },
823
+ "truncated": 0,
824
+ "non_truncated": 510,
825
+ "padded": 2532,
826
+ "non_padded": 18,
827
+ "effective_few_shots": 0.0,
828
+ "num_truncated_few_shots": 0
829
+ },
830
+ "lighteval|agieval:lsat-rc|0": {
831
+ "hashes": {
832
+ "hash_examples": "35981ed917ea01cf",
833
+ "hash_full_prompts": "b80e2b86e1eb0cea",
834
+ "hash_input_tokens": "d6d5b49bd88c9e70",
835
+ "hash_cont_tokens": "8c1c4fc8c9cabd97"
836
+ },
837
+ "truncated": 0,
838
+ "non_truncated": 269,
839
+ "padded": 1345,
840
+ "non_padded": 0,
841
+ "effective_few_shots": 0.0,
842
+ "num_truncated_few_shots": 0
843
+ },
844
+ "lighteval|agieval:sat-en|0": {
845
+ "hashes": {
846
+ "hash_examples": "041c39c646536a1e",
847
+ "hash_full_prompts": "4eb610121b313521",
848
+ "hash_input_tokens": "cef5e69336a98b3c",
849
+ "hash_cont_tokens": "4837f17aae6c95e0"
850
+ },
851
+ "truncated": 0,
852
+ "non_truncated": 206,
853
+ "padded": 821,
854
+ "non_padded": 0,
855
+ "effective_few_shots": 0.0,
856
+ "num_truncated_few_shots": 0
857
+ },
858
+ "lighteval|agieval:sat-en-without-passage|0": {
859
+ "hashes": {
860
+ "hash_examples": "e4d9284367dff68f",
861
+ "hash_full_prompts": "532ea18906ff2f4e",
862
+ "hash_input_tokens": "93408ebd24af8814",
863
+ "hash_cont_tokens": "4837f17aae6c95e0"
864
+ },
865
+ "truncated": 0,
866
+ "non_truncated": 206,
867
+ "padded": 817,
868
+ "non_padded": 4,
869
+ "effective_few_shots": 0.0,
870
+ "num_truncated_few_shots": 0
871
+ },
872
+ "lighteval|agieval:sat-math|0": {
873
+ "hashes": {
874
+ "hash_examples": "01db7291603fc1a0",
875
+ "hash_full_prompts": "1422fad2e0cca51f",
876
+ "hash_input_tokens": "09fa56db6a40b324",
877
+ "hash_cont_tokens": "d959ef83452da9fe"
878
+ },
879
+ "truncated": 0,
880
+ "non_truncated": 220,
881
+ "padded": 877,
882
+ "non_padded": 3,
883
+ "effective_few_shots": 0.0,
884
+ "num_truncated_few_shots": 0
885
+ }
886
+ },
887
+ "summary_general": {
888
+ "hashes": {
889
+ "hash_examples": "da3af66181f18ddf",
890
+ "hash_full_prompts": "e89209d4ce68d63a",
891
+ "hash_input_tokens": "bc8a4e955411c77c",
892
+ "hash_cont_tokens": "b3bace8c3199f6d8"
893
+ },
894
+ "truncated": 0,
895
+ "non_truncated": 5151,
896
+ "padded": 21823,
897
+ "non_padded": 41,
898
+ "num_truncated_few_shots": 0
899
+ }
900
+ }