lewtun HF staff commited on
Commit
2961e00
·
verified ·
1 Parent(s): d4240c5

Upload eval_results/HuggingFaceH4/zephyr-7b-beta/main/agieval/results_2024-03-28T16-41-08.142040.json with huggingface_hub

Browse files
eval_results/HuggingFaceH4/zephyr-7b-beta/main/agieval/results_2024-03-28T16-41-08.142040.json ADDED
@@ -0,0 +1,900 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 1,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 2079520.330530645,
9
+ "end_time": 2079786.912695208,
10
+ "total_evaluation_time_secondes": "266.5821645630058",
11
+ "model_name": "HuggingFaceH4/zephyr-7b-beta",
12
+ "model_sha": "b70e0c9a2d9e14bd1e812d3c398e5f313e93b473",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "13.99 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|agieval:aqua-rat|0": {
19
+ "acc": 0.2440944881889764,
20
+ "acc_stderr": 0.027005516126961022,
21
+ "acc_norm": 0.21653543307086615,
22
+ "acc_norm_stderr": 0.02589488017640766
23
+ },
24
+ "lighteval|agieval:gaokao-biology|0": {
25
+ "acc": 0.3142857142857143,
26
+ "acc_stderr": 0.03211151353994383,
27
+ "acc_norm": 0.3619047619047619,
28
+ "acc_norm_stderr": 0.03324043951593504
29
+ },
30
+ "lighteval|agieval:gaokao-chemistry|0": {
31
+ "acc": 0.2995169082125604,
32
+ "acc_stderr": 0.031913606824066645,
33
+ "acc_norm": 0.28019323671497587,
34
+ "acc_norm_stderr": 0.031289827964521094
35
+ },
36
+ "lighteval|agieval:gaokao-chinese|0": {
37
+ "acc": 0.25609756097560976,
38
+ "acc_stderr": 0.027885450835842325,
39
+ "acc_norm": 0.26422764227642276,
40
+ "acc_norm_stderr": 0.028169414252345877
41
+ },
42
+ "lighteval|agieval:gaokao-english|0": {
43
+ "acc": 0.6601307189542484,
44
+ "acc_stderr": 0.027121956071388852,
45
+ "acc_norm": 0.6699346405228758,
46
+ "acc_norm_stderr": 0.026925654653615693
47
+ },
48
+ "lighteval|agieval:gaokao-geography|0": {
49
+ "acc": 0.3768844221105528,
50
+ "acc_stderr": 0.034439417931776,
51
+ "acc_norm": 0.38190954773869346,
52
+ "acc_norm_stderr": 0.03452817946540989
53
+ },
54
+ "lighteval|agieval:gaokao-history|0": {
55
+ "acc": 0.37872340425531914,
56
+ "acc_stderr": 0.03170995606040655,
57
+ "acc_norm": 0.39148936170212767,
58
+ "acc_norm_stderr": 0.03190701242326812
59
+ },
60
+ "lighteval|agieval:gaokao-mathqa|0": {
61
+ "acc": 0.24786324786324787,
62
+ "acc_stderr": 0.023079184079532418,
63
+ "acc_norm": 0.2564102564102564,
64
+ "acc_norm_stderr": 0.023339974098276813
65
+ },
66
+ "lighteval|agieval:gaokao-physics|0": {
67
+ "acc": 0.325,
68
+ "acc_stderr": 0.0332022127978448,
69
+ "acc_norm": 0.33,
70
+ "acc_norm_stderr": 0.03333249580187338
71
+ },
72
+ "lighteval|agieval:logiqa-en|0": {
73
+ "acc": 0.36098310291858676,
74
+ "acc_stderr": 0.01883835295453869,
75
+ "acc_norm": 0.3778801843317972,
76
+ "acc_norm_stderr": 0.019017673991121052
77
+ },
78
+ "lighteval|agieval:logiqa-zh|0": {
79
+ "acc": 0.3778801843317972,
80
+ "acc_stderr": 0.019017673991121052,
81
+ "acc_norm": 0.3686635944700461,
82
+ "acc_norm_stderr": 0.018922951005122524
83
+ },
84
+ "lighteval|agieval:lsat-ar|0": {
85
+ "acc": 0.24347826086956523,
86
+ "acc_stderr": 0.028361099300075073,
87
+ "acc_norm": 0.23478260869565218,
88
+ "acc_norm_stderr": 0.028009647070930118
89
+ },
90
+ "lighteval|agieval:lsat-lr|0": {
91
+ "acc": 0.39215686274509803,
92
+ "acc_stderr": 0.021640474419436253,
93
+ "acc_norm": 0.36470588235294116,
94
+ "acc_norm_stderr": 0.021335356790349588
95
+ },
96
+ "lighteval|agieval:lsat-rc|0": {
97
+ "acc": 0.4944237918215613,
98
+ "acc_stderr": 0.030540461655697047,
99
+ "acc_norm": 0.4721189591078067,
100
+ "acc_norm_stderr": 0.030494839761588354
101
+ },
102
+ "lighteval|agieval:sat-en|0": {
103
+ "acc": 0.6407766990291263,
104
+ "acc_stderr": 0.03350878450608781,
105
+ "acc_norm": 0.6407766990291263,
106
+ "acc_norm_stderr": 0.03350878450608781
107
+ },
108
+ "lighteval|agieval:sat-en-without-passage|0": {
109
+ "acc": 0.45145631067961167,
110
+ "acc_stderr": 0.034756540723428556,
111
+ "acc_norm": 0.42718446601941745,
112
+ "acc_norm_stderr": 0.03454921537431906
113
+ },
114
+ "lighteval|agieval:sat-math|0": {
115
+ "acc": 0.36363636363636365,
116
+ "acc_stderr": 0.0325060555424689,
117
+ "acc_norm": 0.3409090909090909,
118
+ "acc_norm_stderr": 0.03203095553573995
119
+ },
120
+ "lighteval|agieval:_average|0": {
121
+ "acc": 0.3780816494634082,
122
+ "acc_stderr": 0.028684603374153867,
123
+ "acc_norm": 0.37527213913275626,
124
+ "acc_norm_stderr": 0.028617488375700707
125
+ },
126
+ "all": {
127
+ "acc": 0.3780816494634082,
128
+ "acc_stderr": 0.028684603374153867,
129
+ "acc_norm": 0.37527213913275626,
130
+ "acc_norm_stderr": 0.028617488375700707
131
+ }
132
+ },
133
+ "versions": {
134
+ "lighteval|agieval:aqua-rat|0": 0,
135
+ "lighteval|agieval:gaokao-biology|0": 0,
136
+ "lighteval|agieval:gaokao-chemistry|0": 0,
137
+ "lighteval|agieval:gaokao-chinese|0": 0,
138
+ "lighteval|agieval:gaokao-english|0": 0,
139
+ "lighteval|agieval:gaokao-geography|0": 0,
140
+ "lighteval|agieval:gaokao-history|0": 0,
141
+ "lighteval|agieval:gaokao-mathqa|0": 0,
142
+ "lighteval|agieval:gaokao-physics|0": 0,
143
+ "lighteval|agieval:logiqa-en|0": 0,
144
+ "lighteval|agieval:logiqa-zh|0": 0,
145
+ "lighteval|agieval:lsat-ar|0": 0,
146
+ "lighteval|agieval:lsat-lr|0": 0,
147
+ "lighteval|agieval:lsat-rc|0": 0,
148
+ "lighteval|agieval:sat-en|0": 0,
149
+ "lighteval|agieval:sat-en-without-passage|0": 0,
150
+ "lighteval|agieval:sat-math|0": 0
151
+ },
152
+ "config_tasks": {
153
+ "lighteval|agieval:aqua-rat": {
154
+ "name": "agieval:aqua-rat",
155
+ "prompt_function": "agieval",
156
+ "hf_repo": "dmayhem93/agieval-aqua-rat",
157
+ "hf_subset": "default",
158
+ "metric": [
159
+ "loglikelihood_acc",
160
+ "loglikelihood_acc_norm_nospace"
161
+ ],
162
+ "hf_avail_splits": [
163
+ "test"
164
+ ],
165
+ "evaluation_splits": [
166
+ "test"
167
+ ],
168
+ "few_shots_split": null,
169
+ "few_shots_select": "random_sampling",
170
+ "generation_size": 1,
171
+ "stop_sequence": null,
172
+ "output_regex": null,
173
+ "frozen": false,
174
+ "suite": [
175
+ "lighteval"
176
+ ],
177
+ "original_num_docs": 254,
178
+ "effective_num_docs": 254,
179
+ "trust_dataset": true,
180
+ "must_remove_duplicate_docs": null
181
+ },
182
+ "lighteval|agieval:gaokao-biology": {
183
+ "name": "agieval:gaokao-biology",
184
+ "prompt_function": "agieval",
185
+ "hf_repo": "dmayhem93/agieval-gaokao-biology",
186
+ "hf_subset": "default",
187
+ "metric": [
188
+ "loglikelihood_acc",
189
+ "loglikelihood_acc_norm_nospace"
190
+ ],
191
+ "hf_avail_splits": [
192
+ "test"
193
+ ],
194
+ "evaluation_splits": [
195
+ "test"
196
+ ],
197
+ "few_shots_split": null,
198
+ "few_shots_select": "random_sampling",
199
+ "generation_size": 1,
200
+ "stop_sequence": null,
201
+ "output_regex": null,
202
+ "frozen": false,
203
+ "suite": [
204
+ "lighteval"
205
+ ],
206
+ "original_num_docs": 210,
207
+ "effective_num_docs": 210,
208
+ "trust_dataset": true,
209
+ "must_remove_duplicate_docs": null
210
+ },
211
+ "lighteval|agieval:gaokao-chemistry": {
212
+ "name": "agieval:gaokao-chemistry",
213
+ "prompt_function": "agieval",
214
+ "hf_repo": "dmayhem93/agieval-gaokao-chemistry",
215
+ "hf_subset": "default",
216
+ "metric": [
217
+ "loglikelihood_acc",
218
+ "loglikelihood_acc_norm_nospace"
219
+ ],
220
+ "hf_avail_splits": [
221
+ "test"
222
+ ],
223
+ "evaluation_splits": [
224
+ "test"
225
+ ],
226
+ "few_shots_split": null,
227
+ "few_shots_select": "random_sampling",
228
+ "generation_size": 1,
229
+ "stop_sequence": null,
230
+ "output_regex": null,
231
+ "frozen": false,
232
+ "suite": [
233
+ "lighteval"
234
+ ],
235
+ "original_num_docs": 207,
236
+ "effective_num_docs": 207,
237
+ "trust_dataset": true,
238
+ "must_remove_duplicate_docs": null
239
+ },
240
+ "lighteval|agieval:gaokao-chinese": {
241
+ "name": "agieval:gaokao-chinese",
242
+ "prompt_function": "agieval",
243
+ "hf_repo": "dmayhem93/agieval-gaokao-chinese",
244
+ "hf_subset": "default",
245
+ "metric": [
246
+ "loglikelihood_acc",
247
+ "loglikelihood_acc_norm_nospace"
248
+ ],
249
+ "hf_avail_splits": [
250
+ "test"
251
+ ],
252
+ "evaluation_splits": [
253
+ "test"
254
+ ],
255
+ "few_shots_split": null,
256
+ "few_shots_select": "random_sampling",
257
+ "generation_size": 1,
258
+ "stop_sequence": null,
259
+ "output_regex": null,
260
+ "frozen": false,
261
+ "suite": [
262
+ "lighteval"
263
+ ],
264
+ "original_num_docs": 246,
265
+ "effective_num_docs": 246,
266
+ "trust_dataset": true,
267
+ "must_remove_duplicate_docs": null
268
+ },
269
+ "lighteval|agieval:gaokao-english": {
270
+ "name": "agieval:gaokao-english",
271
+ "prompt_function": "agieval",
272
+ "hf_repo": "dmayhem93/agieval-gaokao-english",
273
+ "hf_subset": "default",
274
+ "metric": [
275
+ "loglikelihood_acc",
276
+ "loglikelihood_acc_norm_nospace"
277
+ ],
278
+ "hf_avail_splits": [
279
+ "test"
280
+ ],
281
+ "evaluation_splits": [
282
+ "test"
283
+ ],
284
+ "few_shots_split": null,
285
+ "few_shots_select": "random_sampling",
286
+ "generation_size": 1,
287
+ "stop_sequence": null,
288
+ "output_regex": null,
289
+ "frozen": false,
290
+ "suite": [
291
+ "lighteval"
292
+ ],
293
+ "original_num_docs": 306,
294
+ "effective_num_docs": 306,
295
+ "trust_dataset": true,
296
+ "must_remove_duplicate_docs": null
297
+ },
298
+ "lighteval|agieval:gaokao-geography": {
299
+ "name": "agieval:gaokao-geography",
300
+ "prompt_function": "agieval",
301
+ "hf_repo": "dmayhem93/agieval-gaokao-geography",
302
+ "hf_subset": "default",
303
+ "metric": [
304
+ "loglikelihood_acc",
305
+ "loglikelihood_acc_norm_nospace"
306
+ ],
307
+ "hf_avail_splits": [
308
+ "test"
309
+ ],
310
+ "evaluation_splits": [
311
+ "test"
312
+ ],
313
+ "few_shots_split": null,
314
+ "few_shots_select": "random_sampling",
315
+ "generation_size": 1,
316
+ "stop_sequence": null,
317
+ "output_regex": null,
318
+ "frozen": false,
319
+ "suite": [
320
+ "lighteval"
321
+ ],
322
+ "original_num_docs": 199,
323
+ "effective_num_docs": 199,
324
+ "trust_dataset": true,
325
+ "must_remove_duplicate_docs": null
326
+ },
327
+ "lighteval|agieval:gaokao-history": {
328
+ "name": "agieval:gaokao-history",
329
+ "prompt_function": "agieval",
330
+ "hf_repo": "dmayhem93/agieval-gaokao-history",
331
+ "hf_subset": "default",
332
+ "metric": [
333
+ "loglikelihood_acc",
334
+ "loglikelihood_acc_norm_nospace"
335
+ ],
336
+ "hf_avail_splits": [
337
+ "test"
338
+ ],
339
+ "evaluation_splits": [
340
+ "test"
341
+ ],
342
+ "few_shots_split": null,
343
+ "few_shots_select": "random_sampling",
344
+ "generation_size": 1,
345
+ "stop_sequence": null,
346
+ "output_regex": null,
347
+ "frozen": false,
348
+ "suite": [
349
+ "lighteval"
350
+ ],
351
+ "original_num_docs": 235,
352
+ "effective_num_docs": 235,
353
+ "trust_dataset": true,
354
+ "must_remove_duplicate_docs": null
355
+ },
356
+ "lighteval|agieval:gaokao-mathqa": {
357
+ "name": "agieval:gaokao-mathqa",
358
+ "prompt_function": "agieval",
359
+ "hf_repo": "dmayhem93/agieval-gaokao-mathqa",
360
+ "hf_subset": "default",
361
+ "metric": [
362
+ "loglikelihood_acc",
363
+ "loglikelihood_acc_norm_nospace"
364
+ ],
365
+ "hf_avail_splits": [
366
+ "test"
367
+ ],
368
+ "evaluation_splits": [
369
+ "test"
370
+ ],
371
+ "few_shots_split": null,
372
+ "few_shots_select": "random_sampling",
373
+ "generation_size": 1,
374
+ "stop_sequence": null,
375
+ "output_regex": null,
376
+ "frozen": false,
377
+ "suite": [
378
+ "lighteval"
379
+ ],
380
+ "original_num_docs": 351,
381
+ "effective_num_docs": 351,
382
+ "trust_dataset": true,
383
+ "must_remove_duplicate_docs": null
384
+ },
385
+ "lighteval|agieval:gaokao-physics": {
386
+ "name": "agieval:gaokao-physics",
387
+ "prompt_function": "agieval",
388
+ "hf_repo": "dmayhem93/agieval-gaokao-physics",
389
+ "hf_subset": "default",
390
+ "metric": [
391
+ "loglikelihood_acc",
392
+ "loglikelihood_acc_norm_nospace"
393
+ ],
394
+ "hf_avail_splits": [
395
+ "test"
396
+ ],
397
+ "evaluation_splits": [
398
+ "test"
399
+ ],
400
+ "few_shots_split": null,
401
+ "few_shots_select": "random_sampling",
402
+ "generation_size": 1,
403
+ "stop_sequence": null,
404
+ "output_regex": null,
405
+ "frozen": false,
406
+ "suite": [
407
+ "lighteval"
408
+ ],
409
+ "original_num_docs": 200,
410
+ "effective_num_docs": 200,
411
+ "trust_dataset": true,
412
+ "must_remove_duplicate_docs": null
413
+ },
414
+ "lighteval|agieval:logiqa-en": {
415
+ "name": "agieval:logiqa-en",
416
+ "prompt_function": "agieval",
417
+ "hf_repo": "dmayhem93/agieval-logiqa-en",
418
+ "hf_subset": "default",
419
+ "metric": [
420
+ "loglikelihood_acc",
421
+ "loglikelihood_acc_norm_nospace"
422
+ ],
423
+ "hf_avail_splits": [
424
+ "test"
425
+ ],
426
+ "evaluation_splits": [
427
+ "test"
428
+ ],
429
+ "few_shots_split": null,
430
+ "few_shots_select": "random_sampling",
431
+ "generation_size": 1,
432
+ "stop_sequence": null,
433
+ "output_regex": null,
434
+ "frozen": false,
435
+ "suite": [
436
+ "lighteval"
437
+ ],
438
+ "original_num_docs": 651,
439
+ "effective_num_docs": 651,
440
+ "trust_dataset": true,
441
+ "must_remove_duplicate_docs": null
442
+ },
443
+ "lighteval|agieval:logiqa-zh": {
444
+ "name": "agieval:logiqa-zh",
445
+ "prompt_function": "agieval",
446
+ "hf_repo": "dmayhem93/agieval-logiqa-zh",
447
+ "hf_subset": "default",
448
+ "metric": [
449
+ "loglikelihood_acc",
450
+ "loglikelihood_acc_norm_nospace"
451
+ ],
452
+ "hf_avail_splits": [
453
+ "test"
454
+ ],
455
+ "evaluation_splits": [
456
+ "test"
457
+ ],
458
+ "few_shots_split": null,
459
+ "few_shots_select": "random_sampling",
460
+ "generation_size": 1,
461
+ "stop_sequence": null,
462
+ "output_regex": null,
463
+ "frozen": false,
464
+ "suite": [
465
+ "lighteval"
466
+ ],
467
+ "original_num_docs": 651,
468
+ "effective_num_docs": 651,
469
+ "trust_dataset": true,
470
+ "must_remove_duplicate_docs": null
471
+ },
472
+ "lighteval|agieval:lsat-ar": {
473
+ "name": "agieval:lsat-ar",
474
+ "prompt_function": "agieval",
475
+ "hf_repo": "dmayhem93/agieval-lsat-ar",
476
+ "hf_subset": "default",
477
+ "metric": [
478
+ "loglikelihood_acc",
479
+ "loglikelihood_acc_norm_nospace"
480
+ ],
481
+ "hf_avail_splits": [
482
+ "test"
483
+ ],
484
+ "evaluation_splits": [
485
+ "test"
486
+ ],
487
+ "few_shots_split": null,
488
+ "few_shots_select": "random_sampling",
489
+ "generation_size": 1,
490
+ "stop_sequence": null,
491
+ "output_regex": null,
492
+ "frozen": false,
493
+ "suite": [
494
+ "lighteval"
495
+ ],
496
+ "original_num_docs": 230,
497
+ "effective_num_docs": 230,
498
+ "trust_dataset": true,
499
+ "must_remove_duplicate_docs": null
500
+ },
501
+ "lighteval|agieval:lsat-lr": {
502
+ "name": "agieval:lsat-lr",
503
+ "prompt_function": "agieval",
504
+ "hf_repo": "dmayhem93/agieval-lsat-lr",
505
+ "hf_subset": "default",
506
+ "metric": [
507
+ "loglikelihood_acc",
508
+ "loglikelihood_acc_norm_nospace"
509
+ ],
510
+ "hf_avail_splits": [
511
+ "test"
512
+ ],
513
+ "evaluation_splits": [
514
+ "test"
515
+ ],
516
+ "few_shots_split": null,
517
+ "few_shots_select": "random_sampling",
518
+ "generation_size": 1,
519
+ "stop_sequence": null,
520
+ "output_regex": null,
521
+ "frozen": false,
522
+ "suite": [
523
+ "lighteval"
524
+ ],
525
+ "original_num_docs": 510,
526
+ "effective_num_docs": 510,
527
+ "trust_dataset": true,
528
+ "must_remove_duplicate_docs": null
529
+ },
530
+ "lighteval|agieval:lsat-rc": {
531
+ "name": "agieval:lsat-rc",
532
+ "prompt_function": "agieval",
533
+ "hf_repo": "dmayhem93/agieval-lsat-rc",
534
+ "hf_subset": "default",
535
+ "metric": [
536
+ "loglikelihood_acc",
537
+ "loglikelihood_acc_norm_nospace"
538
+ ],
539
+ "hf_avail_splits": [
540
+ "test"
541
+ ],
542
+ "evaluation_splits": [
543
+ "test"
544
+ ],
545
+ "few_shots_split": null,
546
+ "few_shots_select": "random_sampling",
547
+ "generation_size": 1,
548
+ "stop_sequence": null,
549
+ "output_regex": null,
550
+ "frozen": false,
551
+ "suite": [
552
+ "lighteval"
553
+ ],
554
+ "original_num_docs": 269,
555
+ "effective_num_docs": 269,
556
+ "trust_dataset": true,
557
+ "must_remove_duplicate_docs": null
558
+ },
559
+ "lighteval|agieval:sat-en": {
560
+ "name": "agieval:sat-en",
561
+ "prompt_function": "agieval",
562
+ "hf_repo": "dmayhem93/agieval-sat-en",
563
+ "hf_subset": "default",
564
+ "metric": [
565
+ "loglikelihood_acc",
566
+ "loglikelihood_acc_norm_nospace"
567
+ ],
568
+ "hf_avail_splits": [
569
+ "test"
570
+ ],
571
+ "evaluation_splits": [
572
+ "test"
573
+ ],
574
+ "few_shots_split": null,
575
+ "few_shots_select": "random_sampling",
576
+ "generation_size": 1,
577
+ "stop_sequence": null,
578
+ "output_regex": null,
579
+ "frozen": false,
580
+ "suite": [
581
+ "lighteval"
582
+ ],
583
+ "original_num_docs": 206,
584
+ "effective_num_docs": 206,
585
+ "trust_dataset": true,
586
+ "must_remove_duplicate_docs": null
587
+ },
588
+ "lighteval|agieval:sat-en-without-passage": {
589
+ "name": "agieval:sat-en-without-passage",
590
+ "prompt_function": "agieval",
591
+ "hf_repo": "dmayhem93/agieval-sat-en-without-passage",
592
+ "hf_subset": "default",
593
+ "metric": [
594
+ "loglikelihood_acc",
595
+ "loglikelihood_acc_norm_nospace"
596
+ ],
597
+ "hf_avail_splits": [
598
+ "test"
599
+ ],
600
+ "evaluation_splits": [
601
+ "test"
602
+ ],
603
+ "few_shots_split": null,
604
+ "few_shots_select": "random_sampling",
605
+ "generation_size": 1,
606
+ "stop_sequence": null,
607
+ "output_regex": null,
608
+ "frozen": false,
609
+ "suite": [
610
+ "lighteval"
611
+ ],
612
+ "original_num_docs": 206,
613
+ "effective_num_docs": 206,
614
+ "trust_dataset": true,
615
+ "must_remove_duplicate_docs": null
616
+ },
617
+ "lighteval|agieval:sat-math": {
618
+ "name": "agieval:sat-math",
619
+ "prompt_function": "agieval",
620
+ "hf_repo": "dmayhem93/agieval-sat-math",
621
+ "hf_subset": "default",
622
+ "metric": [
623
+ "loglikelihood_acc",
624
+ "loglikelihood_acc_norm_nospace"
625
+ ],
626
+ "hf_avail_splits": [
627
+ "test"
628
+ ],
629
+ "evaluation_splits": [
630
+ "test"
631
+ ],
632
+ "few_shots_split": null,
633
+ "few_shots_select": "random_sampling",
634
+ "generation_size": 1,
635
+ "stop_sequence": null,
636
+ "output_regex": null,
637
+ "frozen": false,
638
+ "suite": [
639
+ "lighteval"
640
+ ],
641
+ "original_num_docs": 220,
642
+ "effective_num_docs": 220,
643
+ "trust_dataset": true,
644
+ "must_remove_duplicate_docs": null
645
+ }
646
+ },
647
+ "summary_tasks": {
648
+ "lighteval|agieval:aqua-rat|0": {
649
+ "hashes": {
650
+ "hash_examples": "f09607f69e5b7525",
651
+ "hash_full_prompts": "8b913655a6fea4ab",
652
+ "hash_input_tokens": "293a61e5163aa27e",
653
+ "hash_cont_tokens": "a12c4ac8996ba11d"
654
+ },
655
+ "truncated": 0,
656
+ "non_truncated": 254,
657
+ "padded": 1270,
658
+ "non_padded": 0,
659
+ "effective_few_shots": 0.0,
660
+ "num_truncated_few_shots": 0
661
+ },
662
+ "lighteval|agieval:gaokao-biology|0": {
663
+ "hashes": {
664
+ "hash_examples": "f262eaf4a72db963",
665
+ "hash_full_prompts": "c7078ace868f7ee8",
666
+ "hash_input_tokens": "4abbc3b98bca27c3",
667
+ "hash_cont_tokens": "22b786cf7aa6d1a9"
668
+ },
669
+ "truncated": 0,
670
+ "non_truncated": 210,
671
+ "padded": 840,
672
+ "non_padded": 0,
673
+ "effective_few_shots": 0.0,
674
+ "num_truncated_few_shots": 0
675
+ },
676
+ "lighteval|agieval:gaokao-chemistry|0": {
677
+ "hashes": {
678
+ "hash_examples": "47f2e649f58d9da5",
679
+ "hash_full_prompts": "bd066d6d8c807f39",
680
+ "hash_input_tokens": "3d9728247bf04ac3",
681
+ "hash_cont_tokens": "318562bcb4103fc4"
682
+ },
683
+ "truncated": 0,
684
+ "non_truncated": 207,
685
+ "padded": 831,
686
+ "non_padded": 0,
687
+ "effective_few_shots": 0.0,
688
+ "num_truncated_few_shots": 0
689
+ },
690
+ "lighteval|agieval:gaokao-chinese|0": {
691
+ "hashes": {
692
+ "hash_examples": "1010b21fde4726ab",
693
+ "hash_full_prompts": "3f53e9dd34c43d52",
694
+ "hash_input_tokens": "f2a1d4c848527f86",
695
+ "hash_cont_tokens": "7b177add04591cdb"
696
+ },
697
+ "truncated": 0,
698
+ "non_truncated": 246,
699
+ "padded": 982,
700
+ "non_padded": 2,
701
+ "effective_few_shots": 0.0,
702
+ "num_truncated_few_shots": 0
703
+ },
704
+ "lighteval|agieval:gaokao-english|0": {
705
+ "hashes": {
706
+ "hash_examples": "4864e492a350ae93",
707
+ "hash_full_prompts": "59104cb8623f69e5",
708
+ "hash_input_tokens": "ff82bcaabb6cde43",
709
+ "hash_cont_tokens": "c9ca0addab2a9327"
710
+ },
711
+ "truncated": 0,
712
+ "non_truncated": 306,
713
+ "padded": 1224,
714
+ "non_padded": 0,
715
+ "effective_few_shots": 0.0,
716
+ "num_truncated_few_shots": 0
717
+ },
718
+ "lighteval|agieval:gaokao-geography|0": {
719
+ "hashes": {
720
+ "hash_examples": "ec3a021e37650e7d",
721
+ "hash_full_prompts": "d2456e0377df1973",
722
+ "hash_input_tokens": "b6593d42d60f9e65",
723
+ "hash_cont_tokens": "e1bc87e81807da78"
724
+ },
725
+ "truncated": 0,
726
+ "non_truncated": 199,
727
+ "padded": 796,
728
+ "non_padded": 0,
729
+ "effective_few_shots": 0.0,
730
+ "num_truncated_few_shots": 0
731
+ },
732
+ "lighteval|agieval:gaokao-history|0": {
733
+ "hashes": {
734
+ "hash_examples": "b3fad1596f1ae1f9",
735
+ "hash_full_prompts": "faea8f291d9a0cd5",
736
+ "hash_input_tokens": "ffa388f05b4bfadf",
737
+ "hash_cont_tokens": "b3c6c60f59b08db4"
738
+ },
739
+ "truncated": 0,
740
+ "non_truncated": 235,
741
+ "padded": 940,
742
+ "non_padded": 0,
743
+ "effective_few_shots": 0.0,
744
+ "num_truncated_few_shots": 0
745
+ },
746
+ "lighteval|agieval:gaokao-mathqa|0": {
747
+ "hashes": {
748
+ "hash_examples": "1d1088556861b0b0",
749
+ "hash_full_prompts": "de899bfeaaa61154",
750
+ "hash_input_tokens": "1e826e8ae60c9cf4",
751
+ "hash_cont_tokens": "5d69ebf8391bf298"
752
+ },
753
+ "truncated": 0,
754
+ "non_truncated": 351,
755
+ "padded": 1404,
756
+ "non_padded": 0,
757
+ "effective_few_shots": 0.0,
758
+ "num_truncated_few_shots": 0
759
+ },
760
+ "lighteval|agieval:gaokao-physics|0": {
761
+ "hashes": {
762
+ "hash_examples": "eb05f035c7bfca2f",
763
+ "hash_full_prompts": "08008e0300283edc",
764
+ "hash_input_tokens": "0a4cddbeea3c31fa",
765
+ "hash_cont_tokens": "93b4c52fa838ace2"
766
+ },
767
+ "truncated": 0,
768
+ "non_truncated": 200,
769
+ "padded": 800,
770
+ "non_padded": 0,
771
+ "effective_few_shots": 0.0,
772
+ "num_truncated_few_shots": 0
773
+ },
774
+ "lighteval|agieval:logiqa-en|0": {
775
+ "hashes": {
776
+ "hash_examples": "0a688a45f69c21e0",
777
+ "hash_full_prompts": "3405fd262d4b2d28",
778
+ "hash_input_tokens": "8672a68d080ba1fb",
779
+ "hash_cont_tokens": "2624c1243afac3f2"
780
+ },
781
+ "truncated": 0,
782
+ "non_truncated": 651,
783
+ "padded": 2604,
784
+ "non_padded": 0,
785
+ "effective_few_shots": 0.0,
786
+ "num_truncated_few_shots": 0
787
+ },
788
+ "lighteval|agieval:logiqa-zh|0": {
789
+ "hashes": {
790
+ "hash_examples": "620d6888b6012ea5",
791
+ "hash_full_prompts": "ac19dc4eaa56f5e0",
792
+ "hash_input_tokens": "24c3bd34df395ee2",
793
+ "hash_cont_tokens": "725ca2b921b6f8fe"
794
+ },
795
+ "truncated": 0,
796
+ "non_truncated": 651,
797
+ "padded": 2603,
798
+ "non_padded": 1,
799
+ "effective_few_shots": 0.0,
800
+ "num_truncated_few_shots": 0
801
+ },
802
+ "lighteval|agieval:lsat-ar|0": {
803
+ "hashes": {
804
+ "hash_examples": "627c8f5ccd5da209",
805
+ "hash_full_prompts": "9aed992c4bfa8dd7",
806
+ "hash_input_tokens": "042173fcbbb85776",
807
+ "hash_cont_tokens": "23c097e1d431f2b8"
808
+ },
809
+ "truncated": 0,
810
+ "non_truncated": 230,
811
+ "padded": 1137,
812
+ "non_padded": 13,
813
+ "effective_few_shots": 0.0,
814
+ "num_truncated_few_shots": 0
815
+ },
816
+ "lighteval|agieval:lsat-lr|0": {
817
+ "hashes": {
818
+ "hash_examples": "794641c86de172f5",
819
+ "hash_full_prompts": "6a36e90325996129",
820
+ "hash_input_tokens": "58c268b8b0f3c5c4",
821
+ "hash_cont_tokens": "b555f4319746d815"
822
+ },
823
+ "truncated": 0,
824
+ "non_truncated": 510,
825
+ "padded": 2532,
826
+ "non_padded": 18,
827
+ "effective_few_shots": 0.0,
828
+ "num_truncated_few_shots": 0
829
+ },
830
+ "lighteval|agieval:lsat-rc|0": {
831
+ "hashes": {
832
+ "hash_examples": "35981ed917ea01cf",
833
+ "hash_full_prompts": "15f0f342f9572c41",
834
+ "hash_input_tokens": "1eaa7331cf1a555e",
835
+ "hash_cont_tokens": "8c1c4fc8c9cabd97"
836
+ },
837
+ "truncated": 0,
838
+ "non_truncated": 269,
839
+ "padded": 1345,
840
+ "non_padded": 0,
841
+ "effective_few_shots": 0.0,
842
+ "num_truncated_few_shots": 0
843
+ },
844
+ "lighteval|agieval:sat-en|0": {
845
+ "hashes": {
846
+ "hash_examples": "041c39c646536a1e",
847
+ "hash_full_prompts": "163217fd603b9352",
848
+ "hash_input_tokens": "e1e9417c28fa0db0",
849
+ "hash_cont_tokens": "4837f17aae6c95e0"
850
+ },
851
+ "truncated": 0,
852
+ "non_truncated": 206,
853
+ "padded": 821,
854
+ "non_padded": 0,
855
+ "effective_few_shots": 0.0,
856
+ "num_truncated_few_shots": 0
857
+ },
858
+ "lighteval|agieval:sat-en-without-passage|0": {
859
+ "hashes": {
860
+ "hash_examples": "e4d9284367dff68f",
861
+ "hash_full_prompts": "bdd4c7065b87de8a",
862
+ "hash_input_tokens": "70bcab139ba874e5",
863
+ "hash_cont_tokens": "4837f17aae6c95e0"
864
+ },
865
+ "truncated": 0,
866
+ "non_truncated": 206,
867
+ "padded": 817,
868
+ "non_padded": 4,
869
+ "effective_few_shots": 0.0,
870
+ "num_truncated_few_shots": 0
871
+ },
872
+ "lighteval|agieval:sat-math|0": {
873
+ "hashes": {
874
+ "hash_examples": "01db7291603fc1a0",
875
+ "hash_full_prompts": "63ca65b2f0baebb5",
876
+ "hash_input_tokens": "e242b0a44e256fbb",
877
+ "hash_cont_tokens": "d959ef83452da9fe"
878
+ },
879
+ "truncated": 0,
880
+ "non_truncated": 220,
881
+ "padded": 877,
882
+ "non_padded": 3,
883
+ "effective_few_shots": 0.0,
884
+ "num_truncated_few_shots": 0
885
+ }
886
+ },
887
+ "summary_general": {
888
+ "hashes": {
889
+ "hash_examples": "da3af66181f18ddf",
890
+ "hash_full_prompts": "f7c298d03686fa0e",
891
+ "hash_input_tokens": "84e02a40c0a714da",
892
+ "hash_cont_tokens": "b3bace8c3199f6d8"
893
+ },
894
+ "truncated": 0,
895
+ "non_truncated": 5151,
896
+ "padded": 21823,
897
+ "non_padded": 41,
898
+ "num_truncated_few_shots": 0
899
+ }
900
+ }