lewtun HF staff commited on
Commit
d4240c5
·
verified ·
1 Parent(s): a96d97e

Upload eval_results/HuggingFaceH4/zephyr-7b-gemma-v0.1/main/agieval/results_2024-03-28T16-40-43.592094.json with huggingface_hub

Browse files
eval_results/HuggingFaceH4/zephyr-7b-gemma-v0.1/main/agieval/results_2024-03-28T16-40-43.592094.json ADDED
@@ -0,0 +1,900 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 1,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 549406.183073153,
9
+ "end_time": 549696.457783003,
10
+ "total_evaluation_time_secondes": "290.2747098499676",
11
+ "model_name": "HuggingFaceH4/zephyr-7b-gemma-v0.1",
12
+ "model_sha": "03b3427d0ed07d2e0f86c0a7e53d82d4beef9540",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "15.9 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|agieval:aqua-rat|0": {
19
+ "acc": 0.2125984251968504,
20
+ "acc_stderr": 0.025722779833723054,
21
+ "acc_norm": 0.2125984251968504,
22
+ "acc_norm_stderr": 0.02572277983372305
23
+ },
24
+ "lighteval|agieval:gaokao-biology|0": {
25
+ "acc": 0.35714285714285715,
26
+ "acc_stderr": 0.033144012047664914,
27
+ "acc_norm": 0.38095238095238093,
28
+ "acc_norm_stderr": 0.03359110046749989
29
+ },
30
+ "lighteval|agieval:gaokao-chemistry|0": {
31
+ "acc": 0.26570048309178745,
32
+ "acc_stderr": 0.030775079470103068,
33
+ "acc_norm": 0.2946859903381642,
34
+ "acc_norm_stderr": 0.03176416108295297
35
+ },
36
+ "lighteval|agieval:gaokao-chinese|0": {
37
+ "acc": 0.2967479674796748,
38
+ "acc_stderr": 0.029185445861037912,
39
+ "acc_norm": 0.3008130081300813,
40
+ "acc_norm_stderr": 0.02929961637067325
41
+ },
42
+ "lighteval|agieval:gaokao-english|0": {
43
+ "acc": 0.6111111111111112,
44
+ "acc_stderr": 0.027914055510468008,
45
+ "acc_norm": 0.5620915032679739,
46
+ "acc_norm_stderr": 0.028408302020332687
47
+ },
48
+ "lighteval|agieval:gaokao-geography|0": {
49
+ "acc": 0.46733668341708545,
50
+ "acc_stderr": 0.03545755092964412,
51
+ "acc_norm": 0.457286432160804,
52
+ "acc_norm_stderr": 0.035403557368657
53
+ },
54
+ "lighteval|agieval:gaokao-history|0": {
55
+ "acc": 0.451063829787234,
56
+ "acc_stderr": 0.032529096196131965,
57
+ "acc_norm": 0.4127659574468085,
58
+ "acc_norm_stderr": 0.03218471141400352
59
+ },
60
+ "lighteval|agieval:gaokao-mathqa|0": {
61
+ "acc": 0.27635327635327633,
62
+ "acc_stderr": 0.023903505003127223,
63
+ "acc_norm": 0.27635327635327633,
64
+ "acc_norm_stderr": 0.023903505003127216
65
+ },
66
+ "lighteval|agieval:gaokao-physics|0": {
67
+ "acc": 0.36,
68
+ "acc_stderr": 0.03402629784040017,
69
+ "acc_norm": 0.37,
70
+ "acc_norm_stderr": 0.0342250899767933
71
+ },
72
+ "lighteval|agieval:logiqa-en|0": {
73
+ "acc": 0.35944700460829493,
74
+ "acc_stderr": 0.018820809084481267,
75
+ "acc_norm": 0.3486943164362519,
76
+ "acc_norm_stderr": 0.018692104055797923
77
+ },
78
+ "lighteval|agieval:logiqa-zh|0": {
79
+ "acc": 0.31336405529953915,
80
+ "acc_stderr": 0.01819412517802074,
81
+ "acc_norm": 0.36251920122887865,
82
+ "acc_norm_stderr": 0.018855687979585072
83
+ },
84
+ "lighteval|agieval:lsat-ar|0": {
85
+ "acc": 0.2217391304347826,
86
+ "acc_stderr": 0.027451496604058913,
87
+ "acc_norm": 0.16521739130434782,
88
+ "acc_norm_stderr": 0.02454125880854541
89
+ },
90
+ "lighteval|agieval:lsat-lr|0": {
91
+ "acc": 0.3627450980392157,
92
+ "acc_stderr": 0.021310737393780418,
93
+ "acc_norm": 0.3215686274509804,
94
+ "acc_norm_stderr": 0.020702886736741092
95
+ },
96
+ "lighteval|agieval:lsat-rc|0": {
97
+ "acc": 0.5055762081784386,
98
+ "acc_stderr": 0.03054046165569704,
99
+ "acc_norm": 0.3754646840148699,
100
+ "acc_norm_stderr": 0.029579828435446678
101
+ },
102
+ "lighteval|agieval:sat-en|0": {
103
+ "acc": 0.6747572815533981,
104
+ "acc_stderr": 0.03271904737596389,
105
+ "acc_norm": 0.49514563106796117,
106
+ "acc_norm_stderr": 0.03491986890584391
107
+ },
108
+ "lighteval|agieval:sat-en-without-passage|0": {
109
+ "acc": 0.46601941747572817,
110
+ "acc_stderr": 0.03484077510347999,
111
+ "acc_norm": 0.3446601941747573,
112
+ "acc_norm_stderr": 0.03319341285859081
113
+ },
114
+ "lighteval|agieval:sat-math|0": {
115
+ "acc": 0.42272727272727273,
116
+ "acc_stderr": 0.03338094264093533,
117
+ "acc_norm": 0.3409090909090909,
118
+ "acc_norm_stderr": 0.03203095553573995
119
+ },
120
+ "lighteval|agieval:_average|0": {
121
+ "acc": 0.38967235893509095,
122
+ "acc_stderr": 0.028818601042865762,
123
+ "acc_norm": 0.35421918296667515,
124
+ "acc_norm_stderr": 0.028648166285532566
125
+ },
126
+ "all": {
127
+ "acc": 0.38967235893509095,
128
+ "acc_stderr": 0.028818601042865762,
129
+ "acc_norm": 0.35421918296667515,
130
+ "acc_norm_stderr": 0.028648166285532566
131
+ }
132
+ },
133
+ "versions": {
134
+ "lighteval|agieval:aqua-rat|0": 0,
135
+ "lighteval|agieval:gaokao-biology|0": 0,
136
+ "lighteval|agieval:gaokao-chemistry|0": 0,
137
+ "lighteval|agieval:gaokao-chinese|0": 0,
138
+ "lighteval|agieval:gaokao-english|0": 0,
139
+ "lighteval|agieval:gaokao-geography|0": 0,
140
+ "lighteval|agieval:gaokao-history|0": 0,
141
+ "lighteval|agieval:gaokao-mathqa|0": 0,
142
+ "lighteval|agieval:gaokao-physics|0": 0,
143
+ "lighteval|agieval:logiqa-en|0": 0,
144
+ "lighteval|agieval:logiqa-zh|0": 0,
145
+ "lighteval|agieval:lsat-ar|0": 0,
146
+ "lighteval|agieval:lsat-lr|0": 0,
147
+ "lighteval|agieval:lsat-rc|0": 0,
148
+ "lighteval|agieval:sat-en|0": 0,
149
+ "lighteval|agieval:sat-en-without-passage|0": 0,
150
+ "lighteval|agieval:sat-math|0": 0
151
+ },
152
+ "config_tasks": {
153
+ "lighteval|agieval:aqua-rat": {
154
+ "name": "agieval:aqua-rat",
155
+ "prompt_function": "agieval",
156
+ "hf_repo": "dmayhem93/agieval-aqua-rat",
157
+ "hf_subset": "default",
158
+ "metric": [
159
+ "loglikelihood_acc",
160
+ "loglikelihood_acc_norm_nospace"
161
+ ],
162
+ "hf_avail_splits": [
163
+ "test"
164
+ ],
165
+ "evaluation_splits": [
166
+ "test"
167
+ ],
168
+ "few_shots_split": null,
169
+ "few_shots_select": "random_sampling",
170
+ "generation_size": 1,
171
+ "stop_sequence": null,
172
+ "output_regex": null,
173
+ "frozen": false,
174
+ "suite": [
175
+ "lighteval"
176
+ ],
177
+ "original_num_docs": 254,
178
+ "effective_num_docs": 254,
179
+ "trust_dataset": true,
180
+ "must_remove_duplicate_docs": null
181
+ },
182
+ "lighteval|agieval:gaokao-biology": {
183
+ "name": "agieval:gaokao-biology",
184
+ "prompt_function": "agieval",
185
+ "hf_repo": "dmayhem93/agieval-gaokao-biology",
186
+ "hf_subset": "default",
187
+ "metric": [
188
+ "loglikelihood_acc",
189
+ "loglikelihood_acc_norm_nospace"
190
+ ],
191
+ "hf_avail_splits": [
192
+ "test"
193
+ ],
194
+ "evaluation_splits": [
195
+ "test"
196
+ ],
197
+ "few_shots_split": null,
198
+ "few_shots_select": "random_sampling",
199
+ "generation_size": 1,
200
+ "stop_sequence": null,
201
+ "output_regex": null,
202
+ "frozen": false,
203
+ "suite": [
204
+ "lighteval"
205
+ ],
206
+ "original_num_docs": 210,
207
+ "effective_num_docs": 210,
208
+ "trust_dataset": true,
209
+ "must_remove_duplicate_docs": null
210
+ },
211
+ "lighteval|agieval:gaokao-chemistry": {
212
+ "name": "agieval:gaokao-chemistry",
213
+ "prompt_function": "agieval",
214
+ "hf_repo": "dmayhem93/agieval-gaokao-chemistry",
215
+ "hf_subset": "default",
216
+ "metric": [
217
+ "loglikelihood_acc",
218
+ "loglikelihood_acc_norm_nospace"
219
+ ],
220
+ "hf_avail_splits": [
221
+ "test"
222
+ ],
223
+ "evaluation_splits": [
224
+ "test"
225
+ ],
226
+ "few_shots_split": null,
227
+ "few_shots_select": "random_sampling",
228
+ "generation_size": 1,
229
+ "stop_sequence": null,
230
+ "output_regex": null,
231
+ "frozen": false,
232
+ "suite": [
233
+ "lighteval"
234
+ ],
235
+ "original_num_docs": 207,
236
+ "effective_num_docs": 207,
237
+ "trust_dataset": true,
238
+ "must_remove_duplicate_docs": null
239
+ },
240
+ "lighteval|agieval:gaokao-chinese": {
241
+ "name": "agieval:gaokao-chinese",
242
+ "prompt_function": "agieval",
243
+ "hf_repo": "dmayhem93/agieval-gaokao-chinese",
244
+ "hf_subset": "default",
245
+ "metric": [
246
+ "loglikelihood_acc",
247
+ "loglikelihood_acc_norm_nospace"
248
+ ],
249
+ "hf_avail_splits": [
250
+ "test"
251
+ ],
252
+ "evaluation_splits": [
253
+ "test"
254
+ ],
255
+ "few_shots_split": null,
256
+ "few_shots_select": "random_sampling",
257
+ "generation_size": 1,
258
+ "stop_sequence": null,
259
+ "output_regex": null,
260
+ "frozen": false,
261
+ "suite": [
262
+ "lighteval"
263
+ ],
264
+ "original_num_docs": 246,
265
+ "effective_num_docs": 246,
266
+ "trust_dataset": true,
267
+ "must_remove_duplicate_docs": null
268
+ },
269
+ "lighteval|agieval:gaokao-english": {
270
+ "name": "agieval:gaokao-english",
271
+ "prompt_function": "agieval",
272
+ "hf_repo": "dmayhem93/agieval-gaokao-english",
273
+ "hf_subset": "default",
274
+ "metric": [
275
+ "loglikelihood_acc",
276
+ "loglikelihood_acc_norm_nospace"
277
+ ],
278
+ "hf_avail_splits": [
279
+ "test"
280
+ ],
281
+ "evaluation_splits": [
282
+ "test"
283
+ ],
284
+ "few_shots_split": null,
285
+ "few_shots_select": "random_sampling",
286
+ "generation_size": 1,
287
+ "stop_sequence": null,
288
+ "output_regex": null,
289
+ "frozen": false,
290
+ "suite": [
291
+ "lighteval"
292
+ ],
293
+ "original_num_docs": 306,
294
+ "effective_num_docs": 306,
295
+ "trust_dataset": true,
296
+ "must_remove_duplicate_docs": null
297
+ },
298
+ "lighteval|agieval:gaokao-geography": {
299
+ "name": "agieval:gaokao-geography",
300
+ "prompt_function": "agieval",
301
+ "hf_repo": "dmayhem93/agieval-gaokao-geography",
302
+ "hf_subset": "default",
303
+ "metric": [
304
+ "loglikelihood_acc",
305
+ "loglikelihood_acc_norm_nospace"
306
+ ],
307
+ "hf_avail_splits": [
308
+ "test"
309
+ ],
310
+ "evaluation_splits": [
311
+ "test"
312
+ ],
313
+ "few_shots_split": null,
314
+ "few_shots_select": "random_sampling",
315
+ "generation_size": 1,
316
+ "stop_sequence": null,
317
+ "output_regex": null,
318
+ "frozen": false,
319
+ "suite": [
320
+ "lighteval"
321
+ ],
322
+ "original_num_docs": 199,
323
+ "effective_num_docs": 199,
324
+ "trust_dataset": true,
325
+ "must_remove_duplicate_docs": null
326
+ },
327
+ "lighteval|agieval:gaokao-history": {
328
+ "name": "agieval:gaokao-history",
329
+ "prompt_function": "agieval",
330
+ "hf_repo": "dmayhem93/agieval-gaokao-history",
331
+ "hf_subset": "default",
332
+ "metric": [
333
+ "loglikelihood_acc",
334
+ "loglikelihood_acc_norm_nospace"
335
+ ],
336
+ "hf_avail_splits": [
337
+ "test"
338
+ ],
339
+ "evaluation_splits": [
340
+ "test"
341
+ ],
342
+ "few_shots_split": null,
343
+ "few_shots_select": "random_sampling",
344
+ "generation_size": 1,
345
+ "stop_sequence": null,
346
+ "output_regex": null,
347
+ "frozen": false,
348
+ "suite": [
349
+ "lighteval"
350
+ ],
351
+ "original_num_docs": 235,
352
+ "effective_num_docs": 235,
353
+ "trust_dataset": true,
354
+ "must_remove_duplicate_docs": null
355
+ },
356
+ "lighteval|agieval:gaokao-mathqa": {
357
+ "name": "agieval:gaokao-mathqa",
358
+ "prompt_function": "agieval",
359
+ "hf_repo": "dmayhem93/agieval-gaokao-mathqa",
360
+ "hf_subset": "default",
361
+ "metric": [
362
+ "loglikelihood_acc",
363
+ "loglikelihood_acc_norm_nospace"
364
+ ],
365
+ "hf_avail_splits": [
366
+ "test"
367
+ ],
368
+ "evaluation_splits": [
369
+ "test"
370
+ ],
371
+ "few_shots_split": null,
372
+ "few_shots_select": "random_sampling",
373
+ "generation_size": 1,
374
+ "stop_sequence": null,
375
+ "output_regex": null,
376
+ "frozen": false,
377
+ "suite": [
378
+ "lighteval"
379
+ ],
380
+ "original_num_docs": 351,
381
+ "effective_num_docs": 351,
382
+ "trust_dataset": true,
383
+ "must_remove_duplicate_docs": null
384
+ },
385
+ "lighteval|agieval:gaokao-physics": {
386
+ "name": "agieval:gaokao-physics",
387
+ "prompt_function": "agieval",
388
+ "hf_repo": "dmayhem93/agieval-gaokao-physics",
389
+ "hf_subset": "default",
390
+ "metric": [
391
+ "loglikelihood_acc",
392
+ "loglikelihood_acc_norm_nospace"
393
+ ],
394
+ "hf_avail_splits": [
395
+ "test"
396
+ ],
397
+ "evaluation_splits": [
398
+ "test"
399
+ ],
400
+ "few_shots_split": null,
401
+ "few_shots_select": "random_sampling",
402
+ "generation_size": 1,
403
+ "stop_sequence": null,
404
+ "output_regex": null,
405
+ "frozen": false,
406
+ "suite": [
407
+ "lighteval"
408
+ ],
409
+ "original_num_docs": 200,
410
+ "effective_num_docs": 200,
411
+ "trust_dataset": true,
412
+ "must_remove_duplicate_docs": null
413
+ },
414
+ "lighteval|agieval:logiqa-en": {
415
+ "name": "agieval:logiqa-en",
416
+ "prompt_function": "agieval",
417
+ "hf_repo": "dmayhem93/agieval-logiqa-en",
418
+ "hf_subset": "default",
419
+ "metric": [
420
+ "loglikelihood_acc",
421
+ "loglikelihood_acc_norm_nospace"
422
+ ],
423
+ "hf_avail_splits": [
424
+ "test"
425
+ ],
426
+ "evaluation_splits": [
427
+ "test"
428
+ ],
429
+ "few_shots_split": null,
430
+ "few_shots_select": "random_sampling",
431
+ "generation_size": 1,
432
+ "stop_sequence": null,
433
+ "output_regex": null,
434
+ "frozen": false,
435
+ "suite": [
436
+ "lighteval"
437
+ ],
438
+ "original_num_docs": 651,
439
+ "effective_num_docs": 651,
440
+ "trust_dataset": true,
441
+ "must_remove_duplicate_docs": null
442
+ },
443
+ "lighteval|agieval:logiqa-zh": {
444
+ "name": "agieval:logiqa-zh",
445
+ "prompt_function": "agieval",
446
+ "hf_repo": "dmayhem93/agieval-logiqa-zh",
447
+ "hf_subset": "default",
448
+ "metric": [
449
+ "loglikelihood_acc",
450
+ "loglikelihood_acc_norm_nospace"
451
+ ],
452
+ "hf_avail_splits": [
453
+ "test"
454
+ ],
455
+ "evaluation_splits": [
456
+ "test"
457
+ ],
458
+ "few_shots_split": null,
459
+ "few_shots_select": "random_sampling",
460
+ "generation_size": 1,
461
+ "stop_sequence": null,
462
+ "output_regex": null,
463
+ "frozen": false,
464
+ "suite": [
465
+ "lighteval"
466
+ ],
467
+ "original_num_docs": 651,
468
+ "effective_num_docs": 651,
469
+ "trust_dataset": true,
470
+ "must_remove_duplicate_docs": null
471
+ },
472
+ "lighteval|agieval:lsat-ar": {
473
+ "name": "agieval:lsat-ar",
474
+ "prompt_function": "agieval",
475
+ "hf_repo": "dmayhem93/agieval-lsat-ar",
476
+ "hf_subset": "default",
477
+ "metric": [
478
+ "loglikelihood_acc",
479
+ "loglikelihood_acc_norm_nospace"
480
+ ],
481
+ "hf_avail_splits": [
482
+ "test"
483
+ ],
484
+ "evaluation_splits": [
485
+ "test"
486
+ ],
487
+ "few_shots_split": null,
488
+ "few_shots_select": "random_sampling",
489
+ "generation_size": 1,
490
+ "stop_sequence": null,
491
+ "output_regex": null,
492
+ "frozen": false,
493
+ "suite": [
494
+ "lighteval"
495
+ ],
496
+ "original_num_docs": 230,
497
+ "effective_num_docs": 230,
498
+ "trust_dataset": true,
499
+ "must_remove_duplicate_docs": null
500
+ },
501
+ "lighteval|agieval:lsat-lr": {
502
+ "name": "agieval:lsat-lr",
503
+ "prompt_function": "agieval",
504
+ "hf_repo": "dmayhem93/agieval-lsat-lr",
505
+ "hf_subset": "default",
506
+ "metric": [
507
+ "loglikelihood_acc",
508
+ "loglikelihood_acc_norm_nospace"
509
+ ],
510
+ "hf_avail_splits": [
511
+ "test"
512
+ ],
513
+ "evaluation_splits": [
514
+ "test"
515
+ ],
516
+ "few_shots_split": null,
517
+ "few_shots_select": "random_sampling",
518
+ "generation_size": 1,
519
+ "stop_sequence": null,
520
+ "output_regex": null,
521
+ "frozen": false,
522
+ "suite": [
523
+ "lighteval"
524
+ ],
525
+ "original_num_docs": 510,
526
+ "effective_num_docs": 510,
527
+ "trust_dataset": true,
528
+ "must_remove_duplicate_docs": null
529
+ },
530
+ "lighteval|agieval:lsat-rc": {
531
+ "name": "agieval:lsat-rc",
532
+ "prompt_function": "agieval",
533
+ "hf_repo": "dmayhem93/agieval-lsat-rc",
534
+ "hf_subset": "default",
535
+ "metric": [
536
+ "loglikelihood_acc",
537
+ "loglikelihood_acc_norm_nospace"
538
+ ],
539
+ "hf_avail_splits": [
540
+ "test"
541
+ ],
542
+ "evaluation_splits": [
543
+ "test"
544
+ ],
545
+ "few_shots_split": null,
546
+ "few_shots_select": "random_sampling",
547
+ "generation_size": 1,
548
+ "stop_sequence": null,
549
+ "output_regex": null,
550
+ "frozen": false,
551
+ "suite": [
552
+ "lighteval"
553
+ ],
554
+ "original_num_docs": 269,
555
+ "effective_num_docs": 269,
556
+ "trust_dataset": true,
557
+ "must_remove_duplicate_docs": null
558
+ },
559
+ "lighteval|agieval:sat-en": {
560
+ "name": "agieval:sat-en",
561
+ "prompt_function": "agieval",
562
+ "hf_repo": "dmayhem93/agieval-sat-en",
563
+ "hf_subset": "default",
564
+ "metric": [
565
+ "loglikelihood_acc",
566
+ "loglikelihood_acc_norm_nospace"
567
+ ],
568
+ "hf_avail_splits": [
569
+ "test"
570
+ ],
571
+ "evaluation_splits": [
572
+ "test"
573
+ ],
574
+ "few_shots_split": null,
575
+ "few_shots_select": "random_sampling",
576
+ "generation_size": 1,
577
+ "stop_sequence": null,
578
+ "output_regex": null,
579
+ "frozen": false,
580
+ "suite": [
581
+ "lighteval"
582
+ ],
583
+ "original_num_docs": 206,
584
+ "effective_num_docs": 206,
585
+ "trust_dataset": true,
586
+ "must_remove_duplicate_docs": null
587
+ },
588
+ "lighteval|agieval:sat-en-without-passage": {
589
+ "name": "agieval:sat-en-without-passage",
590
+ "prompt_function": "agieval",
591
+ "hf_repo": "dmayhem93/agieval-sat-en-without-passage",
592
+ "hf_subset": "default",
593
+ "metric": [
594
+ "loglikelihood_acc",
595
+ "loglikelihood_acc_norm_nospace"
596
+ ],
597
+ "hf_avail_splits": [
598
+ "test"
599
+ ],
600
+ "evaluation_splits": [
601
+ "test"
602
+ ],
603
+ "few_shots_split": null,
604
+ "few_shots_select": "random_sampling",
605
+ "generation_size": 1,
606
+ "stop_sequence": null,
607
+ "output_regex": null,
608
+ "frozen": false,
609
+ "suite": [
610
+ "lighteval"
611
+ ],
612
+ "original_num_docs": 206,
613
+ "effective_num_docs": 206,
614
+ "trust_dataset": true,
615
+ "must_remove_duplicate_docs": null
616
+ },
617
+ "lighteval|agieval:sat-math": {
618
+ "name": "agieval:sat-math",
619
+ "prompt_function": "agieval",
620
+ "hf_repo": "dmayhem93/agieval-sat-math",
621
+ "hf_subset": "default",
622
+ "metric": [
623
+ "loglikelihood_acc",
624
+ "loglikelihood_acc_norm_nospace"
625
+ ],
626
+ "hf_avail_splits": [
627
+ "test"
628
+ ],
629
+ "evaluation_splits": [
630
+ "test"
631
+ ],
632
+ "few_shots_split": null,
633
+ "few_shots_select": "random_sampling",
634
+ "generation_size": 1,
635
+ "stop_sequence": null,
636
+ "output_regex": null,
637
+ "frozen": false,
638
+ "suite": [
639
+ "lighteval"
640
+ ],
641
+ "original_num_docs": 220,
642
+ "effective_num_docs": 220,
643
+ "trust_dataset": true,
644
+ "must_remove_duplicate_docs": null
645
+ }
646
+ },
647
+ "summary_tasks": {
648
+ "lighteval|agieval:aqua-rat|0": {
649
+ "hashes": {
650
+ "hash_examples": "f09607f69e5b7525",
651
+ "hash_full_prompts": "8ad711fb8fb77d94",
652
+ "hash_input_tokens": "3ca91e227e1f5ee8",
653
+ "hash_cont_tokens": "68076809549d1d3f"
654
+ },
655
+ "truncated": 0,
656
+ "non_truncated": 254,
657
+ "padded": 1270,
658
+ "non_padded": 0,
659
+ "effective_few_shots": 0.0,
660
+ "num_truncated_few_shots": 0
661
+ },
662
+ "lighteval|agieval:gaokao-biology|0": {
663
+ "hashes": {
664
+ "hash_examples": "f262eaf4a72db963",
665
+ "hash_full_prompts": "33999b9b989424c2",
666
+ "hash_input_tokens": "8bf36c825bbcbace",
667
+ "hash_cont_tokens": "6e251426994bce31"
668
+ },
669
+ "truncated": 0,
670
+ "non_truncated": 210,
671
+ "padded": 833,
672
+ "non_padded": 7,
673
+ "effective_few_shots": 0.0,
674
+ "num_truncated_few_shots": 0
675
+ },
676
+ "lighteval|agieval:gaokao-chemistry|0": {
677
+ "hashes": {
678
+ "hash_examples": "47f2e649f58d9da5",
679
+ "hash_full_prompts": "7d7d09f9eb879955",
680
+ "hash_input_tokens": "8f96a029d955595e",
681
+ "hash_cont_tokens": "4871f0178c1adc9e"
682
+ },
683
+ "truncated": 0,
684
+ "non_truncated": 207,
685
+ "padded": 827,
686
+ "non_padded": 4,
687
+ "effective_few_shots": 0.0,
688
+ "num_truncated_few_shots": 0
689
+ },
690
+ "lighteval|agieval:gaokao-chinese|0": {
691
+ "hashes": {
692
+ "hash_examples": "1010b21fde4726ab",
693
+ "hash_full_prompts": "60c65321d5cc8691",
694
+ "hash_input_tokens": "e71306cf9d3a3034",
695
+ "hash_cont_tokens": "0be39ceaef751ad1"
696
+ },
697
+ "truncated": 0,
698
+ "non_truncated": 246,
699
+ "padded": 979,
700
+ "non_padded": 5,
701
+ "effective_few_shots": 0.0,
702
+ "num_truncated_few_shots": 0
703
+ },
704
+ "lighteval|agieval:gaokao-english|0": {
705
+ "hashes": {
706
+ "hash_examples": "4864e492a350ae93",
707
+ "hash_full_prompts": "65c242dcefa907ed",
708
+ "hash_input_tokens": "f8bfdf5178b55ada",
709
+ "hash_cont_tokens": "689c6ea1272771f8"
710
+ },
711
+ "truncated": 0,
712
+ "non_truncated": 306,
713
+ "padded": 1224,
714
+ "non_padded": 0,
715
+ "effective_few_shots": 0.0,
716
+ "num_truncated_few_shots": 0
717
+ },
718
+ "lighteval|agieval:gaokao-geography|0": {
719
+ "hashes": {
720
+ "hash_examples": "ec3a021e37650e7d",
721
+ "hash_full_prompts": "d66698e42924982f",
722
+ "hash_input_tokens": "4df6fd6f05dd97b2",
723
+ "hash_cont_tokens": "fc7dd10486347853"
724
+ },
725
+ "truncated": 0,
726
+ "non_truncated": 199,
727
+ "padded": 793,
728
+ "non_padded": 3,
729
+ "effective_few_shots": 0.0,
730
+ "num_truncated_few_shots": 0
731
+ },
732
+ "lighteval|agieval:gaokao-history|0": {
733
+ "hashes": {
734
+ "hash_examples": "b3fad1596f1ae1f9",
735
+ "hash_full_prompts": "b688278c6bb839a5",
736
+ "hash_input_tokens": "bf01c4ae7bccfb50",
737
+ "hash_cont_tokens": "94a46bd09bc7f9a7"
738
+ },
739
+ "truncated": 0,
740
+ "non_truncated": 235,
741
+ "padded": 934,
742
+ "non_padded": 6,
743
+ "effective_few_shots": 0.0,
744
+ "num_truncated_few_shots": 0
745
+ },
746
+ "lighteval|agieval:gaokao-mathqa|0": {
747
+ "hashes": {
748
+ "hash_examples": "1d1088556861b0b0",
749
+ "hash_full_prompts": "a5f205e77d0a99c2",
750
+ "hash_input_tokens": "e955f3458ac121d3",
751
+ "hash_cont_tokens": "d19aa287b771e823"
752
+ },
753
+ "truncated": 0,
754
+ "non_truncated": 351,
755
+ "padded": 1396,
756
+ "non_padded": 8,
757
+ "effective_few_shots": 0.0,
758
+ "num_truncated_few_shots": 0
759
+ },
760
+ "lighteval|agieval:gaokao-physics|0": {
761
+ "hashes": {
762
+ "hash_examples": "eb05f035c7bfca2f",
763
+ "hash_full_prompts": "408cff052da9e941",
764
+ "hash_input_tokens": "240bc1cf51689ef3",
765
+ "hash_cont_tokens": "78f68d934030f9d6"
766
+ },
767
+ "truncated": 0,
768
+ "non_truncated": 200,
769
+ "padded": 792,
770
+ "non_padded": 8,
771
+ "effective_few_shots": 0.0,
772
+ "num_truncated_few_shots": 0
773
+ },
774
+ "lighteval|agieval:logiqa-en|0": {
775
+ "hashes": {
776
+ "hash_examples": "0a688a45f69c21e0",
777
+ "hash_full_prompts": "4abde5e3d4c7b3f8",
778
+ "hash_input_tokens": "b49fe92bef71b0db",
779
+ "hash_cont_tokens": "78b51c7dda5a457a"
780
+ },
781
+ "truncated": 0,
782
+ "non_truncated": 651,
783
+ "padded": 2592,
784
+ "non_padded": 12,
785
+ "effective_few_shots": 0.0,
786
+ "num_truncated_few_shots": 0
787
+ },
788
+ "lighteval|agieval:logiqa-zh|0": {
789
+ "hashes": {
790
+ "hash_examples": "620d6888b6012ea5",
791
+ "hash_full_prompts": "dc4ddbb9c38aef67",
792
+ "hash_input_tokens": "2eef4f372fdad7b7",
793
+ "hash_cont_tokens": "33ab0a1feb3a2fed"
794
+ },
795
+ "truncated": 0,
796
+ "non_truncated": 651,
797
+ "padded": 2588,
798
+ "non_padded": 16,
799
+ "effective_few_shots": 0.0,
800
+ "num_truncated_few_shots": 0
801
+ },
802
+ "lighteval|agieval:lsat-ar|0": {
803
+ "hashes": {
804
+ "hash_examples": "627c8f5ccd5da209",
805
+ "hash_full_prompts": "85f4e2da30c42407",
806
+ "hash_input_tokens": "b8ef0c74c45ff792",
807
+ "hash_cont_tokens": "5a4d3fed21889b2c"
808
+ },
809
+ "truncated": 0,
810
+ "non_truncated": 230,
811
+ "padded": 1145,
812
+ "non_padded": 5,
813
+ "effective_few_shots": 0.0,
814
+ "num_truncated_few_shots": 0
815
+ },
816
+ "lighteval|agieval:lsat-lr|0": {
817
+ "hashes": {
818
+ "hash_examples": "794641c86de172f5",
819
+ "hash_full_prompts": "833dd104fba3a50a",
820
+ "hash_input_tokens": "8bcb7119e593166c",
821
+ "hash_cont_tokens": "449ff43e1d759ce9"
822
+ },
823
+ "truncated": 0,
824
+ "non_truncated": 510,
825
+ "padded": 2537,
826
+ "non_padded": 13,
827
+ "effective_few_shots": 0.0,
828
+ "num_truncated_few_shots": 0
829
+ },
830
+ "lighteval|agieval:lsat-rc|0": {
831
+ "hashes": {
832
+ "hash_examples": "35981ed917ea01cf",
833
+ "hash_full_prompts": "1f6ee9e7b383dee6",
834
+ "hash_input_tokens": "0e5dcc5d9ba3acb4",
835
+ "hash_cont_tokens": "3c2d8acf3e02c384"
836
+ },
837
+ "truncated": 0,
838
+ "non_truncated": 269,
839
+ "padded": 1345,
840
+ "non_padded": 0,
841
+ "effective_few_shots": 0.0,
842
+ "num_truncated_few_shots": 0
843
+ },
844
+ "lighteval|agieval:sat-en|0": {
845
+ "hashes": {
846
+ "hash_examples": "041c39c646536a1e",
847
+ "hash_full_prompts": "dc367ba3deb69c80",
848
+ "hash_input_tokens": "8f6bcc50dd80c45f",
849
+ "hash_cont_tokens": "a6f1ab815d02f06d"
850
+ },
851
+ "truncated": 0,
852
+ "non_truncated": 206,
853
+ "padded": 821,
854
+ "non_padded": 0,
855
+ "effective_few_shots": 0.0,
856
+ "num_truncated_few_shots": 0
857
+ },
858
+ "lighteval|agieval:sat-en-without-passage|0": {
859
+ "hashes": {
860
+ "hash_examples": "e4d9284367dff68f",
861
+ "hash_full_prompts": "dc84a50c7cf1b0da",
862
+ "hash_input_tokens": "c63a20ff70e10200",
863
+ "hash_cont_tokens": "a6f1ab815d02f06d"
864
+ },
865
+ "truncated": 0,
866
+ "non_truncated": 206,
867
+ "padded": 811,
868
+ "non_padded": 10,
869
+ "effective_few_shots": 0.0,
870
+ "num_truncated_few_shots": 0
871
+ },
872
+ "lighteval|agieval:sat-math|0": {
873
+ "hashes": {
874
+ "hash_examples": "01db7291603fc1a0",
875
+ "hash_full_prompts": "fd1ffdeea790b637",
876
+ "hash_input_tokens": "7d57211e46d19552",
877
+ "hash_cont_tokens": "51a49f5633293d60"
878
+ },
879
+ "truncated": 0,
880
+ "non_truncated": 220,
881
+ "padded": 875,
882
+ "non_padded": 5,
883
+ "effective_few_shots": 0.0,
884
+ "num_truncated_few_shots": 0
885
+ }
886
+ },
887
+ "summary_general": {
888
+ "hashes": {
889
+ "hash_examples": "da3af66181f18ddf",
890
+ "hash_full_prompts": "6cd70cd84b2d90cb",
891
+ "hash_input_tokens": "2c432ff3076aeb5a",
892
+ "hash_cont_tokens": "a3082c4ed8f63f0b"
893
+ },
894
+ "truncated": 0,
895
+ "non_truncated": 5151,
896
+ "padded": 21762,
897
+ "non_padded": 102,
898
+ "num_truncated_few_shots": 0
899
+ }
900
+ }