lewtun HF staff commited on
Commit
efcd6d9
·
verified ·
1 Parent(s): 4d8d0e1

Upload eval_results/Nexusflow/Starling-LM-7B-beta/main/agieval/results_2024-03-28T19-47-30.715264.json with huggingface_hub

Browse files
eval_results/Nexusflow/Starling-LM-7B-beta/main/agieval/results_2024-03-28T19-47-30.715264.json ADDED
@@ -0,0 +1,900 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 1,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 1769919.296175537,
9
+ "end_time": 1770205.166829818,
10
+ "total_evaluation_time_secondes": "285.87065428099595",
11
+ "model_name": "Nexusflow/Starling-LM-7B-beta",
12
+ "model_sha": "aa21e7f117d41f9463b6d48d2e127bbf2e93256d",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "13.61 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|agieval:aqua-rat|0": {
19
+ "acc": 0.2677165354330709,
20
+ "acc_stderr": 0.027836648866445348,
21
+ "acc_norm": 0.28346456692913385,
22
+ "acc_norm_stderr": 0.02833400492130763
23
+ },
24
+ "lighteval|agieval:gaokao-biology|0": {
25
+ "acc": 0.3476190476190476,
26
+ "acc_stderr": 0.032940430891650836,
27
+ "acc_norm": 0.38095238095238093,
28
+ "acc_norm_stderr": 0.03359110046749991
29
+ },
30
+ "lighteval|agieval:gaokao-chemistry|0": {
31
+ "acc": 0.3188405797101449,
32
+ "acc_stderr": 0.03246964709878484,
33
+ "acc_norm": 0.2995169082125604,
34
+ "acc_norm_stderr": 0.031913606824066625
35
+ },
36
+ "lighteval|agieval:gaokao-chinese|0": {
37
+ "acc": 0.2682926829268293,
38
+ "acc_stderr": 0.028306754023121848,
39
+ "acc_norm": 0.2601626016260163,
40
+ "acc_norm_stderr": 0.028028995361669366
41
+ },
42
+ "lighteval|agieval:gaokao-english|0": {
43
+ "acc": 0.6437908496732027,
44
+ "acc_stderr": 0.027420477662629252,
45
+ "acc_norm": 0.6339869281045751,
46
+ "acc_norm_stderr": 0.027582811415159614
47
+ },
48
+ "lighteval|agieval:gaokao-geography|0": {
49
+ "acc": 0.47738693467336685,
50
+ "acc_stderr": 0.03549709401084567,
51
+ "acc_norm": 0.4723618090452261,
52
+ "acc_norm_stderr": 0.035479125346565575
53
+ },
54
+ "lighteval|agieval:gaokao-history|0": {
55
+ "acc": 0.46382978723404256,
56
+ "acc_stderr": 0.03260038511835771,
57
+ "acc_norm": 0.4595744680851064,
58
+ "acc_norm_stderr": 0.03257901482099834
59
+ },
60
+ "lighteval|agieval:gaokao-mathqa|0": {
61
+ "acc": 0.29914529914529914,
62
+ "acc_stderr": 0.02447490780047233,
63
+ "acc_norm": 0.2905982905982906,
64
+ "acc_norm_stderr": 0.024269376594480006
65
+ },
66
+ "lighteval|agieval:gaokao-physics|0": {
67
+ "acc": 0.355,
68
+ "acc_stderr": 0.033920910080708536,
69
+ "acc_norm": 0.365,
70
+ "acc_norm_stderr": 0.03412767927155775
71
+ },
72
+ "lighteval|agieval:logiqa-en|0": {
73
+ "acc": 0.3486943164362519,
74
+ "acc_stderr": 0.018692104055797926,
75
+ "acc_norm": 0.3655913978494624,
76
+ "acc_norm_stderr": 0.01888973378759108
77
+ },
78
+ "lighteval|agieval:logiqa-zh|0": {
79
+ "acc": 0.3778801843317972,
80
+ "acc_stderr": 0.01901767399112105,
81
+ "acc_norm": 0.3901689708141321,
82
+ "acc_norm_stderr": 0.019132619951195383
83
+ },
84
+ "lighteval|agieval:lsat-ar|0": {
85
+ "acc": 0.1956521739130435,
86
+ "acc_stderr": 0.026214799709819596,
87
+ "acc_norm": 0.19130434782608696,
88
+ "acc_norm_stderr": 0.025991852462828483
89
+ },
90
+ "lighteval|agieval:lsat-lr|0": {
91
+ "acc": 0.48823529411764705,
92
+ "acc_stderr": 0.02215597466931114,
93
+ "acc_norm": 0.4568627450980392,
94
+ "acc_norm_stderr": 0.022079476786510514
95
+ },
96
+ "lighteval|agieval:lsat-rc|0": {
97
+ "acc": 0.5278810408921933,
98
+ "acc_stderr": 0.03049483976158836,
99
+ "acc_norm": 0.483271375464684,
100
+ "acc_norm_stderr": 0.030525261933744594
101
+ },
102
+ "lighteval|agieval:sat-en|0": {
103
+ "acc": 0.7330097087378641,
104
+ "acc_stderr": 0.030897665523458157,
105
+ "acc_norm": 0.6747572815533981,
106
+ "acc_norm_stderr": 0.0327190473759639
107
+ },
108
+ "lighteval|agieval:sat-en-without-passage|0": {
109
+ "acc": 0.4077669902912621,
110
+ "acc_stderr": 0.03432222290260264,
111
+ "acc_norm": 0.3737864077669903,
112
+ "acc_norm_stderr": 0.03379061271978838
113
+ },
114
+ "lighteval|agieval:sat-math|0": {
115
+ "acc": 0.33636363636363636,
116
+ "acc_stderr": 0.03192622349349312,
117
+ "acc_norm": 0.3,
118
+ "acc_norm_stderr": 0.030966176864266674
119
+ },
120
+ "lighteval|agieval:_average|0": {
121
+ "acc": 0.40335912126462936,
122
+ "acc_stderr": 0.028775809391776958,
123
+ "acc_norm": 0.39302120470153423,
124
+ "acc_norm_stderr": 0.028823558641481985
125
+ },
126
+ "all": {
127
+ "acc": 0.40335912126462936,
128
+ "acc_stderr": 0.028775809391776958,
129
+ "acc_norm": 0.39302120470153423,
130
+ "acc_norm_stderr": 0.028823558641481985
131
+ }
132
+ },
133
+ "versions": {
134
+ "lighteval|agieval:aqua-rat|0": 0,
135
+ "lighteval|agieval:gaokao-biology|0": 0,
136
+ "lighteval|agieval:gaokao-chemistry|0": 0,
137
+ "lighteval|agieval:gaokao-chinese|0": 0,
138
+ "lighteval|agieval:gaokao-english|0": 0,
139
+ "lighteval|agieval:gaokao-geography|0": 0,
140
+ "lighteval|agieval:gaokao-history|0": 0,
141
+ "lighteval|agieval:gaokao-mathqa|0": 0,
142
+ "lighteval|agieval:gaokao-physics|0": 0,
143
+ "lighteval|agieval:logiqa-en|0": 0,
144
+ "lighteval|agieval:logiqa-zh|0": 0,
145
+ "lighteval|agieval:lsat-ar|0": 0,
146
+ "lighteval|agieval:lsat-lr|0": 0,
147
+ "lighteval|agieval:lsat-rc|0": 0,
148
+ "lighteval|agieval:sat-en|0": 0,
149
+ "lighteval|agieval:sat-en-without-passage|0": 0,
150
+ "lighteval|agieval:sat-math|0": 0
151
+ },
152
+ "config_tasks": {
153
+ "lighteval|agieval:aqua-rat": {
154
+ "name": "agieval:aqua-rat",
155
+ "prompt_function": "agieval",
156
+ "hf_repo": "dmayhem93/agieval-aqua-rat",
157
+ "hf_subset": "default",
158
+ "metric": [
159
+ "loglikelihood_acc",
160
+ "loglikelihood_acc_norm_nospace"
161
+ ],
162
+ "hf_avail_splits": [
163
+ "test"
164
+ ],
165
+ "evaluation_splits": [
166
+ "test"
167
+ ],
168
+ "few_shots_split": null,
169
+ "few_shots_select": "random_sampling",
170
+ "generation_size": 1,
171
+ "stop_sequence": null,
172
+ "output_regex": null,
173
+ "frozen": false,
174
+ "suite": [
175
+ "lighteval"
176
+ ],
177
+ "original_num_docs": 254,
178
+ "effective_num_docs": 254,
179
+ "trust_dataset": true,
180
+ "must_remove_duplicate_docs": null
181
+ },
182
+ "lighteval|agieval:gaokao-biology": {
183
+ "name": "agieval:gaokao-biology",
184
+ "prompt_function": "agieval",
185
+ "hf_repo": "dmayhem93/agieval-gaokao-biology",
186
+ "hf_subset": "default",
187
+ "metric": [
188
+ "loglikelihood_acc",
189
+ "loglikelihood_acc_norm_nospace"
190
+ ],
191
+ "hf_avail_splits": [
192
+ "test"
193
+ ],
194
+ "evaluation_splits": [
195
+ "test"
196
+ ],
197
+ "few_shots_split": null,
198
+ "few_shots_select": "random_sampling",
199
+ "generation_size": 1,
200
+ "stop_sequence": null,
201
+ "output_regex": null,
202
+ "frozen": false,
203
+ "suite": [
204
+ "lighteval"
205
+ ],
206
+ "original_num_docs": 210,
207
+ "effective_num_docs": 210,
208
+ "trust_dataset": true,
209
+ "must_remove_duplicate_docs": null
210
+ },
211
+ "lighteval|agieval:gaokao-chemistry": {
212
+ "name": "agieval:gaokao-chemistry",
213
+ "prompt_function": "agieval",
214
+ "hf_repo": "dmayhem93/agieval-gaokao-chemistry",
215
+ "hf_subset": "default",
216
+ "metric": [
217
+ "loglikelihood_acc",
218
+ "loglikelihood_acc_norm_nospace"
219
+ ],
220
+ "hf_avail_splits": [
221
+ "test"
222
+ ],
223
+ "evaluation_splits": [
224
+ "test"
225
+ ],
226
+ "few_shots_split": null,
227
+ "few_shots_select": "random_sampling",
228
+ "generation_size": 1,
229
+ "stop_sequence": null,
230
+ "output_regex": null,
231
+ "frozen": false,
232
+ "suite": [
233
+ "lighteval"
234
+ ],
235
+ "original_num_docs": 207,
236
+ "effective_num_docs": 207,
237
+ "trust_dataset": true,
238
+ "must_remove_duplicate_docs": null
239
+ },
240
+ "lighteval|agieval:gaokao-chinese": {
241
+ "name": "agieval:gaokao-chinese",
242
+ "prompt_function": "agieval",
243
+ "hf_repo": "dmayhem93/agieval-gaokao-chinese",
244
+ "hf_subset": "default",
245
+ "metric": [
246
+ "loglikelihood_acc",
247
+ "loglikelihood_acc_norm_nospace"
248
+ ],
249
+ "hf_avail_splits": [
250
+ "test"
251
+ ],
252
+ "evaluation_splits": [
253
+ "test"
254
+ ],
255
+ "few_shots_split": null,
256
+ "few_shots_select": "random_sampling",
257
+ "generation_size": 1,
258
+ "stop_sequence": null,
259
+ "output_regex": null,
260
+ "frozen": false,
261
+ "suite": [
262
+ "lighteval"
263
+ ],
264
+ "original_num_docs": 246,
265
+ "effective_num_docs": 246,
266
+ "trust_dataset": true,
267
+ "must_remove_duplicate_docs": null
268
+ },
269
+ "lighteval|agieval:gaokao-english": {
270
+ "name": "agieval:gaokao-english",
271
+ "prompt_function": "agieval",
272
+ "hf_repo": "dmayhem93/agieval-gaokao-english",
273
+ "hf_subset": "default",
274
+ "metric": [
275
+ "loglikelihood_acc",
276
+ "loglikelihood_acc_norm_nospace"
277
+ ],
278
+ "hf_avail_splits": [
279
+ "test"
280
+ ],
281
+ "evaluation_splits": [
282
+ "test"
283
+ ],
284
+ "few_shots_split": null,
285
+ "few_shots_select": "random_sampling",
286
+ "generation_size": 1,
287
+ "stop_sequence": null,
288
+ "output_regex": null,
289
+ "frozen": false,
290
+ "suite": [
291
+ "lighteval"
292
+ ],
293
+ "original_num_docs": 306,
294
+ "effective_num_docs": 306,
295
+ "trust_dataset": true,
296
+ "must_remove_duplicate_docs": null
297
+ },
298
+ "lighteval|agieval:gaokao-geography": {
299
+ "name": "agieval:gaokao-geography",
300
+ "prompt_function": "agieval",
301
+ "hf_repo": "dmayhem93/agieval-gaokao-geography",
302
+ "hf_subset": "default",
303
+ "metric": [
304
+ "loglikelihood_acc",
305
+ "loglikelihood_acc_norm_nospace"
306
+ ],
307
+ "hf_avail_splits": [
308
+ "test"
309
+ ],
310
+ "evaluation_splits": [
311
+ "test"
312
+ ],
313
+ "few_shots_split": null,
314
+ "few_shots_select": "random_sampling",
315
+ "generation_size": 1,
316
+ "stop_sequence": null,
317
+ "output_regex": null,
318
+ "frozen": false,
319
+ "suite": [
320
+ "lighteval"
321
+ ],
322
+ "original_num_docs": 199,
323
+ "effective_num_docs": 199,
324
+ "trust_dataset": true,
325
+ "must_remove_duplicate_docs": null
326
+ },
327
+ "lighteval|agieval:gaokao-history": {
328
+ "name": "agieval:gaokao-history",
329
+ "prompt_function": "agieval",
330
+ "hf_repo": "dmayhem93/agieval-gaokao-history",
331
+ "hf_subset": "default",
332
+ "metric": [
333
+ "loglikelihood_acc",
334
+ "loglikelihood_acc_norm_nospace"
335
+ ],
336
+ "hf_avail_splits": [
337
+ "test"
338
+ ],
339
+ "evaluation_splits": [
340
+ "test"
341
+ ],
342
+ "few_shots_split": null,
343
+ "few_shots_select": "random_sampling",
344
+ "generation_size": 1,
345
+ "stop_sequence": null,
346
+ "output_regex": null,
347
+ "frozen": false,
348
+ "suite": [
349
+ "lighteval"
350
+ ],
351
+ "original_num_docs": 235,
352
+ "effective_num_docs": 235,
353
+ "trust_dataset": true,
354
+ "must_remove_duplicate_docs": null
355
+ },
356
+ "lighteval|agieval:gaokao-mathqa": {
357
+ "name": "agieval:gaokao-mathqa",
358
+ "prompt_function": "agieval",
359
+ "hf_repo": "dmayhem93/agieval-gaokao-mathqa",
360
+ "hf_subset": "default",
361
+ "metric": [
362
+ "loglikelihood_acc",
363
+ "loglikelihood_acc_norm_nospace"
364
+ ],
365
+ "hf_avail_splits": [
366
+ "test"
367
+ ],
368
+ "evaluation_splits": [
369
+ "test"
370
+ ],
371
+ "few_shots_split": null,
372
+ "few_shots_select": "random_sampling",
373
+ "generation_size": 1,
374
+ "stop_sequence": null,
375
+ "output_regex": null,
376
+ "frozen": false,
377
+ "suite": [
378
+ "lighteval"
379
+ ],
380
+ "original_num_docs": 351,
381
+ "effective_num_docs": 351,
382
+ "trust_dataset": true,
383
+ "must_remove_duplicate_docs": null
384
+ },
385
+ "lighteval|agieval:gaokao-physics": {
386
+ "name": "agieval:gaokao-physics",
387
+ "prompt_function": "agieval",
388
+ "hf_repo": "dmayhem93/agieval-gaokao-physics",
389
+ "hf_subset": "default",
390
+ "metric": [
391
+ "loglikelihood_acc",
392
+ "loglikelihood_acc_norm_nospace"
393
+ ],
394
+ "hf_avail_splits": [
395
+ "test"
396
+ ],
397
+ "evaluation_splits": [
398
+ "test"
399
+ ],
400
+ "few_shots_split": null,
401
+ "few_shots_select": "random_sampling",
402
+ "generation_size": 1,
403
+ "stop_sequence": null,
404
+ "output_regex": null,
405
+ "frozen": false,
406
+ "suite": [
407
+ "lighteval"
408
+ ],
409
+ "original_num_docs": 200,
410
+ "effective_num_docs": 200,
411
+ "trust_dataset": true,
412
+ "must_remove_duplicate_docs": null
413
+ },
414
+ "lighteval|agieval:logiqa-en": {
415
+ "name": "agieval:logiqa-en",
416
+ "prompt_function": "agieval",
417
+ "hf_repo": "dmayhem93/agieval-logiqa-en",
418
+ "hf_subset": "default",
419
+ "metric": [
420
+ "loglikelihood_acc",
421
+ "loglikelihood_acc_norm_nospace"
422
+ ],
423
+ "hf_avail_splits": [
424
+ "test"
425
+ ],
426
+ "evaluation_splits": [
427
+ "test"
428
+ ],
429
+ "few_shots_split": null,
430
+ "few_shots_select": "random_sampling",
431
+ "generation_size": 1,
432
+ "stop_sequence": null,
433
+ "output_regex": null,
434
+ "frozen": false,
435
+ "suite": [
436
+ "lighteval"
437
+ ],
438
+ "original_num_docs": 651,
439
+ "effective_num_docs": 651,
440
+ "trust_dataset": true,
441
+ "must_remove_duplicate_docs": null
442
+ },
443
+ "lighteval|agieval:logiqa-zh": {
444
+ "name": "agieval:logiqa-zh",
445
+ "prompt_function": "agieval",
446
+ "hf_repo": "dmayhem93/agieval-logiqa-zh",
447
+ "hf_subset": "default",
448
+ "metric": [
449
+ "loglikelihood_acc",
450
+ "loglikelihood_acc_norm_nospace"
451
+ ],
452
+ "hf_avail_splits": [
453
+ "test"
454
+ ],
455
+ "evaluation_splits": [
456
+ "test"
457
+ ],
458
+ "few_shots_split": null,
459
+ "few_shots_select": "random_sampling",
460
+ "generation_size": 1,
461
+ "stop_sequence": null,
462
+ "output_regex": null,
463
+ "frozen": false,
464
+ "suite": [
465
+ "lighteval"
466
+ ],
467
+ "original_num_docs": 651,
468
+ "effective_num_docs": 651,
469
+ "trust_dataset": true,
470
+ "must_remove_duplicate_docs": null
471
+ },
472
+ "lighteval|agieval:lsat-ar": {
473
+ "name": "agieval:lsat-ar",
474
+ "prompt_function": "agieval",
475
+ "hf_repo": "dmayhem93/agieval-lsat-ar",
476
+ "hf_subset": "default",
477
+ "metric": [
478
+ "loglikelihood_acc",
479
+ "loglikelihood_acc_norm_nospace"
480
+ ],
481
+ "hf_avail_splits": [
482
+ "test"
483
+ ],
484
+ "evaluation_splits": [
485
+ "test"
486
+ ],
487
+ "few_shots_split": null,
488
+ "few_shots_select": "random_sampling",
489
+ "generation_size": 1,
490
+ "stop_sequence": null,
491
+ "output_regex": null,
492
+ "frozen": false,
493
+ "suite": [
494
+ "lighteval"
495
+ ],
496
+ "original_num_docs": 230,
497
+ "effective_num_docs": 230,
498
+ "trust_dataset": true,
499
+ "must_remove_duplicate_docs": null
500
+ },
501
+ "lighteval|agieval:lsat-lr": {
502
+ "name": "agieval:lsat-lr",
503
+ "prompt_function": "agieval",
504
+ "hf_repo": "dmayhem93/agieval-lsat-lr",
505
+ "hf_subset": "default",
506
+ "metric": [
507
+ "loglikelihood_acc",
508
+ "loglikelihood_acc_norm_nospace"
509
+ ],
510
+ "hf_avail_splits": [
511
+ "test"
512
+ ],
513
+ "evaluation_splits": [
514
+ "test"
515
+ ],
516
+ "few_shots_split": null,
517
+ "few_shots_select": "random_sampling",
518
+ "generation_size": 1,
519
+ "stop_sequence": null,
520
+ "output_regex": null,
521
+ "frozen": false,
522
+ "suite": [
523
+ "lighteval"
524
+ ],
525
+ "original_num_docs": 510,
526
+ "effective_num_docs": 510,
527
+ "trust_dataset": true,
528
+ "must_remove_duplicate_docs": null
529
+ },
530
+ "lighteval|agieval:lsat-rc": {
531
+ "name": "agieval:lsat-rc",
532
+ "prompt_function": "agieval",
533
+ "hf_repo": "dmayhem93/agieval-lsat-rc",
534
+ "hf_subset": "default",
535
+ "metric": [
536
+ "loglikelihood_acc",
537
+ "loglikelihood_acc_norm_nospace"
538
+ ],
539
+ "hf_avail_splits": [
540
+ "test"
541
+ ],
542
+ "evaluation_splits": [
543
+ "test"
544
+ ],
545
+ "few_shots_split": null,
546
+ "few_shots_select": "random_sampling",
547
+ "generation_size": 1,
548
+ "stop_sequence": null,
549
+ "output_regex": null,
550
+ "frozen": false,
551
+ "suite": [
552
+ "lighteval"
553
+ ],
554
+ "original_num_docs": 269,
555
+ "effective_num_docs": 269,
556
+ "trust_dataset": true,
557
+ "must_remove_duplicate_docs": null
558
+ },
559
+ "lighteval|agieval:sat-en": {
560
+ "name": "agieval:sat-en",
561
+ "prompt_function": "agieval",
562
+ "hf_repo": "dmayhem93/agieval-sat-en",
563
+ "hf_subset": "default",
564
+ "metric": [
565
+ "loglikelihood_acc",
566
+ "loglikelihood_acc_norm_nospace"
567
+ ],
568
+ "hf_avail_splits": [
569
+ "test"
570
+ ],
571
+ "evaluation_splits": [
572
+ "test"
573
+ ],
574
+ "few_shots_split": null,
575
+ "few_shots_select": "random_sampling",
576
+ "generation_size": 1,
577
+ "stop_sequence": null,
578
+ "output_regex": null,
579
+ "frozen": false,
580
+ "suite": [
581
+ "lighteval"
582
+ ],
583
+ "original_num_docs": 206,
584
+ "effective_num_docs": 206,
585
+ "trust_dataset": true,
586
+ "must_remove_duplicate_docs": null
587
+ },
588
+ "lighteval|agieval:sat-en-without-passage": {
589
+ "name": "agieval:sat-en-without-passage",
590
+ "prompt_function": "agieval",
591
+ "hf_repo": "dmayhem93/agieval-sat-en-without-passage",
592
+ "hf_subset": "default",
593
+ "metric": [
594
+ "loglikelihood_acc",
595
+ "loglikelihood_acc_norm_nospace"
596
+ ],
597
+ "hf_avail_splits": [
598
+ "test"
599
+ ],
600
+ "evaluation_splits": [
601
+ "test"
602
+ ],
603
+ "few_shots_split": null,
604
+ "few_shots_select": "random_sampling",
605
+ "generation_size": 1,
606
+ "stop_sequence": null,
607
+ "output_regex": null,
608
+ "frozen": false,
609
+ "suite": [
610
+ "lighteval"
611
+ ],
612
+ "original_num_docs": 206,
613
+ "effective_num_docs": 206,
614
+ "trust_dataset": true,
615
+ "must_remove_duplicate_docs": null
616
+ },
617
+ "lighteval|agieval:sat-math": {
618
+ "name": "agieval:sat-math",
619
+ "prompt_function": "agieval",
620
+ "hf_repo": "dmayhem93/agieval-sat-math",
621
+ "hf_subset": "default",
622
+ "metric": [
623
+ "loglikelihood_acc",
624
+ "loglikelihood_acc_norm_nospace"
625
+ ],
626
+ "hf_avail_splits": [
627
+ "test"
628
+ ],
629
+ "evaluation_splits": [
630
+ "test"
631
+ ],
632
+ "few_shots_split": null,
633
+ "few_shots_select": "random_sampling",
634
+ "generation_size": 1,
635
+ "stop_sequence": null,
636
+ "output_regex": null,
637
+ "frozen": false,
638
+ "suite": [
639
+ "lighteval"
640
+ ],
641
+ "original_num_docs": 220,
642
+ "effective_num_docs": 220,
643
+ "trust_dataset": true,
644
+ "must_remove_duplicate_docs": null
645
+ }
646
+ },
647
+ "summary_tasks": {
648
+ "lighteval|agieval:aqua-rat|0": {
649
+ "hashes": {
650
+ "hash_examples": "f09607f69e5b7525",
651
+ "hash_full_prompts": "c885824d200848ff",
652
+ "hash_input_tokens": "bbc9473dbbe6f16c",
653
+ "hash_cont_tokens": "7c2bcf33ed1e5ae3"
654
+ },
655
+ "truncated": 0,
656
+ "non_truncated": 254,
657
+ "padded": 1270,
658
+ "non_padded": 0,
659
+ "effective_few_shots": 0.0,
660
+ "num_truncated_few_shots": 0
661
+ },
662
+ "lighteval|agieval:gaokao-biology|0": {
663
+ "hashes": {
664
+ "hash_examples": "f262eaf4a72db963",
665
+ "hash_full_prompts": "a57b68a06f7c5dad",
666
+ "hash_input_tokens": "ca1fe44a4eaeed8d",
667
+ "hash_cont_tokens": "b3febf85776696e1"
668
+ },
669
+ "truncated": 0,
670
+ "non_truncated": 210,
671
+ "padded": 840,
672
+ "non_padded": 0,
673
+ "effective_few_shots": 0.0,
674
+ "num_truncated_few_shots": 0
675
+ },
676
+ "lighteval|agieval:gaokao-chemistry|0": {
677
+ "hashes": {
678
+ "hash_examples": "47f2e649f58d9da5",
679
+ "hash_full_prompts": "77b311499beca56b",
680
+ "hash_input_tokens": "43896e11a60e81d3",
681
+ "hash_cont_tokens": "711fb41221b6515e"
682
+ },
683
+ "truncated": 0,
684
+ "non_truncated": 207,
685
+ "padded": 830,
686
+ "non_padded": 1,
687
+ "effective_few_shots": 0.0,
688
+ "num_truncated_few_shots": 0
689
+ },
690
+ "lighteval|agieval:gaokao-chinese|0": {
691
+ "hashes": {
692
+ "hash_examples": "1010b21fde4726ab",
693
+ "hash_full_prompts": "765b83dc3fe58e0e",
694
+ "hash_input_tokens": "c2e6ac114faa28b7",
695
+ "hash_cont_tokens": "393b1820b8f4534f"
696
+ },
697
+ "truncated": 0,
698
+ "non_truncated": 246,
699
+ "padded": 981,
700
+ "non_padded": 3,
701
+ "effective_few_shots": 0.0,
702
+ "num_truncated_few_shots": 0
703
+ },
704
+ "lighteval|agieval:gaokao-english|0": {
705
+ "hashes": {
706
+ "hash_examples": "4864e492a350ae93",
707
+ "hash_full_prompts": "2d90914067cbddb2",
708
+ "hash_input_tokens": "19ee1ddb098e0782",
709
+ "hash_cont_tokens": "dd72ceb1d7224598"
710
+ },
711
+ "truncated": 0,
712
+ "non_truncated": 306,
713
+ "padded": 1222,
714
+ "non_padded": 2,
715
+ "effective_few_shots": 0.0,
716
+ "num_truncated_few_shots": 0
717
+ },
718
+ "lighteval|agieval:gaokao-geography|0": {
719
+ "hashes": {
720
+ "hash_examples": "ec3a021e37650e7d",
721
+ "hash_full_prompts": "26abf6b8b06b03fd",
722
+ "hash_input_tokens": "9c63079da9deb525",
723
+ "hash_cont_tokens": "e06462bcee629ea8"
724
+ },
725
+ "truncated": 0,
726
+ "non_truncated": 199,
727
+ "padded": 794,
728
+ "non_padded": 2,
729
+ "effective_few_shots": 0.0,
730
+ "num_truncated_few_shots": 0
731
+ },
732
+ "lighteval|agieval:gaokao-history|0": {
733
+ "hashes": {
734
+ "hash_examples": "b3fad1596f1ae1f9",
735
+ "hash_full_prompts": "d22e901442960499",
736
+ "hash_input_tokens": "cd1d7e17c8e3016d",
737
+ "hash_cont_tokens": "87cce58a4ec6cfd8"
738
+ },
739
+ "truncated": 0,
740
+ "non_truncated": 235,
741
+ "padded": 935,
742
+ "non_padded": 5,
743
+ "effective_few_shots": 0.0,
744
+ "num_truncated_few_shots": 0
745
+ },
746
+ "lighteval|agieval:gaokao-mathqa|0": {
747
+ "hashes": {
748
+ "hash_examples": "1d1088556861b0b0",
749
+ "hash_full_prompts": "8f130ed7923a2e96",
750
+ "hash_input_tokens": "d4e27a91afe699a4",
751
+ "hash_cont_tokens": "771402d59229cbae"
752
+ },
753
+ "truncated": 0,
754
+ "non_truncated": 351,
755
+ "padded": 1401,
756
+ "non_padded": 3,
757
+ "effective_few_shots": 0.0,
758
+ "num_truncated_few_shots": 0
759
+ },
760
+ "lighteval|agieval:gaokao-physics|0": {
761
+ "hashes": {
762
+ "hash_examples": "eb05f035c7bfca2f",
763
+ "hash_full_prompts": "04c26f6aa4512fa0",
764
+ "hash_input_tokens": "c571bd43f31938ce",
765
+ "hash_cont_tokens": "f1574dddbe4231e1"
766
+ },
767
+ "truncated": 0,
768
+ "non_truncated": 200,
769
+ "padded": 797,
770
+ "non_padded": 3,
771
+ "effective_few_shots": 0.0,
772
+ "num_truncated_few_shots": 0
773
+ },
774
+ "lighteval|agieval:logiqa-en|0": {
775
+ "hashes": {
776
+ "hash_examples": "0a688a45f69c21e0",
777
+ "hash_full_prompts": "1e7fa6f1e62bea8b",
778
+ "hash_input_tokens": "c349474caa787f2c",
779
+ "hash_cont_tokens": "106c1564fdd5ff8e"
780
+ },
781
+ "truncated": 0,
782
+ "non_truncated": 651,
783
+ "padded": 2597,
784
+ "non_padded": 7,
785
+ "effective_few_shots": 0.0,
786
+ "num_truncated_few_shots": 0
787
+ },
788
+ "lighteval|agieval:logiqa-zh|0": {
789
+ "hashes": {
790
+ "hash_examples": "620d6888b6012ea5",
791
+ "hash_full_prompts": "f5e4b12f2be2730a",
792
+ "hash_input_tokens": "7443875e78dae670",
793
+ "hash_cont_tokens": "22d3640f03c44bb0"
794
+ },
795
+ "truncated": 0,
796
+ "non_truncated": 651,
797
+ "padded": 2581,
798
+ "non_padded": 23,
799
+ "effective_few_shots": 0.0,
800
+ "num_truncated_few_shots": 0
801
+ },
802
+ "lighteval|agieval:lsat-ar|0": {
803
+ "hashes": {
804
+ "hash_examples": "627c8f5ccd5da209",
805
+ "hash_full_prompts": "93f01bfefd21f036",
806
+ "hash_input_tokens": "cd6e0cee27c3c3e4",
807
+ "hash_cont_tokens": "9e13af9ad8f5f78e"
808
+ },
809
+ "truncated": 0,
810
+ "non_truncated": 230,
811
+ "padded": 1140,
812
+ "non_padded": 10,
813
+ "effective_few_shots": 0.0,
814
+ "num_truncated_few_shots": 0
815
+ },
816
+ "lighteval|agieval:lsat-lr|0": {
817
+ "hashes": {
818
+ "hash_examples": "794641c86de172f5",
819
+ "hash_full_prompts": "8e81a99c7521258c",
820
+ "hash_input_tokens": "09e76ba415ce4540",
821
+ "hash_cont_tokens": "86333847359ceee5"
822
+ },
823
+ "truncated": 0,
824
+ "non_truncated": 510,
825
+ "padded": 2525,
826
+ "non_padded": 25,
827
+ "effective_few_shots": 0.0,
828
+ "num_truncated_few_shots": 0
829
+ },
830
+ "lighteval|agieval:lsat-rc|0": {
831
+ "hashes": {
832
+ "hash_examples": "35981ed917ea01cf",
833
+ "hash_full_prompts": "b6e8d1adf72e9fb4",
834
+ "hash_input_tokens": "78bda8444d556179",
835
+ "hash_cont_tokens": "cb47c6b984067525"
836
+ },
837
+ "truncated": 0,
838
+ "non_truncated": 269,
839
+ "padded": 1345,
840
+ "non_padded": 0,
841
+ "effective_few_shots": 0.0,
842
+ "num_truncated_few_shots": 0
843
+ },
844
+ "lighteval|agieval:sat-en|0": {
845
+ "hashes": {
846
+ "hash_examples": "041c39c646536a1e",
847
+ "hash_full_prompts": "b9081e0a4bf5d97f",
848
+ "hash_input_tokens": "16dc5e2c7544b9f1",
849
+ "hash_cont_tokens": "cb01422bf828aefe"
850
+ },
851
+ "truncated": 0,
852
+ "non_truncated": 206,
853
+ "padded": 821,
854
+ "non_padded": 0,
855
+ "effective_few_shots": 0.0,
856
+ "num_truncated_few_shots": 0
857
+ },
858
+ "lighteval|agieval:sat-en-without-passage|0": {
859
+ "hashes": {
860
+ "hash_examples": "e4d9284367dff68f",
861
+ "hash_full_prompts": "f1362aa3ba355eaa",
862
+ "hash_input_tokens": "c4236dd09d0088d9",
863
+ "hash_cont_tokens": "cb01422bf828aefe"
864
+ },
865
+ "truncated": 0,
866
+ "non_truncated": 206,
867
+ "padded": 817,
868
+ "non_padded": 4,
869
+ "effective_few_shots": 0.0,
870
+ "num_truncated_few_shots": 0
871
+ },
872
+ "lighteval|agieval:sat-math|0": {
873
+ "hashes": {
874
+ "hash_examples": "01db7291603fc1a0",
875
+ "hash_full_prompts": "8dab33ab73deb698",
876
+ "hash_input_tokens": "e4d4e2e413144ff5",
877
+ "hash_cont_tokens": "0c4980b69a75cb83"
878
+ },
879
+ "truncated": 0,
880
+ "non_truncated": 220,
881
+ "padded": 876,
882
+ "non_padded": 4,
883
+ "effective_few_shots": 0.0,
884
+ "num_truncated_few_shots": 0
885
+ }
886
+ },
887
+ "summary_general": {
888
+ "hashes": {
889
+ "hash_examples": "da3af66181f18ddf",
890
+ "hash_full_prompts": "37ef9da1700eed47",
891
+ "hash_input_tokens": "3a23d7f630bcf75b",
892
+ "hash_cont_tokens": "7f70e41f6e93ac40"
893
+ },
894
+ "truncated": 0,
895
+ "non_truncated": 5151,
896
+ "padded": 21772,
897
+ "non_padded": 92,
898
+ "num_truncated_few_shots": 0
899
+ }
900
+ }