lewtun HF Staff commited on
Commit
e5995a8
·
verified ·
1 Parent(s): 047e95d

Upload eval_results/databricks/dbrx-instruct/main/agieval/results_2024-03-30T15-19-26.415183.json with huggingface_hub

Browse files
eval_results/databricks/dbrx-instruct/main/agieval/results_2024-03-30T15-19-26.415183.json ADDED
@@ -0,0 +1,900 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 1,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 1449028.270326239,
9
+ "end_time": 1455742.868291307,
10
+ "total_evaluation_time_secondes": "6714.597965067951",
11
+ "model_name": "databricks/dbrx-instruct",
12
+ "model_sha": "3b5d968eab47b0cb5b075fd984612b63f92841c2",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "245.12 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|agieval:aqua-rat|0": {
19
+ "acc": 0.25196850393700787,
20
+ "acc_stderr": 0.027294353392553605,
21
+ "acc_norm": 0.25984251968503935,
22
+ "acc_norm_stderr": 0.02757127913961099
23
+ },
24
+ "lighteval|agieval:gaokao-biology|0": {
25
+ "acc": 0.41904761904761906,
26
+ "acc_stderr": 0.03412941259257987,
27
+ "acc_norm": 0.41904761904761906,
28
+ "acc_norm_stderr": 0.034129412592579865
29
+ },
30
+ "lighteval|agieval:gaokao-chemistry|0": {
31
+ "acc": 0.40096618357487923,
32
+ "acc_stderr": 0.03414647938499206,
33
+ "acc_norm": 0.3526570048309179,
34
+ "acc_norm_stderr": 0.033289699402134686
35
+ },
36
+ "lighteval|agieval:gaokao-chinese|0": {
37
+ "acc": 0.3943089430894309,
38
+ "acc_stderr": 0.03122200953730326,
39
+ "acc_norm": 0.4065040650406504,
40
+ "acc_norm_stderr": 0.03138038579815636
41
+ },
42
+ "lighteval|agieval:gaokao-english|0": {
43
+ "acc": 0.6176470588235294,
44
+ "acc_stderr": 0.027826109307283697,
45
+ "acc_norm": 0.5751633986928104,
46
+ "acc_norm_stderr": 0.02830457667314111
47
+ },
48
+ "lighteval|agieval:gaokao-geography|0": {
49
+ "acc": 0.6030150753768844,
50
+ "acc_stderr": 0.03477110537378156,
51
+ "acc_norm": 0.592964824120603,
52
+ "acc_norm_stderr": 0.03491385802519052
53
+ },
54
+ "lighteval|agieval:gaokao-history|0": {
55
+ "acc": 0.6042553191489362,
56
+ "acc_stderr": 0.031967586978353627,
57
+ "acc_norm": 0.5446808510638298,
58
+ "acc_norm_stderr": 0.03255525359340356
59
+ },
60
+ "lighteval|agieval:gaokao-mathqa|0": {
61
+ "acc": 0.2849002849002849,
62
+ "acc_stderr": 0.024126577672411744,
63
+ "acc_norm": 0.2792022792022792,
64
+ "acc_norm_stderr": 0.02397906029914625
65
+ },
66
+ "lighteval|agieval:gaokao-physics|0": {
67
+ "acc": 0.46,
68
+ "acc_stderr": 0.03533045720097817,
69
+ "acc_norm": 0.445,
70
+ "acc_norm_stderr": 0.0352289710609046
71
+ },
72
+ "lighteval|agieval:logiqa-en|0": {
73
+ "acc": 0.3210445468509985,
74
+ "acc_stderr": 0.01831245670147613,
75
+ "acc_norm": 0.34715821812596004,
76
+ "acc_norm_stderr": 0.018672867593776798
77
+ },
78
+ "lighteval|agieval:logiqa-zh|0": {
79
+ "acc": 0.40860215053763443,
80
+ "acc_stderr": 0.019281175878347604,
81
+ "acc_norm": 0.42242703533026116,
82
+ "acc_norm_stderr": 0.01937414753071922
83
+ },
84
+ "lighteval|agieval:lsat-ar|0": {
85
+ "acc": 0.20869565217391303,
86
+ "acc_stderr": 0.026854108265439658,
87
+ "acc_norm": 0.20869565217391303,
88
+ "acc_norm_stderr": 0.026854108265439658
89
+ },
90
+ "lighteval|agieval:lsat-lr|0": {
91
+ "acc": 0.48823529411764705,
92
+ "acc_stderr": 0.022155974669311143,
93
+ "acc_norm": 0.3843137254901961,
94
+ "acc_norm_stderr": 0.021560746452830175
95
+ },
96
+ "lighteval|agieval:lsat-rc|0": {
97
+ "acc": 0.6133828996282528,
98
+ "acc_stderr": 0.029746711725453,
99
+ "acc_norm": 0.46096654275092935,
100
+ "acc_norm_stderr": 0.030449149512372414
101
+ },
102
+ "lighteval|agieval:sat-en|0": {
103
+ "acc": 0.7378640776699029,
104
+ "acc_stderr": 0.03071669765614076,
105
+ "acc_norm": 0.5922330097087378,
106
+ "acc_norm_stderr": 0.034322222902602624
107
+ },
108
+ "lighteval|agieval:sat-en-without-passage|0": {
109
+ "acc": 0.46601941747572817,
110
+ "acc_stderr": 0.03484077510347999,
111
+ "acc_norm": 0.3786407766990291,
112
+ "acc_norm_stderr": 0.033877248925062636
113
+ },
114
+ "lighteval|agieval:sat-math|0": {
115
+ "acc": 0.44545454545454544,
116
+ "acc_stderr": 0.03358522134954388,
117
+ "acc_norm": 0.32727272727272727,
118
+ "acc_norm_stderr": 0.0317067966768602
119
+ },
120
+ "lighteval|agieval:_average|0": {
121
+ "acc": 0.4544357395180702,
122
+ "acc_stderr": 0.029194541928789987,
123
+ "acc_norm": 0.41157472054326494,
124
+ "acc_norm_stderr": 0.029304104967290103
125
+ },
126
+ "all": {
127
+ "acc": 0.4544357395180702,
128
+ "acc_stderr": 0.029194541928789987,
129
+ "acc_norm": 0.41157472054326494,
130
+ "acc_norm_stderr": 0.029304104967290103
131
+ }
132
+ },
133
+ "versions": {
134
+ "lighteval|agieval:aqua-rat|0": 0,
135
+ "lighteval|agieval:gaokao-biology|0": 0,
136
+ "lighteval|agieval:gaokao-chemistry|0": 0,
137
+ "lighteval|agieval:gaokao-chinese|0": 0,
138
+ "lighteval|agieval:gaokao-english|0": 0,
139
+ "lighteval|agieval:gaokao-geography|0": 0,
140
+ "lighteval|agieval:gaokao-history|0": 0,
141
+ "lighteval|agieval:gaokao-mathqa|0": 0,
142
+ "lighteval|agieval:gaokao-physics|0": 0,
143
+ "lighteval|agieval:logiqa-en|0": 0,
144
+ "lighteval|agieval:logiqa-zh|0": 0,
145
+ "lighteval|agieval:lsat-ar|0": 0,
146
+ "lighteval|agieval:lsat-lr|0": 0,
147
+ "lighteval|agieval:lsat-rc|0": 0,
148
+ "lighteval|agieval:sat-en|0": 0,
149
+ "lighteval|agieval:sat-en-without-passage|0": 0,
150
+ "lighteval|agieval:sat-math|0": 0
151
+ },
152
+ "config_tasks": {
153
+ "lighteval|agieval:aqua-rat": {
154
+ "name": "agieval:aqua-rat",
155
+ "prompt_function": "agieval",
156
+ "hf_repo": "dmayhem93/agieval-aqua-rat",
157
+ "hf_subset": "default",
158
+ "metric": [
159
+ "loglikelihood_acc",
160
+ "loglikelihood_acc_norm_nospace"
161
+ ],
162
+ "hf_avail_splits": [
163
+ "test"
164
+ ],
165
+ "evaluation_splits": [
166
+ "test"
167
+ ],
168
+ "few_shots_split": null,
169
+ "few_shots_select": "random_sampling",
170
+ "generation_size": 1,
171
+ "stop_sequence": null,
172
+ "output_regex": null,
173
+ "frozen": false,
174
+ "suite": [
175
+ "lighteval"
176
+ ],
177
+ "original_num_docs": 254,
178
+ "effective_num_docs": 254,
179
+ "trust_dataset": true,
180
+ "must_remove_duplicate_docs": null
181
+ },
182
+ "lighteval|agieval:gaokao-biology": {
183
+ "name": "agieval:gaokao-biology",
184
+ "prompt_function": "agieval",
185
+ "hf_repo": "dmayhem93/agieval-gaokao-biology",
186
+ "hf_subset": "default",
187
+ "metric": [
188
+ "loglikelihood_acc",
189
+ "loglikelihood_acc_norm_nospace"
190
+ ],
191
+ "hf_avail_splits": [
192
+ "test"
193
+ ],
194
+ "evaluation_splits": [
195
+ "test"
196
+ ],
197
+ "few_shots_split": null,
198
+ "few_shots_select": "random_sampling",
199
+ "generation_size": 1,
200
+ "stop_sequence": null,
201
+ "output_regex": null,
202
+ "frozen": false,
203
+ "suite": [
204
+ "lighteval"
205
+ ],
206
+ "original_num_docs": 210,
207
+ "effective_num_docs": 210,
208
+ "trust_dataset": true,
209
+ "must_remove_duplicate_docs": null
210
+ },
211
+ "lighteval|agieval:gaokao-chemistry": {
212
+ "name": "agieval:gaokao-chemistry",
213
+ "prompt_function": "agieval",
214
+ "hf_repo": "dmayhem93/agieval-gaokao-chemistry",
215
+ "hf_subset": "default",
216
+ "metric": [
217
+ "loglikelihood_acc",
218
+ "loglikelihood_acc_norm_nospace"
219
+ ],
220
+ "hf_avail_splits": [
221
+ "test"
222
+ ],
223
+ "evaluation_splits": [
224
+ "test"
225
+ ],
226
+ "few_shots_split": null,
227
+ "few_shots_select": "random_sampling",
228
+ "generation_size": 1,
229
+ "stop_sequence": null,
230
+ "output_regex": null,
231
+ "frozen": false,
232
+ "suite": [
233
+ "lighteval"
234
+ ],
235
+ "original_num_docs": 207,
236
+ "effective_num_docs": 207,
237
+ "trust_dataset": true,
238
+ "must_remove_duplicate_docs": null
239
+ },
240
+ "lighteval|agieval:gaokao-chinese": {
241
+ "name": "agieval:gaokao-chinese",
242
+ "prompt_function": "agieval",
243
+ "hf_repo": "dmayhem93/agieval-gaokao-chinese",
244
+ "hf_subset": "default",
245
+ "metric": [
246
+ "loglikelihood_acc",
247
+ "loglikelihood_acc_norm_nospace"
248
+ ],
249
+ "hf_avail_splits": [
250
+ "test"
251
+ ],
252
+ "evaluation_splits": [
253
+ "test"
254
+ ],
255
+ "few_shots_split": null,
256
+ "few_shots_select": "random_sampling",
257
+ "generation_size": 1,
258
+ "stop_sequence": null,
259
+ "output_regex": null,
260
+ "frozen": false,
261
+ "suite": [
262
+ "lighteval"
263
+ ],
264
+ "original_num_docs": 246,
265
+ "effective_num_docs": 246,
266
+ "trust_dataset": true,
267
+ "must_remove_duplicate_docs": null
268
+ },
269
+ "lighteval|agieval:gaokao-english": {
270
+ "name": "agieval:gaokao-english",
271
+ "prompt_function": "agieval",
272
+ "hf_repo": "dmayhem93/agieval-gaokao-english",
273
+ "hf_subset": "default",
274
+ "metric": [
275
+ "loglikelihood_acc",
276
+ "loglikelihood_acc_norm_nospace"
277
+ ],
278
+ "hf_avail_splits": [
279
+ "test"
280
+ ],
281
+ "evaluation_splits": [
282
+ "test"
283
+ ],
284
+ "few_shots_split": null,
285
+ "few_shots_select": "random_sampling",
286
+ "generation_size": 1,
287
+ "stop_sequence": null,
288
+ "output_regex": null,
289
+ "frozen": false,
290
+ "suite": [
291
+ "lighteval"
292
+ ],
293
+ "original_num_docs": 306,
294
+ "effective_num_docs": 306,
295
+ "trust_dataset": true,
296
+ "must_remove_duplicate_docs": null
297
+ },
298
+ "lighteval|agieval:gaokao-geography": {
299
+ "name": "agieval:gaokao-geography",
300
+ "prompt_function": "agieval",
301
+ "hf_repo": "dmayhem93/agieval-gaokao-geography",
302
+ "hf_subset": "default",
303
+ "metric": [
304
+ "loglikelihood_acc",
305
+ "loglikelihood_acc_norm_nospace"
306
+ ],
307
+ "hf_avail_splits": [
308
+ "test"
309
+ ],
310
+ "evaluation_splits": [
311
+ "test"
312
+ ],
313
+ "few_shots_split": null,
314
+ "few_shots_select": "random_sampling",
315
+ "generation_size": 1,
316
+ "stop_sequence": null,
317
+ "output_regex": null,
318
+ "frozen": false,
319
+ "suite": [
320
+ "lighteval"
321
+ ],
322
+ "original_num_docs": 199,
323
+ "effective_num_docs": 199,
324
+ "trust_dataset": true,
325
+ "must_remove_duplicate_docs": null
326
+ },
327
+ "lighteval|agieval:gaokao-history": {
328
+ "name": "agieval:gaokao-history",
329
+ "prompt_function": "agieval",
330
+ "hf_repo": "dmayhem93/agieval-gaokao-history",
331
+ "hf_subset": "default",
332
+ "metric": [
333
+ "loglikelihood_acc",
334
+ "loglikelihood_acc_norm_nospace"
335
+ ],
336
+ "hf_avail_splits": [
337
+ "test"
338
+ ],
339
+ "evaluation_splits": [
340
+ "test"
341
+ ],
342
+ "few_shots_split": null,
343
+ "few_shots_select": "random_sampling",
344
+ "generation_size": 1,
345
+ "stop_sequence": null,
346
+ "output_regex": null,
347
+ "frozen": false,
348
+ "suite": [
349
+ "lighteval"
350
+ ],
351
+ "original_num_docs": 235,
352
+ "effective_num_docs": 235,
353
+ "trust_dataset": true,
354
+ "must_remove_duplicate_docs": null
355
+ },
356
+ "lighteval|agieval:gaokao-mathqa": {
357
+ "name": "agieval:gaokao-mathqa",
358
+ "prompt_function": "agieval",
359
+ "hf_repo": "dmayhem93/agieval-gaokao-mathqa",
360
+ "hf_subset": "default",
361
+ "metric": [
362
+ "loglikelihood_acc",
363
+ "loglikelihood_acc_norm_nospace"
364
+ ],
365
+ "hf_avail_splits": [
366
+ "test"
367
+ ],
368
+ "evaluation_splits": [
369
+ "test"
370
+ ],
371
+ "few_shots_split": null,
372
+ "few_shots_select": "random_sampling",
373
+ "generation_size": 1,
374
+ "stop_sequence": null,
375
+ "output_regex": null,
376
+ "frozen": false,
377
+ "suite": [
378
+ "lighteval"
379
+ ],
380
+ "original_num_docs": 351,
381
+ "effective_num_docs": 351,
382
+ "trust_dataset": true,
383
+ "must_remove_duplicate_docs": null
384
+ },
385
+ "lighteval|agieval:gaokao-physics": {
386
+ "name": "agieval:gaokao-physics",
387
+ "prompt_function": "agieval",
388
+ "hf_repo": "dmayhem93/agieval-gaokao-physics",
389
+ "hf_subset": "default",
390
+ "metric": [
391
+ "loglikelihood_acc",
392
+ "loglikelihood_acc_norm_nospace"
393
+ ],
394
+ "hf_avail_splits": [
395
+ "test"
396
+ ],
397
+ "evaluation_splits": [
398
+ "test"
399
+ ],
400
+ "few_shots_split": null,
401
+ "few_shots_select": "random_sampling",
402
+ "generation_size": 1,
403
+ "stop_sequence": null,
404
+ "output_regex": null,
405
+ "frozen": false,
406
+ "suite": [
407
+ "lighteval"
408
+ ],
409
+ "original_num_docs": 200,
410
+ "effective_num_docs": 200,
411
+ "trust_dataset": true,
412
+ "must_remove_duplicate_docs": null
413
+ },
414
+ "lighteval|agieval:logiqa-en": {
415
+ "name": "agieval:logiqa-en",
416
+ "prompt_function": "agieval",
417
+ "hf_repo": "dmayhem93/agieval-logiqa-en",
418
+ "hf_subset": "default",
419
+ "metric": [
420
+ "loglikelihood_acc",
421
+ "loglikelihood_acc_norm_nospace"
422
+ ],
423
+ "hf_avail_splits": [
424
+ "test"
425
+ ],
426
+ "evaluation_splits": [
427
+ "test"
428
+ ],
429
+ "few_shots_split": null,
430
+ "few_shots_select": "random_sampling",
431
+ "generation_size": 1,
432
+ "stop_sequence": null,
433
+ "output_regex": null,
434
+ "frozen": false,
435
+ "suite": [
436
+ "lighteval"
437
+ ],
438
+ "original_num_docs": 651,
439
+ "effective_num_docs": 651,
440
+ "trust_dataset": true,
441
+ "must_remove_duplicate_docs": null
442
+ },
443
+ "lighteval|agieval:logiqa-zh": {
444
+ "name": "agieval:logiqa-zh",
445
+ "prompt_function": "agieval",
446
+ "hf_repo": "dmayhem93/agieval-logiqa-zh",
447
+ "hf_subset": "default",
448
+ "metric": [
449
+ "loglikelihood_acc",
450
+ "loglikelihood_acc_norm_nospace"
451
+ ],
452
+ "hf_avail_splits": [
453
+ "test"
454
+ ],
455
+ "evaluation_splits": [
456
+ "test"
457
+ ],
458
+ "few_shots_split": null,
459
+ "few_shots_select": "random_sampling",
460
+ "generation_size": 1,
461
+ "stop_sequence": null,
462
+ "output_regex": null,
463
+ "frozen": false,
464
+ "suite": [
465
+ "lighteval"
466
+ ],
467
+ "original_num_docs": 651,
468
+ "effective_num_docs": 651,
469
+ "trust_dataset": true,
470
+ "must_remove_duplicate_docs": null
471
+ },
472
+ "lighteval|agieval:lsat-ar": {
473
+ "name": "agieval:lsat-ar",
474
+ "prompt_function": "agieval",
475
+ "hf_repo": "dmayhem93/agieval-lsat-ar",
476
+ "hf_subset": "default",
477
+ "metric": [
478
+ "loglikelihood_acc",
479
+ "loglikelihood_acc_norm_nospace"
480
+ ],
481
+ "hf_avail_splits": [
482
+ "test"
483
+ ],
484
+ "evaluation_splits": [
485
+ "test"
486
+ ],
487
+ "few_shots_split": null,
488
+ "few_shots_select": "random_sampling",
489
+ "generation_size": 1,
490
+ "stop_sequence": null,
491
+ "output_regex": null,
492
+ "frozen": false,
493
+ "suite": [
494
+ "lighteval"
495
+ ],
496
+ "original_num_docs": 230,
497
+ "effective_num_docs": 230,
498
+ "trust_dataset": true,
499
+ "must_remove_duplicate_docs": null
500
+ },
501
+ "lighteval|agieval:lsat-lr": {
502
+ "name": "agieval:lsat-lr",
503
+ "prompt_function": "agieval",
504
+ "hf_repo": "dmayhem93/agieval-lsat-lr",
505
+ "hf_subset": "default",
506
+ "metric": [
507
+ "loglikelihood_acc",
508
+ "loglikelihood_acc_norm_nospace"
509
+ ],
510
+ "hf_avail_splits": [
511
+ "test"
512
+ ],
513
+ "evaluation_splits": [
514
+ "test"
515
+ ],
516
+ "few_shots_split": null,
517
+ "few_shots_select": "random_sampling",
518
+ "generation_size": 1,
519
+ "stop_sequence": null,
520
+ "output_regex": null,
521
+ "frozen": false,
522
+ "suite": [
523
+ "lighteval"
524
+ ],
525
+ "original_num_docs": 510,
526
+ "effective_num_docs": 510,
527
+ "trust_dataset": true,
528
+ "must_remove_duplicate_docs": null
529
+ },
530
+ "lighteval|agieval:lsat-rc": {
531
+ "name": "agieval:lsat-rc",
532
+ "prompt_function": "agieval",
533
+ "hf_repo": "dmayhem93/agieval-lsat-rc",
534
+ "hf_subset": "default",
535
+ "metric": [
536
+ "loglikelihood_acc",
537
+ "loglikelihood_acc_norm_nospace"
538
+ ],
539
+ "hf_avail_splits": [
540
+ "test"
541
+ ],
542
+ "evaluation_splits": [
543
+ "test"
544
+ ],
545
+ "few_shots_split": null,
546
+ "few_shots_select": "random_sampling",
547
+ "generation_size": 1,
548
+ "stop_sequence": null,
549
+ "output_regex": null,
550
+ "frozen": false,
551
+ "suite": [
552
+ "lighteval"
553
+ ],
554
+ "original_num_docs": 269,
555
+ "effective_num_docs": 269,
556
+ "trust_dataset": true,
557
+ "must_remove_duplicate_docs": null
558
+ },
559
+ "lighteval|agieval:sat-en": {
560
+ "name": "agieval:sat-en",
561
+ "prompt_function": "agieval",
562
+ "hf_repo": "dmayhem93/agieval-sat-en",
563
+ "hf_subset": "default",
564
+ "metric": [
565
+ "loglikelihood_acc",
566
+ "loglikelihood_acc_norm_nospace"
567
+ ],
568
+ "hf_avail_splits": [
569
+ "test"
570
+ ],
571
+ "evaluation_splits": [
572
+ "test"
573
+ ],
574
+ "few_shots_split": null,
575
+ "few_shots_select": "random_sampling",
576
+ "generation_size": 1,
577
+ "stop_sequence": null,
578
+ "output_regex": null,
579
+ "frozen": false,
580
+ "suite": [
581
+ "lighteval"
582
+ ],
583
+ "original_num_docs": 206,
584
+ "effective_num_docs": 206,
585
+ "trust_dataset": true,
586
+ "must_remove_duplicate_docs": null
587
+ },
588
+ "lighteval|agieval:sat-en-without-passage": {
589
+ "name": "agieval:sat-en-without-passage",
590
+ "prompt_function": "agieval",
591
+ "hf_repo": "dmayhem93/agieval-sat-en-without-passage",
592
+ "hf_subset": "default",
593
+ "metric": [
594
+ "loglikelihood_acc",
595
+ "loglikelihood_acc_norm_nospace"
596
+ ],
597
+ "hf_avail_splits": [
598
+ "test"
599
+ ],
600
+ "evaluation_splits": [
601
+ "test"
602
+ ],
603
+ "few_shots_split": null,
604
+ "few_shots_select": "random_sampling",
605
+ "generation_size": 1,
606
+ "stop_sequence": null,
607
+ "output_regex": null,
608
+ "frozen": false,
609
+ "suite": [
610
+ "lighteval"
611
+ ],
612
+ "original_num_docs": 206,
613
+ "effective_num_docs": 206,
614
+ "trust_dataset": true,
615
+ "must_remove_duplicate_docs": null
616
+ },
617
+ "lighteval|agieval:sat-math": {
618
+ "name": "agieval:sat-math",
619
+ "prompt_function": "agieval",
620
+ "hf_repo": "dmayhem93/agieval-sat-math",
621
+ "hf_subset": "default",
622
+ "metric": [
623
+ "loglikelihood_acc",
624
+ "loglikelihood_acc_norm_nospace"
625
+ ],
626
+ "hf_avail_splits": [
627
+ "test"
628
+ ],
629
+ "evaluation_splits": [
630
+ "test"
631
+ ],
632
+ "few_shots_split": null,
633
+ "few_shots_select": "random_sampling",
634
+ "generation_size": 1,
635
+ "stop_sequence": null,
636
+ "output_regex": null,
637
+ "frozen": false,
638
+ "suite": [
639
+ "lighteval"
640
+ ],
641
+ "original_num_docs": 220,
642
+ "effective_num_docs": 220,
643
+ "trust_dataset": true,
644
+ "must_remove_duplicate_docs": null
645
+ }
646
+ },
647
+ "summary_tasks": {
648
+ "lighteval|agieval:aqua-rat|0": {
649
+ "hashes": {
650
+ "hash_examples": "f09607f69e5b7525",
651
+ "hash_full_prompts": "2dd019f2a9629d3c",
652
+ "hash_input_tokens": "c7a095b065d480b0",
653
+ "hash_cont_tokens": "9a4c5828e6dabe2e"
654
+ },
655
+ "truncated": 0,
656
+ "non_truncated": 254,
657
+ "padded": 1270,
658
+ "non_padded": 0,
659
+ "effective_few_shots": 0.0,
660
+ "num_truncated_few_shots": 0
661
+ },
662
+ "lighteval|agieval:gaokao-biology|0": {
663
+ "hashes": {
664
+ "hash_examples": "f262eaf4a72db963",
665
+ "hash_full_prompts": "9936cbf7a5bc399f",
666
+ "hash_input_tokens": "28166e19ba7194ee",
667
+ "hash_cont_tokens": "5dc07709ee2eda10"
668
+ },
669
+ "truncated": 0,
670
+ "non_truncated": 210,
671
+ "padded": 840,
672
+ "non_padded": 0,
673
+ "effective_few_shots": 0.0,
674
+ "num_truncated_few_shots": 0
675
+ },
676
+ "lighteval|agieval:gaokao-chemistry|0": {
677
+ "hashes": {
678
+ "hash_examples": "47f2e649f58d9da5",
679
+ "hash_full_prompts": "679f06a0d708b514",
680
+ "hash_input_tokens": "abfd6d291af87b20",
681
+ "hash_cont_tokens": "c02c0b3f0938184c"
682
+ },
683
+ "truncated": 0,
684
+ "non_truncated": 207,
685
+ "padded": 831,
686
+ "non_padded": 0,
687
+ "effective_few_shots": 0.0,
688
+ "num_truncated_few_shots": 0
689
+ },
690
+ "lighteval|agieval:gaokao-chinese|0": {
691
+ "hashes": {
692
+ "hash_examples": "1010b21fde4726ab",
693
+ "hash_full_prompts": "3c905587787709a8",
694
+ "hash_input_tokens": "f800795cf9f4a078",
695
+ "hash_cont_tokens": "885b3626dcebb080"
696
+ },
697
+ "truncated": 0,
698
+ "non_truncated": 246,
699
+ "padded": 982,
700
+ "non_padded": 2,
701
+ "effective_few_shots": 0.0,
702
+ "num_truncated_few_shots": 0
703
+ },
704
+ "lighteval|agieval:gaokao-english|0": {
705
+ "hashes": {
706
+ "hash_examples": "4864e492a350ae93",
707
+ "hash_full_prompts": "72fe2d636b6f6622",
708
+ "hash_input_tokens": "179f671d7cc3b7cc",
709
+ "hash_cont_tokens": "2262620ff400b608"
710
+ },
711
+ "truncated": 0,
712
+ "non_truncated": 306,
713
+ "padded": 1218,
714
+ "non_padded": 6,
715
+ "effective_few_shots": 0.0,
716
+ "num_truncated_few_shots": 0
717
+ },
718
+ "lighteval|agieval:gaokao-geography|0": {
719
+ "hashes": {
720
+ "hash_examples": "ec3a021e37650e7d",
721
+ "hash_full_prompts": "60a42fb81acf5a1e",
722
+ "hash_input_tokens": "55a30b8466c26129",
723
+ "hash_cont_tokens": "480a5c8f86827df4"
724
+ },
725
+ "truncated": 0,
726
+ "non_truncated": 199,
727
+ "padded": 796,
728
+ "non_padded": 0,
729
+ "effective_few_shots": 0.0,
730
+ "num_truncated_few_shots": 0
731
+ },
732
+ "lighteval|agieval:gaokao-history|0": {
733
+ "hashes": {
734
+ "hash_examples": "b3fad1596f1ae1f9",
735
+ "hash_full_prompts": "2c8875804d162da8",
736
+ "hash_input_tokens": "5cdba09c199f23cf",
737
+ "hash_cont_tokens": "e1608c6dd5f55267"
738
+ },
739
+ "truncated": 0,
740
+ "non_truncated": 235,
741
+ "padded": 940,
742
+ "non_padded": 0,
743
+ "effective_few_shots": 0.0,
744
+ "num_truncated_few_shots": 0
745
+ },
746
+ "lighteval|agieval:gaokao-mathqa|0": {
747
+ "hashes": {
748
+ "hash_examples": "1d1088556861b0b0",
749
+ "hash_full_prompts": "c1a6433c211e92a1",
750
+ "hash_input_tokens": "814594787313230d",
751
+ "hash_cont_tokens": "e8cd9aaeeec6a5d4"
752
+ },
753
+ "truncated": 0,
754
+ "non_truncated": 351,
755
+ "padded": 1404,
756
+ "non_padded": 0,
757
+ "effective_few_shots": 0.0,
758
+ "num_truncated_few_shots": 0
759
+ },
760
+ "lighteval|agieval:gaokao-physics|0": {
761
+ "hashes": {
762
+ "hash_examples": "eb05f035c7bfca2f",
763
+ "hash_full_prompts": "4ba90cfb06e848e8",
764
+ "hash_input_tokens": "a96955e66f7210d8",
765
+ "hash_cont_tokens": "b89267e9f865c6b3"
766
+ },
767
+ "truncated": 0,
768
+ "non_truncated": 200,
769
+ "padded": 800,
770
+ "non_padded": 0,
771
+ "effective_few_shots": 0.0,
772
+ "num_truncated_few_shots": 0
773
+ },
774
+ "lighteval|agieval:logiqa-en|0": {
775
+ "hashes": {
776
+ "hash_examples": "0a688a45f69c21e0",
777
+ "hash_full_prompts": "d66246a3e0fde055",
778
+ "hash_input_tokens": "cb9639b768095552",
779
+ "hash_cont_tokens": "2c45f086a39c57b8"
780
+ },
781
+ "truncated": 0,
782
+ "non_truncated": 651,
783
+ "padded": 2599,
784
+ "non_padded": 5,
785
+ "effective_few_shots": 0.0,
786
+ "num_truncated_few_shots": 0
787
+ },
788
+ "lighteval|agieval:logiqa-zh|0": {
789
+ "hashes": {
790
+ "hash_examples": "620d6888b6012ea5",
791
+ "hash_full_prompts": "12bef01e6e2bb3bb",
792
+ "hash_input_tokens": "fb10f8f201d221fc",
793
+ "hash_cont_tokens": "528f93b95cf11c45"
794
+ },
795
+ "truncated": 0,
796
+ "non_truncated": 651,
797
+ "padded": 2592,
798
+ "non_padded": 12,
799
+ "effective_few_shots": 0.0,
800
+ "num_truncated_few_shots": 0
801
+ },
802
+ "lighteval|agieval:lsat-ar|0": {
803
+ "hashes": {
804
+ "hash_examples": "627c8f5ccd5da209",
805
+ "hash_full_prompts": "2f70fdd969d5131c",
806
+ "hash_input_tokens": "ff96b15c8e322081",
807
+ "hash_cont_tokens": "a04649158245218b"
808
+ },
809
+ "truncated": 0,
810
+ "non_truncated": 230,
811
+ "padded": 1147,
812
+ "non_padded": 3,
813
+ "effective_few_shots": 0.0,
814
+ "num_truncated_few_shots": 0
815
+ },
816
+ "lighteval|agieval:lsat-lr|0": {
817
+ "hashes": {
818
+ "hash_examples": "794641c86de172f5",
819
+ "hash_full_prompts": "19fa02ffa4cfbbe3",
820
+ "hash_input_tokens": "1ac160b68dbd4f42",
821
+ "hash_cont_tokens": "1020f1117c8bf340"
822
+ },
823
+ "truncated": 0,
824
+ "non_truncated": 510,
825
+ "padded": 2519,
826
+ "non_padded": 31,
827
+ "effective_few_shots": 0.0,
828
+ "num_truncated_few_shots": 0
829
+ },
830
+ "lighteval|agieval:lsat-rc|0": {
831
+ "hashes": {
832
+ "hash_examples": "35981ed917ea01cf",
833
+ "hash_full_prompts": "37f03609dd5027c5",
834
+ "hash_input_tokens": "71655332fea694a0",
835
+ "hash_cont_tokens": "cb10bf65326d313e"
836
+ },
837
+ "truncated": 0,
838
+ "non_truncated": 269,
839
+ "padded": 1345,
840
+ "non_padded": 0,
841
+ "effective_few_shots": 0.0,
842
+ "num_truncated_few_shots": 0
843
+ },
844
+ "lighteval|agieval:sat-en|0": {
845
+ "hashes": {
846
+ "hash_examples": "041c39c646536a1e",
847
+ "hash_full_prompts": "10014752d3b89e2d",
848
+ "hash_input_tokens": "1086a441056617a9",
849
+ "hash_cont_tokens": "b2eb0cb9fcf03bef"
850
+ },
851
+ "truncated": 0,
852
+ "non_truncated": 206,
853
+ "padded": 821,
854
+ "non_padded": 0,
855
+ "effective_few_shots": 0.0,
856
+ "num_truncated_few_shots": 0
857
+ },
858
+ "lighteval|agieval:sat-en-without-passage|0": {
859
+ "hashes": {
860
+ "hash_examples": "e4d9284367dff68f",
861
+ "hash_full_prompts": "048ea6811dd85fb5",
862
+ "hash_input_tokens": "8b8d347811731c48",
863
+ "hash_cont_tokens": "b2eb0cb9fcf03bef"
864
+ },
865
+ "truncated": 0,
866
+ "non_truncated": 206,
867
+ "padded": 819,
868
+ "non_padded": 2,
869
+ "effective_few_shots": 0.0,
870
+ "num_truncated_few_shots": 0
871
+ },
872
+ "lighteval|agieval:sat-math|0": {
873
+ "hashes": {
874
+ "hash_examples": "01db7291603fc1a0",
875
+ "hash_full_prompts": "eff3065f8ae8d936",
876
+ "hash_input_tokens": "b4d4bd8a36ef96a7",
877
+ "hash_cont_tokens": "20524d21001d25a6"
878
+ },
879
+ "truncated": 0,
880
+ "non_truncated": 220,
881
+ "padded": 878,
882
+ "non_padded": 2,
883
+ "effective_few_shots": 0.0,
884
+ "num_truncated_few_shots": 0
885
+ }
886
+ },
887
+ "summary_general": {
888
+ "hashes": {
889
+ "hash_examples": "da3af66181f18ddf",
890
+ "hash_full_prompts": "99f646ddb87a6ed9",
891
+ "hash_input_tokens": "7e5cae6af0c269dd",
892
+ "hash_cont_tokens": "748ca081a3836593"
893
+ },
894
+ "truncated": 0,
895
+ "non_truncated": 5151,
896
+ "padded": 21801,
897
+ "non_padded": 63,
898
+ "num_truncated_few_shots": 0
899
+ }
900
+ }