lewtun HF staff commited on
Commit
0aa53da
·
verified ·
1 Parent(s): 51278f0

Upload eval_results/orpo-explorers/argilla-mistral-orpo-OpenHermesPreferences-50k-beta-0.2-ckpt-epoch-2/main/agieval/results_2024-05-09T18-58-33.971890.json with huggingface_hub

Browse files
eval_results/orpo-explorers/argilla-mistral-orpo-OpenHermesPreferences-50k-beta-0.2-ckpt-epoch-2/main/agieval/results_2024-05-09T18-58-33.971890.json ADDED
@@ -0,0 +1,934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 833244.635872818,
9
+ "end_time": 833433.874554734,
10
+ "total_evaluation_time_secondes": "189.23868191603106",
11
+ "model_name": "orpo-explorers/argilla-mistral-orpo-OpenHermesPreferences-50k-beta-0.2-ckpt-epoch-2",
12
+ "model_sha": "153b3124eb16cf41c5f0d98cd51706c385ca1ed5",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "13.99 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|agieval:aqua-rat|0": {
19
+ "acc": 0.22440944881889763,
20
+ "acc_stderr": 0.026228687798575897,
21
+ "acc_norm": 0.2204724409448819,
22
+ "acc_norm_stderr": 0.026063493749639587
23
+ },
24
+ "lighteval|agieval:gaokao-biology|0": {
25
+ "acc": 0.28095238095238095,
26
+ "acc_stderr": 0.03109009446934461,
27
+ "acc_norm": 0.30952380952380953,
28
+ "acc_norm_stderr": 0.03197777494209474
29
+ },
30
+ "lighteval|agieval:gaokao-chemistry|0": {
31
+ "acc": 0.2028985507246377,
32
+ "acc_stderr": 0.028019647132494573,
33
+ "acc_norm": 0.2318840579710145,
34
+ "acc_norm_stderr": 0.029404596565406543
35
+ },
36
+ "lighteval|agieval:gaokao-chinese|0": {
37
+ "acc": 0.2886178861788618,
38
+ "acc_stderr": 0.028948765576340286,
39
+ "acc_norm": 0.2804878048780488,
40
+ "acc_norm_stderr": 0.028700735693673953
41
+ },
42
+ "lighteval|agieval:gaokao-english|0": {
43
+ "acc": 0.5522875816993464,
44
+ "acc_stderr": 0.02847293847803353,
45
+ "acc_norm": 0.5424836601307189,
46
+ "acc_norm_stderr": 0.028526383452142645
47
+ },
48
+ "lighteval|agieval:gaokao-geography|0": {
49
+ "acc": 0.31155778894472363,
50
+ "acc_stderr": 0.03291322637124229,
51
+ "acc_norm": 0.3015075376884422,
52
+ "acc_norm_stderr": 0.03261349587454653
53
+ },
54
+ "lighteval|agieval:gaokao-history|0": {
55
+ "acc": 0.2851063829787234,
56
+ "acc_stderr": 0.02951319662553935,
57
+ "acc_norm": 0.2723404255319149,
58
+ "acc_norm_stderr": 0.0291012906983867
59
+ },
60
+ "lighteval|agieval:gaokao-mathqa|0": {
61
+ "acc": 0.2706552706552707,
62
+ "acc_stderr": 0.023748744034266786,
63
+ "acc_norm": 0.28205128205128205,
64
+ "acc_norm_stderr": 0.024053414152940704
65
+ },
66
+ "lighteval|agieval:gaokao-physics|0": {
67
+ "acc": 0.285,
68
+ "acc_stderr": 0.03199992148231578,
69
+ "acc_norm": 0.325,
70
+ "acc_norm_stderr": 0.03320221279784479
71
+ },
72
+ "lighteval|agieval:logiqa-en|0": {
73
+ "acc": 0.3241167434715822,
74
+ "acc_stderr": 0.01835819163513243,
75
+ "acc_norm": 0.3241167434715822,
76
+ "acc_norm_stderr": 0.01835819163513243
77
+ },
78
+ "lighteval|agieval:logiqa-zh|0": {
79
+ "acc": 0.31490015360983103,
80
+ "acc_stderr": 0.018218251493671685,
81
+ "acc_norm": 0.3317972350230415,
82
+ "acc_norm_stderr": 0.0184685941264168
83
+ },
84
+ "lighteval|agieval:lsat-ar|0": {
85
+ "acc": 0.17391304347826086,
86
+ "acc_stderr": 0.025047317386049713,
87
+ "acc_norm": 0.2,
88
+ "acc_norm_stderr": 0.026432744018203558
89
+ },
90
+ "lighteval|agieval:lsat-lr|0": {
91
+ "acc": 0.2784313725490196,
92
+ "acc_stderr": 0.019867307525414934,
93
+ "acc_norm": 0.30196078431372547,
94
+ "acc_norm_stderr": 0.020349619453119146
95
+ },
96
+ "lighteval|agieval:lsat-rc|0": {
97
+ "acc": 0.34944237918215615,
98
+ "acc_stderr": 0.02912482161970038,
99
+ "acc_norm": 0.3048327137546468,
100
+ "acc_norm_stderr": 0.02811952967561346
101
+ },
102
+ "lighteval|agieval:sat-en|0": {
103
+ "acc": 0.46116504854368934,
104
+ "acc_stderr": 0.03481602144131183,
105
+ "acc_norm": 0.4223300970873786,
106
+ "acc_norm_stderr": 0.03449760586825818
107
+ },
108
+ "lighteval|agieval:sat-en-without-passage|0": {
109
+ "acc": 0.2912621359223301,
110
+ "acc_stderr": 0.031732764025283834,
111
+ "acc_norm": 0.2912621359223301,
112
+ "acc_norm_stderr": 0.031732764025283834
113
+ },
114
+ "lighteval|agieval:sat-math|0": {
115
+ "acc": 0.2409090909090909,
116
+ "acc_stderr": 0.02889691178349409,
117
+ "acc_norm": 0.21363636363636362,
118
+ "acc_norm_stderr": 0.027696649960503875
119
+ },
120
+ "lighteval|agieval:_average|0": {
121
+ "acc": 0.30209560344816483,
122
+ "acc_stderr": 0.027470400522247765,
123
+ "acc_norm": 0.30327571128995184,
124
+ "acc_norm_stderr": 0.027605829217012205
125
+ },
126
+ "all": {
127
+ "acc": 0.30209560344816483,
128
+ "acc_stderr": 0.027470400522247765,
129
+ "acc_norm": 0.30327571128995184,
130
+ "acc_norm_stderr": 0.027605829217012205
131
+ }
132
+ },
133
+ "versions": {
134
+ "lighteval|agieval:aqua-rat|0": 0,
135
+ "lighteval|agieval:gaokao-biology|0": 0,
136
+ "lighteval|agieval:gaokao-chemistry|0": 0,
137
+ "lighteval|agieval:gaokao-chinese|0": 0,
138
+ "lighteval|agieval:gaokao-english|0": 0,
139
+ "lighteval|agieval:gaokao-geography|0": 0,
140
+ "lighteval|agieval:gaokao-history|0": 0,
141
+ "lighteval|agieval:gaokao-mathqa|0": 0,
142
+ "lighteval|agieval:gaokao-physics|0": 0,
143
+ "lighteval|agieval:logiqa-en|0": 0,
144
+ "lighteval|agieval:logiqa-zh|0": 0,
145
+ "lighteval|agieval:lsat-ar|0": 0,
146
+ "lighteval|agieval:lsat-lr|0": 0,
147
+ "lighteval|agieval:lsat-rc|0": 0,
148
+ "lighteval|agieval:sat-en|0": 0,
149
+ "lighteval|agieval:sat-en-without-passage|0": 0,
150
+ "lighteval|agieval:sat-math|0": 0
151
+ },
152
+ "config_tasks": {
153
+ "lighteval|agieval:aqua-rat": {
154
+ "name": "agieval:aqua-rat",
155
+ "prompt_function": "agieval",
156
+ "hf_repo": "dmayhem93/agieval-aqua-rat",
157
+ "hf_subset": "default",
158
+ "metric": [
159
+ "loglikelihood_acc",
160
+ "loglikelihood_acc_norm_nospace"
161
+ ],
162
+ "hf_avail_splits": [
163
+ "test"
164
+ ],
165
+ "evaluation_splits": [
166
+ "test"
167
+ ],
168
+ "few_shots_split": null,
169
+ "few_shots_select": "random_sampling",
170
+ "generation_size": 1,
171
+ "stop_sequence": null,
172
+ "output_regex": null,
173
+ "num_samples": null,
174
+ "frozen": false,
175
+ "suite": [
176
+ "lighteval"
177
+ ],
178
+ "original_num_docs": 254,
179
+ "effective_num_docs": 254,
180
+ "trust_dataset": true,
181
+ "must_remove_duplicate_docs": null,
182
+ "version": 0
183
+ },
184
+ "lighteval|agieval:gaokao-biology": {
185
+ "name": "agieval:gaokao-biology",
186
+ "prompt_function": "agieval",
187
+ "hf_repo": "dmayhem93/agieval-gaokao-biology",
188
+ "hf_subset": "default",
189
+ "metric": [
190
+ "loglikelihood_acc",
191
+ "loglikelihood_acc_norm_nospace"
192
+ ],
193
+ "hf_avail_splits": [
194
+ "test"
195
+ ],
196
+ "evaluation_splits": [
197
+ "test"
198
+ ],
199
+ "few_shots_split": null,
200
+ "few_shots_select": "random_sampling",
201
+ "generation_size": 1,
202
+ "stop_sequence": null,
203
+ "output_regex": null,
204
+ "num_samples": null,
205
+ "frozen": false,
206
+ "suite": [
207
+ "lighteval"
208
+ ],
209
+ "original_num_docs": 210,
210
+ "effective_num_docs": 210,
211
+ "trust_dataset": true,
212
+ "must_remove_duplicate_docs": null,
213
+ "version": 0
214
+ },
215
+ "lighteval|agieval:gaokao-chemistry": {
216
+ "name": "agieval:gaokao-chemistry",
217
+ "prompt_function": "agieval",
218
+ "hf_repo": "dmayhem93/agieval-gaokao-chemistry",
219
+ "hf_subset": "default",
220
+ "metric": [
221
+ "loglikelihood_acc",
222
+ "loglikelihood_acc_norm_nospace"
223
+ ],
224
+ "hf_avail_splits": [
225
+ "test"
226
+ ],
227
+ "evaluation_splits": [
228
+ "test"
229
+ ],
230
+ "few_shots_split": null,
231
+ "few_shots_select": "random_sampling",
232
+ "generation_size": 1,
233
+ "stop_sequence": null,
234
+ "output_regex": null,
235
+ "num_samples": null,
236
+ "frozen": false,
237
+ "suite": [
238
+ "lighteval"
239
+ ],
240
+ "original_num_docs": 207,
241
+ "effective_num_docs": 207,
242
+ "trust_dataset": true,
243
+ "must_remove_duplicate_docs": null,
244
+ "version": 0
245
+ },
246
+ "lighteval|agieval:gaokao-chinese": {
247
+ "name": "agieval:gaokao-chinese",
248
+ "prompt_function": "agieval",
249
+ "hf_repo": "dmayhem93/agieval-gaokao-chinese",
250
+ "hf_subset": "default",
251
+ "metric": [
252
+ "loglikelihood_acc",
253
+ "loglikelihood_acc_norm_nospace"
254
+ ],
255
+ "hf_avail_splits": [
256
+ "test"
257
+ ],
258
+ "evaluation_splits": [
259
+ "test"
260
+ ],
261
+ "few_shots_split": null,
262
+ "few_shots_select": "random_sampling",
263
+ "generation_size": 1,
264
+ "stop_sequence": null,
265
+ "output_regex": null,
266
+ "num_samples": null,
267
+ "frozen": false,
268
+ "suite": [
269
+ "lighteval"
270
+ ],
271
+ "original_num_docs": 246,
272
+ "effective_num_docs": 246,
273
+ "trust_dataset": true,
274
+ "must_remove_duplicate_docs": null,
275
+ "version": 0
276
+ },
277
+ "lighteval|agieval:gaokao-english": {
278
+ "name": "agieval:gaokao-english",
279
+ "prompt_function": "agieval",
280
+ "hf_repo": "dmayhem93/agieval-gaokao-english",
281
+ "hf_subset": "default",
282
+ "metric": [
283
+ "loglikelihood_acc",
284
+ "loglikelihood_acc_norm_nospace"
285
+ ],
286
+ "hf_avail_splits": [
287
+ "test"
288
+ ],
289
+ "evaluation_splits": [
290
+ "test"
291
+ ],
292
+ "few_shots_split": null,
293
+ "few_shots_select": "random_sampling",
294
+ "generation_size": 1,
295
+ "stop_sequence": null,
296
+ "output_regex": null,
297
+ "num_samples": null,
298
+ "frozen": false,
299
+ "suite": [
300
+ "lighteval"
301
+ ],
302
+ "original_num_docs": 306,
303
+ "effective_num_docs": 306,
304
+ "trust_dataset": true,
305
+ "must_remove_duplicate_docs": null,
306
+ "version": 0
307
+ },
308
+ "lighteval|agieval:gaokao-geography": {
309
+ "name": "agieval:gaokao-geography",
310
+ "prompt_function": "agieval",
311
+ "hf_repo": "dmayhem93/agieval-gaokao-geography",
312
+ "hf_subset": "default",
313
+ "metric": [
314
+ "loglikelihood_acc",
315
+ "loglikelihood_acc_norm_nospace"
316
+ ],
317
+ "hf_avail_splits": [
318
+ "test"
319
+ ],
320
+ "evaluation_splits": [
321
+ "test"
322
+ ],
323
+ "few_shots_split": null,
324
+ "few_shots_select": "random_sampling",
325
+ "generation_size": 1,
326
+ "stop_sequence": null,
327
+ "output_regex": null,
328
+ "num_samples": null,
329
+ "frozen": false,
330
+ "suite": [
331
+ "lighteval"
332
+ ],
333
+ "original_num_docs": 199,
334
+ "effective_num_docs": 199,
335
+ "trust_dataset": true,
336
+ "must_remove_duplicate_docs": null,
337
+ "version": 0
338
+ },
339
+ "lighteval|agieval:gaokao-history": {
340
+ "name": "agieval:gaokao-history",
341
+ "prompt_function": "agieval",
342
+ "hf_repo": "dmayhem93/agieval-gaokao-history",
343
+ "hf_subset": "default",
344
+ "metric": [
345
+ "loglikelihood_acc",
346
+ "loglikelihood_acc_norm_nospace"
347
+ ],
348
+ "hf_avail_splits": [
349
+ "test"
350
+ ],
351
+ "evaluation_splits": [
352
+ "test"
353
+ ],
354
+ "few_shots_split": null,
355
+ "few_shots_select": "random_sampling",
356
+ "generation_size": 1,
357
+ "stop_sequence": null,
358
+ "output_regex": null,
359
+ "num_samples": null,
360
+ "frozen": false,
361
+ "suite": [
362
+ "lighteval"
363
+ ],
364
+ "original_num_docs": 235,
365
+ "effective_num_docs": 235,
366
+ "trust_dataset": true,
367
+ "must_remove_duplicate_docs": null,
368
+ "version": 0
369
+ },
370
+ "lighteval|agieval:gaokao-mathqa": {
371
+ "name": "agieval:gaokao-mathqa",
372
+ "prompt_function": "agieval",
373
+ "hf_repo": "dmayhem93/agieval-gaokao-mathqa",
374
+ "hf_subset": "default",
375
+ "metric": [
376
+ "loglikelihood_acc",
377
+ "loglikelihood_acc_norm_nospace"
378
+ ],
379
+ "hf_avail_splits": [
380
+ "test"
381
+ ],
382
+ "evaluation_splits": [
383
+ "test"
384
+ ],
385
+ "few_shots_split": null,
386
+ "few_shots_select": "random_sampling",
387
+ "generation_size": 1,
388
+ "stop_sequence": null,
389
+ "output_regex": null,
390
+ "num_samples": null,
391
+ "frozen": false,
392
+ "suite": [
393
+ "lighteval"
394
+ ],
395
+ "original_num_docs": 351,
396
+ "effective_num_docs": 351,
397
+ "trust_dataset": true,
398
+ "must_remove_duplicate_docs": null,
399
+ "version": 0
400
+ },
401
+ "lighteval|agieval:gaokao-physics": {
402
+ "name": "agieval:gaokao-physics",
403
+ "prompt_function": "agieval",
404
+ "hf_repo": "dmayhem93/agieval-gaokao-physics",
405
+ "hf_subset": "default",
406
+ "metric": [
407
+ "loglikelihood_acc",
408
+ "loglikelihood_acc_norm_nospace"
409
+ ],
410
+ "hf_avail_splits": [
411
+ "test"
412
+ ],
413
+ "evaluation_splits": [
414
+ "test"
415
+ ],
416
+ "few_shots_split": null,
417
+ "few_shots_select": "random_sampling",
418
+ "generation_size": 1,
419
+ "stop_sequence": null,
420
+ "output_regex": null,
421
+ "num_samples": null,
422
+ "frozen": false,
423
+ "suite": [
424
+ "lighteval"
425
+ ],
426
+ "original_num_docs": 200,
427
+ "effective_num_docs": 200,
428
+ "trust_dataset": true,
429
+ "must_remove_duplicate_docs": null,
430
+ "version": 0
431
+ },
432
+ "lighteval|agieval:logiqa-en": {
433
+ "name": "agieval:logiqa-en",
434
+ "prompt_function": "agieval",
435
+ "hf_repo": "dmayhem93/agieval-logiqa-en",
436
+ "hf_subset": "default",
437
+ "metric": [
438
+ "loglikelihood_acc",
439
+ "loglikelihood_acc_norm_nospace"
440
+ ],
441
+ "hf_avail_splits": [
442
+ "test"
443
+ ],
444
+ "evaluation_splits": [
445
+ "test"
446
+ ],
447
+ "few_shots_split": null,
448
+ "few_shots_select": "random_sampling",
449
+ "generation_size": 1,
450
+ "stop_sequence": null,
451
+ "output_regex": null,
452
+ "num_samples": null,
453
+ "frozen": false,
454
+ "suite": [
455
+ "lighteval"
456
+ ],
457
+ "original_num_docs": 651,
458
+ "effective_num_docs": 651,
459
+ "trust_dataset": true,
460
+ "must_remove_duplicate_docs": null,
461
+ "version": 0
462
+ },
463
+ "lighteval|agieval:logiqa-zh": {
464
+ "name": "agieval:logiqa-zh",
465
+ "prompt_function": "agieval",
466
+ "hf_repo": "dmayhem93/agieval-logiqa-zh",
467
+ "hf_subset": "default",
468
+ "metric": [
469
+ "loglikelihood_acc",
470
+ "loglikelihood_acc_norm_nospace"
471
+ ],
472
+ "hf_avail_splits": [
473
+ "test"
474
+ ],
475
+ "evaluation_splits": [
476
+ "test"
477
+ ],
478
+ "few_shots_split": null,
479
+ "few_shots_select": "random_sampling",
480
+ "generation_size": 1,
481
+ "stop_sequence": null,
482
+ "output_regex": null,
483
+ "num_samples": null,
484
+ "frozen": false,
485
+ "suite": [
486
+ "lighteval"
487
+ ],
488
+ "original_num_docs": 651,
489
+ "effective_num_docs": 651,
490
+ "trust_dataset": true,
491
+ "must_remove_duplicate_docs": null,
492
+ "version": 0
493
+ },
494
+ "lighteval|agieval:lsat-ar": {
495
+ "name": "agieval:lsat-ar",
496
+ "prompt_function": "agieval",
497
+ "hf_repo": "dmayhem93/agieval-lsat-ar",
498
+ "hf_subset": "default",
499
+ "metric": [
500
+ "loglikelihood_acc",
501
+ "loglikelihood_acc_norm_nospace"
502
+ ],
503
+ "hf_avail_splits": [
504
+ "test"
505
+ ],
506
+ "evaluation_splits": [
507
+ "test"
508
+ ],
509
+ "few_shots_split": null,
510
+ "few_shots_select": "random_sampling",
511
+ "generation_size": 1,
512
+ "stop_sequence": null,
513
+ "output_regex": null,
514
+ "num_samples": null,
515
+ "frozen": false,
516
+ "suite": [
517
+ "lighteval"
518
+ ],
519
+ "original_num_docs": 230,
520
+ "effective_num_docs": 230,
521
+ "trust_dataset": true,
522
+ "must_remove_duplicate_docs": null,
523
+ "version": 0
524
+ },
525
+ "lighteval|agieval:lsat-lr": {
526
+ "name": "agieval:lsat-lr",
527
+ "prompt_function": "agieval",
528
+ "hf_repo": "dmayhem93/agieval-lsat-lr",
529
+ "hf_subset": "default",
530
+ "metric": [
531
+ "loglikelihood_acc",
532
+ "loglikelihood_acc_norm_nospace"
533
+ ],
534
+ "hf_avail_splits": [
535
+ "test"
536
+ ],
537
+ "evaluation_splits": [
538
+ "test"
539
+ ],
540
+ "few_shots_split": null,
541
+ "few_shots_select": "random_sampling",
542
+ "generation_size": 1,
543
+ "stop_sequence": null,
544
+ "output_regex": null,
545
+ "num_samples": null,
546
+ "frozen": false,
547
+ "suite": [
548
+ "lighteval"
549
+ ],
550
+ "original_num_docs": 510,
551
+ "effective_num_docs": 510,
552
+ "trust_dataset": true,
553
+ "must_remove_duplicate_docs": null,
554
+ "version": 0
555
+ },
556
+ "lighteval|agieval:lsat-rc": {
557
+ "name": "agieval:lsat-rc",
558
+ "prompt_function": "agieval",
559
+ "hf_repo": "dmayhem93/agieval-lsat-rc",
560
+ "hf_subset": "default",
561
+ "metric": [
562
+ "loglikelihood_acc",
563
+ "loglikelihood_acc_norm_nospace"
564
+ ],
565
+ "hf_avail_splits": [
566
+ "test"
567
+ ],
568
+ "evaluation_splits": [
569
+ "test"
570
+ ],
571
+ "few_shots_split": null,
572
+ "few_shots_select": "random_sampling",
573
+ "generation_size": 1,
574
+ "stop_sequence": null,
575
+ "output_regex": null,
576
+ "num_samples": null,
577
+ "frozen": false,
578
+ "suite": [
579
+ "lighteval"
580
+ ],
581
+ "original_num_docs": 269,
582
+ "effective_num_docs": 269,
583
+ "trust_dataset": true,
584
+ "must_remove_duplicate_docs": null,
585
+ "version": 0
586
+ },
587
+ "lighteval|agieval:sat-en": {
588
+ "name": "agieval:sat-en",
589
+ "prompt_function": "agieval",
590
+ "hf_repo": "dmayhem93/agieval-sat-en",
591
+ "hf_subset": "default",
592
+ "metric": [
593
+ "loglikelihood_acc",
594
+ "loglikelihood_acc_norm_nospace"
595
+ ],
596
+ "hf_avail_splits": [
597
+ "test"
598
+ ],
599
+ "evaluation_splits": [
600
+ "test"
601
+ ],
602
+ "few_shots_split": null,
603
+ "few_shots_select": "random_sampling",
604
+ "generation_size": 1,
605
+ "stop_sequence": null,
606
+ "output_regex": null,
607
+ "num_samples": null,
608
+ "frozen": false,
609
+ "suite": [
610
+ "lighteval"
611
+ ],
612
+ "original_num_docs": 206,
613
+ "effective_num_docs": 206,
614
+ "trust_dataset": true,
615
+ "must_remove_duplicate_docs": null,
616
+ "version": 0
617
+ },
618
+ "lighteval|agieval:sat-en-without-passage": {
619
+ "name": "agieval:sat-en-without-passage",
620
+ "prompt_function": "agieval",
621
+ "hf_repo": "dmayhem93/agieval-sat-en-without-passage",
622
+ "hf_subset": "default",
623
+ "metric": [
624
+ "loglikelihood_acc",
625
+ "loglikelihood_acc_norm_nospace"
626
+ ],
627
+ "hf_avail_splits": [
628
+ "test"
629
+ ],
630
+ "evaluation_splits": [
631
+ "test"
632
+ ],
633
+ "few_shots_split": null,
634
+ "few_shots_select": "random_sampling",
635
+ "generation_size": 1,
636
+ "stop_sequence": null,
637
+ "output_regex": null,
638
+ "num_samples": null,
639
+ "frozen": false,
640
+ "suite": [
641
+ "lighteval"
642
+ ],
643
+ "original_num_docs": 206,
644
+ "effective_num_docs": 206,
645
+ "trust_dataset": true,
646
+ "must_remove_duplicate_docs": null,
647
+ "version": 0
648
+ },
649
+ "lighteval|agieval:sat-math": {
650
+ "name": "agieval:sat-math",
651
+ "prompt_function": "agieval",
652
+ "hf_repo": "dmayhem93/agieval-sat-math",
653
+ "hf_subset": "default",
654
+ "metric": [
655
+ "loglikelihood_acc",
656
+ "loglikelihood_acc_norm_nospace"
657
+ ],
658
+ "hf_avail_splits": [
659
+ "test"
660
+ ],
661
+ "evaluation_splits": [
662
+ "test"
663
+ ],
664
+ "few_shots_split": null,
665
+ "few_shots_select": "random_sampling",
666
+ "generation_size": 1,
667
+ "stop_sequence": null,
668
+ "output_regex": null,
669
+ "num_samples": null,
670
+ "frozen": false,
671
+ "suite": [
672
+ "lighteval"
673
+ ],
674
+ "original_num_docs": 220,
675
+ "effective_num_docs": 220,
676
+ "trust_dataset": true,
677
+ "must_remove_duplicate_docs": null,
678
+ "version": 0
679
+ }
680
+ },
681
+ "summary_tasks": {
682
+ "lighteval|agieval:aqua-rat|0": {
683
+ "hashes": {
684
+ "hash_examples": "f09607f69e5b7525",
685
+ "hash_full_prompts": "8b913655a6fea4ab",
686
+ "hash_input_tokens": "293a61e5163aa27e",
687
+ "hash_cont_tokens": "6e9004a0164bb799"
688
+ },
689
+ "truncated": 0,
690
+ "non_truncated": 254,
691
+ "padded": 1270,
692
+ "non_padded": 0,
693
+ "effective_few_shots": 0.0,
694
+ "num_truncated_few_shots": 0
695
+ },
696
+ "lighteval|agieval:gaokao-biology|0": {
697
+ "hashes": {
698
+ "hash_examples": "f262eaf4a72db963",
699
+ "hash_full_prompts": "c7078ace868f7ee8",
700
+ "hash_input_tokens": "4abbc3b98bca27c3",
701
+ "hash_cont_tokens": "1246a476ed0cbfc5"
702
+ },
703
+ "truncated": 0,
704
+ "non_truncated": 210,
705
+ "padded": 840,
706
+ "non_padded": 0,
707
+ "effective_few_shots": 0.0,
708
+ "num_truncated_few_shots": 0
709
+ },
710
+ "lighteval|agieval:gaokao-chemistry|0": {
711
+ "hashes": {
712
+ "hash_examples": "47f2e649f58d9da5",
713
+ "hash_full_prompts": "bd066d6d8c807f39",
714
+ "hash_input_tokens": "3d9728247bf04ac3",
715
+ "hash_cont_tokens": "a9eb294028dc6ef3"
716
+ },
717
+ "truncated": 0,
718
+ "non_truncated": 207,
719
+ "padded": 831,
720
+ "non_padded": 0,
721
+ "effective_few_shots": 0.0,
722
+ "num_truncated_few_shots": 0
723
+ },
724
+ "lighteval|agieval:gaokao-chinese|0": {
725
+ "hashes": {
726
+ "hash_examples": "1010b21fde4726ab",
727
+ "hash_full_prompts": "3f53e9dd34c43d52",
728
+ "hash_input_tokens": "f2a1d4c848527f86",
729
+ "hash_cont_tokens": "0a9d93afd335aa3c"
730
+ },
731
+ "truncated": 0,
732
+ "non_truncated": 246,
733
+ "padded": 982,
734
+ "non_padded": 2,
735
+ "effective_few_shots": 0.0,
736
+ "num_truncated_few_shots": 0
737
+ },
738
+ "lighteval|agieval:gaokao-english|0": {
739
+ "hashes": {
740
+ "hash_examples": "4864e492a350ae93",
741
+ "hash_full_prompts": "59104cb8623f69e5",
742
+ "hash_input_tokens": "ff82bcaabb6cde43",
743
+ "hash_cont_tokens": "3215b688d7c88b2f"
744
+ },
745
+ "truncated": 0,
746
+ "non_truncated": 306,
747
+ "padded": 1224,
748
+ "non_padded": 0,
749
+ "effective_few_shots": 0.0,
750
+ "num_truncated_few_shots": 0
751
+ },
752
+ "lighteval|agieval:gaokao-geography|0": {
753
+ "hashes": {
754
+ "hash_examples": "ec3a021e37650e7d",
755
+ "hash_full_prompts": "d2456e0377df1973",
756
+ "hash_input_tokens": "b6593d42d60f9e65",
757
+ "hash_cont_tokens": "8fd995e0ac66be75"
758
+ },
759
+ "truncated": 0,
760
+ "non_truncated": 199,
761
+ "padded": 796,
762
+ "non_padded": 0,
763
+ "effective_few_shots": 0.0,
764
+ "num_truncated_few_shots": 0
765
+ },
766
+ "lighteval|agieval:gaokao-history|0": {
767
+ "hashes": {
768
+ "hash_examples": "b3fad1596f1ae1f9",
769
+ "hash_full_prompts": "faea8f291d9a0cd5",
770
+ "hash_input_tokens": "ffa388f05b4bfadf",
771
+ "hash_cont_tokens": "4b0a7458d3ea41a3"
772
+ },
773
+ "truncated": 0,
774
+ "non_truncated": 235,
775
+ "padded": 940,
776
+ "non_padded": 0,
777
+ "effective_few_shots": 0.0,
778
+ "num_truncated_few_shots": 0
779
+ },
780
+ "lighteval|agieval:gaokao-mathqa|0": {
781
+ "hashes": {
782
+ "hash_examples": "1d1088556861b0b0",
783
+ "hash_full_prompts": "de899bfeaaa61154",
784
+ "hash_input_tokens": "1e826e8ae60c9cf4",
785
+ "hash_cont_tokens": "9bbd22eae9aff3f4"
786
+ },
787
+ "truncated": 0,
788
+ "non_truncated": 351,
789
+ "padded": 1404,
790
+ "non_padded": 0,
791
+ "effective_few_shots": 0.0,
792
+ "num_truncated_few_shots": 0
793
+ },
794
+ "lighteval|agieval:gaokao-physics|0": {
795
+ "hashes": {
796
+ "hash_examples": "eb05f035c7bfca2f",
797
+ "hash_full_prompts": "08008e0300283edc",
798
+ "hash_input_tokens": "0a4cddbeea3c31fa",
799
+ "hash_cont_tokens": "d7e81c1c739c343d"
800
+ },
801
+ "truncated": 0,
802
+ "non_truncated": 200,
803
+ "padded": 800,
804
+ "non_padded": 0,
805
+ "effective_few_shots": 0.0,
806
+ "num_truncated_few_shots": 0
807
+ },
808
+ "lighteval|agieval:logiqa-en|0": {
809
+ "hashes": {
810
+ "hash_examples": "0a688a45f69c21e0",
811
+ "hash_full_prompts": "3405fd262d4b2d28",
812
+ "hash_input_tokens": "8672a68d080ba1fb",
813
+ "hash_cont_tokens": "c1f60bd55a215749"
814
+ },
815
+ "truncated": 0,
816
+ "non_truncated": 651,
817
+ "padded": 2604,
818
+ "non_padded": 0,
819
+ "effective_few_shots": 0.0,
820
+ "num_truncated_few_shots": 0
821
+ },
822
+ "lighteval|agieval:logiqa-zh|0": {
823
+ "hashes": {
824
+ "hash_examples": "620d6888b6012ea5",
825
+ "hash_full_prompts": "ac19dc4eaa56f5e0",
826
+ "hash_input_tokens": "24c3bd34df395ee2",
827
+ "hash_cont_tokens": "d1987256a4a8ad62"
828
+ },
829
+ "truncated": 0,
830
+ "non_truncated": 651,
831
+ "padded": 2603,
832
+ "non_padded": 1,
833
+ "effective_few_shots": 0.0,
834
+ "num_truncated_few_shots": 0
835
+ },
836
+ "lighteval|agieval:lsat-ar|0": {
837
+ "hashes": {
838
+ "hash_examples": "627c8f5ccd5da209",
839
+ "hash_full_prompts": "9aed992c4bfa8dd7",
840
+ "hash_input_tokens": "042173fcbbb85776",
841
+ "hash_cont_tokens": "f1f82716a353907a"
842
+ },
843
+ "truncated": 0,
844
+ "non_truncated": 230,
845
+ "padded": 1137,
846
+ "non_padded": 13,
847
+ "effective_few_shots": 0.0,
848
+ "num_truncated_few_shots": 0
849
+ },
850
+ "lighteval|agieval:lsat-lr|0": {
851
+ "hashes": {
852
+ "hash_examples": "794641c86de172f5",
853
+ "hash_full_prompts": "6a36e90325996129",
854
+ "hash_input_tokens": "58c268b8b0f3c5c4",
855
+ "hash_cont_tokens": "9f8685be13aa273f"
856
+ },
857
+ "truncated": 0,
858
+ "non_truncated": 510,
859
+ "padded": 2532,
860
+ "non_padded": 18,
861
+ "effective_few_shots": 0.0,
862
+ "num_truncated_few_shots": 0
863
+ },
864
+ "lighteval|agieval:lsat-rc|0": {
865
+ "hashes": {
866
+ "hash_examples": "35981ed917ea01cf",
867
+ "hash_full_prompts": "15f0f342f9572c41",
868
+ "hash_input_tokens": "1eaa7331cf1a555e",
869
+ "hash_cont_tokens": "e39cea8f1c13694c"
870
+ },
871
+ "truncated": 0,
872
+ "non_truncated": 269,
873
+ "padded": 1345,
874
+ "non_padded": 0,
875
+ "effective_few_shots": 0.0,
876
+ "num_truncated_few_shots": 0
877
+ },
878
+ "lighteval|agieval:sat-en|0": {
879
+ "hashes": {
880
+ "hash_examples": "041c39c646536a1e",
881
+ "hash_full_prompts": "163217fd603b9352",
882
+ "hash_input_tokens": "e1e9417c28fa0db0",
883
+ "hash_cont_tokens": "bc6d627ee3fe275d"
884
+ },
885
+ "truncated": 0,
886
+ "non_truncated": 206,
887
+ "padded": 821,
888
+ "non_padded": 0,
889
+ "effective_few_shots": 0.0,
890
+ "num_truncated_few_shots": 0
891
+ },
892
+ "lighteval|agieval:sat-en-without-passage|0": {
893
+ "hashes": {
894
+ "hash_examples": "e4d9284367dff68f",
895
+ "hash_full_prompts": "bdd4c7065b87de8a",
896
+ "hash_input_tokens": "70bcab139ba874e5",
897
+ "hash_cont_tokens": "71939640e1182ae0"
898
+ },
899
+ "truncated": 0,
900
+ "non_truncated": 206,
901
+ "padded": 817,
902
+ "non_padded": 4,
903
+ "effective_few_shots": 0.0,
904
+ "num_truncated_few_shots": 0
905
+ },
906
+ "lighteval|agieval:sat-math|0": {
907
+ "hashes": {
908
+ "hash_examples": "01db7291603fc1a0",
909
+ "hash_full_prompts": "63ca65b2f0baebb5",
910
+ "hash_input_tokens": "e242b0a44e256fbb",
911
+ "hash_cont_tokens": "065259bbb09c8c6d"
912
+ },
913
+ "truncated": 0,
914
+ "non_truncated": 220,
915
+ "padded": 877,
916
+ "non_padded": 3,
917
+ "effective_few_shots": 0.0,
918
+ "num_truncated_few_shots": 0
919
+ }
920
+ },
921
+ "summary_general": {
922
+ "hashes": {
923
+ "hash_examples": "da3af66181f18ddf",
924
+ "hash_full_prompts": "f7c298d03686fa0e",
925
+ "hash_input_tokens": "84e02a40c0a714da",
926
+ "hash_cont_tokens": "a8617293ec34570e"
927
+ },
928
+ "truncated": 0,
929
+ "non_truncated": 5151,
930
+ "padded": 21823,
931
+ "non_padded": 41,
932
+ "num_truncated_few_shots": 0
933
+ }
934
+ }