lewtun HF staff commited on
Commit
47a0bd2
·
verified ·
1 Parent(s): aef19f8

Upload eval_results/orpo-explorers/hf-llama3-70b-orpo-v0.0/main/agieval/results_2024-05-08T13-24-02.242534.json with huggingface_hub

Browse files
eval_results/orpo-explorers/hf-llama3-70b-orpo-v0.0/main/agieval/results_2024-05-08T13-24-02.242534.json ADDED
@@ -0,0 +1,934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 440093.005929755,
9
+ "end_time": 442559.229810266,
10
+ "total_evaluation_time_secondes": "2466.22388051101",
11
+ "model_name": "orpo-explorers/hf-llama3-70b-orpo-v0.0",
12
+ "model_sha": "c81afe2e88ad735df00af1e00424f09c7385319c",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "131.73 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|agieval:aqua-rat|0": {
19
+ "acc": 0.3346456692913386,
20
+ "acc_stderr": 0.029665989450923986,
21
+ "acc_norm": 0.2795275590551181,
22
+ "acc_norm_stderr": 0.02821374533845075
23
+ },
24
+ "lighteval|agieval:gaokao-biology|0": {
25
+ "acc": 0.5380952380952381,
26
+ "acc_stderr": 0.034485192220162664,
27
+ "acc_norm": 0.5047619047619047,
28
+ "acc_norm_stderr": 0.03458415464421142
29
+ },
30
+ "lighteval|agieval:gaokao-chemistry|0": {
31
+ "acc": 0.3864734299516908,
32
+ "acc_stderr": 0.033926799978224606,
33
+ "acc_norm": 0.357487922705314,
34
+ "acc_norm_stderr": 0.03339163808945076
35
+ },
36
+ "lighteval|agieval:gaokao-chinese|0": {
37
+ "acc": 0.44715447154471544,
38
+ "acc_stderr": 0.031764911338391044,
39
+ "acc_norm": 0.45121951219512196,
40
+ "acc_norm_stderr": 0.03179144179598375
41
+ },
42
+ "lighteval|agieval:gaokao-english|0": {
43
+ "acc": 0.7254901960784313,
44
+ "acc_stderr": 0.025553169991826514,
45
+ "acc_norm": 0.7516339869281046,
46
+ "acc_norm_stderr": 0.02473998135511359
47
+ },
48
+ "lighteval|agieval:gaokao-geography|0": {
49
+ "acc": 0.6834170854271356,
50
+ "acc_stderr": 0.033056286002970406,
51
+ "acc_norm": 0.7236180904522613,
52
+ "acc_norm_stderr": 0.03178168502681787
53
+ },
54
+ "lighteval|agieval:gaokao-history|0": {
55
+ "acc": 0.7148936170212766,
56
+ "acc_stderr": 0.029513196625539345,
57
+ "acc_norm": 0.7021276595744681,
58
+ "acc_norm_stderr": 0.029896145682095466
59
+ },
60
+ "lighteval|agieval:gaokao-mathqa|0": {
61
+ "acc": 0.38746438746438744,
62
+ "acc_stderr": 0.026040393672207132,
63
+ "acc_norm": 0.36182336182336183,
64
+ "acc_norm_stderr": 0.025685305229822614
65
+ },
66
+ "lighteval|agieval:gaokao-physics|0": {
67
+ "acc": 0.535,
68
+ "acc_stderr": 0.035357115664894224,
69
+ "acc_norm": 0.495,
70
+ "acc_norm_stderr": 0.035442288003096976
71
+ },
72
+ "lighteval|agieval:logiqa-en|0": {
73
+ "acc": 0.4009216589861751,
74
+ "acc_stderr": 0.019222722225450923,
75
+ "acc_norm": 0.40706605222734255,
76
+ "acc_norm_stderr": 0.01926987610639942
77
+ },
78
+ "lighteval|agieval:logiqa-zh|0": {
79
+ "acc": 0.46697388632872505,
80
+ "acc_stderr": 0.01956878502638525,
81
+ "acc_norm": 0.445468509984639,
82
+ "acc_norm_stderr": 0.019494627133439975
83
+ },
84
+ "lighteval|agieval:lsat-ar|0": {
85
+ "acc": 0.2608695652173913,
86
+ "acc_stderr": 0.02901713355938128,
87
+ "acc_norm": 0.24782608695652175,
88
+ "acc_norm_stderr": 0.028530862595410062
89
+ },
90
+ "lighteval|agieval:lsat-lr|0": {
91
+ "acc": 0.6235294117647059,
92
+ "acc_stderr": 0.021475095661076454,
93
+ "acc_norm": 0.5372549019607843,
94
+ "acc_norm_stderr": 0.022100505922784036
95
+ },
96
+ "lighteval|agieval:lsat-rc|0": {
97
+ "acc": 0.7211895910780669,
98
+ "acc_stderr": 0.027391247975710395,
99
+ "acc_norm": 0.6133828996282528,
100
+ "acc_norm_stderr": 0.029746711725453
101
+ },
102
+ "lighteval|agieval:sat-en|0": {
103
+ "acc": 0.7864077669902912,
104
+ "acc_stderr": 0.028624617667710652,
105
+ "acc_norm": 0.7718446601941747,
106
+ "acc_norm_stderr": 0.029309157873241717
107
+ },
108
+ "lighteval|agieval:sat-en-without-passage|0": {
109
+ "acc": 0.5242718446601942,
110
+ "acc_stderr": 0.03488034442356184,
111
+ "acc_norm": 0.46601941747572817,
112
+ "acc_norm_stderr": 0.03484077510348
113
+ },
114
+ "lighteval|agieval:sat-math|0": {
115
+ "acc": 0.6045454545454545,
116
+ "acc_stderr": 0.0330400509644678,
117
+ "acc_norm": 0.4772727272727273,
118
+ "acc_norm_stderr": 0.03375194708230162
119
+ },
120
+ "lighteval|agieval:_average|0": {
121
+ "acc": 0.5377260749673658,
122
+ "acc_stderr": 0.028975473673463797,
123
+ "acc_norm": 0.5054903090115191,
124
+ "acc_norm_stderr": 0.02897475580632665
125
+ },
126
+ "all": {
127
+ "acc": 0.5377260749673658,
128
+ "acc_stderr": 0.028975473673463797,
129
+ "acc_norm": 0.5054903090115191,
130
+ "acc_norm_stderr": 0.02897475580632665
131
+ }
132
+ },
133
+ "versions": {
134
+ "lighteval|agieval:aqua-rat|0": 0,
135
+ "lighteval|agieval:gaokao-biology|0": 0,
136
+ "lighteval|agieval:gaokao-chemistry|0": 0,
137
+ "lighteval|agieval:gaokao-chinese|0": 0,
138
+ "lighteval|agieval:gaokao-english|0": 0,
139
+ "lighteval|agieval:gaokao-geography|0": 0,
140
+ "lighteval|agieval:gaokao-history|0": 0,
141
+ "lighteval|agieval:gaokao-mathqa|0": 0,
142
+ "lighteval|agieval:gaokao-physics|0": 0,
143
+ "lighteval|agieval:logiqa-en|0": 0,
144
+ "lighteval|agieval:logiqa-zh|0": 0,
145
+ "lighteval|agieval:lsat-ar|0": 0,
146
+ "lighteval|agieval:lsat-lr|0": 0,
147
+ "lighteval|agieval:lsat-rc|0": 0,
148
+ "lighteval|agieval:sat-en|0": 0,
149
+ "lighteval|agieval:sat-en-without-passage|0": 0,
150
+ "lighteval|agieval:sat-math|0": 0
151
+ },
152
+ "config_tasks": {
153
+ "lighteval|agieval:aqua-rat": {
154
+ "name": "agieval:aqua-rat",
155
+ "prompt_function": "agieval",
156
+ "hf_repo": "dmayhem93/agieval-aqua-rat",
157
+ "hf_subset": "default",
158
+ "metric": [
159
+ "loglikelihood_acc",
160
+ "loglikelihood_acc_norm_nospace"
161
+ ],
162
+ "hf_avail_splits": [
163
+ "test"
164
+ ],
165
+ "evaluation_splits": [
166
+ "test"
167
+ ],
168
+ "few_shots_split": null,
169
+ "few_shots_select": "random_sampling",
170
+ "generation_size": 1,
171
+ "stop_sequence": null,
172
+ "output_regex": null,
173
+ "num_samples": null,
174
+ "frozen": false,
175
+ "suite": [
176
+ "lighteval"
177
+ ],
178
+ "original_num_docs": 254,
179
+ "effective_num_docs": 254,
180
+ "trust_dataset": true,
181
+ "must_remove_duplicate_docs": null,
182
+ "version": 0
183
+ },
184
+ "lighteval|agieval:gaokao-biology": {
185
+ "name": "agieval:gaokao-biology",
186
+ "prompt_function": "agieval",
187
+ "hf_repo": "dmayhem93/agieval-gaokao-biology",
188
+ "hf_subset": "default",
189
+ "metric": [
190
+ "loglikelihood_acc",
191
+ "loglikelihood_acc_norm_nospace"
192
+ ],
193
+ "hf_avail_splits": [
194
+ "test"
195
+ ],
196
+ "evaluation_splits": [
197
+ "test"
198
+ ],
199
+ "few_shots_split": null,
200
+ "few_shots_select": "random_sampling",
201
+ "generation_size": 1,
202
+ "stop_sequence": null,
203
+ "output_regex": null,
204
+ "num_samples": null,
205
+ "frozen": false,
206
+ "suite": [
207
+ "lighteval"
208
+ ],
209
+ "original_num_docs": 210,
210
+ "effective_num_docs": 210,
211
+ "trust_dataset": true,
212
+ "must_remove_duplicate_docs": null,
213
+ "version": 0
214
+ },
215
+ "lighteval|agieval:gaokao-chemistry": {
216
+ "name": "agieval:gaokao-chemistry",
217
+ "prompt_function": "agieval",
218
+ "hf_repo": "dmayhem93/agieval-gaokao-chemistry",
219
+ "hf_subset": "default",
220
+ "metric": [
221
+ "loglikelihood_acc",
222
+ "loglikelihood_acc_norm_nospace"
223
+ ],
224
+ "hf_avail_splits": [
225
+ "test"
226
+ ],
227
+ "evaluation_splits": [
228
+ "test"
229
+ ],
230
+ "few_shots_split": null,
231
+ "few_shots_select": "random_sampling",
232
+ "generation_size": 1,
233
+ "stop_sequence": null,
234
+ "output_regex": null,
235
+ "num_samples": null,
236
+ "frozen": false,
237
+ "suite": [
238
+ "lighteval"
239
+ ],
240
+ "original_num_docs": 207,
241
+ "effective_num_docs": 207,
242
+ "trust_dataset": true,
243
+ "must_remove_duplicate_docs": null,
244
+ "version": 0
245
+ },
246
+ "lighteval|agieval:gaokao-chinese": {
247
+ "name": "agieval:gaokao-chinese",
248
+ "prompt_function": "agieval",
249
+ "hf_repo": "dmayhem93/agieval-gaokao-chinese",
250
+ "hf_subset": "default",
251
+ "metric": [
252
+ "loglikelihood_acc",
253
+ "loglikelihood_acc_norm_nospace"
254
+ ],
255
+ "hf_avail_splits": [
256
+ "test"
257
+ ],
258
+ "evaluation_splits": [
259
+ "test"
260
+ ],
261
+ "few_shots_split": null,
262
+ "few_shots_select": "random_sampling",
263
+ "generation_size": 1,
264
+ "stop_sequence": null,
265
+ "output_regex": null,
266
+ "num_samples": null,
267
+ "frozen": false,
268
+ "suite": [
269
+ "lighteval"
270
+ ],
271
+ "original_num_docs": 246,
272
+ "effective_num_docs": 246,
273
+ "trust_dataset": true,
274
+ "must_remove_duplicate_docs": null,
275
+ "version": 0
276
+ },
277
+ "lighteval|agieval:gaokao-english": {
278
+ "name": "agieval:gaokao-english",
279
+ "prompt_function": "agieval",
280
+ "hf_repo": "dmayhem93/agieval-gaokao-english",
281
+ "hf_subset": "default",
282
+ "metric": [
283
+ "loglikelihood_acc",
284
+ "loglikelihood_acc_norm_nospace"
285
+ ],
286
+ "hf_avail_splits": [
287
+ "test"
288
+ ],
289
+ "evaluation_splits": [
290
+ "test"
291
+ ],
292
+ "few_shots_split": null,
293
+ "few_shots_select": "random_sampling",
294
+ "generation_size": 1,
295
+ "stop_sequence": null,
296
+ "output_regex": null,
297
+ "num_samples": null,
298
+ "frozen": false,
299
+ "suite": [
300
+ "lighteval"
301
+ ],
302
+ "original_num_docs": 306,
303
+ "effective_num_docs": 306,
304
+ "trust_dataset": true,
305
+ "must_remove_duplicate_docs": null,
306
+ "version": 0
307
+ },
308
+ "lighteval|agieval:gaokao-geography": {
309
+ "name": "agieval:gaokao-geography",
310
+ "prompt_function": "agieval",
311
+ "hf_repo": "dmayhem93/agieval-gaokao-geography",
312
+ "hf_subset": "default",
313
+ "metric": [
314
+ "loglikelihood_acc",
315
+ "loglikelihood_acc_norm_nospace"
316
+ ],
317
+ "hf_avail_splits": [
318
+ "test"
319
+ ],
320
+ "evaluation_splits": [
321
+ "test"
322
+ ],
323
+ "few_shots_split": null,
324
+ "few_shots_select": "random_sampling",
325
+ "generation_size": 1,
326
+ "stop_sequence": null,
327
+ "output_regex": null,
328
+ "num_samples": null,
329
+ "frozen": false,
330
+ "suite": [
331
+ "lighteval"
332
+ ],
333
+ "original_num_docs": 199,
334
+ "effective_num_docs": 199,
335
+ "trust_dataset": true,
336
+ "must_remove_duplicate_docs": null,
337
+ "version": 0
338
+ },
339
+ "lighteval|agieval:gaokao-history": {
340
+ "name": "agieval:gaokao-history",
341
+ "prompt_function": "agieval",
342
+ "hf_repo": "dmayhem93/agieval-gaokao-history",
343
+ "hf_subset": "default",
344
+ "metric": [
345
+ "loglikelihood_acc",
346
+ "loglikelihood_acc_norm_nospace"
347
+ ],
348
+ "hf_avail_splits": [
349
+ "test"
350
+ ],
351
+ "evaluation_splits": [
352
+ "test"
353
+ ],
354
+ "few_shots_split": null,
355
+ "few_shots_select": "random_sampling",
356
+ "generation_size": 1,
357
+ "stop_sequence": null,
358
+ "output_regex": null,
359
+ "num_samples": null,
360
+ "frozen": false,
361
+ "suite": [
362
+ "lighteval"
363
+ ],
364
+ "original_num_docs": 235,
365
+ "effective_num_docs": 235,
366
+ "trust_dataset": true,
367
+ "must_remove_duplicate_docs": null,
368
+ "version": 0
369
+ },
370
+ "lighteval|agieval:gaokao-mathqa": {
371
+ "name": "agieval:gaokao-mathqa",
372
+ "prompt_function": "agieval",
373
+ "hf_repo": "dmayhem93/agieval-gaokao-mathqa",
374
+ "hf_subset": "default",
375
+ "metric": [
376
+ "loglikelihood_acc",
377
+ "loglikelihood_acc_norm_nospace"
378
+ ],
379
+ "hf_avail_splits": [
380
+ "test"
381
+ ],
382
+ "evaluation_splits": [
383
+ "test"
384
+ ],
385
+ "few_shots_split": null,
386
+ "few_shots_select": "random_sampling",
387
+ "generation_size": 1,
388
+ "stop_sequence": null,
389
+ "output_regex": null,
390
+ "num_samples": null,
391
+ "frozen": false,
392
+ "suite": [
393
+ "lighteval"
394
+ ],
395
+ "original_num_docs": 351,
396
+ "effective_num_docs": 351,
397
+ "trust_dataset": true,
398
+ "must_remove_duplicate_docs": null,
399
+ "version": 0
400
+ },
401
+ "lighteval|agieval:gaokao-physics": {
402
+ "name": "agieval:gaokao-physics",
403
+ "prompt_function": "agieval",
404
+ "hf_repo": "dmayhem93/agieval-gaokao-physics",
405
+ "hf_subset": "default",
406
+ "metric": [
407
+ "loglikelihood_acc",
408
+ "loglikelihood_acc_norm_nospace"
409
+ ],
410
+ "hf_avail_splits": [
411
+ "test"
412
+ ],
413
+ "evaluation_splits": [
414
+ "test"
415
+ ],
416
+ "few_shots_split": null,
417
+ "few_shots_select": "random_sampling",
418
+ "generation_size": 1,
419
+ "stop_sequence": null,
420
+ "output_regex": null,
421
+ "num_samples": null,
422
+ "frozen": false,
423
+ "suite": [
424
+ "lighteval"
425
+ ],
426
+ "original_num_docs": 200,
427
+ "effective_num_docs": 200,
428
+ "trust_dataset": true,
429
+ "must_remove_duplicate_docs": null,
430
+ "version": 0
431
+ },
432
+ "lighteval|agieval:logiqa-en": {
433
+ "name": "agieval:logiqa-en",
434
+ "prompt_function": "agieval",
435
+ "hf_repo": "dmayhem93/agieval-logiqa-en",
436
+ "hf_subset": "default",
437
+ "metric": [
438
+ "loglikelihood_acc",
439
+ "loglikelihood_acc_norm_nospace"
440
+ ],
441
+ "hf_avail_splits": [
442
+ "test"
443
+ ],
444
+ "evaluation_splits": [
445
+ "test"
446
+ ],
447
+ "few_shots_split": null,
448
+ "few_shots_select": "random_sampling",
449
+ "generation_size": 1,
450
+ "stop_sequence": null,
451
+ "output_regex": null,
452
+ "num_samples": null,
453
+ "frozen": false,
454
+ "suite": [
455
+ "lighteval"
456
+ ],
457
+ "original_num_docs": 651,
458
+ "effective_num_docs": 651,
459
+ "trust_dataset": true,
460
+ "must_remove_duplicate_docs": null,
461
+ "version": 0
462
+ },
463
+ "lighteval|agieval:logiqa-zh": {
464
+ "name": "agieval:logiqa-zh",
465
+ "prompt_function": "agieval",
466
+ "hf_repo": "dmayhem93/agieval-logiqa-zh",
467
+ "hf_subset": "default",
468
+ "metric": [
469
+ "loglikelihood_acc",
470
+ "loglikelihood_acc_norm_nospace"
471
+ ],
472
+ "hf_avail_splits": [
473
+ "test"
474
+ ],
475
+ "evaluation_splits": [
476
+ "test"
477
+ ],
478
+ "few_shots_split": null,
479
+ "few_shots_select": "random_sampling",
480
+ "generation_size": 1,
481
+ "stop_sequence": null,
482
+ "output_regex": null,
483
+ "num_samples": null,
484
+ "frozen": false,
485
+ "suite": [
486
+ "lighteval"
487
+ ],
488
+ "original_num_docs": 651,
489
+ "effective_num_docs": 651,
490
+ "trust_dataset": true,
491
+ "must_remove_duplicate_docs": null,
492
+ "version": 0
493
+ },
494
+ "lighteval|agieval:lsat-ar": {
495
+ "name": "agieval:lsat-ar",
496
+ "prompt_function": "agieval",
497
+ "hf_repo": "dmayhem93/agieval-lsat-ar",
498
+ "hf_subset": "default",
499
+ "metric": [
500
+ "loglikelihood_acc",
501
+ "loglikelihood_acc_norm_nospace"
502
+ ],
503
+ "hf_avail_splits": [
504
+ "test"
505
+ ],
506
+ "evaluation_splits": [
507
+ "test"
508
+ ],
509
+ "few_shots_split": null,
510
+ "few_shots_select": "random_sampling",
511
+ "generation_size": 1,
512
+ "stop_sequence": null,
513
+ "output_regex": null,
514
+ "num_samples": null,
515
+ "frozen": false,
516
+ "suite": [
517
+ "lighteval"
518
+ ],
519
+ "original_num_docs": 230,
520
+ "effective_num_docs": 230,
521
+ "trust_dataset": true,
522
+ "must_remove_duplicate_docs": null,
523
+ "version": 0
524
+ },
525
+ "lighteval|agieval:lsat-lr": {
526
+ "name": "agieval:lsat-lr",
527
+ "prompt_function": "agieval",
528
+ "hf_repo": "dmayhem93/agieval-lsat-lr",
529
+ "hf_subset": "default",
530
+ "metric": [
531
+ "loglikelihood_acc",
532
+ "loglikelihood_acc_norm_nospace"
533
+ ],
534
+ "hf_avail_splits": [
535
+ "test"
536
+ ],
537
+ "evaluation_splits": [
538
+ "test"
539
+ ],
540
+ "few_shots_split": null,
541
+ "few_shots_select": "random_sampling",
542
+ "generation_size": 1,
543
+ "stop_sequence": null,
544
+ "output_regex": null,
545
+ "num_samples": null,
546
+ "frozen": false,
547
+ "suite": [
548
+ "lighteval"
549
+ ],
550
+ "original_num_docs": 510,
551
+ "effective_num_docs": 510,
552
+ "trust_dataset": true,
553
+ "must_remove_duplicate_docs": null,
554
+ "version": 0
555
+ },
556
+ "lighteval|agieval:lsat-rc": {
557
+ "name": "agieval:lsat-rc",
558
+ "prompt_function": "agieval",
559
+ "hf_repo": "dmayhem93/agieval-lsat-rc",
560
+ "hf_subset": "default",
561
+ "metric": [
562
+ "loglikelihood_acc",
563
+ "loglikelihood_acc_norm_nospace"
564
+ ],
565
+ "hf_avail_splits": [
566
+ "test"
567
+ ],
568
+ "evaluation_splits": [
569
+ "test"
570
+ ],
571
+ "few_shots_split": null,
572
+ "few_shots_select": "random_sampling",
573
+ "generation_size": 1,
574
+ "stop_sequence": null,
575
+ "output_regex": null,
576
+ "num_samples": null,
577
+ "frozen": false,
578
+ "suite": [
579
+ "lighteval"
580
+ ],
581
+ "original_num_docs": 269,
582
+ "effective_num_docs": 269,
583
+ "trust_dataset": true,
584
+ "must_remove_duplicate_docs": null,
585
+ "version": 0
586
+ },
587
+ "lighteval|agieval:sat-en": {
588
+ "name": "agieval:sat-en",
589
+ "prompt_function": "agieval",
590
+ "hf_repo": "dmayhem93/agieval-sat-en",
591
+ "hf_subset": "default",
592
+ "metric": [
593
+ "loglikelihood_acc",
594
+ "loglikelihood_acc_norm_nospace"
595
+ ],
596
+ "hf_avail_splits": [
597
+ "test"
598
+ ],
599
+ "evaluation_splits": [
600
+ "test"
601
+ ],
602
+ "few_shots_split": null,
603
+ "few_shots_select": "random_sampling",
604
+ "generation_size": 1,
605
+ "stop_sequence": null,
606
+ "output_regex": null,
607
+ "num_samples": null,
608
+ "frozen": false,
609
+ "suite": [
610
+ "lighteval"
611
+ ],
612
+ "original_num_docs": 206,
613
+ "effective_num_docs": 206,
614
+ "trust_dataset": true,
615
+ "must_remove_duplicate_docs": null,
616
+ "version": 0
617
+ },
618
+ "lighteval|agieval:sat-en-without-passage": {
619
+ "name": "agieval:sat-en-without-passage",
620
+ "prompt_function": "agieval",
621
+ "hf_repo": "dmayhem93/agieval-sat-en-without-passage",
622
+ "hf_subset": "default",
623
+ "metric": [
624
+ "loglikelihood_acc",
625
+ "loglikelihood_acc_norm_nospace"
626
+ ],
627
+ "hf_avail_splits": [
628
+ "test"
629
+ ],
630
+ "evaluation_splits": [
631
+ "test"
632
+ ],
633
+ "few_shots_split": null,
634
+ "few_shots_select": "random_sampling",
635
+ "generation_size": 1,
636
+ "stop_sequence": null,
637
+ "output_regex": null,
638
+ "num_samples": null,
639
+ "frozen": false,
640
+ "suite": [
641
+ "lighteval"
642
+ ],
643
+ "original_num_docs": 206,
644
+ "effective_num_docs": 206,
645
+ "trust_dataset": true,
646
+ "must_remove_duplicate_docs": null,
647
+ "version": 0
648
+ },
649
+ "lighteval|agieval:sat-math": {
650
+ "name": "agieval:sat-math",
651
+ "prompt_function": "agieval",
652
+ "hf_repo": "dmayhem93/agieval-sat-math",
653
+ "hf_subset": "default",
654
+ "metric": [
655
+ "loglikelihood_acc",
656
+ "loglikelihood_acc_norm_nospace"
657
+ ],
658
+ "hf_avail_splits": [
659
+ "test"
660
+ ],
661
+ "evaluation_splits": [
662
+ "test"
663
+ ],
664
+ "few_shots_split": null,
665
+ "few_shots_select": "random_sampling",
666
+ "generation_size": 1,
667
+ "stop_sequence": null,
668
+ "output_regex": null,
669
+ "num_samples": null,
670
+ "frozen": false,
671
+ "suite": [
672
+ "lighteval"
673
+ ],
674
+ "original_num_docs": 220,
675
+ "effective_num_docs": 220,
676
+ "trust_dataset": true,
677
+ "must_remove_duplicate_docs": null,
678
+ "version": 0
679
+ }
680
+ },
681
+ "summary_tasks": {
682
+ "lighteval|agieval:aqua-rat|0": {
683
+ "hashes": {
684
+ "hash_examples": "f09607f69e5b7525",
685
+ "hash_full_prompts": "45f162411ace6fc2",
686
+ "hash_input_tokens": "23aae2a0de54fdb1",
687
+ "hash_cont_tokens": "3bd5c620772b1bb0"
688
+ },
689
+ "truncated": 0,
690
+ "non_truncated": 254,
691
+ "padded": 1270,
692
+ "non_padded": 0,
693
+ "effective_few_shots": 0.0,
694
+ "num_truncated_few_shots": 0
695
+ },
696
+ "lighteval|agieval:gaokao-biology|0": {
697
+ "hashes": {
698
+ "hash_examples": "f262eaf4a72db963",
699
+ "hash_full_prompts": "9065e2c284053787",
700
+ "hash_input_tokens": "33b34cfc3f023b55",
701
+ "hash_cont_tokens": "0579b1b36a9e71ef"
702
+ },
703
+ "truncated": 0,
704
+ "non_truncated": 210,
705
+ "padded": 840,
706
+ "non_padded": 0,
707
+ "effective_few_shots": 0.0,
708
+ "num_truncated_few_shots": 0
709
+ },
710
+ "lighteval|agieval:gaokao-chemistry|0": {
711
+ "hashes": {
712
+ "hash_examples": "47f2e649f58d9da5",
713
+ "hash_full_prompts": "2590748ef4ec7437",
714
+ "hash_input_tokens": "bcd3c88c123c7893",
715
+ "hash_cont_tokens": "e79877a4f290a5f6"
716
+ },
717
+ "truncated": 0,
718
+ "non_truncated": 207,
719
+ "padded": 825,
720
+ "non_padded": 6,
721
+ "effective_few_shots": 0.0,
722
+ "num_truncated_few_shots": 0
723
+ },
724
+ "lighteval|agieval:gaokao-chinese|0": {
725
+ "hashes": {
726
+ "hash_examples": "1010b21fde4726ab",
727
+ "hash_full_prompts": "cead2911f25db59d",
728
+ "hash_input_tokens": "9cd1da7d8d28f48a",
729
+ "hash_cont_tokens": "a85942d5d1657847"
730
+ },
731
+ "truncated": 0,
732
+ "non_truncated": 246,
733
+ "padded": 981,
734
+ "non_padded": 3,
735
+ "effective_few_shots": 0.0,
736
+ "num_truncated_few_shots": 0
737
+ },
738
+ "lighteval|agieval:gaokao-english|0": {
739
+ "hashes": {
740
+ "hash_examples": "4864e492a350ae93",
741
+ "hash_full_prompts": "135d7b429332d260",
742
+ "hash_input_tokens": "b8ef3b3f819352d0",
743
+ "hash_cont_tokens": "3f5d765956a4d3cd"
744
+ },
745
+ "truncated": 0,
746
+ "non_truncated": 306,
747
+ "padded": 1224,
748
+ "non_padded": 0,
749
+ "effective_few_shots": 0.0,
750
+ "num_truncated_few_shots": 0
751
+ },
752
+ "lighteval|agieval:gaokao-geography|0": {
753
+ "hashes": {
754
+ "hash_examples": "ec3a021e37650e7d",
755
+ "hash_full_prompts": "20fed5ec555f6ec4",
756
+ "hash_input_tokens": "c6bcf260356967cc",
757
+ "hash_cont_tokens": "958639a9229da878"
758
+ },
759
+ "truncated": 0,
760
+ "non_truncated": 199,
761
+ "padded": 794,
762
+ "non_padded": 2,
763
+ "effective_few_shots": 0.0,
764
+ "num_truncated_few_shots": 0
765
+ },
766
+ "lighteval|agieval:gaokao-history|0": {
767
+ "hashes": {
768
+ "hash_examples": "b3fad1596f1ae1f9",
769
+ "hash_full_prompts": "d516152c37bfabec",
770
+ "hash_input_tokens": "e7525e91b8bcab93",
771
+ "hash_cont_tokens": "2823df28272b3169"
772
+ },
773
+ "truncated": 0,
774
+ "non_truncated": 235,
775
+ "padded": 938,
776
+ "non_padded": 2,
777
+ "effective_few_shots": 0.0,
778
+ "num_truncated_few_shots": 0
779
+ },
780
+ "lighteval|agieval:gaokao-mathqa|0": {
781
+ "hashes": {
782
+ "hash_examples": "1d1088556861b0b0",
783
+ "hash_full_prompts": "012e211321eb487c",
784
+ "hash_input_tokens": "b8da4c3dd0ce4e98",
785
+ "hash_cont_tokens": "bebfeb0c9a213a00"
786
+ },
787
+ "truncated": 0,
788
+ "non_truncated": 351,
789
+ "padded": 1403,
790
+ "non_padded": 1,
791
+ "effective_few_shots": 0.0,
792
+ "num_truncated_few_shots": 0
793
+ },
794
+ "lighteval|agieval:gaokao-physics|0": {
795
+ "hashes": {
796
+ "hash_examples": "eb05f035c7bfca2f",
797
+ "hash_full_prompts": "1b3de6a23416b97c",
798
+ "hash_input_tokens": "11d7e00208eaebda",
799
+ "hash_cont_tokens": "11fcd0ecf2781f72"
800
+ },
801
+ "truncated": 0,
802
+ "non_truncated": 200,
803
+ "padded": 792,
804
+ "non_padded": 8,
805
+ "effective_few_shots": 0.0,
806
+ "num_truncated_few_shots": 0
807
+ },
808
+ "lighteval|agieval:logiqa-en|0": {
809
+ "hashes": {
810
+ "hash_examples": "0a688a45f69c21e0",
811
+ "hash_full_prompts": "e6286df1c8938f3d",
812
+ "hash_input_tokens": "143338cb78c7e447",
813
+ "hash_cont_tokens": "ec4007d0a2baccd0"
814
+ },
815
+ "truncated": 0,
816
+ "non_truncated": 651,
817
+ "padded": 2591,
818
+ "non_padded": 13,
819
+ "effective_few_shots": 0.0,
820
+ "num_truncated_few_shots": 0
821
+ },
822
+ "lighteval|agieval:logiqa-zh|0": {
823
+ "hashes": {
824
+ "hash_examples": "620d6888b6012ea5",
825
+ "hash_full_prompts": "e576c5e991ec11ab",
826
+ "hash_input_tokens": "9bb22aebecab6f6c",
827
+ "hash_cont_tokens": "7d6fa9469c400168"
828
+ },
829
+ "truncated": 0,
830
+ "non_truncated": 651,
831
+ "padded": 2573,
832
+ "non_padded": 31,
833
+ "effective_few_shots": 0.0,
834
+ "num_truncated_few_shots": 0
835
+ },
836
+ "lighteval|agieval:lsat-ar|0": {
837
+ "hashes": {
838
+ "hash_examples": "627c8f5ccd5da209",
839
+ "hash_full_prompts": "f6c74737c1c89d2e",
840
+ "hash_input_tokens": "7281aac3ff127be9",
841
+ "hash_cont_tokens": "963fb60f601921f8"
842
+ },
843
+ "truncated": 0,
844
+ "non_truncated": 230,
845
+ "padded": 1134,
846
+ "non_padded": 16,
847
+ "effective_few_shots": 0.0,
848
+ "num_truncated_few_shots": 0
849
+ },
850
+ "lighteval|agieval:lsat-lr|0": {
851
+ "hashes": {
852
+ "hash_examples": "794641c86de172f5",
853
+ "hash_full_prompts": "08efe3e7eb75341b",
854
+ "hash_input_tokens": "e4c2fd73238bb80d",
855
+ "hash_cont_tokens": "2688b42e570332eb"
856
+ },
857
+ "truncated": 0,
858
+ "non_truncated": 510,
859
+ "padded": 2528,
860
+ "non_padded": 22,
861
+ "effective_few_shots": 0.0,
862
+ "num_truncated_few_shots": 0
863
+ },
864
+ "lighteval|agieval:lsat-rc|0": {
865
+ "hashes": {
866
+ "hash_examples": "35981ed917ea01cf",
867
+ "hash_full_prompts": "c492cc61e80db65e",
868
+ "hash_input_tokens": "2c6c15ef3c54359c",
869
+ "hash_cont_tokens": "6a5e736a7985a643"
870
+ },
871
+ "truncated": 0,
872
+ "non_truncated": 269,
873
+ "padded": 1345,
874
+ "non_padded": 0,
875
+ "effective_few_shots": 0.0,
876
+ "num_truncated_few_shots": 0
877
+ },
878
+ "lighteval|agieval:sat-en|0": {
879
+ "hashes": {
880
+ "hash_examples": "041c39c646536a1e",
881
+ "hash_full_prompts": "2bf37943376e3266",
882
+ "hash_input_tokens": "1c4a887f57721fd6",
883
+ "hash_cont_tokens": "a48397ff1dec7238"
884
+ },
885
+ "truncated": 0,
886
+ "non_truncated": 206,
887
+ "padded": 821,
888
+ "non_padded": 0,
889
+ "effective_few_shots": 0.0,
890
+ "num_truncated_few_shots": 0
891
+ },
892
+ "lighteval|agieval:sat-en-without-passage|0": {
893
+ "hashes": {
894
+ "hash_examples": "e4d9284367dff68f",
895
+ "hash_full_prompts": "e54e45a5cf33ea4c",
896
+ "hash_input_tokens": "34a9d9fffd631680",
897
+ "hash_cont_tokens": "d0dac74d66b7aeb3"
898
+ },
899
+ "truncated": 0,
900
+ "non_truncated": 206,
901
+ "padded": 814,
902
+ "non_padded": 7,
903
+ "effective_few_shots": 0.0,
904
+ "num_truncated_few_shots": 0
905
+ },
906
+ "lighteval|agieval:sat-math|0": {
907
+ "hashes": {
908
+ "hash_examples": "01db7291603fc1a0",
909
+ "hash_full_prompts": "e7eac9039959dc4e",
910
+ "hash_input_tokens": "facd30a4b8dc2e58",
911
+ "hash_cont_tokens": "1285c389d940ee00"
912
+ },
913
+ "truncated": 0,
914
+ "non_truncated": 220,
915
+ "padded": 873,
916
+ "non_padded": 7,
917
+ "effective_few_shots": 0.0,
918
+ "num_truncated_few_shots": 0
919
+ }
920
+ },
921
+ "summary_general": {
922
+ "hashes": {
923
+ "hash_examples": "da3af66181f18ddf",
924
+ "hash_full_prompts": "32b4cf577fa823b0",
925
+ "hash_input_tokens": "1c8f5afd9ce640d2",
926
+ "hash_cont_tokens": "302bf87189b05e5d"
927
+ },
928
+ "truncated": 0,
929
+ "non_truncated": 5151,
930
+ "padded": 21746,
931
+ "non_padded": 118,
932
+ "num_truncated_few_shots": 0
933
+ }
934
+ }