lewtun HF staff commited on
Commit
70e7343
·
verified ·
1 Parent(s): 1f6d2c9

Upload eval_results/orpo-explorers/argilla-mistral-orpo-OpenHermesPreferences-50k-beta-0.2/main/bbh/results_2024-05-09T19-42-29.906304.json with huggingface_hub

Browse files
eval_results/orpo-explorers/argilla-mistral-orpo-OpenHermesPreferences-50k-beta-0.2/main/bbh/results_2024-05-09T19-42-29.906304.json ADDED
@@ -0,0 +1,1000 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 2423635.741185222,
9
+ "end_time": 2423731.717094719,
10
+ "total_evaluation_time_secondes": "95.97590949712321",
11
+ "model_name": "orpo-explorers/argilla-mistral-orpo-OpenHermesPreferences-50k-beta-0.2",
12
+ "model_sha": "8e456b857961e00f2e1b076756e84b97a3b88ea5",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "13.99 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|bigbench:causal_judgment|0": {
19
+ "acc": 0.5421052631578948,
20
+ "acc_stderr": 0.03624046284425957
21
+ },
22
+ "lighteval|bigbench:date_understanding|0": {
23
+ "acc": 0.4173441734417344,
24
+ "acc_stderr": 0.025705692903559226
25
+ },
26
+ "lighteval|bigbench:disambiguation_qa|0": {
27
+ "acc": 0.6007751937984496,
28
+ "acc_stderr": 0.03054908408103616
29
+ },
30
+ "lighteval|bigbench:geometric_shapes|0": {
31
+ "acc": 0.19166666666666668,
32
+ "acc_stderr": 0.020774056803809365
33
+ },
34
+ "lighteval|bigbench:logical_deduction_five_objects|0": {
35
+ "acc": 0.2,
36
+ "acc_stderr": 0.01790645924143384
37
+ },
38
+ "lighteval|bigbench:logical_deduction_seven_objects|0": {
39
+ "acc": 0.14285714285714285,
40
+ "acc_stderr": 0.013235458703202278
41
+ },
42
+ "lighteval|bigbench:logical_deduction_three_objects|0": {
43
+ "acc": 0.3333333333333333,
44
+ "acc_stderr": 0.027262027336984396
45
+ },
46
+ "lighteval|bigbench:movie_recommendation|0": {
47
+ "acc": 0.498,
48
+ "acc_stderr": 0.02238289498648353
49
+ },
50
+ "lighteval|bigbench:navigate|0": {
51
+ "acc": 0.524,
52
+ "acc_stderr": 0.015801065586651755
53
+ },
54
+ "lighteval|bigbench:reasoning_about_colored_objects|0": {
55
+ "acc": 0.263,
56
+ "acc_stderr": 0.009847029094655511
57
+ },
58
+ "lighteval|bigbench:ruin_names|0": {
59
+ "acc": 0.26339285714285715,
60
+ "acc_stderr": 0.02083369001657861
61
+ },
62
+ "lighteval|bigbench:salient_translation_error_detection|0": {
63
+ "acc": 0.28857715430861725,
64
+ "acc_stderr": 0.014349847898982434
65
+ },
66
+ "lighteval|bigbench:snarks|0": {
67
+ "acc": 0.4696132596685083,
68
+ "acc_stderr": 0.037198913216803256
69
+ },
70
+ "lighteval|bigbench:sports_understanding|0": {
71
+ "acc": 0.552,
72
+ "acc_stderr": 0.01573351656634783
73
+ },
74
+ "lighteval|bigbench:temporal_sequences|0": {
75
+ "acc": 0.069,
76
+ "acc_stderr": 0.008018934050315151
77
+ },
78
+ "lighteval|bigbench:tracking_shuffled_objects_five_objects|0": {
79
+ "acc": 0.2,
80
+ "acc_stderr": 0.011318236699485788
81
+ },
82
+ "lighteval|bigbench:tracking_shuffled_objects_seven_objects|0": {
83
+ "acc": 0.14285714285714285,
84
+ "acc_stderr": 0.008367248752248818
85
+ },
86
+ "lighteval|bigbench:tracking_shuffled_objects_three_objects|0": {
87
+ "acc": 0.3333333333333333,
88
+ "acc_stderr": 0.027262027336984396
89
+ },
90
+ "lighteval|bigbench:_average|0": {
91
+ "acc": 0.3351030844758711,
92
+ "acc_stderr": 0.02015481367332344
93
+ },
94
+ "all": {
95
+ "acc": 0.3351030844758711,
96
+ "acc_stderr": 0.02015481367332344
97
+ }
98
+ },
99
+ "versions": {
100
+ "lighteval|bigbench:causal_judgment|0": 0,
101
+ "lighteval|bigbench:date_understanding|0": 0,
102
+ "lighteval|bigbench:disambiguation_qa|0": 0,
103
+ "lighteval|bigbench:geometric_shapes|0": 0,
104
+ "lighteval|bigbench:logical_deduction_five_objects|0": 0,
105
+ "lighteval|bigbench:logical_deduction_seven_objects|0": 0,
106
+ "lighteval|bigbench:logical_deduction_three_objects|0": 0,
107
+ "lighteval|bigbench:movie_recommendation|0": 0,
108
+ "lighteval|bigbench:navigate|0": 0,
109
+ "lighteval|bigbench:reasoning_about_colored_objects|0": 0,
110
+ "lighteval|bigbench:ruin_names|0": 0,
111
+ "lighteval|bigbench:salient_translation_error_detection|0": 0,
112
+ "lighteval|bigbench:snarks|0": 0,
113
+ "lighteval|bigbench:sports_understanding|0": 0,
114
+ "lighteval|bigbench:temporal_sequences|0": 0,
115
+ "lighteval|bigbench:tracking_shuffled_objects_five_objects|0": 0,
116
+ "lighteval|bigbench:tracking_shuffled_objects_seven_objects|0": 0,
117
+ "lighteval|bigbench:tracking_shuffled_objects_three_objects|0": 0
118
+ },
119
+ "config_tasks": {
120
+ "lighteval|bigbench:causal_judgment": {
121
+ "name": "bigbench:causal_judgment",
122
+ "prompt_function": "bbh_lighteval",
123
+ "hf_repo": "lighteval/bbh",
124
+ "hf_subset": "causal_judgement",
125
+ "metric": [
126
+ "loglikelihood_acc_single_token"
127
+ ],
128
+ "hf_avail_splits": [
129
+ "train"
130
+ ],
131
+ "evaluation_splits": [
132
+ "train"
133
+ ],
134
+ "few_shots_split": null,
135
+ "few_shots_select": null,
136
+ "generation_size": -1,
137
+ "stop_sequence": [
138
+ "</s>",
139
+ "Q:",
140
+ "\n\n"
141
+ ],
142
+ "output_regex": null,
143
+ "num_samples": null,
144
+ "frozen": false,
145
+ "suite": [
146
+ "lighteval"
147
+ ],
148
+ "original_num_docs": 190,
149
+ "effective_num_docs": 190,
150
+ "trust_dataset": true,
151
+ "must_remove_duplicate_docs": null,
152
+ "version": 0
153
+ },
154
+ "lighteval|bigbench:date_understanding": {
155
+ "name": "bigbench:date_understanding",
156
+ "prompt_function": "bbh_lighteval",
157
+ "hf_repo": "lighteval/bbh",
158
+ "hf_subset": "date_understanding",
159
+ "metric": [
160
+ "loglikelihood_acc_single_token"
161
+ ],
162
+ "hf_avail_splits": [
163
+ "train"
164
+ ],
165
+ "evaluation_splits": [
166
+ "train"
167
+ ],
168
+ "few_shots_split": null,
169
+ "few_shots_select": null,
170
+ "generation_size": -1,
171
+ "stop_sequence": [
172
+ "</s>",
173
+ "Q:",
174
+ "\n\n"
175
+ ],
176
+ "output_regex": null,
177
+ "num_samples": null,
178
+ "frozen": false,
179
+ "suite": [
180
+ "lighteval"
181
+ ],
182
+ "original_num_docs": 369,
183
+ "effective_num_docs": 369,
184
+ "trust_dataset": true,
185
+ "must_remove_duplicate_docs": null,
186
+ "version": 0
187
+ },
188
+ "lighteval|bigbench:disambiguation_qa": {
189
+ "name": "bigbench:disambiguation_qa",
190
+ "prompt_function": "bbh_lighteval",
191
+ "hf_repo": "lighteval/bbh",
192
+ "hf_subset": "disambiguation_qa",
193
+ "metric": [
194
+ "loglikelihood_acc_single_token"
195
+ ],
196
+ "hf_avail_splits": [
197
+ "train"
198
+ ],
199
+ "evaluation_splits": [
200
+ "train"
201
+ ],
202
+ "few_shots_split": null,
203
+ "few_shots_select": null,
204
+ "generation_size": -1,
205
+ "stop_sequence": [
206
+ "</s>",
207
+ "Q:",
208
+ "\n\n"
209
+ ],
210
+ "output_regex": null,
211
+ "num_samples": null,
212
+ "frozen": false,
213
+ "suite": [
214
+ "lighteval"
215
+ ],
216
+ "original_num_docs": 258,
217
+ "effective_num_docs": 258,
218
+ "trust_dataset": true,
219
+ "must_remove_duplicate_docs": null,
220
+ "version": 0
221
+ },
222
+ "lighteval|bigbench:geometric_shapes": {
223
+ "name": "bigbench:geometric_shapes",
224
+ "prompt_function": "bbh_lighteval",
225
+ "hf_repo": "lighteval/bbh",
226
+ "hf_subset": "geometric_shapes",
227
+ "metric": [
228
+ "loglikelihood_acc_single_token"
229
+ ],
230
+ "hf_avail_splits": [
231
+ "train"
232
+ ],
233
+ "evaluation_splits": [
234
+ "train"
235
+ ],
236
+ "few_shots_split": null,
237
+ "few_shots_select": null,
238
+ "generation_size": -1,
239
+ "stop_sequence": [
240
+ "</s>",
241
+ "Q:",
242
+ "\n\n"
243
+ ],
244
+ "output_regex": null,
245
+ "num_samples": null,
246
+ "frozen": false,
247
+ "suite": [
248
+ "lighteval"
249
+ ],
250
+ "original_num_docs": 360,
251
+ "effective_num_docs": 360,
252
+ "trust_dataset": true,
253
+ "must_remove_duplicate_docs": null,
254
+ "version": 0
255
+ },
256
+ "lighteval|bigbench:logical_deduction_five_objects": {
257
+ "name": "bigbench:logical_deduction_five_objects",
258
+ "prompt_function": "bbh_lighteval",
259
+ "hf_repo": "lighteval/bbh",
260
+ "hf_subset": "logical_deduction_five_objects",
261
+ "metric": [
262
+ "loglikelihood_acc_single_token"
263
+ ],
264
+ "hf_avail_splits": [
265
+ "train"
266
+ ],
267
+ "evaluation_splits": [
268
+ "train"
269
+ ],
270
+ "few_shots_split": null,
271
+ "few_shots_select": null,
272
+ "generation_size": -1,
273
+ "stop_sequence": [
274
+ "</s>",
275
+ "Q:",
276
+ "\n\n"
277
+ ],
278
+ "output_regex": null,
279
+ "num_samples": null,
280
+ "frozen": false,
281
+ "suite": [
282
+ "lighteval"
283
+ ],
284
+ "original_num_docs": 500,
285
+ "effective_num_docs": 500,
286
+ "trust_dataset": true,
287
+ "must_remove_duplicate_docs": null,
288
+ "version": 0
289
+ },
290
+ "lighteval|bigbench:logical_deduction_seven_objects": {
291
+ "name": "bigbench:logical_deduction_seven_objects",
292
+ "prompt_function": "bbh_lighteval",
293
+ "hf_repo": "lighteval/bbh",
294
+ "hf_subset": "logical_deduction_seven_objects",
295
+ "metric": [
296
+ "loglikelihood_acc_single_token"
297
+ ],
298
+ "hf_avail_splits": [
299
+ "train"
300
+ ],
301
+ "evaluation_splits": [
302
+ "train"
303
+ ],
304
+ "few_shots_split": null,
305
+ "few_shots_select": null,
306
+ "generation_size": -1,
307
+ "stop_sequence": [
308
+ "</s>",
309
+ "Q:",
310
+ "\n\n"
311
+ ],
312
+ "output_regex": null,
313
+ "num_samples": null,
314
+ "frozen": false,
315
+ "suite": [
316
+ "lighteval"
317
+ ],
318
+ "original_num_docs": 700,
319
+ "effective_num_docs": 700,
320
+ "trust_dataset": true,
321
+ "must_remove_duplicate_docs": null,
322
+ "version": 0
323
+ },
324
+ "lighteval|bigbench:logical_deduction_three_objects": {
325
+ "name": "bigbench:logical_deduction_three_objects",
326
+ "prompt_function": "bbh_lighteval",
327
+ "hf_repo": "lighteval/bbh",
328
+ "hf_subset": "logical_deduction_three_objects",
329
+ "metric": [
330
+ "loglikelihood_acc_single_token"
331
+ ],
332
+ "hf_avail_splits": [
333
+ "train"
334
+ ],
335
+ "evaluation_splits": [
336
+ "train"
337
+ ],
338
+ "few_shots_split": null,
339
+ "few_shots_select": null,
340
+ "generation_size": -1,
341
+ "stop_sequence": [
342
+ "</s>",
343
+ "Q:",
344
+ "\n\n"
345
+ ],
346
+ "output_regex": null,
347
+ "num_samples": null,
348
+ "frozen": false,
349
+ "suite": [
350
+ "lighteval"
351
+ ],
352
+ "original_num_docs": 300,
353
+ "effective_num_docs": 300,
354
+ "trust_dataset": true,
355
+ "must_remove_duplicate_docs": null,
356
+ "version": 0
357
+ },
358
+ "lighteval|bigbench:movie_recommendation": {
359
+ "name": "bigbench:movie_recommendation",
360
+ "prompt_function": "bbh_lighteval",
361
+ "hf_repo": "lighteval/bbh",
362
+ "hf_subset": "movie_recommendation",
363
+ "metric": [
364
+ "loglikelihood_acc_single_token"
365
+ ],
366
+ "hf_avail_splits": [
367
+ "train"
368
+ ],
369
+ "evaluation_splits": [
370
+ "train"
371
+ ],
372
+ "few_shots_split": null,
373
+ "few_shots_select": null,
374
+ "generation_size": -1,
375
+ "stop_sequence": [
376
+ "</s>",
377
+ "Q:",
378
+ "\n\n"
379
+ ],
380
+ "output_regex": null,
381
+ "num_samples": null,
382
+ "frozen": false,
383
+ "suite": [
384
+ "lighteval"
385
+ ],
386
+ "original_num_docs": 500,
387
+ "effective_num_docs": 500,
388
+ "trust_dataset": true,
389
+ "must_remove_duplicate_docs": null,
390
+ "version": 0
391
+ },
392
+ "lighteval|bigbench:navigate": {
393
+ "name": "bigbench:navigate",
394
+ "prompt_function": "bbh_lighteval",
395
+ "hf_repo": "lighteval/bbh",
396
+ "hf_subset": "navigate",
397
+ "metric": [
398
+ "loglikelihood_acc_single_token"
399
+ ],
400
+ "hf_avail_splits": [
401
+ "train"
402
+ ],
403
+ "evaluation_splits": [
404
+ "train"
405
+ ],
406
+ "few_shots_split": null,
407
+ "few_shots_select": null,
408
+ "generation_size": -1,
409
+ "stop_sequence": [
410
+ "</s>",
411
+ "Q:",
412
+ "\n\n"
413
+ ],
414
+ "output_regex": null,
415
+ "num_samples": null,
416
+ "frozen": false,
417
+ "suite": [
418
+ "lighteval"
419
+ ],
420
+ "original_num_docs": 1000,
421
+ "effective_num_docs": 1000,
422
+ "trust_dataset": true,
423
+ "must_remove_duplicate_docs": null,
424
+ "version": 0
425
+ },
426
+ "lighteval|bigbench:reasoning_about_colored_objects": {
427
+ "name": "bigbench:reasoning_about_colored_objects",
428
+ "prompt_function": "bbh_lighteval",
429
+ "hf_repo": "lighteval/bbh",
430
+ "hf_subset": "reasoning_about_colored_objects",
431
+ "metric": [
432
+ "loglikelihood_acc_single_token"
433
+ ],
434
+ "hf_avail_splits": [
435
+ "train"
436
+ ],
437
+ "evaluation_splits": [
438
+ "train"
439
+ ],
440
+ "few_shots_split": null,
441
+ "few_shots_select": null,
442
+ "generation_size": -1,
443
+ "stop_sequence": [
444
+ "</s>",
445
+ "Q:",
446
+ "\n\n"
447
+ ],
448
+ "output_regex": null,
449
+ "num_samples": null,
450
+ "frozen": false,
451
+ "suite": [
452
+ "lighteval"
453
+ ],
454
+ "original_num_docs": 2000,
455
+ "effective_num_docs": 2000,
456
+ "trust_dataset": true,
457
+ "must_remove_duplicate_docs": null,
458
+ "version": 0
459
+ },
460
+ "lighteval|bigbench:ruin_names": {
461
+ "name": "bigbench:ruin_names",
462
+ "prompt_function": "bbh_lighteval",
463
+ "hf_repo": "lighteval/bbh",
464
+ "hf_subset": "ruin_names",
465
+ "metric": [
466
+ "loglikelihood_acc_single_token"
467
+ ],
468
+ "hf_avail_splits": [
469
+ "train"
470
+ ],
471
+ "evaluation_splits": [
472
+ "train"
473
+ ],
474
+ "few_shots_split": null,
475
+ "few_shots_select": null,
476
+ "generation_size": -1,
477
+ "stop_sequence": [
478
+ "</s>",
479
+ "Q:",
480
+ "\n\n"
481
+ ],
482
+ "output_regex": null,
483
+ "num_samples": null,
484
+ "frozen": false,
485
+ "suite": [
486
+ "lighteval"
487
+ ],
488
+ "original_num_docs": 448,
489
+ "effective_num_docs": 448,
490
+ "trust_dataset": true,
491
+ "must_remove_duplicate_docs": null,
492
+ "version": 0
493
+ },
494
+ "lighteval|bigbench:salient_translation_error_detection": {
495
+ "name": "bigbench:salient_translation_error_detection",
496
+ "prompt_function": "bbh_lighteval",
497
+ "hf_repo": "lighteval/bbh",
498
+ "hf_subset": "salient_translation_error_detection",
499
+ "metric": [
500
+ "loglikelihood_acc_single_token"
501
+ ],
502
+ "hf_avail_splits": [
503
+ "train"
504
+ ],
505
+ "evaluation_splits": [
506
+ "train"
507
+ ],
508
+ "few_shots_split": null,
509
+ "few_shots_select": null,
510
+ "generation_size": -1,
511
+ "stop_sequence": [
512
+ "</s>",
513
+ "Q:",
514
+ "\n\n"
515
+ ],
516
+ "output_regex": null,
517
+ "num_samples": null,
518
+ "frozen": false,
519
+ "suite": [
520
+ "lighteval"
521
+ ],
522
+ "original_num_docs": 998,
523
+ "effective_num_docs": 998,
524
+ "trust_dataset": true,
525
+ "must_remove_duplicate_docs": null,
526
+ "version": 0
527
+ },
528
+ "lighteval|bigbench:snarks": {
529
+ "name": "bigbench:snarks",
530
+ "prompt_function": "bbh_lighteval",
531
+ "hf_repo": "lighteval/bbh",
532
+ "hf_subset": "snarks",
533
+ "metric": [
534
+ "loglikelihood_acc_single_token"
535
+ ],
536
+ "hf_avail_splits": [
537
+ "train"
538
+ ],
539
+ "evaluation_splits": [
540
+ "train"
541
+ ],
542
+ "few_shots_split": null,
543
+ "few_shots_select": null,
544
+ "generation_size": -1,
545
+ "stop_sequence": [
546
+ "</s>",
547
+ "Q:",
548
+ "\n\n"
549
+ ],
550
+ "output_regex": null,
551
+ "num_samples": null,
552
+ "frozen": false,
553
+ "suite": [
554
+ "lighteval"
555
+ ],
556
+ "original_num_docs": 181,
557
+ "effective_num_docs": 181,
558
+ "trust_dataset": true,
559
+ "must_remove_duplicate_docs": null,
560
+ "version": 0
561
+ },
562
+ "lighteval|bigbench:sports_understanding": {
563
+ "name": "bigbench:sports_understanding",
564
+ "prompt_function": "bbh_lighteval",
565
+ "hf_repo": "lighteval/bbh",
566
+ "hf_subset": "sports_understanding",
567
+ "metric": [
568
+ "loglikelihood_acc_single_token"
569
+ ],
570
+ "hf_avail_splits": [
571
+ "train"
572
+ ],
573
+ "evaluation_splits": [
574
+ "train"
575
+ ],
576
+ "few_shots_split": null,
577
+ "few_shots_select": null,
578
+ "generation_size": -1,
579
+ "stop_sequence": [
580
+ "</s>",
581
+ "Q:",
582
+ "\n\n"
583
+ ],
584
+ "output_regex": null,
585
+ "num_samples": null,
586
+ "frozen": false,
587
+ "suite": [
588
+ "lighteval"
589
+ ],
590
+ "original_num_docs": 1000,
591
+ "effective_num_docs": 1000,
592
+ "trust_dataset": true,
593
+ "must_remove_duplicate_docs": null,
594
+ "version": 0
595
+ },
596
+ "lighteval|bigbench:temporal_sequences": {
597
+ "name": "bigbench:temporal_sequences",
598
+ "prompt_function": "bbh_lighteval",
599
+ "hf_repo": "lighteval/bbh",
600
+ "hf_subset": "temporal_sequences",
601
+ "metric": [
602
+ "loglikelihood_acc_single_token"
603
+ ],
604
+ "hf_avail_splits": [
605
+ "train"
606
+ ],
607
+ "evaluation_splits": [
608
+ "train"
609
+ ],
610
+ "few_shots_split": null,
611
+ "few_shots_select": null,
612
+ "generation_size": -1,
613
+ "stop_sequence": [
614
+ "</s>",
615
+ "Q:",
616
+ "\n\n"
617
+ ],
618
+ "output_regex": null,
619
+ "num_samples": null,
620
+ "frozen": false,
621
+ "suite": [
622
+ "lighteval"
623
+ ],
624
+ "original_num_docs": 1000,
625
+ "effective_num_docs": 1000,
626
+ "trust_dataset": true,
627
+ "must_remove_duplicate_docs": null,
628
+ "version": 0
629
+ },
630
+ "lighteval|bigbench:tracking_shuffled_objects_five_objects": {
631
+ "name": "bigbench:tracking_shuffled_objects_five_objects",
632
+ "prompt_function": "bbh_lighteval",
633
+ "hf_repo": "lighteval/bbh",
634
+ "hf_subset": "tracking_shuffled_objects_five_objects",
635
+ "metric": [
636
+ "loglikelihood_acc_single_token"
637
+ ],
638
+ "hf_avail_splits": [
639
+ "train"
640
+ ],
641
+ "evaluation_splits": [
642
+ "train"
643
+ ],
644
+ "few_shots_split": null,
645
+ "few_shots_select": null,
646
+ "generation_size": -1,
647
+ "stop_sequence": [
648
+ "</s>",
649
+ "Q:",
650
+ "\n\n"
651
+ ],
652
+ "output_regex": null,
653
+ "num_samples": null,
654
+ "frozen": false,
655
+ "suite": [
656
+ "lighteval"
657
+ ],
658
+ "original_num_docs": 1250,
659
+ "effective_num_docs": 1250,
660
+ "trust_dataset": true,
661
+ "must_remove_duplicate_docs": null,
662
+ "version": 0
663
+ },
664
+ "lighteval|bigbench:tracking_shuffled_objects_seven_objects": {
665
+ "name": "bigbench:tracking_shuffled_objects_seven_objects",
666
+ "prompt_function": "bbh_lighteval",
667
+ "hf_repo": "lighteval/bbh",
668
+ "hf_subset": "tracking_shuffled_objects_seven_objects",
669
+ "metric": [
670
+ "loglikelihood_acc_single_token"
671
+ ],
672
+ "hf_avail_splits": [
673
+ "train"
674
+ ],
675
+ "evaluation_splits": [
676
+ "train"
677
+ ],
678
+ "few_shots_split": null,
679
+ "few_shots_select": null,
680
+ "generation_size": -1,
681
+ "stop_sequence": [
682
+ "</s>",
683
+ "Q:",
684
+ "\n\n"
685
+ ],
686
+ "output_regex": null,
687
+ "num_samples": null,
688
+ "frozen": false,
689
+ "suite": [
690
+ "lighteval"
691
+ ],
692
+ "original_num_docs": 1750,
693
+ "effective_num_docs": 1750,
694
+ "trust_dataset": true,
695
+ "must_remove_duplicate_docs": null,
696
+ "version": 0
697
+ },
698
+ "lighteval|bigbench:tracking_shuffled_objects_three_objects": {
699
+ "name": "bigbench:tracking_shuffled_objects_three_objects",
700
+ "prompt_function": "bbh_lighteval",
701
+ "hf_repo": "lighteval/bbh",
702
+ "hf_subset": "tracking_shuffled_objects_three_objects",
703
+ "metric": [
704
+ "loglikelihood_acc_single_token"
705
+ ],
706
+ "hf_avail_splits": [
707
+ "train"
708
+ ],
709
+ "evaluation_splits": [
710
+ "train"
711
+ ],
712
+ "few_shots_split": null,
713
+ "few_shots_select": null,
714
+ "generation_size": -1,
715
+ "stop_sequence": [
716
+ "</s>",
717
+ "Q:",
718
+ "\n\n"
719
+ ],
720
+ "output_regex": null,
721
+ "num_samples": null,
722
+ "frozen": false,
723
+ "suite": [
724
+ "lighteval"
725
+ ],
726
+ "original_num_docs": 300,
727
+ "effective_num_docs": 300,
728
+ "trust_dataset": true,
729
+ "must_remove_duplicate_docs": null,
730
+ "version": 0
731
+ }
732
+ },
733
+ "summary_tasks": {
734
+ "lighteval|bigbench:causal_judgment|0": {
735
+ "hashes": {
736
+ "hash_examples": "dfb1ae47218f2850",
737
+ "hash_full_prompts": "92b1cf75ca896127",
738
+ "hash_input_tokens": "3fbebfde3354c6ac",
739
+ "hash_cont_tokens": "5f93270a3abb23cb"
740
+ },
741
+ "truncated": 0,
742
+ "non_truncated": 190,
743
+ "padded": 189,
744
+ "non_padded": 1,
745
+ "effective_few_shots": 0.0,
746
+ "num_truncated_few_shots": 0
747
+ },
748
+ "lighteval|bigbench:date_understanding|0": {
749
+ "hashes": {
750
+ "hash_examples": "2b823c41500a6ec2",
751
+ "hash_full_prompts": "a086589baadb24a5",
752
+ "hash_input_tokens": "262880ca53810c51",
753
+ "hash_cont_tokens": "a0df85f715ef8665"
754
+ },
755
+ "truncated": 0,
756
+ "non_truncated": 369,
757
+ "padded": 369,
758
+ "non_padded": 0,
759
+ "effective_few_shots": 0.0,
760
+ "num_truncated_few_shots": 0
761
+ },
762
+ "lighteval|bigbench:disambiguation_qa|0": {
763
+ "hashes": {
764
+ "hash_examples": "2a4c3d41db198cea",
765
+ "hash_full_prompts": "407f0b9a565699a7",
766
+ "hash_input_tokens": "7159e757a625c662",
767
+ "hash_cont_tokens": "ea8c6c727a0e9657"
768
+ },
769
+ "truncated": 0,
770
+ "non_truncated": 258,
771
+ "padded": 258,
772
+ "non_padded": 0,
773
+ "effective_few_shots": 0.0,
774
+ "num_truncated_few_shots": 0
775
+ },
776
+ "lighteval|bigbench:geometric_shapes|0": {
777
+ "hashes": {
778
+ "hash_examples": "24aa261103911b72",
779
+ "hash_full_prompts": "c7a3189ee1642ab5",
780
+ "hash_input_tokens": "2bbd71a22a0967ff",
781
+ "hash_cont_tokens": "cfff7ac805aca5af"
782
+ },
783
+ "truncated": 0,
784
+ "non_truncated": 360,
785
+ "padded": 360,
786
+ "non_padded": 0,
787
+ "effective_few_shots": 0.0,
788
+ "num_truncated_few_shots": 0
789
+ },
790
+ "lighteval|bigbench:logical_deduction_five_objects|0": {
791
+ "hashes": {
792
+ "hash_examples": "cb5bdc92afc41f83",
793
+ "hash_full_prompts": "199211ceee2c8d60",
794
+ "hash_input_tokens": "0c67894c26b553e1",
795
+ "hash_cont_tokens": "1f4302f16658f321"
796
+ },
797
+ "truncated": 0,
798
+ "non_truncated": 500,
799
+ "padded": 500,
800
+ "non_padded": 0,
801
+ "effective_few_shots": 0.0,
802
+ "num_truncated_few_shots": 0
803
+ },
804
+ "lighteval|bigbench:logical_deduction_seven_objects|0": {
805
+ "hashes": {
806
+ "hash_examples": "b6805ea696739f9f",
807
+ "hash_full_prompts": "38f66304971bdfb0",
808
+ "hash_input_tokens": "fe8815876ceefc9f",
809
+ "hash_cont_tokens": "bc16fbeb3f267778"
810
+ },
811
+ "truncated": 0,
812
+ "non_truncated": 700,
813
+ "padded": 700,
814
+ "non_padded": 0,
815
+ "effective_few_shots": 0.0,
816
+ "num_truncated_few_shots": 0
817
+ },
818
+ "lighteval|bigbench:logical_deduction_three_objects|0": {
819
+ "hashes": {
820
+ "hash_examples": "0509e5712ab9bcdb",
821
+ "hash_full_prompts": "0a9439e283842405",
822
+ "hash_input_tokens": "bdaf9a4ad6d5308d",
823
+ "hash_cont_tokens": "17b92ec4d7f3b0a6"
824
+ },
825
+ "truncated": 0,
826
+ "non_truncated": 300,
827
+ "padded": 300,
828
+ "non_padded": 0,
829
+ "effective_few_shots": 0.0,
830
+ "num_truncated_few_shots": 0
831
+ },
832
+ "lighteval|bigbench:movie_recommendation|0": {
833
+ "hashes": {
834
+ "hash_examples": "530cc6f737830f45",
835
+ "hash_full_prompts": "e7d59f843d80e6ba",
836
+ "hash_input_tokens": "d107040c803cc28a",
837
+ "hash_cont_tokens": "1a79efad33e450d9"
838
+ },
839
+ "truncated": 0,
840
+ "non_truncated": 500,
841
+ "padded": 500,
842
+ "non_padded": 0,
843
+ "effective_few_shots": 0.0,
844
+ "num_truncated_few_shots": 0
845
+ },
846
+ "lighteval|bigbench:navigate|0": {
847
+ "hashes": {
848
+ "hash_examples": "7962ef85d0058b9a",
849
+ "hash_full_prompts": "d58b607419984968",
850
+ "hash_input_tokens": "f163e125b820f371",
851
+ "hash_cont_tokens": "4cba3a6b51df2f82"
852
+ },
853
+ "truncated": 0,
854
+ "non_truncated": 1000,
855
+ "padded": 988,
856
+ "non_padded": 12,
857
+ "effective_few_shots": 0.0,
858
+ "num_truncated_few_shots": 0
859
+ },
860
+ "lighteval|bigbench:reasoning_about_colored_objects|0": {
861
+ "hashes": {
862
+ "hash_examples": "39be1ab1677a651d",
863
+ "hash_full_prompts": "157347bb0e7fe3f1",
864
+ "hash_input_tokens": "562dab78f82dce11",
865
+ "hash_cont_tokens": "b460bf430f1c4128"
866
+ },
867
+ "truncated": 0,
868
+ "non_truncated": 2000,
869
+ "padded": 1969,
870
+ "non_padded": 31,
871
+ "effective_few_shots": 0.0,
872
+ "num_truncated_few_shots": 0
873
+ },
874
+ "lighteval|bigbench:ruin_names|0": {
875
+ "hashes": {
876
+ "hash_examples": "e9b96b31d2154941",
877
+ "hash_full_prompts": "9cb73d2fcaf5ee1e",
878
+ "hash_input_tokens": "07268ad2915fab08",
879
+ "hash_cont_tokens": "da425c7c151ed7c3"
880
+ },
881
+ "truncated": 0,
882
+ "non_truncated": 448,
883
+ "padded": 442,
884
+ "non_padded": 6,
885
+ "effective_few_shots": 0.0,
886
+ "num_truncated_few_shots": 0
887
+ },
888
+ "lighteval|bigbench:salient_translation_error_detection|0": {
889
+ "hashes": {
890
+ "hash_examples": "951ac59f7ad0427d",
891
+ "hash_full_prompts": "3598ce066bb83298",
892
+ "hash_input_tokens": "d514272792c1a43b",
893
+ "hash_cont_tokens": "52a851b402ce7859"
894
+ },
895
+ "truncated": 0,
896
+ "non_truncated": 998,
897
+ "padded": 998,
898
+ "non_padded": 0,
899
+ "effective_few_shots": 0.0,
900
+ "num_truncated_few_shots": 0
901
+ },
902
+ "lighteval|bigbench:snarks|0": {
903
+ "hashes": {
904
+ "hash_examples": "3a53eb9b9d758534",
905
+ "hash_full_prompts": "fb6c17d84dd479d6",
906
+ "hash_input_tokens": "a714129b2023dfb4",
907
+ "hash_cont_tokens": "fcf8b70a05755b30"
908
+ },
909
+ "truncated": 0,
910
+ "non_truncated": 181,
911
+ "padded": 178,
912
+ "non_padded": 3,
913
+ "effective_few_shots": 0.0,
914
+ "num_truncated_few_shots": 0
915
+ },
916
+ "lighteval|bigbench:sports_understanding|0": {
917
+ "hashes": {
918
+ "hash_examples": "bd65741f00770373",
919
+ "hash_full_prompts": "467a508a87ae3ce4",
920
+ "hash_input_tokens": "c1c3655837258597",
921
+ "hash_cont_tokens": "3a6f302fe5d32bc4"
922
+ },
923
+ "truncated": 0,
924
+ "non_truncated": 1000,
925
+ "padded": 1000,
926
+ "non_padded": 0,
927
+ "effective_few_shots": 0.0,
928
+ "num_truncated_few_shots": 0
929
+ },
930
+ "lighteval|bigbench:temporal_sequences|0": {
931
+ "hashes": {
932
+ "hash_examples": "1d13139f47cb2df7",
933
+ "hash_full_prompts": "c1b66e439c72477b",
934
+ "hash_input_tokens": "d47a810e1658bcc6",
935
+ "hash_cont_tokens": "156d739a1e6ce872"
936
+ },
937
+ "truncated": 0,
938
+ "non_truncated": 1000,
939
+ "padded": 1000,
940
+ "non_padded": 0,
941
+ "effective_few_shots": 0.0,
942
+ "num_truncated_few_shots": 0
943
+ },
944
+ "lighteval|bigbench:tracking_shuffled_objects_five_objects|0": {
945
+ "hashes": {
946
+ "hash_examples": "8770a702a9646648",
947
+ "hash_full_prompts": "8c353eaa84f712ff",
948
+ "hash_input_tokens": "e940ffaad712c055",
949
+ "hash_cont_tokens": "bb8b1af3f905c2e3"
950
+ },
951
+ "truncated": 0,
952
+ "non_truncated": 1250,
953
+ "padded": 1180,
954
+ "non_padded": 70,
955
+ "effective_few_shots": 0.0,
956
+ "num_truncated_few_shots": 0
957
+ },
958
+ "lighteval|bigbench:tracking_shuffled_objects_seven_objects|0": {
959
+ "hashes": {
960
+ "hash_examples": "b469b7d073824a59",
961
+ "hash_full_prompts": "902f5b74467353eb",
962
+ "hash_input_tokens": "9d52e492d2f6cd34",
963
+ "hash_cont_tokens": "0453f99f28d935a3"
964
+ },
965
+ "truncated": 0,
966
+ "non_truncated": 1750,
967
+ "padded": 1701,
968
+ "non_padded": 49,
969
+ "effective_few_shots": 0.0,
970
+ "num_truncated_few_shots": 0
971
+ },
972
+ "lighteval|bigbench:tracking_shuffled_objects_three_objects|0": {
973
+ "hashes": {
974
+ "hash_examples": "0509e5712ab9bcdb",
975
+ "hash_full_prompts": "0a9439e283842405",
976
+ "hash_input_tokens": "5867c63542b2b703",
977
+ "hash_cont_tokens": "8062227d4df3c18a"
978
+ },
979
+ "truncated": 0,
980
+ "non_truncated": 300,
981
+ "padded": 294,
982
+ "non_padded": 6,
983
+ "effective_few_shots": 0.0,
984
+ "num_truncated_few_shots": 0
985
+ }
986
+ },
987
+ "summary_general": {
988
+ "hashes": {
989
+ "hash_examples": "51a30c4501ba4586",
990
+ "hash_full_prompts": "2f8425bef1f1b307",
991
+ "hash_input_tokens": "b84ccc3699968295",
992
+ "hash_cont_tokens": "c93c9cc4e6d49fcb"
993
+ },
994
+ "truncated": 0,
995
+ "non_truncated": 13104,
996
+ "padded": 12926,
997
+ "non_padded": 178,
998
+ "num_truncated_few_shots": 0
999
+ }
1000
+ }