lewtun HF staff commited on
Commit
ec45ed5
·
verified ·
1 Parent(s): 02352db

Upload eval_results/alvarobartt/mistral-7b-orpo-capybara-reproduction/main/bbh/results_2024-03-28T17-29-11.084565.json with huggingface_hub

Browse files
eval_results/alvarobartt/mistral-7b-orpo-capybara-reproduction/main/bbh/results_2024-03-28T17-29-11.084565.json ADDED
@@ -0,0 +1,964 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 1,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 1243716.753066,
9
+ "end_time": 1243936.279035164,
10
+ "total_evaluation_time_secondes": "219.52596916398033",
11
+ "model_name": "alvarobartt/mistral-7b-orpo-capybara-reproduction",
12
+ "model_sha": "0a87f28dcb8483abd79ba9c6c168fc0c78fa78af",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "13.99 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|bigbench:causal_judgment|0": {
19
+ "acc": 0.5105263157894737,
20
+ "acc_stderr": 0.03636158772354769
21
+ },
22
+ "lighteval|bigbench:date_understanding|0": {
23
+ "acc": 0.23848238482384823,
24
+ "acc_stderr": 0.022214892732360197
25
+ },
26
+ "lighteval|bigbench:disambiguation_qa|0": {
27
+ "acc": 0.45348837209302323,
28
+ "acc_stderr": 0.0310539049637848
29
+ },
30
+ "lighteval|bigbench:geometric_shapes|0": {
31
+ "acc": 0.10833333333333334,
32
+ "acc_stderr": 0.01640344605263584
33
+ },
34
+ "lighteval|bigbench:logical_deduction_five_objects|0": {
35
+ "acc": 0.238,
36
+ "acc_stderr": 0.019064072958198442
37
+ },
38
+ "lighteval|bigbench:logical_deduction_seven_objects|0": {
39
+ "acc": 0.15285714285714286,
40
+ "acc_stderr": 0.013610767256635325
41
+ },
42
+ "lighteval|bigbench:logical_deduction_three_objects|0": {
43
+ "acc": 0.38,
44
+ "acc_stderr": 0.028070622832789688
45
+ },
46
+ "lighteval|bigbench:movie_recommendation|0": {
47
+ "acc": 0.458,
48
+ "acc_stderr": 0.022303966774269948
49
+ },
50
+ "lighteval|bigbench:navigate|0": {
51
+ "acc": 0.488,
52
+ "acc_stderr": 0.015814743314581818
53
+ },
54
+ "lighteval|bigbench:reasoning_about_colored_objects|0": {
55
+ "acc": 0.252,
56
+ "acc_stderr": 0.009710568859432962
57
+ },
58
+ "lighteval|bigbench:ruin_names|0": {
59
+ "acc": 0.22991071428571427,
60
+ "acc_stderr": 0.019901984530139517
61
+ },
62
+ "lighteval|bigbench:salient_translation_error_detection|0": {
63
+ "acc": 0.12625250501002003,
64
+ "acc_stderr": 0.010518780548208886
65
+ },
66
+ "lighteval|bigbench:snarks|0": {
67
+ "acc": 0.5303867403314917,
68
+ "acc_stderr": 0.03719891321680327
69
+ },
70
+ "lighteval|bigbench:sports_understanding|0": {
71
+ "acc": 0.62,
72
+ "acc_stderr": 0.015356947477797587
73
+ },
74
+ "lighteval|bigbench:temporal_sequences|0": {
75
+ "acc": 0.217,
76
+ "acc_stderr": 0.01304151375727071
77
+ },
78
+ "lighteval|bigbench:tracking_shuffled_objects_five_objects|0": {
79
+ "acc": 0.1848,
80
+ "acc_stderr": 0.010982516129213015
81
+ },
82
+ "lighteval|bigbench:tracking_shuffled_objects_seven_objects|0": {
83
+ "acc": 0.11314285714285714,
84
+ "acc_stderr": 0.007574352280950085
85
+ },
86
+ "lighteval|bigbench:tracking_shuffled_objects_three_objects|0": {
87
+ "acc": 0.38,
88
+ "acc_stderr": 0.028070622832789688
89
+ },
90
+ "lighteval|bigbench:_average|0": {
91
+ "acc": 0.3156211314259391,
92
+ "acc_stderr": 0.01984745579118942
93
+ },
94
+ "all": {
95
+ "acc": 0.3156211314259391,
96
+ "acc_stderr": 0.01984745579118942
97
+ }
98
+ },
99
+ "versions": {
100
+ "lighteval|bigbench:causal_judgment|0": 0,
101
+ "lighteval|bigbench:date_understanding|0": 0,
102
+ "lighteval|bigbench:disambiguation_qa|0": 0,
103
+ "lighteval|bigbench:geometric_shapes|0": 0,
104
+ "lighteval|bigbench:logical_deduction_five_objects|0": 0,
105
+ "lighteval|bigbench:logical_deduction_seven_objects|0": 0,
106
+ "lighteval|bigbench:logical_deduction_three_objects|0": 0,
107
+ "lighteval|bigbench:movie_recommendation|0": 0,
108
+ "lighteval|bigbench:navigate|0": 0,
109
+ "lighteval|bigbench:reasoning_about_colored_objects|0": 0,
110
+ "lighteval|bigbench:ruin_names|0": 0,
111
+ "lighteval|bigbench:salient_translation_error_detection|0": 0,
112
+ "lighteval|bigbench:snarks|0": 0,
113
+ "lighteval|bigbench:sports_understanding|0": 0,
114
+ "lighteval|bigbench:temporal_sequences|0": 0,
115
+ "lighteval|bigbench:tracking_shuffled_objects_five_objects|0": 0,
116
+ "lighteval|bigbench:tracking_shuffled_objects_seven_objects|0": 0,
117
+ "lighteval|bigbench:tracking_shuffled_objects_three_objects|0": 0
118
+ },
119
+ "config_tasks": {
120
+ "lighteval|bigbench:causal_judgment": {
121
+ "name": "bigbench:causal_judgment",
122
+ "prompt_function": "bbh_lighteval",
123
+ "hf_repo": "lighteval/bbh",
124
+ "hf_subset": "causal_judgement",
125
+ "metric": [
126
+ "loglikelihood_acc_single_token"
127
+ ],
128
+ "hf_avail_splits": [
129
+ "train"
130
+ ],
131
+ "evaluation_splits": [
132
+ "train"
133
+ ],
134
+ "few_shots_split": null,
135
+ "few_shots_select": null,
136
+ "generation_size": -1,
137
+ "stop_sequence": [
138
+ "</s>",
139
+ "Q:",
140
+ "\n\n"
141
+ ],
142
+ "output_regex": null,
143
+ "frozen": false,
144
+ "suite": [
145
+ "lighteval"
146
+ ],
147
+ "original_num_docs": 190,
148
+ "effective_num_docs": 190,
149
+ "trust_dataset": true,
150
+ "must_remove_duplicate_docs": null
151
+ },
152
+ "lighteval|bigbench:date_understanding": {
153
+ "name": "bigbench:date_understanding",
154
+ "prompt_function": "bbh_lighteval",
155
+ "hf_repo": "lighteval/bbh",
156
+ "hf_subset": "date_understanding",
157
+ "metric": [
158
+ "loglikelihood_acc_single_token"
159
+ ],
160
+ "hf_avail_splits": [
161
+ "train"
162
+ ],
163
+ "evaluation_splits": [
164
+ "train"
165
+ ],
166
+ "few_shots_split": null,
167
+ "few_shots_select": null,
168
+ "generation_size": -1,
169
+ "stop_sequence": [
170
+ "</s>",
171
+ "Q:",
172
+ "\n\n"
173
+ ],
174
+ "output_regex": null,
175
+ "frozen": false,
176
+ "suite": [
177
+ "lighteval"
178
+ ],
179
+ "original_num_docs": 369,
180
+ "effective_num_docs": 369,
181
+ "trust_dataset": true,
182
+ "must_remove_duplicate_docs": null
183
+ },
184
+ "lighteval|bigbench:disambiguation_qa": {
185
+ "name": "bigbench:disambiguation_qa",
186
+ "prompt_function": "bbh_lighteval",
187
+ "hf_repo": "lighteval/bbh",
188
+ "hf_subset": "disambiguation_qa",
189
+ "metric": [
190
+ "loglikelihood_acc_single_token"
191
+ ],
192
+ "hf_avail_splits": [
193
+ "train"
194
+ ],
195
+ "evaluation_splits": [
196
+ "train"
197
+ ],
198
+ "few_shots_split": null,
199
+ "few_shots_select": null,
200
+ "generation_size": -1,
201
+ "stop_sequence": [
202
+ "</s>",
203
+ "Q:",
204
+ "\n\n"
205
+ ],
206
+ "output_regex": null,
207
+ "frozen": false,
208
+ "suite": [
209
+ "lighteval"
210
+ ],
211
+ "original_num_docs": 258,
212
+ "effective_num_docs": 258,
213
+ "trust_dataset": true,
214
+ "must_remove_duplicate_docs": null
215
+ },
216
+ "lighteval|bigbench:geometric_shapes": {
217
+ "name": "bigbench:geometric_shapes",
218
+ "prompt_function": "bbh_lighteval",
219
+ "hf_repo": "lighteval/bbh",
220
+ "hf_subset": "geometric_shapes",
221
+ "metric": [
222
+ "loglikelihood_acc_single_token"
223
+ ],
224
+ "hf_avail_splits": [
225
+ "train"
226
+ ],
227
+ "evaluation_splits": [
228
+ "train"
229
+ ],
230
+ "few_shots_split": null,
231
+ "few_shots_select": null,
232
+ "generation_size": -1,
233
+ "stop_sequence": [
234
+ "</s>",
235
+ "Q:",
236
+ "\n\n"
237
+ ],
238
+ "output_regex": null,
239
+ "frozen": false,
240
+ "suite": [
241
+ "lighteval"
242
+ ],
243
+ "original_num_docs": 360,
244
+ "effective_num_docs": 360,
245
+ "trust_dataset": true,
246
+ "must_remove_duplicate_docs": null
247
+ },
248
+ "lighteval|bigbench:logical_deduction_five_objects": {
249
+ "name": "bigbench:logical_deduction_five_objects",
250
+ "prompt_function": "bbh_lighteval",
251
+ "hf_repo": "lighteval/bbh",
252
+ "hf_subset": "logical_deduction_five_objects",
253
+ "metric": [
254
+ "loglikelihood_acc_single_token"
255
+ ],
256
+ "hf_avail_splits": [
257
+ "train"
258
+ ],
259
+ "evaluation_splits": [
260
+ "train"
261
+ ],
262
+ "few_shots_split": null,
263
+ "few_shots_select": null,
264
+ "generation_size": -1,
265
+ "stop_sequence": [
266
+ "</s>",
267
+ "Q:",
268
+ "\n\n"
269
+ ],
270
+ "output_regex": null,
271
+ "frozen": false,
272
+ "suite": [
273
+ "lighteval"
274
+ ],
275
+ "original_num_docs": 500,
276
+ "effective_num_docs": 500,
277
+ "trust_dataset": true,
278
+ "must_remove_duplicate_docs": null
279
+ },
280
+ "lighteval|bigbench:logical_deduction_seven_objects": {
281
+ "name": "bigbench:logical_deduction_seven_objects",
282
+ "prompt_function": "bbh_lighteval",
283
+ "hf_repo": "lighteval/bbh",
284
+ "hf_subset": "logical_deduction_seven_objects",
285
+ "metric": [
286
+ "loglikelihood_acc_single_token"
287
+ ],
288
+ "hf_avail_splits": [
289
+ "train"
290
+ ],
291
+ "evaluation_splits": [
292
+ "train"
293
+ ],
294
+ "few_shots_split": null,
295
+ "few_shots_select": null,
296
+ "generation_size": -1,
297
+ "stop_sequence": [
298
+ "</s>",
299
+ "Q:",
300
+ "\n\n"
301
+ ],
302
+ "output_regex": null,
303
+ "frozen": false,
304
+ "suite": [
305
+ "lighteval"
306
+ ],
307
+ "original_num_docs": 700,
308
+ "effective_num_docs": 700,
309
+ "trust_dataset": true,
310
+ "must_remove_duplicate_docs": null
311
+ },
312
+ "lighteval|bigbench:logical_deduction_three_objects": {
313
+ "name": "bigbench:logical_deduction_three_objects",
314
+ "prompt_function": "bbh_lighteval",
315
+ "hf_repo": "lighteval/bbh",
316
+ "hf_subset": "logical_deduction_three_objects",
317
+ "metric": [
318
+ "loglikelihood_acc_single_token"
319
+ ],
320
+ "hf_avail_splits": [
321
+ "train"
322
+ ],
323
+ "evaluation_splits": [
324
+ "train"
325
+ ],
326
+ "few_shots_split": null,
327
+ "few_shots_select": null,
328
+ "generation_size": -1,
329
+ "stop_sequence": [
330
+ "</s>",
331
+ "Q:",
332
+ "\n\n"
333
+ ],
334
+ "output_regex": null,
335
+ "frozen": false,
336
+ "suite": [
337
+ "lighteval"
338
+ ],
339
+ "original_num_docs": 300,
340
+ "effective_num_docs": 300,
341
+ "trust_dataset": true,
342
+ "must_remove_duplicate_docs": null
343
+ },
344
+ "lighteval|bigbench:movie_recommendation": {
345
+ "name": "bigbench:movie_recommendation",
346
+ "prompt_function": "bbh_lighteval",
347
+ "hf_repo": "lighteval/bbh",
348
+ "hf_subset": "movie_recommendation",
349
+ "metric": [
350
+ "loglikelihood_acc_single_token"
351
+ ],
352
+ "hf_avail_splits": [
353
+ "train"
354
+ ],
355
+ "evaluation_splits": [
356
+ "train"
357
+ ],
358
+ "few_shots_split": null,
359
+ "few_shots_select": null,
360
+ "generation_size": -1,
361
+ "stop_sequence": [
362
+ "</s>",
363
+ "Q:",
364
+ "\n\n"
365
+ ],
366
+ "output_regex": null,
367
+ "frozen": false,
368
+ "suite": [
369
+ "lighteval"
370
+ ],
371
+ "original_num_docs": 500,
372
+ "effective_num_docs": 500,
373
+ "trust_dataset": true,
374
+ "must_remove_duplicate_docs": null
375
+ },
376
+ "lighteval|bigbench:navigate": {
377
+ "name": "bigbench:navigate",
378
+ "prompt_function": "bbh_lighteval",
379
+ "hf_repo": "lighteval/bbh",
380
+ "hf_subset": "navigate",
381
+ "metric": [
382
+ "loglikelihood_acc_single_token"
383
+ ],
384
+ "hf_avail_splits": [
385
+ "train"
386
+ ],
387
+ "evaluation_splits": [
388
+ "train"
389
+ ],
390
+ "few_shots_split": null,
391
+ "few_shots_select": null,
392
+ "generation_size": -1,
393
+ "stop_sequence": [
394
+ "</s>",
395
+ "Q:",
396
+ "\n\n"
397
+ ],
398
+ "output_regex": null,
399
+ "frozen": false,
400
+ "suite": [
401
+ "lighteval"
402
+ ],
403
+ "original_num_docs": 1000,
404
+ "effective_num_docs": 1000,
405
+ "trust_dataset": true,
406
+ "must_remove_duplicate_docs": null
407
+ },
408
+ "lighteval|bigbench:reasoning_about_colored_objects": {
409
+ "name": "bigbench:reasoning_about_colored_objects",
410
+ "prompt_function": "bbh_lighteval",
411
+ "hf_repo": "lighteval/bbh",
412
+ "hf_subset": "reasoning_about_colored_objects",
413
+ "metric": [
414
+ "loglikelihood_acc_single_token"
415
+ ],
416
+ "hf_avail_splits": [
417
+ "train"
418
+ ],
419
+ "evaluation_splits": [
420
+ "train"
421
+ ],
422
+ "few_shots_split": null,
423
+ "few_shots_select": null,
424
+ "generation_size": -1,
425
+ "stop_sequence": [
426
+ "</s>",
427
+ "Q:",
428
+ "\n\n"
429
+ ],
430
+ "output_regex": null,
431
+ "frozen": false,
432
+ "suite": [
433
+ "lighteval"
434
+ ],
435
+ "original_num_docs": 2000,
436
+ "effective_num_docs": 2000,
437
+ "trust_dataset": true,
438
+ "must_remove_duplicate_docs": null
439
+ },
440
+ "lighteval|bigbench:ruin_names": {
441
+ "name": "bigbench:ruin_names",
442
+ "prompt_function": "bbh_lighteval",
443
+ "hf_repo": "lighteval/bbh",
444
+ "hf_subset": "ruin_names",
445
+ "metric": [
446
+ "loglikelihood_acc_single_token"
447
+ ],
448
+ "hf_avail_splits": [
449
+ "train"
450
+ ],
451
+ "evaluation_splits": [
452
+ "train"
453
+ ],
454
+ "few_shots_split": null,
455
+ "few_shots_select": null,
456
+ "generation_size": -1,
457
+ "stop_sequence": [
458
+ "</s>",
459
+ "Q:",
460
+ "\n\n"
461
+ ],
462
+ "output_regex": null,
463
+ "frozen": false,
464
+ "suite": [
465
+ "lighteval"
466
+ ],
467
+ "original_num_docs": 448,
468
+ "effective_num_docs": 448,
469
+ "trust_dataset": true,
470
+ "must_remove_duplicate_docs": null
471
+ },
472
+ "lighteval|bigbench:salient_translation_error_detection": {
473
+ "name": "bigbench:salient_translation_error_detection",
474
+ "prompt_function": "bbh_lighteval",
475
+ "hf_repo": "lighteval/bbh",
476
+ "hf_subset": "salient_translation_error_detection",
477
+ "metric": [
478
+ "loglikelihood_acc_single_token"
479
+ ],
480
+ "hf_avail_splits": [
481
+ "train"
482
+ ],
483
+ "evaluation_splits": [
484
+ "train"
485
+ ],
486
+ "few_shots_split": null,
487
+ "few_shots_select": null,
488
+ "generation_size": -1,
489
+ "stop_sequence": [
490
+ "</s>",
491
+ "Q:",
492
+ "\n\n"
493
+ ],
494
+ "output_regex": null,
495
+ "frozen": false,
496
+ "suite": [
497
+ "lighteval"
498
+ ],
499
+ "original_num_docs": 998,
500
+ "effective_num_docs": 998,
501
+ "trust_dataset": true,
502
+ "must_remove_duplicate_docs": null
503
+ },
504
+ "lighteval|bigbench:snarks": {
505
+ "name": "bigbench:snarks",
506
+ "prompt_function": "bbh_lighteval",
507
+ "hf_repo": "lighteval/bbh",
508
+ "hf_subset": "snarks",
509
+ "metric": [
510
+ "loglikelihood_acc_single_token"
511
+ ],
512
+ "hf_avail_splits": [
513
+ "train"
514
+ ],
515
+ "evaluation_splits": [
516
+ "train"
517
+ ],
518
+ "few_shots_split": null,
519
+ "few_shots_select": null,
520
+ "generation_size": -1,
521
+ "stop_sequence": [
522
+ "</s>",
523
+ "Q:",
524
+ "\n\n"
525
+ ],
526
+ "output_regex": null,
527
+ "frozen": false,
528
+ "suite": [
529
+ "lighteval"
530
+ ],
531
+ "original_num_docs": 181,
532
+ "effective_num_docs": 181,
533
+ "trust_dataset": true,
534
+ "must_remove_duplicate_docs": null
535
+ },
536
+ "lighteval|bigbench:sports_understanding": {
537
+ "name": "bigbench:sports_understanding",
538
+ "prompt_function": "bbh_lighteval",
539
+ "hf_repo": "lighteval/bbh",
540
+ "hf_subset": "sports_understanding",
541
+ "metric": [
542
+ "loglikelihood_acc_single_token"
543
+ ],
544
+ "hf_avail_splits": [
545
+ "train"
546
+ ],
547
+ "evaluation_splits": [
548
+ "train"
549
+ ],
550
+ "few_shots_split": null,
551
+ "few_shots_select": null,
552
+ "generation_size": -1,
553
+ "stop_sequence": [
554
+ "</s>",
555
+ "Q:",
556
+ "\n\n"
557
+ ],
558
+ "output_regex": null,
559
+ "frozen": false,
560
+ "suite": [
561
+ "lighteval"
562
+ ],
563
+ "original_num_docs": 1000,
564
+ "effective_num_docs": 1000,
565
+ "trust_dataset": true,
566
+ "must_remove_duplicate_docs": null
567
+ },
568
+ "lighteval|bigbench:temporal_sequences": {
569
+ "name": "bigbench:temporal_sequences",
570
+ "prompt_function": "bbh_lighteval",
571
+ "hf_repo": "lighteval/bbh",
572
+ "hf_subset": "temporal_sequences",
573
+ "metric": [
574
+ "loglikelihood_acc_single_token"
575
+ ],
576
+ "hf_avail_splits": [
577
+ "train"
578
+ ],
579
+ "evaluation_splits": [
580
+ "train"
581
+ ],
582
+ "few_shots_split": null,
583
+ "few_shots_select": null,
584
+ "generation_size": -1,
585
+ "stop_sequence": [
586
+ "</s>",
587
+ "Q:",
588
+ "\n\n"
589
+ ],
590
+ "output_regex": null,
591
+ "frozen": false,
592
+ "suite": [
593
+ "lighteval"
594
+ ],
595
+ "original_num_docs": 1000,
596
+ "effective_num_docs": 1000,
597
+ "trust_dataset": true,
598
+ "must_remove_duplicate_docs": null
599
+ },
600
+ "lighteval|bigbench:tracking_shuffled_objects_five_objects": {
601
+ "name": "bigbench:tracking_shuffled_objects_five_objects",
602
+ "prompt_function": "bbh_lighteval",
603
+ "hf_repo": "lighteval/bbh",
604
+ "hf_subset": "tracking_shuffled_objects_five_objects",
605
+ "metric": [
606
+ "loglikelihood_acc_single_token"
607
+ ],
608
+ "hf_avail_splits": [
609
+ "train"
610
+ ],
611
+ "evaluation_splits": [
612
+ "train"
613
+ ],
614
+ "few_shots_split": null,
615
+ "few_shots_select": null,
616
+ "generation_size": -1,
617
+ "stop_sequence": [
618
+ "</s>",
619
+ "Q:",
620
+ "\n\n"
621
+ ],
622
+ "output_regex": null,
623
+ "frozen": false,
624
+ "suite": [
625
+ "lighteval"
626
+ ],
627
+ "original_num_docs": 1250,
628
+ "effective_num_docs": 1250,
629
+ "trust_dataset": true,
630
+ "must_remove_duplicate_docs": null
631
+ },
632
+ "lighteval|bigbench:tracking_shuffled_objects_seven_objects": {
633
+ "name": "bigbench:tracking_shuffled_objects_seven_objects",
634
+ "prompt_function": "bbh_lighteval",
635
+ "hf_repo": "lighteval/bbh",
636
+ "hf_subset": "tracking_shuffled_objects_seven_objects",
637
+ "metric": [
638
+ "loglikelihood_acc_single_token"
639
+ ],
640
+ "hf_avail_splits": [
641
+ "train"
642
+ ],
643
+ "evaluation_splits": [
644
+ "train"
645
+ ],
646
+ "few_shots_split": null,
647
+ "few_shots_select": null,
648
+ "generation_size": -1,
649
+ "stop_sequence": [
650
+ "</s>",
651
+ "Q:",
652
+ "\n\n"
653
+ ],
654
+ "output_regex": null,
655
+ "frozen": false,
656
+ "suite": [
657
+ "lighteval"
658
+ ],
659
+ "original_num_docs": 1750,
660
+ "effective_num_docs": 1750,
661
+ "trust_dataset": true,
662
+ "must_remove_duplicate_docs": null
663
+ },
664
+ "lighteval|bigbench:tracking_shuffled_objects_three_objects": {
665
+ "name": "bigbench:tracking_shuffled_objects_three_objects",
666
+ "prompt_function": "bbh_lighteval",
667
+ "hf_repo": "lighteval/bbh",
668
+ "hf_subset": "tracking_shuffled_objects_three_objects",
669
+ "metric": [
670
+ "loglikelihood_acc_single_token"
671
+ ],
672
+ "hf_avail_splits": [
673
+ "train"
674
+ ],
675
+ "evaluation_splits": [
676
+ "train"
677
+ ],
678
+ "few_shots_split": null,
679
+ "few_shots_select": null,
680
+ "generation_size": -1,
681
+ "stop_sequence": [
682
+ "</s>",
683
+ "Q:",
684
+ "\n\n"
685
+ ],
686
+ "output_regex": null,
687
+ "frozen": false,
688
+ "suite": [
689
+ "lighteval"
690
+ ],
691
+ "original_num_docs": 300,
692
+ "effective_num_docs": 300,
693
+ "trust_dataset": true,
694
+ "must_remove_duplicate_docs": null
695
+ }
696
+ },
697
+ "summary_tasks": {
698
+ "lighteval|bigbench:causal_judgment|0": {
699
+ "hashes": {
700
+ "hash_examples": "dfb1ae47218f2850",
701
+ "hash_full_prompts": "7292c47f5bf2ba48",
702
+ "hash_input_tokens": "c78e9b0699fa360e",
703
+ "hash_cont_tokens": "ac670c3ea513a639"
704
+ },
705
+ "truncated": 0,
706
+ "non_truncated": 190,
707
+ "padded": 189,
708
+ "non_padded": 1,
709
+ "effective_few_shots": 0.0,
710
+ "num_truncated_few_shots": 0
711
+ },
712
+ "lighteval|bigbench:date_understanding|0": {
713
+ "hashes": {
714
+ "hash_examples": "2b823c41500a6ec2",
715
+ "hash_full_prompts": "4db646afa4176c07",
716
+ "hash_input_tokens": "0df10f15a8309e51",
717
+ "hash_cont_tokens": "e7711b87d7f90d38"
718
+ },
719
+ "truncated": 0,
720
+ "non_truncated": 369,
721
+ "padded": 369,
722
+ "non_padded": 0,
723
+ "effective_few_shots": 0.0,
724
+ "num_truncated_few_shots": 0
725
+ },
726
+ "lighteval|bigbench:disambiguation_qa|0": {
727
+ "hashes": {
728
+ "hash_examples": "2a4c3d41db198cea",
729
+ "hash_full_prompts": "12d668cf5edc9542",
730
+ "hash_input_tokens": "9e1cf05ed5d4ec2c",
731
+ "hash_cont_tokens": "de89f8a6e5dac00c"
732
+ },
733
+ "truncated": 0,
734
+ "non_truncated": 258,
735
+ "padded": 258,
736
+ "non_padded": 0,
737
+ "effective_few_shots": 0.0,
738
+ "num_truncated_few_shots": 0
739
+ },
740
+ "lighteval|bigbench:geometric_shapes|0": {
741
+ "hashes": {
742
+ "hash_examples": "24aa261103911b72",
743
+ "hash_full_prompts": "51dfb12a121e7a69",
744
+ "hash_input_tokens": "4dbad2d1516faab7",
745
+ "hash_cont_tokens": "e51eec73c3eb26c9"
746
+ },
747
+ "truncated": 0,
748
+ "non_truncated": 360,
749
+ "padded": 360,
750
+ "non_padded": 0,
751
+ "effective_few_shots": 0.0,
752
+ "num_truncated_few_shots": 0
753
+ },
754
+ "lighteval|bigbench:logical_deduction_five_objects|0": {
755
+ "hashes": {
756
+ "hash_examples": "cb5bdc92afc41f83",
757
+ "hash_full_prompts": "b6e4a71663bc3e1c",
758
+ "hash_input_tokens": "cd98f4f02be4310c",
759
+ "hash_cont_tokens": "4c9e9d2d14981c58"
760
+ },
761
+ "truncated": 0,
762
+ "non_truncated": 500,
763
+ "padded": 500,
764
+ "non_padded": 0,
765
+ "effective_few_shots": 0.0,
766
+ "num_truncated_few_shots": 0
767
+ },
768
+ "lighteval|bigbench:logical_deduction_seven_objects|0": {
769
+ "hashes": {
770
+ "hash_examples": "b6805ea696739f9f",
771
+ "hash_full_prompts": "d0c82c066345c294",
772
+ "hash_input_tokens": "66459a902e8496a2",
773
+ "hash_cont_tokens": "1745fa6fd92f0e0d"
774
+ },
775
+ "truncated": 0,
776
+ "non_truncated": 700,
777
+ "padded": 700,
778
+ "non_padded": 0,
779
+ "effective_few_shots": 0.0,
780
+ "num_truncated_few_shots": 0
781
+ },
782
+ "lighteval|bigbench:logical_deduction_three_objects|0": {
783
+ "hashes": {
784
+ "hash_examples": "0509e5712ab9bcdb",
785
+ "hash_full_prompts": "396c1e56901b46ed",
786
+ "hash_input_tokens": "d91f9cfdcb0d2966",
787
+ "hash_cont_tokens": "2b5b679169d7bcf1"
788
+ },
789
+ "truncated": 0,
790
+ "non_truncated": 300,
791
+ "padded": 300,
792
+ "non_padded": 0,
793
+ "effective_few_shots": 0.0,
794
+ "num_truncated_few_shots": 0
795
+ },
796
+ "lighteval|bigbench:movie_recommendation|0": {
797
+ "hashes": {
798
+ "hash_examples": "530cc6f737830f45",
799
+ "hash_full_prompts": "e821384b2a44e36b",
800
+ "hash_input_tokens": "2a3916667c449290",
801
+ "hash_cont_tokens": "be520838bf2427bc"
802
+ },
803
+ "truncated": 0,
804
+ "non_truncated": 500,
805
+ "padded": 500,
806
+ "non_padded": 0,
807
+ "effective_few_shots": 0.0,
808
+ "num_truncated_few_shots": 0
809
+ },
810
+ "lighteval|bigbench:navigate|0": {
811
+ "hashes": {
812
+ "hash_examples": "7962ef85d0058b9a",
813
+ "hash_full_prompts": "43248e6945903d81",
814
+ "hash_input_tokens": "9cdcffe4384159de",
815
+ "hash_cont_tokens": "04e3a57b821a3dd8"
816
+ },
817
+ "truncated": 0,
818
+ "non_truncated": 1000,
819
+ "padded": 988,
820
+ "non_padded": 12,
821
+ "effective_few_shots": 0.0,
822
+ "num_truncated_few_shots": 0
823
+ },
824
+ "lighteval|bigbench:reasoning_about_colored_objects|0": {
825
+ "hashes": {
826
+ "hash_examples": "39be1ab1677a651d",
827
+ "hash_full_prompts": "7f7a503aaa70068f",
828
+ "hash_input_tokens": "2087b98ecbbf0dc1",
829
+ "hash_cont_tokens": "3fe982d2154a001a"
830
+ },
831
+ "truncated": 0,
832
+ "non_truncated": 2000,
833
+ "padded": 1969,
834
+ "non_padded": 31,
835
+ "effective_few_shots": 0.0,
836
+ "num_truncated_few_shots": 0
837
+ },
838
+ "lighteval|bigbench:ruin_names|0": {
839
+ "hashes": {
840
+ "hash_examples": "e9b96b31d2154941",
841
+ "hash_full_prompts": "ae8931c806192844",
842
+ "hash_input_tokens": "b8bf918e87322d16",
843
+ "hash_cont_tokens": "046bbbbddb05b429"
844
+ },
845
+ "truncated": 0,
846
+ "non_truncated": 448,
847
+ "padded": 442,
848
+ "non_padded": 6,
849
+ "effective_few_shots": 0.0,
850
+ "num_truncated_few_shots": 0
851
+ },
852
+ "lighteval|bigbench:salient_translation_error_detection|0": {
853
+ "hashes": {
854
+ "hash_examples": "951ac59f7ad0427d",
855
+ "hash_full_prompts": "643d82c4ce3fab01",
856
+ "hash_input_tokens": "988b9ae225ddd064",
857
+ "hash_cont_tokens": "e78fb6d09071e0f6"
858
+ },
859
+ "truncated": 0,
860
+ "non_truncated": 998,
861
+ "padded": 998,
862
+ "non_padded": 0,
863
+ "effective_few_shots": 0.0,
864
+ "num_truncated_few_shots": 0
865
+ },
866
+ "lighteval|bigbench:snarks|0": {
867
+ "hashes": {
868
+ "hash_examples": "3a53eb9b9d758534",
869
+ "hash_full_prompts": "b12bcea4b9bc9027",
870
+ "hash_input_tokens": "2955b17fa2f88f37",
871
+ "hash_cont_tokens": "f5cb71a436613293"
872
+ },
873
+ "truncated": 0,
874
+ "non_truncated": 181,
875
+ "padded": 178,
876
+ "non_padded": 3,
877
+ "effective_few_shots": 0.0,
878
+ "num_truncated_few_shots": 0
879
+ },
880
+ "lighteval|bigbench:sports_understanding|0": {
881
+ "hashes": {
882
+ "hash_examples": "bd65741f00770373",
883
+ "hash_full_prompts": "39d7688aa2d209e1",
884
+ "hash_input_tokens": "73898bc71644c9e9",
885
+ "hash_cont_tokens": "02230fac16464d15"
886
+ },
887
+ "truncated": 0,
888
+ "non_truncated": 1000,
889
+ "padded": 1000,
890
+ "non_padded": 0,
891
+ "effective_few_shots": 0.0,
892
+ "num_truncated_few_shots": 0
893
+ },
894
+ "lighteval|bigbench:temporal_sequences|0": {
895
+ "hashes": {
896
+ "hash_examples": "1d13139f47cb2df7",
897
+ "hash_full_prompts": "1a874610f00343dc",
898
+ "hash_input_tokens": "4bad0abbf7b942f4",
899
+ "hash_cont_tokens": "88c86d8bfb960c7d"
900
+ },
901
+ "truncated": 0,
902
+ "non_truncated": 1000,
903
+ "padded": 1000,
904
+ "non_padded": 0,
905
+ "effective_few_shots": 0.0,
906
+ "num_truncated_few_shots": 0
907
+ },
908
+ "lighteval|bigbench:tracking_shuffled_objects_five_objects|0": {
909
+ "hashes": {
910
+ "hash_examples": "8770a702a9646648",
911
+ "hash_full_prompts": "392b486c4039dca8",
912
+ "hash_input_tokens": "7a612bb0d0973402",
913
+ "hash_cont_tokens": "7cf11d867348e0b1"
914
+ },
915
+ "truncated": 0,
916
+ "non_truncated": 1250,
917
+ "padded": 1180,
918
+ "non_padded": 70,
919
+ "effective_few_shots": 0.0,
920
+ "num_truncated_few_shots": 0
921
+ },
922
+ "lighteval|bigbench:tracking_shuffled_objects_seven_objects|0": {
923
+ "hashes": {
924
+ "hash_examples": "b469b7d073824a59",
925
+ "hash_full_prompts": "1bad8a693cc74da1",
926
+ "hash_input_tokens": "527bc1f26eee48bd",
927
+ "hash_cont_tokens": "f76ba63a583d749e"
928
+ },
929
+ "truncated": 0,
930
+ "non_truncated": 1750,
931
+ "padded": 1701,
932
+ "non_padded": 49,
933
+ "effective_few_shots": 0.0,
934
+ "num_truncated_few_shots": 0
935
+ },
936
+ "lighteval|bigbench:tracking_shuffled_objects_three_objects|0": {
937
+ "hashes": {
938
+ "hash_examples": "0509e5712ab9bcdb",
939
+ "hash_full_prompts": "396c1e56901b46ed",
940
+ "hash_input_tokens": "5a7e115c908210d7",
941
+ "hash_cont_tokens": "b2cce0a4a2edc859"
942
+ },
943
+ "truncated": 0,
944
+ "non_truncated": 300,
945
+ "padded": 294,
946
+ "non_padded": 6,
947
+ "effective_few_shots": 0.0,
948
+ "num_truncated_few_shots": 0
949
+ }
950
+ },
951
+ "summary_general": {
952
+ "hashes": {
953
+ "hash_examples": "51a30c4501ba4586",
954
+ "hash_full_prompts": "96a511cab844bc38",
955
+ "hash_input_tokens": "fbcd498a47d6bc76",
956
+ "hash_cont_tokens": "2f0ff7c19ccc0d8e"
957
+ },
958
+ "truncated": 0,
959
+ "non_truncated": 13104,
960
+ "padded": 12926,
961
+ "non_padded": 178,
962
+ "num_truncated_few_shots": 0
963
+ }
964
+ }