lewtun HF staff commited on
Commit
1303040
·
verified ·
1 Parent(s): d703db3

Upload eval_results/databricks/dbrx-base/main/bbh/results_2024-03-30T20-19-23.953419.json with huggingface_hub

Browse files
eval_results/databricks/dbrx-base/main/bbh/results_2024-03-30T20-19-23.953419.json ADDED
@@ -0,0 +1,964 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 1,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 1850515.428046803,
9
+ "end_time": 1852176.333653204,
10
+ "total_evaluation_time_secondes": "1660.90560640092",
11
+ "model_name": "databricks/dbrx-base",
12
+ "model_sha": "4e1f4c3e5452762b62b8cf3c2eee863c52da7903",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "245.12 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "lighteval|bigbench:causal_judgment|0": {
19
+ "acc": 0.5526315789473685,
20
+ "acc_stderr": 0.036167593207172444
21
+ },
22
+ "lighteval|bigbench:date_understanding|0": {
23
+ "acc": 0.6043360433604336,
24
+ "acc_stderr": 0.025490513477091623
25
+ },
26
+ "lighteval|bigbench:disambiguation_qa|0": {
27
+ "acc": 0.313953488372093,
28
+ "acc_stderr": 0.028949620503220728
29
+ },
30
+ "lighteval|bigbench:geometric_shapes|0": {
31
+ "acc": 0.15,
32
+ "acc_stderr": 0.018845508837455664
33
+ },
34
+ "lighteval|bigbench:logical_deduction_five_objects|0": {
35
+ "acc": 0.4,
36
+ "acc_stderr": 0.021930844120728505
37
+ },
38
+ "lighteval|bigbench:logical_deduction_seven_objects|0": {
39
+ "acc": 0.24,
40
+ "acc_stderr": 0.016153767548426002
41
+ },
42
+ "lighteval|bigbench:logical_deduction_three_objects|0": {
43
+ "acc": 0.3933333333333333,
44
+ "acc_stderr": 0.028250090846760875
45
+ },
46
+ "lighteval|bigbench:movie_recommendation|0": {
47
+ "acc": 0.682,
48
+ "acc_stderr": 0.020847571620814007
49
+ },
50
+ "lighteval|bigbench:navigate|0": {
51
+ "acc": 0.501,
52
+ "acc_stderr": 0.015819268290576817
53
+ },
54
+ "lighteval|bigbench:reasoning_about_colored_objects|0": {
55
+ "acc": 0.3675,
56
+ "acc_stderr": 0.010783321149233223
57
+ },
58
+ "lighteval|bigbench:ruin_names|0": {
59
+ "acc": 0.6785714285714286,
60
+ "acc_stderr": 0.022089519157170168
61
+ },
62
+ "lighteval|bigbench:salient_translation_error_detection|0": {
63
+ "acc": 0.2755511022044088,
64
+ "acc_stderr": 0.014150030472244257
65
+ },
66
+ "lighteval|bigbench:snarks|0": {
67
+ "acc": 0.5414364640883977,
68
+ "acc_stderr": 0.03713960295993206
69
+ },
70
+ "lighteval|bigbench:sports_understanding|0": {
71
+ "acc": 0.693,
72
+ "acc_stderr": 0.014593284892852623
73
+ },
74
+ "lighteval|bigbench:temporal_sequences|0": {
75
+ "acc": 0.938,
76
+ "acc_stderr": 0.0076298239962803065
77
+ },
78
+ "lighteval|bigbench:tracking_shuffled_objects_five_objects|0": {
79
+ "acc": 0.1784,
80
+ "acc_stderr": 0.01083294207993783
81
+ },
82
+ "lighteval|bigbench:tracking_shuffled_objects_seven_objects|0": {
83
+ "acc": 0.1382857142857143,
84
+ "acc_stderr": 0.008254207843183564
85
+ },
86
+ "lighteval|bigbench:tracking_shuffled_objects_three_objects|0": {
87
+ "acc": 0.39666666666666667,
88
+ "acc_stderr": 0.02829149642514497
89
+ },
90
+ "lighteval|bigbench:_average|0": {
91
+ "acc": 0.44692587887943575,
92
+ "acc_stderr": 0.020345500412679202
93
+ },
94
+ "all": {
95
+ "acc": 0.44692587887943575,
96
+ "acc_stderr": 0.020345500412679202
97
+ }
98
+ },
99
+ "versions": {
100
+ "lighteval|bigbench:causal_judgment|0": 0,
101
+ "lighteval|bigbench:date_understanding|0": 0,
102
+ "lighteval|bigbench:disambiguation_qa|0": 0,
103
+ "lighteval|bigbench:geometric_shapes|0": 0,
104
+ "lighteval|bigbench:logical_deduction_five_objects|0": 0,
105
+ "lighteval|bigbench:logical_deduction_seven_objects|0": 0,
106
+ "lighteval|bigbench:logical_deduction_three_objects|0": 0,
107
+ "lighteval|bigbench:movie_recommendation|0": 0,
108
+ "lighteval|bigbench:navigate|0": 0,
109
+ "lighteval|bigbench:reasoning_about_colored_objects|0": 0,
110
+ "lighteval|bigbench:ruin_names|0": 0,
111
+ "lighteval|bigbench:salient_translation_error_detection|0": 0,
112
+ "lighteval|bigbench:snarks|0": 0,
113
+ "lighteval|bigbench:sports_understanding|0": 0,
114
+ "lighteval|bigbench:temporal_sequences|0": 0,
115
+ "lighteval|bigbench:tracking_shuffled_objects_five_objects|0": 0,
116
+ "lighteval|bigbench:tracking_shuffled_objects_seven_objects|0": 0,
117
+ "lighteval|bigbench:tracking_shuffled_objects_three_objects|0": 0
118
+ },
119
+ "config_tasks": {
120
+ "lighteval|bigbench:causal_judgment": {
121
+ "name": "bigbench:causal_judgment",
122
+ "prompt_function": "bbh_lighteval",
123
+ "hf_repo": "lighteval/bbh",
124
+ "hf_subset": "causal_judgement",
125
+ "metric": [
126
+ "loglikelihood_acc_single_token"
127
+ ],
128
+ "hf_avail_splits": [
129
+ "train"
130
+ ],
131
+ "evaluation_splits": [
132
+ "train"
133
+ ],
134
+ "few_shots_split": null,
135
+ "few_shots_select": null,
136
+ "generation_size": -1,
137
+ "stop_sequence": [
138
+ "</s>",
139
+ "Q:",
140
+ "\n\n"
141
+ ],
142
+ "output_regex": null,
143
+ "frozen": false,
144
+ "suite": [
145
+ "lighteval"
146
+ ],
147
+ "original_num_docs": 190,
148
+ "effective_num_docs": 190,
149
+ "trust_dataset": true,
150
+ "must_remove_duplicate_docs": null
151
+ },
152
+ "lighteval|bigbench:date_understanding": {
153
+ "name": "bigbench:date_understanding",
154
+ "prompt_function": "bbh_lighteval",
155
+ "hf_repo": "lighteval/bbh",
156
+ "hf_subset": "date_understanding",
157
+ "metric": [
158
+ "loglikelihood_acc_single_token"
159
+ ],
160
+ "hf_avail_splits": [
161
+ "train"
162
+ ],
163
+ "evaluation_splits": [
164
+ "train"
165
+ ],
166
+ "few_shots_split": null,
167
+ "few_shots_select": null,
168
+ "generation_size": -1,
169
+ "stop_sequence": [
170
+ "</s>",
171
+ "Q:",
172
+ "\n\n"
173
+ ],
174
+ "output_regex": null,
175
+ "frozen": false,
176
+ "suite": [
177
+ "lighteval"
178
+ ],
179
+ "original_num_docs": 369,
180
+ "effective_num_docs": 369,
181
+ "trust_dataset": true,
182
+ "must_remove_duplicate_docs": null
183
+ },
184
+ "lighteval|bigbench:disambiguation_qa": {
185
+ "name": "bigbench:disambiguation_qa",
186
+ "prompt_function": "bbh_lighteval",
187
+ "hf_repo": "lighteval/bbh",
188
+ "hf_subset": "disambiguation_qa",
189
+ "metric": [
190
+ "loglikelihood_acc_single_token"
191
+ ],
192
+ "hf_avail_splits": [
193
+ "train"
194
+ ],
195
+ "evaluation_splits": [
196
+ "train"
197
+ ],
198
+ "few_shots_split": null,
199
+ "few_shots_select": null,
200
+ "generation_size": -1,
201
+ "stop_sequence": [
202
+ "</s>",
203
+ "Q:",
204
+ "\n\n"
205
+ ],
206
+ "output_regex": null,
207
+ "frozen": false,
208
+ "suite": [
209
+ "lighteval"
210
+ ],
211
+ "original_num_docs": 258,
212
+ "effective_num_docs": 258,
213
+ "trust_dataset": true,
214
+ "must_remove_duplicate_docs": null
215
+ },
216
+ "lighteval|bigbench:geometric_shapes": {
217
+ "name": "bigbench:geometric_shapes",
218
+ "prompt_function": "bbh_lighteval",
219
+ "hf_repo": "lighteval/bbh",
220
+ "hf_subset": "geometric_shapes",
221
+ "metric": [
222
+ "loglikelihood_acc_single_token"
223
+ ],
224
+ "hf_avail_splits": [
225
+ "train"
226
+ ],
227
+ "evaluation_splits": [
228
+ "train"
229
+ ],
230
+ "few_shots_split": null,
231
+ "few_shots_select": null,
232
+ "generation_size": -1,
233
+ "stop_sequence": [
234
+ "</s>",
235
+ "Q:",
236
+ "\n\n"
237
+ ],
238
+ "output_regex": null,
239
+ "frozen": false,
240
+ "suite": [
241
+ "lighteval"
242
+ ],
243
+ "original_num_docs": 360,
244
+ "effective_num_docs": 360,
245
+ "trust_dataset": true,
246
+ "must_remove_duplicate_docs": null
247
+ },
248
+ "lighteval|bigbench:logical_deduction_five_objects": {
249
+ "name": "bigbench:logical_deduction_five_objects",
250
+ "prompt_function": "bbh_lighteval",
251
+ "hf_repo": "lighteval/bbh",
252
+ "hf_subset": "logical_deduction_five_objects",
253
+ "metric": [
254
+ "loglikelihood_acc_single_token"
255
+ ],
256
+ "hf_avail_splits": [
257
+ "train"
258
+ ],
259
+ "evaluation_splits": [
260
+ "train"
261
+ ],
262
+ "few_shots_split": null,
263
+ "few_shots_select": null,
264
+ "generation_size": -1,
265
+ "stop_sequence": [
266
+ "</s>",
267
+ "Q:",
268
+ "\n\n"
269
+ ],
270
+ "output_regex": null,
271
+ "frozen": false,
272
+ "suite": [
273
+ "lighteval"
274
+ ],
275
+ "original_num_docs": 500,
276
+ "effective_num_docs": 500,
277
+ "trust_dataset": true,
278
+ "must_remove_duplicate_docs": null
279
+ },
280
+ "lighteval|bigbench:logical_deduction_seven_objects": {
281
+ "name": "bigbench:logical_deduction_seven_objects",
282
+ "prompt_function": "bbh_lighteval",
283
+ "hf_repo": "lighteval/bbh",
284
+ "hf_subset": "logical_deduction_seven_objects",
285
+ "metric": [
286
+ "loglikelihood_acc_single_token"
287
+ ],
288
+ "hf_avail_splits": [
289
+ "train"
290
+ ],
291
+ "evaluation_splits": [
292
+ "train"
293
+ ],
294
+ "few_shots_split": null,
295
+ "few_shots_select": null,
296
+ "generation_size": -1,
297
+ "stop_sequence": [
298
+ "</s>",
299
+ "Q:",
300
+ "\n\n"
301
+ ],
302
+ "output_regex": null,
303
+ "frozen": false,
304
+ "suite": [
305
+ "lighteval"
306
+ ],
307
+ "original_num_docs": 700,
308
+ "effective_num_docs": 700,
309
+ "trust_dataset": true,
310
+ "must_remove_duplicate_docs": null
311
+ },
312
+ "lighteval|bigbench:logical_deduction_three_objects": {
313
+ "name": "bigbench:logical_deduction_three_objects",
314
+ "prompt_function": "bbh_lighteval",
315
+ "hf_repo": "lighteval/bbh",
316
+ "hf_subset": "logical_deduction_three_objects",
317
+ "metric": [
318
+ "loglikelihood_acc_single_token"
319
+ ],
320
+ "hf_avail_splits": [
321
+ "train"
322
+ ],
323
+ "evaluation_splits": [
324
+ "train"
325
+ ],
326
+ "few_shots_split": null,
327
+ "few_shots_select": null,
328
+ "generation_size": -1,
329
+ "stop_sequence": [
330
+ "</s>",
331
+ "Q:",
332
+ "\n\n"
333
+ ],
334
+ "output_regex": null,
335
+ "frozen": false,
336
+ "suite": [
337
+ "lighteval"
338
+ ],
339
+ "original_num_docs": 300,
340
+ "effective_num_docs": 300,
341
+ "trust_dataset": true,
342
+ "must_remove_duplicate_docs": null
343
+ },
344
+ "lighteval|bigbench:movie_recommendation": {
345
+ "name": "bigbench:movie_recommendation",
346
+ "prompt_function": "bbh_lighteval",
347
+ "hf_repo": "lighteval/bbh",
348
+ "hf_subset": "movie_recommendation",
349
+ "metric": [
350
+ "loglikelihood_acc_single_token"
351
+ ],
352
+ "hf_avail_splits": [
353
+ "train"
354
+ ],
355
+ "evaluation_splits": [
356
+ "train"
357
+ ],
358
+ "few_shots_split": null,
359
+ "few_shots_select": null,
360
+ "generation_size": -1,
361
+ "stop_sequence": [
362
+ "</s>",
363
+ "Q:",
364
+ "\n\n"
365
+ ],
366
+ "output_regex": null,
367
+ "frozen": false,
368
+ "suite": [
369
+ "lighteval"
370
+ ],
371
+ "original_num_docs": 500,
372
+ "effective_num_docs": 500,
373
+ "trust_dataset": true,
374
+ "must_remove_duplicate_docs": null
375
+ },
376
+ "lighteval|bigbench:navigate": {
377
+ "name": "bigbench:navigate",
378
+ "prompt_function": "bbh_lighteval",
379
+ "hf_repo": "lighteval/bbh",
380
+ "hf_subset": "navigate",
381
+ "metric": [
382
+ "loglikelihood_acc_single_token"
383
+ ],
384
+ "hf_avail_splits": [
385
+ "train"
386
+ ],
387
+ "evaluation_splits": [
388
+ "train"
389
+ ],
390
+ "few_shots_split": null,
391
+ "few_shots_select": null,
392
+ "generation_size": -1,
393
+ "stop_sequence": [
394
+ "</s>",
395
+ "Q:",
396
+ "\n\n"
397
+ ],
398
+ "output_regex": null,
399
+ "frozen": false,
400
+ "suite": [
401
+ "lighteval"
402
+ ],
403
+ "original_num_docs": 1000,
404
+ "effective_num_docs": 1000,
405
+ "trust_dataset": true,
406
+ "must_remove_duplicate_docs": null
407
+ },
408
+ "lighteval|bigbench:reasoning_about_colored_objects": {
409
+ "name": "bigbench:reasoning_about_colored_objects",
410
+ "prompt_function": "bbh_lighteval",
411
+ "hf_repo": "lighteval/bbh",
412
+ "hf_subset": "reasoning_about_colored_objects",
413
+ "metric": [
414
+ "loglikelihood_acc_single_token"
415
+ ],
416
+ "hf_avail_splits": [
417
+ "train"
418
+ ],
419
+ "evaluation_splits": [
420
+ "train"
421
+ ],
422
+ "few_shots_split": null,
423
+ "few_shots_select": null,
424
+ "generation_size": -1,
425
+ "stop_sequence": [
426
+ "</s>",
427
+ "Q:",
428
+ "\n\n"
429
+ ],
430
+ "output_regex": null,
431
+ "frozen": false,
432
+ "suite": [
433
+ "lighteval"
434
+ ],
435
+ "original_num_docs": 2000,
436
+ "effective_num_docs": 2000,
437
+ "trust_dataset": true,
438
+ "must_remove_duplicate_docs": null
439
+ },
440
+ "lighteval|bigbench:ruin_names": {
441
+ "name": "bigbench:ruin_names",
442
+ "prompt_function": "bbh_lighteval",
443
+ "hf_repo": "lighteval/bbh",
444
+ "hf_subset": "ruin_names",
445
+ "metric": [
446
+ "loglikelihood_acc_single_token"
447
+ ],
448
+ "hf_avail_splits": [
449
+ "train"
450
+ ],
451
+ "evaluation_splits": [
452
+ "train"
453
+ ],
454
+ "few_shots_split": null,
455
+ "few_shots_select": null,
456
+ "generation_size": -1,
457
+ "stop_sequence": [
458
+ "</s>",
459
+ "Q:",
460
+ "\n\n"
461
+ ],
462
+ "output_regex": null,
463
+ "frozen": false,
464
+ "suite": [
465
+ "lighteval"
466
+ ],
467
+ "original_num_docs": 448,
468
+ "effective_num_docs": 448,
469
+ "trust_dataset": true,
470
+ "must_remove_duplicate_docs": null
471
+ },
472
+ "lighteval|bigbench:salient_translation_error_detection": {
473
+ "name": "bigbench:salient_translation_error_detection",
474
+ "prompt_function": "bbh_lighteval",
475
+ "hf_repo": "lighteval/bbh",
476
+ "hf_subset": "salient_translation_error_detection",
477
+ "metric": [
478
+ "loglikelihood_acc_single_token"
479
+ ],
480
+ "hf_avail_splits": [
481
+ "train"
482
+ ],
483
+ "evaluation_splits": [
484
+ "train"
485
+ ],
486
+ "few_shots_split": null,
487
+ "few_shots_select": null,
488
+ "generation_size": -1,
489
+ "stop_sequence": [
490
+ "</s>",
491
+ "Q:",
492
+ "\n\n"
493
+ ],
494
+ "output_regex": null,
495
+ "frozen": false,
496
+ "suite": [
497
+ "lighteval"
498
+ ],
499
+ "original_num_docs": 998,
500
+ "effective_num_docs": 998,
501
+ "trust_dataset": true,
502
+ "must_remove_duplicate_docs": null
503
+ },
504
+ "lighteval|bigbench:snarks": {
505
+ "name": "bigbench:snarks",
506
+ "prompt_function": "bbh_lighteval",
507
+ "hf_repo": "lighteval/bbh",
508
+ "hf_subset": "snarks",
509
+ "metric": [
510
+ "loglikelihood_acc_single_token"
511
+ ],
512
+ "hf_avail_splits": [
513
+ "train"
514
+ ],
515
+ "evaluation_splits": [
516
+ "train"
517
+ ],
518
+ "few_shots_split": null,
519
+ "few_shots_select": null,
520
+ "generation_size": -1,
521
+ "stop_sequence": [
522
+ "</s>",
523
+ "Q:",
524
+ "\n\n"
525
+ ],
526
+ "output_regex": null,
527
+ "frozen": false,
528
+ "suite": [
529
+ "lighteval"
530
+ ],
531
+ "original_num_docs": 181,
532
+ "effective_num_docs": 181,
533
+ "trust_dataset": true,
534
+ "must_remove_duplicate_docs": null
535
+ },
536
+ "lighteval|bigbench:sports_understanding": {
537
+ "name": "bigbench:sports_understanding",
538
+ "prompt_function": "bbh_lighteval",
539
+ "hf_repo": "lighteval/bbh",
540
+ "hf_subset": "sports_understanding",
541
+ "metric": [
542
+ "loglikelihood_acc_single_token"
543
+ ],
544
+ "hf_avail_splits": [
545
+ "train"
546
+ ],
547
+ "evaluation_splits": [
548
+ "train"
549
+ ],
550
+ "few_shots_split": null,
551
+ "few_shots_select": null,
552
+ "generation_size": -1,
553
+ "stop_sequence": [
554
+ "</s>",
555
+ "Q:",
556
+ "\n\n"
557
+ ],
558
+ "output_regex": null,
559
+ "frozen": false,
560
+ "suite": [
561
+ "lighteval"
562
+ ],
563
+ "original_num_docs": 1000,
564
+ "effective_num_docs": 1000,
565
+ "trust_dataset": true,
566
+ "must_remove_duplicate_docs": null
567
+ },
568
+ "lighteval|bigbench:temporal_sequences": {
569
+ "name": "bigbench:temporal_sequences",
570
+ "prompt_function": "bbh_lighteval",
571
+ "hf_repo": "lighteval/bbh",
572
+ "hf_subset": "temporal_sequences",
573
+ "metric": [
574
+ "loglikelihood_acc_single_token"
575
+ ],
576
+ "hf_avail_splits": [
577
+ "train"
578
+ ],
579
+ "evaluation_splits": [
580
+ "train"
581
+ ],
582
+ "few_shots_split": null,
583
+ "few_shots_select": null,
584
+ "generation_size": -1,
585
+ "stop_sequence": [
586
+ "</s>",
587
+ "Q:",
588
+ "\n\n"
589
+ ],
590
+ "output_regex": null,
591
+ "frozen": false,
592
+ "suite": [
593
+ "lighteval"
594
+ ],
595
+ "original_num_docs": 1000,
596
+ "effective_num_docs": 1000,
597
+ "trust_dataset": true,
598
+ "must_remove_duplicate_docs": null
599
+ },
600
+ "lighteval|bigbench:tracking_shuffled_objects_five_objects": {
601
+ "name": "bigbench:tracking_shuffled_objects_five_objects",
602
+ "prompt_function": "bbh_lighteval",
603
+ "hf_repo": "lighteval/bbh",
604
+ "hf_subset": "tracking_shuffled_objects_five_objects",
605
+ "metric": [
606
+ "loglikelihood_acc_single_token"
607
+ ],
608
+ "hf_avail_splits": [
609
+ "train"
610
+ ],
611
+ "evaluation_splits": [
612
+ "train"
613
+ ],
614
+ "few_shots_split": null,
615
+ "few_shots_select": null,
616
+ "generation_size": -1,
617
+ "stop_sequence": [
618
+ "</s>",
619
+ "Q:",
620
+ "\n\n"
621
+ ],
622
+ "output_regex": null,
623
+ "frozen": false,
624
+ "suite": [
625
+ "lighteval"
626
+ ],
627
+ "original_num_docs": 1250,
628
+ "effective_num_docs": 1250,
629
+ "trust_dataset": true,
630
+ "must_remove_duplicate_docs": null
631
+ },
632
+ "lighteval|bigbench:tracking_shuffled_objects_seven_objects": {
633
+ "name": "bigbench:tracking_shuffled_objects_seven_objects",
634
+ "prompt_function": "bbh_lighteval",
635
+ "hf_repo": "lighteval/bbh",
636
+ "hf_subset": "tracking_shuffled_objects_seven_objects",
637
+ "metric": [
638
+ "loglikelihood_acc_single_token"
639
+ ],
640
+ "hf_avail_splits": [
641
+ "train"
642
+ ],
643
+ "evaluation_splits": [
644
+ "train"
645
+ ],
646
+ "few_shots_split": null,
647
+ "few_shots_select": null,
648
+ "generation_size": -1,
649
+ "stop_sequence": [
650
+ "</s>",
651
+ "Q:",
652
+ "\n\n"
653
+ ],
654
+ "output_regex": null,
655
+ "frozen": false,
656
+ "suite": [
657
+ "lighteval"
658
+ ],
659
+ "original_num_docs": 1750,
660
+ "effective_num_docs": 1750,
661
+ "trust_dataset": true,
662
+ "must_remove_duplicate_docs": null
663
+ },
664
+ "lighteval|bigbench:tracking_shuffled_objects_three_objects": {
665
+ "name": "bigbench:tracking_shuffled_objects_three_objects",
666
+ "prompt_function": "bbh_lighteval",
667
+ "hf_repo": "lighteval/bbh",
668
+ "hf_subset": "tracking_shuffled_objects_three_objects",
669
+ "metric": [
670
+ "loglikelihood_acc_single_token"
671
+ ],
672
+ "hf_avail_splits": [
673
+ "train"
674
+ ],
675
+ "evaluation_splits": [
676
+ "train"
677
+ ],
678
+ "few_shots_split": null,
679
+ "few_shots_select": null,
680
+ "generation_size": -1,
681
+ "stop_sequence": [
682
+ "</s>",
683
+ "Q:",
684
+ "\n\n"
685
+ ],
686
+ "output_regex": null,
687
+ "frozen": false,
688
+ "suite": [
689
+ "lighteval"
690
+ ],
691
+ "original_num_docs": 300,
692
+ "effective_num_docs": 300,
693
+ "trust_dataset": true,
694
+ "must_remove_duplicate_docs": null
695
+ }
696
+ },
697
+ "summary_tasks": {
698
+ "lighteval|bigbench:causal_judgment|0": {
699
+ "hashes": {
700
+ "hash_examples": "dfb1ae47218f2850",
701
+ "hash_full_prompts": "dfb1ae47218f2850",
702
+ "hash_input_tokens": "b988c34f02a78344",
703
+ "hash_cont_tokens": "571d3106dbe55a63"
704
+ },
705
+ "truncated": 0,
706
+ "non_truncated": 190,
707
+ "padded": 189,
708
+ "non_padded": 1,
709
+ "effective_few_shots": 0.0,
710
+ "num_truncated_few_shots": 0
711
+ },
712
+ "lighteval|bigbench:date_understanding|0": {
713
+ "hashes": {
714
+ "hash_examples": "2b823c41500a6ec2",
715
+ "hash_full_prompts": "2b823c41500a6ec2",
716
+ "hash_input_tokens": "8a9740c8c479b27e",
717
+ "hash_cont_tokens": "3925b98e70694109"
718
+ },
719
+ "truncated": 0,
720
+ "non_truncated": 369,
721
+ "padded": 369,
722
+ "non_padded": 0,
723
+ "effective_few_shots": 0.0,
724
+ "num_truncated_few_shots": 0
725
+ },
726
+ "lighteval|bigbench:disambiguation_qa|0": {
727
+ "hashes": {
728
+ "hash_examples": "2a4c3d41db198cea",
729
+ "hash_full_prompts": "2a4c3d41db198cea",
730
+ "hash_input_tokens": "971d53f05bc79941",
731
+ "hash_cont_tokens": "442c543045afc30c"
732
+ },
733
+ "truncated": 0,
734
+ "non_truncated": 258,
735
+ "padded": 255,
736
+ "non_padded": 3,
737
+ "effective_few_shots": 0.0,
738
+ "num_truncated_few_shots": 0
739
+ },
740
+ "lighteval|bigbench:geometric_shapes|0": {
741
+ "hashes": {
742
+ "hash_examples": "24aa261103911b72",
743
+ "hash_full_prompts": "24aa261103911b72",
744
+ "hash_input_tokens": "54d3f7cfb339f217",
745
+ "hash_cont_tokens": "e6270b3f75fd8364"
746
+ },
747
+ "truncated": 0,
748
+ "non_truncated": 360,
749
+ "padded": 360,
750
+ "non_padded": 0,
751
+ "effective_few_shots": 0.0,
752
+ "num_truncated_few_shots": 0
753
+ },
754
+ "lighteval|bigbench:logical_deduction_five_objects|0": {
755
+ "hashes": {
756
+ "hash_examples": "cb5bdc92afc41f83",
757
+ "hash_full_prompts": "cb5bdc92afc41f83",
758
+ "hash_input_tokens": "818ff8bf12580516",
759
+ "hash_cont_tokens": "403379beb0f27bab"
760
+ },
761
+ "truncated": 0,
762
+ "non_truncated": 500,
763
+ "padded": 500,
764
+ "non_padded": 0,
765
+ "effective_few_shots": 0.0,
766
+ "num_truncated_few_shots": 0
767
+ },
768
+ "lighteval|bigbench:logical_deduction_seven_objects|0": {
769
+ "hashes": {
770
+ "hash_examples": "b6805ea696739f9f",
771
+ "hash_full_prompts": "b6805ea696739f9f",
772
+ "hash_input_tokens": "bd1f1e154f0e4570",
773
+ "hash_cont_tokens": "2ab02dd654f1e434"
774
+ },
775
+ "truncated": 0,
776
+ "non_truncated": 700,
777
+ "padded": 700,
778
+ "non_padded": 0,
779
+ "effective_few_shots": 0.0,
780
+ "num_truncated_few_shots": 0
781
+ },
782
+ "lighteval|bigbench:logical_deduction_three_objects|0": {
783
+ "hashes": {
784
+ "hash_examples": "0509e5712ab9bcdb",
785
+ "hash_full_prompts": "0509e5712ab9bcdb",
786
+ "hash_input_tokens": "ea5d60ad52932480",
787
+ "hash_cont_tokens": "b2d8116b6d4720fe"
788
+ },
789
+ "truncated": 0,
790
+ "non_truncated": 300,
791
+ "padded": 264,
792
+ "non_padded": 36,
793
+ "effective_few_shots": 0.0,
794
+ "num_truncated_few_shots": 0
795
+ },
796
+ "lighteval|bigbench:movie_recommendation|0": {
797
+ "hashes": {
798
+ "hash_examples": "530cc6f737830f45",
799
+ "hash_full_prompts": "530cc6f737830f45",
800
+ "hash_input_tokens": "51f8e1d213962bc0",
801
+ "hash_cont_tokens": "337988ba0f6b6159"
802
+ },
803
+ "truncated": 0,
804
+ "non_truncated": 500,
805
+ "padded": 495,
806
+ "non_padded": 5,
807
+ "effective_few_shots": 0.0,
808
+ "num_truncated_few_shots": 0
809
+ },
810
+ "lighteval|bigbench:navigate|0": {
811
+ "hashes": {
812
+ "hash_examples": "7962ef85d0058b9a",
813
+ "hash_full_prompts": "7962ef85d0058b9a",
814
+ "hash_input_tokens": "f7bfb6cee99fdb77",
815
+ "hash_cont_tokens": "135e45e3e01fd667"
816
+ },
817
+ "truncated": 0,
818
+ "non_truncated": 1000,
819
+ "padded": 990,
820
+ "non_padded": 10,
821
+ "effective_few_shots": 0.0,
822
+ "num_truncated_few_shots": 0
823
+ },
824
+ "lighteval|bigbench:reasoning_about_colored_objects|0": {
825
+ "hashes": {
826
+ "hash_examples": "39be1ab1677a651d",
827
+ "hash_full_prompts": "39be1ab1677a651d",
828
+ "hash_input_tokens": "bf06e15e459f6940",
829
+ "hash_cont_tokens": "c887bb4e668fb298"
830
+ },
831
+ "truncated": 0,
832
+ "non_truncated": 2000,
833
+ "padded": 1986,
834
+ "non_padded": 14,
835
+ "effective_few_shots": 0.0,
836
+ "num_truncated_few_shots": 0
837
+ },
838
+ "lighteval|bigbench:ruin_names|0": {
839
+ "hashes": {
840
+ "hash_examples": "e9b96b31d2154941",
841
+ "hash_full_prompts": "e9b96b31d2154941",
842
+ "hash_input_tokens": "94188f6de28af148",
843
+ "hash_cont_tokens": "19d0b304bf4664c1"
844
+ },
845
+ "truncated": 0,
846
+ "non_truncated": 448,
847
+ "padded": 439,
848
+ "non_padded": 9,
849
+ "effective_few_shots": 0.0,
850
+ "num_truncated_few_shots": 0
851
+ },
852
+ "lighteval|bigbench:salient_translation_error_detection|0": {
853
+ "hashes": {
854
+ "hash_examples": "951ac59f7ad0427d",
855
+ "hash_full_prompts": "951ac59f7ad0427d",
856
+ "hash_input_tokens": "56106a4f397c186a",
857
+ "hash_cont_tokens": "04046100381b58a8"
858
+ },
859
+ "truncated": 0,
860
+ "non_truncated": 998,
861
+ "padded": 998,
862
+ "non_padded": 0,
863
+ "effective_few_shots": 0.0,
864
+ "num_truncated_few_shots": 0
865
+ },
866
+ "lighteval|bigbench:snarks|0": {
867
+ "hashes": {
868
+ "hash_examples": "3a53eb9b9d758534",
869
+ "hash_full_prompts": "3a53eb9b9d758534",
870
+ "hash_input_tokens": "518be57c5be4521c",
871
+ "hash_cont_tokens": "4b4988d6f536f2c4"
872
+ },
873
+ "truncated": 0,
874
+ "non_truncated": 181,
875
+ "padded": 172,
876
+ "non_padded": 9,
877
+ "effective_few_shots": 0.0,
878
+ "num_truncated_few_shots": 0
879
+ },
880
+ "lighteval|bigbench:sports_understanding|0": {
881
+ "hashes": {
882
+ "hash_examples": "bd65741f00770373",
883
+ "hash_full_prompts": "bd65741f00770373",
884
+ "hash_input_tokens": "7c6a25ac517dfee1",
885
+ "hash_cont_tokens": "dc942e2d909c04d4"
886
+ },
887
+ "truncated": 0,
888
+ "non_truncated": 1000,
889
+ "padded": 1000,
890
+ "non_padded": 0,
891
+ "effective_few_shots": 0.0,
892
+ "num_truncated_few_shots": 0
893
+ },
894
+ "lighteval|bigbench:temporal_sequences|0": {
895
+ "hashes": {
896
+ "hash_examples": "1d13139f47cb2df7",
897
+ "hash_full_prompts": "1d13139f47cb2df7",
898
+ "hash_input_tokens": "311ec4715efb4e31",
899
+ "hash_cont_tokens": "5482aa5b7d0700b0"
900
+ },
901
+ "truncated": 0,
902
+ "non_truncated": 1000,
903
+ "padded": 992,
904
+ "non_padded": 8,
905
+ "effective_few_shots": 0.0,
906
+ "num_truncated_few_shots": 0
907
+ },
908
+ "lighteval|bigbench:tracking_shuffled_objects_five_objects|0": {
909
+ "hashes": {
910
+ "hash_examples": "8770a702a9646648",
911
+ "hash_full_prompts": "8770a702a9646648",
912
+ "hash_input_tokens": "8560633c07ed6feb",
913
+ "hash_cont_tokens": "98f95fb7447217d1"
914
+ },
915
+ "truncated": 0,
916
+ "non_truncated": 1250,
917
+ "padded": 1250,
918
+ "non_padded": 0,
919
+ "effective_few_shots": 0.0,
920
+ "num_truncated_few_shots": 0
921
+ },
922
+ "lighteval|bigbench:tracking_shuffled_objects_seven_objects|0": {
923
+ "hashes": {
924
+ "hash_examples": "b469b7d073824a59",
925
+ "hash_full_prompts": "b469b7d073824a59",
926
+ "hash_input_tokens": "0cbedd7da5509eea",
927
+ "hash_cont_tokens": "05f35788054043f3"
928
+ },
929
+ "truncated": 0,
930
+ "non_truncated": 1750,
931
+ "padded": 1750,
932
+ "non_padded": 0,
933
+ "effective_few_shots": 0.0,
934
+ "num_truncated_few_shots": 0
935
+ },
936
+ "lighteval|bigbench:tracking_shuffled_objects_three_objects|0": {
937
+ "hashes": {
938
+ "hash_examples": "0509e5712ab9bcdb",
939
+ "hash_full_prompts": "0509e5712ab9bcdb",
940
+ "hash_input_tokens": "ea5d60ad52932480",
941
+ "hash_cont_tokens": "d366642d17153e4a"
942
+ },
943
+ "truncated": 0,
944
+ "non_truncated": 300,
945
+ "padded": 264,
946
+ "non_padded": 36,
947
+ "effective_few_shots": 0.0,
948
+ "num_truncated_few_shots": 0
949
+ }
950
+ },
951
+ "summary_general": {
952
+ "hashes": {
953
+ "hash_examples": "51a30c4501ba4586",
954
+ "hash_full_prompts": "51a30c4501ba4586",
955
+ "hash_input_tokens": "8405c531364ac520",
956
+ "hash_cont_tokens": "374be527beaf3fbb"
957
+ },
958
+ "truncated": 0,
959
+ "non_truncated": 13104,
960
+ "padded": 12973,
961
+ "non_padded": 131,
962
+ "num_truncated_few_shots": 0
963
+ }
964
+ }