lewtun HF staff commited on
Commit
acad715
·
verified ·
1 Parent(s): d64ecd4

Upload eval_results/HuggingFaceH4/zephyr-7b-beta-ift/v1.5/bbh/results_2024-03-19T09-27-11.722655.json with huggingface_hub

Browse files
eval_results/HuggingFaceH4/zephyr-7b-beta-ift/v1.5/bbh/results_2024-03-19T09-27-11.722655.json ADDED
@@ -0,0 +1,1184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 1,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 2786134.983495885,
9
+ "end_time": 2786411.547436556,
10
+ "total_evaluation_time_secondes": "276.5639406708069",
11
+ "model_name": "HuggingFaceH4/zephyr-7b-beta-ift",
12
+ "model_sha": "861515aa7e4d110e576effa7a4e7686ee0c6d807",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "13.99 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "harness|bbh:causal_judgment|3": {
19
+ "em": 0.43315508021390375,
20
+ "em_stderr": 0.03633267411102587,
21
+ "qem": 0.43315508021390375,
22
+ "qem_stderr": 0.03633267411102587,
23
+ "pem": 0.5614973262032086,
24
+ "pem_stderr": 0.03638341809400995,
25
+ "pqem": 0.5614973262032086,
26
+ "pqem_stderr": 0.03638341809400995,
27
+ "perfect_em": 0.43315508021390375,
28
+ "perfect_em_stderr": 0.03633267411102587
29
+ },
30
+ "harness|bbh:date_understanding|3": {
31
+ "em": 0.5,
32
+ "em_stderr": 0.031686212526223896,
33
+ "qem": 0.5,
34
+ "qem_stderr": 0.031686212526223896,
35
+ "pem": 0.5,
36
+ "pem_stderr": 0.031686212526223896,
37
+ "pqem": 0.584,
38
+ "pqem_stderr": 0.031235856237014553,
39
+ "perfect_em": 0.5,
40
+ "perfect_em_stderr": 0.031686212526223896
41
+ },
42
+ "harness|bbh:disambiguation_qa|3": {
43
+ "em": 0.612,
44
+ "em_stderr": 0.03088103874899392,
45
+ "qem": 0.612,
46
+ "qem_stderr": 0.03088103874899392,
47
+ "pem": 0.612,
48
+ "pem_stderr": 0.03088103874899392,
49
+ "pqem": 0.716,
50
+ "pqem_stderr": 0.02857695873043741,
51
+ "perfect_em": 0.612,
52
+ "perfect_em_stderr": 0.03088103874899392
53
+ },
54
+ "harness|bbh:geometric_shapes|3": {
55
+ "em": 0.24,
56
+ "em_stderr": 0.027065293652239007,
57
+ "qem": 0.24,
58
+ "qem_stderr": 0.027065293652239007,
59
+ "pem": 0.24,
60
+ "pem_stderr": 0.027065293652239007,
61
+ "pqem": 0.24,
62
+ "pqem_stderr": 0.027065293652239007,
63
+ "perfect_em": 0.24,
64
+ "perfect_em_stderr": 0.027065293652239007
65
+ },
66
+ "harness|bbh:logical_deduction_five_objects|3": {
67
+ "em": 0.356,
68
+ "em_stderr": 0.030343680657153215,
69
+ "qem": 0.356,
70
+ "qem_stderr": 0.030343680657153215,
71
+ "pem": 0.356,
72
+ "pem_stderr": 0.030343680657153215,
73
+ "pqem": 0.448,
74
+ "pqem_stderr": 0.03151438761115355,
75
+ "perfect_em": 0.356,
76
+ "perfect_em_stderr": 0.030343680657153215
77
+ },
78
+ "harness|bbh:logical_deduction_seven_objects|3": {
79
+ "em": 0.264,
80
+ "em_stderr": 0.027934518957690908,
81
+ "qem": 0.264,
82
+ "qem_stderr": 0.027934518957690908,
83
+ "pem": 0.264,
84
+ "pem_stderr": 0.027934518957690908,
85
+ "pqem": 0.368,
86
+ "pqem_stderr": 0.03056207062099316,
87
+ "perfect_em": 0.264,
88
+ "perfect_em_stderr": 0.027934518957690908
89
+ },
90
+ "harness|bbh:logical_deduction_three_objects|3": {
91
+ "em": 0.532,
92
+ "em_stderr": 0.031621252575725504,
93
+ "qem": 0.532,
94
+ "qem_stderr": 0.031621252575725504,
95
+ "pem": 0.532,
96
+ "pem_stderr": 0.031621252575725504,
97
+ "pqem": 0.74,
98
+ "pqem_stderr": 0.027797315752644304,
99
+ "perfect_em": 0.532,
100
+ "perfect_em_stderr": 0.031621252575725504
101
+ },
102
+ "harness|bbh:movie_recommendation|3": {
103
+ "em": 0.5261044176706827,
104
+ "em_stderr": 0.0317067307000546,
105
+ "qem": 0.5261044176706827,
106
+ "qem_stderr": 0.0317067307000546,
107
+ "pem": 0.5542168674698795,
108
+ "pem_stderr": 0.0315628233794839,
109
+ "pqem": 0.6265060240963856,
110
+ "pqem_stderr": 0.030716985021643353,
111
+ "perfect_em": 0.5261044176706827,
112
+ "perfect_em_stderr": 0.0317067307000546
113
+ },
114
+ "harness|bbh:navigate|3": {
115
+ "em": 0.588,
116
+ "em_stderr": 0.0311915960260229,
117
+ "qem": 0.588,
118
+ "qem_stderr": 0.0311915960260229,
119
+ "pem": 0.588,
120
+ "pem_stderr": 0.0311915960260229,
121
+ "pqem": 0.588,
122
+ "pqem_stderr": 0.0311915960260229,
123
+ "perfect_em": 0.588,
124
+ "perfect_em_stderr": 0.0311915960260229
125
+ },
126
+ "harness|bbh:reasoning_about_colored_objects|3": {
127
+ "em": 0.176,
128
+ "em_stderr": 0.024133497525457123,
129
+ "qem": 0.176,
130
+ "qem_stderr": 0.024133497525457123,
131
+ "pem": 0.336,
132
+ "pem_stderr": 0.029933259094191516,
133
+ "pqem": 0.48,
134
+ "pqem_stderr": 0.031660853408495185,
135
+ "perfect_em": 0.176,
136
+ "perfect_em_stderr": 0.024133497525457123
137
+ },
138
+ "harness|bbh:ruin_names|3": {
139
+ "em": 0.49193548387096775,
140
+ "em_stderr": 0.031810099711288585,
141
+ "qem": 0.49193548387096775,
142
+ "qem_stderr": 0.031810099711288585,
143
+ "pem": 0.4959677419354839,
144
+ "pem_stderr": 0.0318132035898842,
145
+ "pqem": 0.5725806451612904,
146
+ "pqem_stderr": 0.031477261888285146,
147
+ "perfect_em": 0.49193548387096775,
148
+ "perfect_em_stderr": 0.031810099711288585
149
+ },
150
+ "harness|bbh:salient_translation_error_detection|3": {
151
+ "em": 0.344,
152
+ "em_stderr": 0.030104503392316392,
153
+ "qem": 0.344,
154
+ "qem_stderr": 0.030104503392316392,
155
+ "pem": 0.344,
156
+ "pem_stderr": 0.030104503392316392,
157
+ "pqem": 0.484,
158
+ "pqem_stderr": 0.03166998503010742,
159
+ "perfect_em": 0.344,
160
+ "perfect_em_stderr": 0.030104503392316392
161
+ },
162
+ "harness|bbh:snarks|3": {
163
+ "em": 0.5786516853932584,
164
+ "em_stderr": 0.037114414059601884,
165
+ "qem": 0.5786516853932584,
166
+ "qem_stderr": 0.037114414059601884,
167
+ "pem": 0.5786516853932584,
168
+ "pem_stderr": 0.037114414059601884,
169
+ "pqem": 0.6067415730337079,
170
+ "pqem_stderr": 0.036715907095165826,
171
+ "perfect_em": 0.5786516853932584,
172
+ "perfect_em_stderr": 0.037114414059601884
173
+ },
174
+ "harness|bbh:sports_understanding|3": {
175
+ "em": 0.256,
176
+ "em_stderr": 0.0276571087182049,
177
+ "qem": 0.256,
178
+ "qem_stderr": 0.0276571087182049,
179
+ "pem": 0.792,
180
+ "pem_stderr": 0.025721398901416392,
181
+ "pqem": 0.792,
182
+ "pqem_stderr": 0.025721398901416392,
183
+ "perfect_em": 0.256,
184
+ "perfect_em_stderr": 0.0276571087182049
185
+ },
186
+ "harness|bbh:temporal_sequences|3": {
187
+ "em": 0.112,
188
+ "em_stderr": 0.019985536939171433,
189
+ "qem": 0.112,
190
+ "qem_stderr": 0.019985536939171433,
191
+ "pem": 0.112,
192
+ "pem_stderr": 0.019985536939171433,
193
+ "pqem": 0.36,
194
+ "pqem_stderr": 0.03041876402517498,
195
+ "perfect_em": 0.112,
196
+ "perfect_em_stderr": 0.019985536939171433
197
+ },
198
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": {
199
+ "em": 0.164,
200
+ "em_stderr": 0.023465261002076757,
201
+ "qem": 0.164,
202
+ "qem_stderr": 0.023465261002076757,
203
+ "pem": 0.164,
204
+ "pem_stderr": 0.023465261002076757,
205
+ "pqem": 0.36,
206
+ "pqem_stderr": 0.030418764025174995,
207
+ "perfect_em": 0.164,
208
+ "perfect_em_stderr": 0.023465261002076757
209
+ },
210
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": {
211
+ "em": 0.1,
212
+ "em_stderr": 0.01901172751573437,
213
+ "qem": 0.1,
214
+ "qem_stderr": 0.01901172751573437,
215
+ "pem": 0.108,
216
+ "pem_stderr": 0.019669559381568755,
217
+ "pqem": 0.24,
218
+ "pqem_stderr": 0.027065293652239003,
219
+ "perfect_em": 0.1,
220
+ "perfect_em_stderr": 0.01901172751573437
221
+ },
222
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": {
223
+ "em": 0.376,
224
+ "em_stderr": 0.030696336267394594,
225
+ "qem": 0.376,
226
+ "qem_stderr": 0.030696336267394594,
227
+ "pem": 0.396,
228
+ "pem_stderr": 0.03099319785457785,
229
+ "pqem": 0.704,
230
+ "pqem_stderr": 0.028928939388379638,
231
+ "perfect_em": 0.376,
232
+ "perfect_em_stderr": 0.030696336267394594
233
+ },
234
+ "harness|bbh:_average|3": {
235
+ "em": 0.3694359259527118,
236
+ "em_stderr": 0.029041193504798654,
237
+ "qem": 0.3694359259527118,
238
+ "qem_stderr": 0.029041193504798654,
239
+ "pem": 0.41857409005565727,
240
+ "pem_stderr": 0.0293038982684638,
241
+ "pqem": 0.5261847538052551,
242
+ "pqem_stderr": 0.030506724953366482,
243
+ "perfect_em": 0.3694359259527118,
244
+ "perfect_em_stderr": 0.029041193504798654
245
+ }
246
+ },
247
+ "versions": {
248
+ "harness|bbh:causal_judgment|3": 0,
249
+ "harness|bbh:date_understanding|3": 0,
250
+ "harness|bbh:disambiguation_qa|3": 0,
251
+ "harness|bbh:geometric_shapes|3": 0,
252
+ "harness|bbh:logical_deduction_five_objects|3": 0,
253
+ "harness|bbh:logical_deduction_seven_objects|3": 0,
254
+ "harness|bbh:logical_deduction_three_objects|3": 0,
255
+ "harness|bbh:movie_recommendation|3": 0,
256
+ "harness|bbh:navigate|3": 0,
257
+ "harness|bbh:reasoning_about_colored_objects|3": 0,
258
+ "harness|bbh:ruin_names|3": 0,
259
+ "harness|bbh:salient_translation_error_detection|3": 0,
260
+ "harness|bbh:snarks|3": 0,
261
+ "harness|bbh:sports_understanding|3": 0,
262
+ "harness|bbh:temporal_sequences|3": 0,
263
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": 0,
264
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": 0,
265
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": 0
266
+ },
267
+ "config_tasks": {
268
+ "harness|bbh:causal_judgment": {
269
+ "name": "bbh:causal_judgment",
270
+ "prompt_function": "bbh_causal_judgment",
271
+ "hf_repo": "lukaemon/bbh",
272
+ "hf_subset": "causal_judgement",
273
+ "metric": [
274
+ "exact_match",
275
+ "quasi_exact_match",
276
+ "prefix_exact_match",
277
+ "prefix_quasi_exact_match",
278
+ "perfect_exact_match"
279
+ ],
280
+ "hf_avail_splits": [
281
+ "test"
282
+ ],
283
+ "evaluation_splits": [
284
+ "test"
285
+ ],
286
+ "few_shots_split": null,
287
+ "few_shots_select": null,
288
+ "generation_size": 20,
289
+ "stop_sequence": [
290
+ "</s>",
291
+ "Q:",
292
+ "\n\n"
293
+ ],
294
+ "output_regex": null,
295
+ "frozen": false,
296
+ "suite": [
297
+ "harness"
298
+ ],
299
+ "original_num_docs": 187,
300
+ "effective_num_docs": 187,
301
+ "trust_dataset": true,
302
+ "must_remove_duplicate_docs": null
303
+ },
304
+ "harness|bbh:date_understanding": {
305
+ "name": "bbh:date_understanding",
306
+ "prompt_function": "bbh_date_understanding",
307
+ "hf_repo": "lukaemon/bbh",
308
+ "hf_subset": "date_understanding",
309
+ "metric": [
310
+ "exact_match",
311
+ "quasi_exact_match",
312
+ "prefix_exact_match",
313
+ "prefix_quasi_exact_match",
314
+ "perfect_exact_match"
315
+ ],
316
+ "hf_avail_splits": [
317
+ "test"
318
+ ],
319
+ "evaluation_splits": [
320
+ "test"
321
+ ],
322
+ "few_shots_split": null,
323
+ "few_shots_select": null,
324
+ "generation_size": 20,
325
+ "stop_sequence": [
326
+ "</s>",
327
+ "Q:",
328
+ "\n\n"
329
+ ],
330
+ "output_regex": null,
331
+ "frozen": false,
332
+ "suite": [
333
+ "harness"
334
+ ],
335
+ "original_num_docs": 250,
336
+ "effective_num_docs": 250,
337
+ "trust_dataset": true,
338
+ "must_remove_duplicate_docs": null
339
+ },
340
+ "harness|bbh:disambiguation_qa": {
341
+ "name": "bbh:disambiguation_qa",
342
+ "prompt_function": "bbh_disambiguation_qa",
343
+ "hf_repo": "lukaemon/bbh",
344
+ "hf_subset": "disambiguation_qa",
345
+ "metric": [
346
+ "exact_match",
347
+ "quasi_exact_match",
348
+ "prefix_exact_match",
349
+ "prefix_quasi_exact_match",
350
+ "perfect_exact_match"
351
+ ],
352
+ "hf_avail_splits": [
353
+ "test"
354
+ ],
355
+ "evaluation_splits": [
356
+ "test"
357
+ ],
358
+ "few_shots_split": null,
359
+ "few_shots_select": null,
360
+ "generation_size": 20,
361
+ "stop_sequence": [
362
+ "</s>",
363
+ "Q:",
364
+ "\n\n"
365
+ ],
366
+ "output_regex": null,
367
+ "frozen": false,
368
+ "suite": [
369
+ "harness"
370
+ ],
371
+ "original_num_docs": 250,
372
+ "effective_num_docs": 250,
373
+ "trust_dataset": true,
374
+ "must_remove_duplicate_docs": null
375
+ },
376
+ "harness|bbh:geometric_shapes": {
377
+ "name": "bbh:geometric_shapes",
378
+ "prompt_function": "bbh_geometric_shapes",
379
+ "hf_repo": "lukaemon/bbh",
380
+ "hf_subset": "geometric_shapes",
381
+ "metric": [
382
+ "exact_match",
383
+ "quasi_exact_match",
384
+ "prefix_exact_match",
385
+ "prefix_quasi_exact_match",
386
+ "perfect_exact_match"
387
+ ],
388
+ "hf_avail_splits": [
389
+ "test"
390
+ ],
391
+ "evaluation_splits": [
392
+ "test"
393
+ ],
394
+ "few_shots_split": null,
395
+ "few_shots_select": null,
396
+ "generation_size": 20,
397
+ "stop_sequence": [
398
+ "</s>",
399
+ "Q:",
400
+ "\n\n"
401
+ ],
402
+ "output_regex": null,
403
+ "frozen": false,
404
+ "suite": [
405
+ "harness"
406
+ ],
407
+ "original_num_docs": 250,
408
+ "effective_num_docs": 250,
409
+ "trust_dataset": true,
410
+ "must_remove_duplicate_docs": null
411
+ },
412
+ "harness|bbh:logical_deduction_five_objects": {
413
+ "name": "bbh:logical_deduction_five_objects",
414
+ "prompt_function": "bbh_logical_deduction_five_objects",
415
+ "hf_repo": "lukaemon/bbh",
416
+ "hf_subset": "logical_deduction_five_objects",
417
+ "metric": [
418
+ "exact_match",
419
+ "quasi_exact_match",
420
+ "prefix_exact_match",
421
+ "prefix_quasi_exact_match",
422
+ "perfect_exact_match"
423
+ ],
424
+ "hf_avail_splits": [
425
+ "test"
426
+ ],
427
+ "evaluation_splits": [
428
+ "test"
429
+ ],
430
+ "few_shots_split": null,
431
+ "few_shots_select": null,
432
+ "generation_size": 20,
433
+ "stop_sequence": [
434
+ "</s>",
435
+ "Q:",
436
+ "\n\n"
437
+ ],
438
+ "output_regex": null,
439
+ "frozen": false,
440
+ "suite": [
441
+ "harness"
442
+ ],
443
+ "original_num_docs": 250,
444
+ "effective_num_docs": 250,
445
+ "trust_dataset": true,
446
+ "must_remove_duplicate_docs": null
447
+ },
448
+ "harness|bbh:logical_deduction_seven_objects": {
449
+ "name": "bbh:logical_deduction_seven_objects",
450
+ "prompt_function": "bbh_logical_deduction_seven_objects",
451
+ "hf_repo": "lukaemon/bbh",
452
+ "hf_subset": "logical_deduction_seven_objects",
453
+ "metric": [
454
+ "exact_match",
455
+ "quasi_exact_match",
456
+ "prefix_exact_match",
457
+ "prefix_quasi_exact_match",
458
+ "perfect_exact_match"
459
+ ],
460
+ "hf_avail_splits": [
461
+ "test"
462
+ ],
463
+ "evaluation_splits": [
464
+ "test"
465
+ ],
466
+ "few_shots_split": null,
467
+ "few_shots_select": null,
468
+ "generation_size": 20,
469
+ "stop_sequence": [
470
+ "</s>",
471
+ "Q:",
472
+ "\n\n"
473
+ ],
474
+ "output_regex": null,
475
+ "frozen": false,
476
+ "suite": [
477
+ "harness"
478
+ ],
479
+ "original_num_docs": 250,
480
+ "effective_num_docs": 250,
481
+ "trust_dataset": true,
482
+ "must_remove_duplicate_docs": null
483
+ },
484
+ "harness|bbh:logical_deduction_three_objects": {
485
+ "name": "bbh:logical_deduction_three_objects",
486
+ "prompt_function": "bbh_logical_deduction_three_objects",
487
+ "hf_repo": "lukaemon/bbh",
488
+ "hf_subset": "logical_deduction_three_objects",
489
+ "metric": [
490
+ "exact_match",
491
+ "quasi_exact_match",
492
+ "prefix_exact_match",
493
+ "prefix_quasi_exact_match",
494
+ "perfect_exact_match"
495
+ ],
496
+ "hf_avail_splits": [
497
+ "test"
498
+ ],
499
+ "evaluation_splits": [
500
+ "test"
501
+ ],
502
+ "few_shots_split": null,
503
+ "few_shots_select": null,
504
+ "generation_size": 20,
505
+ "stop_sequence": [
506
+ "</s>",
507
+ "Q:",
508
+ "\n\n"
509
+ ],
510
+ "output_regex": null,
511
+ "frozen": false,
512
+ "suite": [
513
+ "harness"
514
+ ],
515
+ "original_num_docs": 250,
516
+ "effective_num_docs": 250,
517
+ "trust_dataset": true,
518
+ "must_remove_duplicate_docs": null
519
+ },
520
+ "harness|bbh:movie_recommendation": {
521
+ "name": "bbh:movie_recommendation",
522
+ "prompt_function": "bbh_movie_recommendation",
523
+ "hf_repo": "lukaemon/bbh",
524
+ "hf_subset": "movie_recommendation",
525
+ "metric": [
526
+ "exact_match",
527
+ "quasi_exact_match",
528
+ "prefix_exact_match",
529
+ "prefix_quasi_exact_match",
530
+ "perfect_exact_match"
531
+ ],
532
+ "hf_avail_splits": [
533
+ "test"
534
+ ],
535
+ "evaluation_splits": [
536
+ "test"
537
+ ],
538
+ "few_shots_split": null,
539
+ "few_shots_select": null,
540
+ "generation_size": 20,
541
+ "stop_sequence": [
542
+ "</s>",
543
+ "Q:",
544
+ "\n\n"
545
+ ],
546
+ "output_regex": null,
547
+ "frozen": false,
548
+ "suite": [
549
+ "harness"
550
+ ],
551
+ "original_num_docs": 249,
552
+ "effective_num_docs": 249,
553
+ "trust_dataset": true,
554
+ "must_remove_duplicate_docs": null
555
+ },
556
+ "harness|bbh:navigate": {
557
+ "name": "bbh:navigate",
558
+ "prompt_function": "bbh_navigate",
559
+ "hf_repo": "lukaemon/bbh",
560
+ "hf_subset": "navigate",
561
+ "metric": [
562
+ "exact_match",
563
+ "quasi_exact_match",
564
+ "prefix_exact_match",
565
+ "prefix_quasi_exact_match",
566
+ "perfect_exact_match"
567
+ ],
568
+ "hf_avail_splits": [
569
+ "test"
570
+ ],
571
+ "evaluation_splits": [
572
+ "test"
573
+ ],
574
+ "few_shots_split": null,
575
+ "few_shots_select": null,
576
+ "generation_size": 20,
577
+ "stop_sequence": [
578
+ "</s>",
579
+ "Q:",
580
+ "\n\n"
581
+ ],
582
+ "output_regex": null,
583
+ "frozen": false,
584
+ "suite": [
585
+ "harness"
586
+ ],
587
+ "original_num_docs": 250,
588
+ "effective_num_docs": 250,
589
+ "trust_dataset": true,
590
+ "must_remove_duplicate_docs": null
591
+ },
592
+ "harness|bbh:reasoning_about_colored_objects": {
593
+ "name": "bbh:reasoning_about_colored_objects",
594
+ "prompt_function": "bbh_reasoning_about_colored_objects",
595
+ "hf_repo": "lukaemon/bbh",
596
+ "hf_subset": "reasoning_about_colored_objects",
597
+ "metric": [
598
+ "exact_match",
599
+ "quasi_exact_match",
600
+ "prefix_exact_match",
601
+ "prefix_quasi_exact_match",
602
+ "perfect_exact_match"
603
+ ],
604
+ "hf_avail_splits": [
605
+ "test"
606
+ ],
607
+ "evaluation_splits": [
608
+ "test"
609
+ ],
610
+ "few_shots_split": null,
611
+ "few_shots_select": null,
612
+ "generation_size": 20,
613
+ "stop_sequence": [
614
+ "</s>",
615
+ "Q:",
616
+ "\n\n"
617
+ ],
618
+ "output_regex": null,
619
+ "frozen": false,
620
+ "suite": [
621
+ "harness"
622
+ ],
623
+ "original_num_docs": 250,
624
+ "effective_num_docs": 250,
625
+ "trust_dataset": true,
626
+ "must_remove_duplicate_docs": null
627
+ },
628
+ "harness|bbh:ruin_names": {
629
+ "name": "bbh:ruin_names",
630
+ "prompt_function": "bbh_ruin_names",
631
+ "hf_repo": "lukaemon/bbh",
632
+ "hf_subset": "ruin_names",
633
+ "metric": [
634
+ "exact_match",
635
+ "quasi_exact_match",
636
+ "prefix_exact_match",
637
+ "prefix_quasi_exact_match",
638
+ "perfect_exact_match"
639
+ ],
640
+ "hf_avail_splits": [
641
+ "test"
642
+ ],
643
+ "evaluation_splits": [
644
+ "test"
645
+ ],
646
+ "few_shots_split": null,
647
+ "few_shots_select": null,
648
+ "generation_size": 20,
649
+ "stop_sequence": [
650
+ "</s>",
651
+ "Q:",
652
+ "\n\n"
653
+ ],
654
+ "output_regex": null,
655
+ "frozen": false,
656
+ "suite": [
657
+ "harness"
658
+ ],
659
+ "original_num_docs": 248,
660
+ "effective_num_docs": 248,
661
+ "trust_dataset": true,
662
+ "must_remove_duplicate_docs": null
663
+ },
664
+ "harness|bbh:salient_translation_error_detection": {
665
+ "name": "bbh:salient_translation_error_detection",
666
+ "prompt_function": "bbh_salient_translation_error_detection",
667
+ "hf_repo": "lukaemon/bbh",
668
+ "hf_subset": "salient_translation_error_detection",
669
+ "metric": [
670
+ "exact_match",
671
+ "quasi_exact_match",
672
+ "prefix_exact_match",
673
+ "prefix_quasi_exact_match",
674
+ "perfect_exact_match"
675
+ ],
676
+ "hf_avail_splits": [
677
+ "test"
678
+ ],
679
+ "evaluation_splits": [
680
+ "test"
681
+ ],
682
+ "few_shots_split": null,
683
+ "few_shots_select": null,
684
+ "generation_size": 20,
685
+ "stop_sequence": [
686
+ "</s>",
687
+ "Q:",
688
+ "\n\n"
689
+ ],
690
+ "output_regex": null,
691
+ "frozen": false,
692
+ "suite": [
693
+ "harness"
694
+ ],
695
+ "original_num_docs": 250,
696
+ "effective_num_docs": 250,
697
+ "trust_dataset": true,
698
+ "must_remove_duplicate_docs": null
699
+ },
700
+ "harness|bbh:snarks": {
701
+ "name": "bbh:snarks",
702
+ "prompt_function": "bbh_snarks",
703
+ "hf_repo": "lukaemon/bbh",
704
+ "hf_subset": "snarks",
705
+ "metric": [
706
+ "exact_match",
707
+ "quasi_exact_match",
708
+ "prefix_exact_match",
709
+ "prefix_quasi_exact_match",
710
+ "perfect_exact_match"
711
+ ],
712
+ "hf_avail_splits": [
713
+ "test"
714
+ ],
715
+ "evaluation_splits": [
716
+ "test"
717
+ ],
718
+ "few_shots_split": null,
719
+ "few_shots_select": null,
720
+ "generation_size": 20,
721
+ "stop_sequence": [
722
+ "</s>",
723
+ "Q:",
724
+ "\n\n"
725
+ ],
726
+ "output_regex": null,
727
+ "frozen": false,
728
+ "suite": [
729
+ "harness"
730
+ ],
731
+ "original_num_docs": 178,
732
+ "effective_num_docs": 178,
733
+ "trust_dataset": true,
734
+ "must_remove_duplicate_docs": null
735
+ },
736
+ "harness|bbh:sports_understanding": {
737
+ "name": "bbh:sports_understanding",
738
+ "prompt_function": "bbh_sports_understanding",
739
+ "hf_repo": "lukaemon/bbh",
740
+ "hf_subset": "sports_understanding",
741
+ "metric": [
742
+ "exact_match",
743
+ "quasi_exact_match",
744
+ "prefix_exact_match",
745
+ "prefix_quasi_exact_match",
746
+ "perfect_exact_match"
747
+ ],
748
+ "hf_avail_splits": [
749
+ "test"
750
+ ],
751
+ "evaluation_splits": [
752
+ "test"
753
+ ],
754
+ "few_shots_split": null,
755
+ "few_shots_select": null,
756
+ "generation_size": 20,
757
+ "stop_sequence": [
758
+ "</s>",
759
+ "Q:",
760
+ "\n\n"
761
+ ],
762
+ "output_regex": null,
763
+ "frozen": false,
764
+ "suite": [
765
+ "harness"
766
+ ],
767
+ "original_num_docs": 250,
768
+ "effective_num_docs": 250,
769
+ "trust_dataset": true,
770
+ "must_remove_duplicate_docs": null
771
+ },
772
+ "harness|bbh:temporal_sequences": {
773
+ "name": "bbh:temporal_sequences",
774
+ "prompt_function": "bbh_temporal_sequences",
775
+ "hf_repo": "lukaemon/bbh",
776
+ "hf_subset": "temporal_sequences",
777
+ "metric": [
778
+ "exact_match",
779
+ "quasi_exact_match",
780
+ "prefix_exact_match",
781
+ "prefix_quasi_exact_match",
782
+ "perfect_exact_match"
783
+ ],
784
+ "hf_avail_splits": [
785
+ "test"
786
+ ],
787
+ "evaluation_splits": [
788
+ "test"
789
+ ],
790
+ "few_shots_split": null,
791
+ "few_shots_select": null,
792
+ "generation_size": 20,
793
+ "stop_sequence": [
794
+ "</s>",
795
+ "Q:",
796
+ "\n\n"
797
+ ],
798
+ "output_regex": null,
799
+ "frozen": false,
800
+ "suite": [
801
+ "harness"
802
+ ],
803
+ "original_num_docs": 250,
804
+ "effective_num_docs": 250,
805
+ "trust_dataset": true,
806
+ "must_remove_duplicate_docs": null
807
+ },
808
+ "harness|bbh:tracking_shuffled_objects_five_objects": {
809
+ "name": "bbh:tracking_shuffled_objects_five_objects",
810
+ "prompt_function": "bbh_tracking_shuffled_objects_five_objects",
811
+ "hf_repo": "lukaemon/bbh",
812
+ "hf_subset": "tracking_shuffled_objects_five_objects",
813
+ "metric": [
814
+ "exact_match",
815
+ "quasi_exact_match",
816
+ "prefix_exact_match",
817
+ "prefix_quasi_exact_match",
818
+ "perfect_exact_match"
819
+ ],
820
+ "hf_avail_splits": [
821
+ "test"
822
+ ],
823
+ "evaluation_splits": [
824
+ "test"
825
+ ],
826
+ "few_shots_split": null,
827
+ "few_shots_select": null,
828
+ "generation_size": 20,
829
+ "stop_sequence": [
830
+ "</s>",
831
+ "Q:",
832
+ "\n\n"
833
+ ],
834
+ "output_regex": null,
835
+ "frozen": false,
836
+ "suite": [
837
+ "harness"
838
+ ],
839
+ "original_num_docs": 250,
840
+ "effective_num_docs": 250,
841
+ "trust_dataset": true,
842
+ "must_remove_duplicate_docs": null
843
+ },
844
+ "harness|bbh:tracking_shuffled_objects_seven_objects": {
845
+ "name": "bbh:tracking_shuffled_objects_seven_objects",
846
+ "prompt_function": "bbh_tracking_shuffled_objects_seven_objects",
847
+ "hf_repo": "lukaemon/bbh",
848
+ "hf_subset": "tracking_shuffled_objects_seven_objects",
849
+ "metric": [
850
+ "exact_match",
851
+ "quasi_exact_match",
852
+ "prefix_exact_match",
853
+ "prefix_quasi_exact_match",
854
+ "perfect_exact_match"
855
+ ],
856
+ "hf_avail_splits": [
857
+ "test"
858
+ ],
859
+ "evaluation_splits": [
860
+ "test"
861
+ ],
862
+ "few_shots_split": null,
863
+ "few_shots_select": null,
864
+ "generation_size": 20,
865
+ "stop_sequence": [
866
+ "</s>",
867
+ "Q:",
868
+ "\n\n"
869
+ ],
870
+ "output_regex": null,
871
+ "frozen": false,
872
+ "suite": [
873
+ "harness"
874
+ ],
875
+ "original_num_docs": 250,
876
+ "effective_num_docs": 250,
877
+ "trust_dataset": true,
878
+ "must_remove_duplicate_docs": null
879
+ },
880
+ "harness|bbh:tracking_shuffled_objects_three_objects": {
881
+ "name": "bbh:tracking_shuffled_objects_three_objects",
882
+ "prompt_function": "bbh_tracking_shuffled_objects_three_objects",
883
+ "hf_repo": "lukaemon/bbh",
884
+ "hf_subset": "tracking_shuffled_objects_three_objects",
885
+ "metric": [
886
+ "exact_match",
887
+ "quasi_exact_match",
888
+ "prefix_exact_match",
889
+ "prefix_quasi_exact_match",
890
+ "perfect_exact_match"
891
+ ],
892
+ "hf_avail_splits": [
893
+ "test"
894
+ ],
895
+ "evaluation_splits": [
896
+ "test"
897
+ ],
898
+ "few_shots_split": null,
899
+ "few_shots_select": null,
900
+ "generation_size": 20,
901
+ "stop_sequence": [
902
+ "</s>",
903
+ "Q:",
904
+ "\n\n"
905
+ ],
906
+ "output_regex": null,
907
+ "frozen": false,
908
+ "suite": [
909
+ "harness"
910
+ ],
911
+ "original_num_docs": 250,
912
+ "effective_num_docs": 250,
913
+ "trust_dataset": true,
914
+ "must_remove_duplicate_docs": null
915
+ }
916
+ },
917
+ "summary_tasks": {
918
+ "harness|bbh:causal_judgment|3": {
919
+ "hashes": {
920
+ "hash_examples": "63218f5ae055ab2b",
921
+ "hash_full_prompts": "7303fa1d0fe0b29a",
922
+ "hash_input_tokens": "79663e73bb5ce6ac",
923
+ "hash_cont_tokens": "2ae104aa8f463042"
924
+ },
925
+ "truncated": 187,
926
+ "non_truncated": 0,
927
+ "padded": 0,
928
+ "non_padded": 187,
929
+ "effective_few_shots": 3.0,
930
+ "num_truncated_few_shots": 0
931
+ },
932
+ "harness|bbh:date_understanding|3": {
933
+ "hashes": {
934
+ "hash_examples": "f145c7a06def3c8e",
935
+ "hash_full_prompts": "69e60d10afa5a6f1",
936
+ "hash_input_tokens": "e9bd5760c58a1104",
937
+ "hash_cont_tokens": "d2ac72662c379649"
938
+ },
939
+ "truncated": 250,
940
+ "non_truncated": 0,
941
+ "padded": 0,
942
+ "non_padded": 250,
943
+ "effective_few_shots": 3.0,
944
+ "num_truncated_few_shots": 0
945
+ },
946
+ "harness|bbh:disambiguation_qa|3": {
947
+ "hashes": {
948
+ "hash_examples": "19677fd1773f7eb9",
949
+ "hash_full_prompts": "ae0a8fd428f9aee3",
950
+ "hash_input_tokens": "b3625dcc25d708b2",
951
+ "hash_cont_tokens": "af88004d58a4a6e9"
952
+ },
953
+ "truncated": 250,
954
+ "non_truncated": 0,
955
+ "padded": 0,
956
+ "non_padded": 250,
957
+ "effective_few_shots": 3.0,
958
+ "num_truncated_few_shots": 0
959
+ },
960
+ "harness|bbh:geometric_shapes|3": {
961
+ "hashes": {
962
+ "hash_examples": "76c7b11a13cc72a9",
963
+ "hash_full_prompts": "76633257f67207f9",
964
+ "hash_input_tokens": "c16e8768d8c9056f",
965
+ "hash_cont_tokens": "5d48a92cb57803c0"
966
+ },
967
+ "truncated": 250,
968
+ "non_truncated": 0,
969
+ "padded": 0,
970
+ "non_padded": 250,
971
+ "effective_few_shots": 3.0,
972
+ "num_truncated_few_shots": 0
973
+ },
974
+ "harness|bbh:logical_deduction_five_objects|3": {
975
+ "hashes": {
976
+ "hash_examples": "0e958c856332a745",
977
+ "hash_full_prompts": "3c96645848786efd",
978
+ "hash_input_tokens": "915443ee37f164dc",
979
+ "hash_cont_tokens": "b04e092b2e3dc0a2"
980
+ },
981
+ "truncated": 250,
982
+ "non_truncated": 0,
983
+ "padded": 0,
984
+ "non_padded": 250,
985
+ "effective_few_shots": 3.0,
986
+ "num_truncated_few_shots": 0
987
+ },
988
+ "harness|bbh:logical_deduction_seven_objects|3": {
989
+ "hashes": {
990
+ "hash_examples": "ab9de25a5eb40d09",
991
+ "hash_full_prompts": "185c5851c101ee66",
992
+ "hash_input_tokens": "66d532c31ef57236",
993
+ "hash_cont_tokens": "a3aa21e4f1c88a0f"
994
+ },
995
+ "truncated": 250,
996
+ "non_truncated": 0,
997
+ "padded": 0,
998
+ "non_padded": 250,
999
+ "effective_few_shots": 3.0,
1000
+ "num_truncated_few_shots": 0
1001
+ },
1002
+ "harness|bbh:logical_deduction_three_objects|3": {
1003
+ "hashes": {
1004
+ "hash_examples": "3c6bf52517714218",
1005
+ "hash_full_prompts": "8ba2d94357e589d0",
1006
+ "hash_input_tokens": "d51c6ad06efbf88b",
1007
+ "hash_cont_tokens": "6ef2fb2696cb3189"
1008
+ },
1009
+ "truncated": 250,
1010
+ "non_truncated": 0,
1011
+ "padded": 0,
1012
+ "non_padded": 250,
1013
+ "effective_few_shots": 3.0,
1014
+ "num_truncated_few_shots": 0
1015
+ },
1016
+ "harness|bbh:movie_recommendation|3": {
1017
+ "hashes": {
1018
+ "hash_examples": "2d9dc4975935d31a",
1019
+ "hash_full_prompts": "a411e216d0f5f626",
1020
+ "hash_input_tokens": "e17a3080d43ae54f",
1021
+ "hash_cont_tokens": "915fcaefe88e41d0"
1022
+ },
1023
+ "truncated": 249,
1024
+ "non_truncated": 0,
1025
+ "padded": 0,
1026
+ "non_padded": 249,
1027
+ "effective_few_shots": 3.0,
1028
+ "num_truncated_few_shots": 0
1029
+ },
1030
+ "harness|bbh:navigate|3": {
1031
+ "hashes": {
1032
+ "hash_examples": "ba91dcdb9a064255",
1033
+ "hash_full_prompts": "ebb3084ecc78a46a",
1034
+ "hash_input_tokens": "90854b0ca565c8f5",
1035
+ "hash_cont_tokens": "1552b672978b1896"
1036
+ },
1037
+ "truncated": 250,
1038
+ "non_truncated": 0,
1039
+ "padded": 0,
1040
+ "non_padded": 250,
1041
+ "effective_few_shots": 3.0,
1042
+ "num_truncated_few_shots": 0
1043
+ },
1044
+ "harness|bbh:reasoning_about_colored_objects|3": {
1045
+ "hashes": {
1046
+ "hash_examples": "a6ba328c4c3385d2",
1047
+ "hash_full_prompts": "38328d016a4ebef3",
1048
+ "hash_input_tokens": "b45b5a8a531e8bf5",
1049
+ "hash_cont_tokens": "00c57b64676b3af7"
1050
+ },
1051
+ "truncated": 250,
1052
+ "non_truncated": 0,
1053
+ "padded": 0,
1054
+ "non_padded": 250,
1055
+ "effective_few_shots": 3.0,
1056
+ "num_truncated_few_shots": 0
1057
+ },
1058
+ "harness|bbh:ruin_names|3": {
1059
+ "hashes": {
1060
+ "hash_examples": "2ef28d5f2d4fdd25",
1061
+ "hash_full_prompts": "9c7d0493c37182d6",
1062
+ "hash_input_tokens": "627b6058879c9350",
1063
+ "hash_cont_tokens": "d77427847a7d37ee"
1064
+ },
1065
+ "truncated": 248,
1066
+ "non_truncated": 0,
1067
+ "padded": 0,
1068
+ "non_padded": 248,
1069
+ "effective_few_shots": 3.0,
1070
+ "num_truncated_few_shots": 0
1071
+ },
1072
+ "harness|bbh:salient_translation_error_detection|3": {
1073
+ "hashes": {
1074
+ "hash_examples": "c13f25ec8ffed496",
1075
+ "hash_full_prompts": "edccd4061b168b78",
1076
+ "hash_input_tokens": "7d4d7e481ad8766b",
1077
+ "hash_cont_tokens": "ae056dc903a003cd"
1078
+ },
1079
+ "truncated": 250,
1080
+ "non_truncated": 0,
1081
+ "padded": 0,
1082
+ "non_padded": 250,
1083
+ "effective_few_shots": 3.0,
1084
+ "num_truncated_few_shots": 0
1085
+ },
1086
+ "harness|bbh:snarks|3": {
1087
+ "hashes": {
1088
+ "hash_examples": "5f6db7bff7f6f22e",
1089
+ "hash_full_prompts": "31cafd95ab850a44",
1090
+ "hash_input_tokens": "616900bacd0ba7ca",
1091
+ "hash_cont_tokens": "0d6053ff9c4cd010"
1092
+ },
1093
+ "truncated": 178,
1094
+ "non_truncated": 0,
1095
+ "padded": 0,
1096
+ "non_padded": 178,
1097
+ "effective_few_shots": 3.0,
1098
+ "num_truncated_few_shots": 0
1099
+ },
1100
+ "harness|bbh:sports_understanding|3": {
1101
+ "hashes": {
1102
+ "hash_examples": "042afbe5d9c1f02d",
1103
+ "hash_full_prompts": "3d46581e9bbec2d0",
1104
+ "hash_input_tokens": "8e9e99c22dd3a8d2",
1105
+ "hash_cont_tokens": "59a36612e096323c"
1106
+ },
1107
+ "truncated": 250,
1108
+ "non_truncated": 0,
1109
+ "padded": 0,
1110
+ "non_padded": 250,
1111
+ "effective_few_shots": 3.0,
1112
+ "num_truncated_few_shots": 0
1113
+ },
1114
+ "harness|bbh:temporal_sequences|3": {
1115
+ "hashes": {
1116
+ "hash_examples": "803a05f352eb6afc",
1117
+ "hash_full_prompts": "4a54db144a5dd222",
1118
+ "hash_input_tokens": "24789970b2290dd3",
1119
+ "hash_cont_tokens": "966eba878f6e19f6"
1120
+ },
1121
+ "truncated": 250,
1122
+ "non_truncated": 0,
1123
+ "padded": 0,
1124
+ "non_padded": 250,
1125
+ "effective_few_shots": 3.0,
1126
+ "num_truncated_few_shots": 0
1127
+ },
1128
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": {
1129
+ "hashes": {
1130
+ "hash_examples": "2bbac6db7ab0d527",
1131
+ "hash_full_prompts": "e3079106787cc311",
1132
+ "hash_input_tokens": "9036045cff895b08",
1133
+ "hash_cont_tokens": "f1d19d0b07292987"
1134
+ },
1135
+ "truncated": 250,
1136
+ "non_truncated": 0,
1137
+ "padded": 0,
1138
+ "non_padded": 250,
1139
+ "effective_few_shots": 3.0,
1140
+ "num_truncated_few_shots": 0
1141
+ },
1142
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": {
1143
+ "hashes": {
1144
+ "hash_examples": "845caf093ac2b58c",
1145
+ "hash_full_prompts": "6364e5b860590ec8",
1146
+ "hash_input_tokens": "7100c488aa0764ff",
1147
+ "hash_cont_tokens": "3d4a342951cc3896"
1148
+ },
1149
+ "truncated": 250,
1150
+ "non_truncated": 0,
1151
+ "padded": 0,
1152
+ "non_padded": 250,
1153
+ "effective_few_shots": 3.0,
1154
+ "num_truncated_few_shots": 0
1155
+ },
1156
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": {
1157
+ "hashes": {
1158
+ "hash_examples": "9004f14d5a32b9a8",
1159
+ "hash_full_prompts": "01aef56c4d1fe9fe",
1160
+ "hash_input_tokens": "b9690a5d32a586fc",
1161
+ "hash_cont_tokens": "4d169d3a16a49180"
1162
+ },
1163
+ "truncated": 250,
1164
+ "non_truncated": 0,
1165
+ "padded": 0,
1166
+ "non_padded": 250,
1167
+ "effective_few_shots": 3.0,
1168
+ "num_truncated_few_shots": 0
1169
+ }
1170
+ },
1171
+ "summary_general": {
1172
+ "hashes": {
1173
+ "hash_examples": "4ff1e3dc5703575d",
1174
+ "hash_full_prompts": "1cbeab0a00117cb8",
1175
+ "hash_input_tokens": "3608679dab4ce40e",
1176
+ "hash_cont_tokens": "dc02f96cd028fd57"
1177
+ },
1178
+ "truncated": 4362,
1179
+ "non_truncated": 0,
1180
+ "padded": 0,
1181
+ "non_padded": 4362,
1182
+ "num_truncated_few_shots": 0
1183
+ }
1184
+ }