lewtun HF staff commited on
Commit
fdb6355
·
verified ·
1 Parent(s): 92d255d

Upload eval_results/HuggingFaceH4/zephyr-7b-beta-ift/v1.0/bbh/results_2024-03-19T09-26-07.598400.json with huggingface_hub

Browse files
eval_results/HuggingFaceH4/zephyr-7b-beta-ift/v1.0/bbh/results_2024-03-19T09-26-07.598400.json ADDED
@@ -0,0 +1,1184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 1,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 1097191.0494034,
9
+ "end_time": 1097416.541857378,
10
+ "total_evaluation_time_secondes": "225.49245397816412",
11
+ "model_name": "HuggingFaceH4/zephyr-7b-beta-ift",
12
+ "model_sha": "e62f1cbe19b0b3c5c32f6cc85b7345d63852aad3",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "13.99 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "harness|bbh:causal_judgment|3": {
19
+ "em": 0.5401069518716578,
20
+ "em_stderr": 0.036543642520475754,
21
+ "qem": 0.5401069518716578,
22
+ "qem_stderr": 0.036543642520475754,
23
+ "pem": 0.5508021390374331,
24
+ "pem_stderr": 0.0364720501817238,
25
+ "pqem": 0.5508021390374331,
26
+ "pqem_stderr": 0.0364720501817238,
27
+ "perfect_em": 0.5401069518716578,
28
+ "perfect_em_stderr": 0.036543642520475754
29
+ },
30
+ "harness|bbh:date_understanding|3": {
31
+ "em": 0.48,
32
+ "em_stderr": 0.031660853408495185,
33
+ "qem": 0.48,
34
+ "qem_stderr": 0.031660853408495185,
35
+ "pem": 0.48,
36
+ "pem_stderr": 0.031660853408495185,
37
+ "pqem": 0.56,
38
+ "pqem_stderr": 0.03145724452223564,
39
+ "perfect_em": 0.48,
40
+ "perfect_em_stderr": 0.031660853408495185
41
+ },
42
+ "harness|bbh:disambiguation_qa|3": {
43
+ "em": 0.632,
44
+ "em_stderr": 0.030562070620993163,
45
+ "qem": 0.632,
46
+ "qem_stderr": 0.030562070620993163,
47
+ "pem": 0.632,
48
+ "pem_stderr": 0.030562070620993163,
49
+ "pqem": 0.688,
50
+ "pqem_stderr": 0.029361067575219817,
51
+ "perfect_em": 0.632,
52
+ "perfect_em_stderr": 0.030562070620993163
53
+ },
54
+ "harness|bbh:geometric_shapes|3": {
55
+ "em": 0.272,
56
+ "em_stderr": 0.028200088296310006,
57
+ "qem": 0.272,
58
+ "qem_stderr": 0.028200088296310006,
59
+ "pem": 0.272,
60
+ "pem_stderr": 0.028200088296310006,
61
+ "pqem": 0.272,
62
+ "pqem_stderr": 0.028200088296310006,
63
+ "perfect_em": 0.272,
64
+ "perfect_em_stderr": 0.028200088296310006
65
+ },
66
+ "harness|bbh:logical_deduction_five_objects|3": {
67
+ "em": 0.352,
68
+ "em_stderr": 0.03026628805735994,
69
+ "qem": 0.352,
70
+ "qem_stderr": 0.03026628805735994,
71
+ "pem": 0.352,
72
+ "pem_stderr": 0.03026628805735994,
73
+ "pqem": 0.42,
74
+ "pqem_stderr": 0.03127799950463661,
75
+ "perfect_em": 0.352,
76
+ "perfect_em_stderr": 0.03026628805735994
77
+ },
78
+ "harness|bbh:logical_deduction_seven_objects|3": {
79
+ "em": 0.292,
80
+ "em_stderr": 0.028814320402205645,
81
+ "qem": 0.292,
82
+ "qem_stderr": 0.028814320402205645,
83
+ "pem": 0.292,
84
+ "pem_stderr": 0.028814320402205645,
85
+ "pqem": 0.396,
86
+ "pqem_stderr": 0.030993197854577853,
87
+ "perfect_em": 0.292,
88
+ "perfect_em_stderr": 0.028814320402205645
89
+ },
90
+ "harness|bbh:logical_deduction_three_objects|3": {
91
+ "em": 0.532,
92
+ "em_stderr": 0.031621252575725504,
93
+ "qem": 0.532,
94
+ "qem_stderr": 0.031621252575725504,
95
+ "pem": 0.532,
96
+ "pem_stderr": 0.031621252575725504,
97
+ "pqem": 0.696,
98
+ "pqem_stderr": 0.029150213374159673,
99
+ "perfect_em": 0.532,
100
+ "perfect_em_stderr": 0.031621252575725504
101
+ },
102
+ "harness|bbh:movie_recommendation|3": {
103
+ "em": 0.6746987951807228,
104
+ "em_stderr": 0.029748971991075932,
105
+ "qem": 0.6746987951807228,
106
+ "qem_stderr": 0.029748971991075932,
107
+ "pem": 0.678714859437751,
108
+ "pem_stderr": 0.029652625884384973,
109
+ "pqem": 0.7429718875502008,
110
+ "pqem_stderr": 0.027749212562228797,
111
+ "perfect_em": 0.6746987951807228,
112
+ "perfect_em_stderr": 0.029748971991075932
113
+ },
114
+ "harness|bbh:navigate|3": {
115
+ "em": 0.516,
116
+ "em_stderr": 0.03166998503010742,
117
+ "qem": 0.516,
118
+ "qem_stderr": 0.03166998503010742,
119
+ "pem": 0.516,
120
+ "pem_stderr": 0.03166998503010742,
121
+ "pqem": 0.516,
122
+ "pqem_stderr": 0.03166998503010742,
123
+ "perfect_em": 0.516,
124
+ "perfect_em_stderr": 0.03166998503010742
125
+ },
126
+ "harness|bbh:reasoning_about_colored_objects|3": {
127
+ "em": 0.352,
128
+ "em_stderr": 0.03026628805735994,
129
+ "qem": 0.352,
130
+ "qem_stderr": 0.03026628805735994,
131
+ "pem": 0.368,
132
+ "pem_stderr": 0.030562070620993163,
133
+ "pqem": 0.512,
134
+ "pqem_stderr": 0.03167708558254708,
135
+ "perfect_em": 0.352,
136
+ "perfect_em_stderr": 0.03026628805735994
137
+ },
138
+ "harness|bbh:ruin_names|3": {
139
+ "em": 0.38306451612903225,
140
+ "em_stderr": 0.03093195435222337,
141
+ "qem": 0.38306451612903225,
142
+ "qem_stderr": 0.03093195435222337,
143
+ "pem": 0.38306451612903225,
144
+ "pem_stderr": 0.03093195435222337,
145
+ "pqem": 0.46774193548387094,
146
+ "pqem_stderr": 0.03174795841398569,
147
+ "perfect_em": 0.38306451612903225,
148
+ "perfect_em_stderr": 0.03093195435222337
149
+ },
150
+ "harness|bbh:salient_translation_error_detection|3": {
151
+ "em": 0.324,
152
+ "em_stderr": 0.02965829492454557,
153
+ "qem": 0.324,
154
+ "qem_stderr": 0.02965829492454557,
155
+ "pem": 0.324,
156
+ "pem_stderr": 0.02965829492454557,
157
+ "pqem": 0.452,
158
+ "pqem_stderr": 0.03153986449255662,
159
+ "perfect_em": 0.324,
160
+ "perfect_em_stderr": 0.02965829492454557
161
+ },
162
+ "harness|bbh:snarks|3": {
163
+ "em": 0.4943820224719101,
164
+ "em_stderr": 0.03757992900475981,
165
+ "qem": 0.4943820224719101,
166
+ "qem_stderr": 0.03757992900475981,
167
+ "pem": 0.4943820224719101,
168
+ "pem_stderr": 0.03757992900475981,
169
+ "pqem": 0.5786516853932584,
170
+ "pqem_stderr": 0.03711441405960187,
171
+ "perfect_em": 0.4943820224719101,
172
+ "perfect_em_stderr": 0.03757992900475981
173
+ },
174
+ "harness|bbh:sports_understanding|3": {
175
+ "em": 0.412,
176
+ "em_stderr": 0.0311915960260229,
177
+ "qem": 0.412,
178
+ "qem_stderr": 0.0311915960260229,
179
+ "pem": 0.776,
180
+ "pem_stderr": 0.026421361687347905,
181
+ "pqem": 0.776,
182
+ "pqem_stderr": 0.026421361687347905,
183
+ "perfect_em": 0.412,
184
+ "perfect_em_stderr": 0.0311915960260229
185
+ },
186
+ "harness|bbh:temporal_sequences|3": {
187
+ "em": 0.1,
188
+ "em_stderr": 0.01901172751573437,
189
+ "qem": 0.1,
190
+ "qem_stderr": 0.01901172751573437,
191
+ "pem": 0.1,
192
+ "pem_stderr": 0.01901172751573437,
193
+ "pqem": 0.36,
194
+ "pqem_stderr": 0.03041876402517497,
195
+ "perfect_em": 0.1,
196
+ "perfect_em_stderr": 0.01901172751573437
197
+ },
198
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": {
199
+ "em": 0.184,
200
+ "em_stderr": 0.02455581299422256,
201
+ "qem": 0.184,
202
+ "qem_stderr": 0.02455581299422256,
203
+ "pem": 0.184,
204
+ "pem_stderr": 0.02455581299422256,
205
+ "pqem": 0.376,
206
+ "pqem_stderr": 0.030696336267394583,
207
+ "perfect_em": 0.184,
208
+ "perfect_em_stderr": 0.02455581299422256
209
+ },
210
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": {
211
+ "em": 0.108,
212
+ "em_stderr": 0.01966955938156875,
213
+ "qem": 0.108,
214
+ "qem_stderr": 0.01966955938156875,
215
+ "pem": 0.108,
216
+ "pem_stderr": 0.01966955938156875,
217
+ "pqem": 0.236,
218
+ "pqem_stderr": 0.026909337594953852,
219
+ "perfect_em": 0.108,
220
+ "perfect_em_stderr": 0.01966955938156875
221
+ },
222
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": {
223
+ "em": 0.356,
224
+ "em_stderr": 0.030343680657153222,
225
+ "qem": 0.356,
226
+ "qem_stderr": 0.030343680657153222,
227
+ "pem": 0.356,
228
+ "pem_stderr": 0.030343680657153222,
229
+ "pqem": 0.664,
230
+ "pqem_stderr": 0.029933259094191523,
231
+ "perfect_em": 0.356,
232
+ "perfect_em_stderr": 0.030343680657153222
233
+ },
234
+ "harness|bbh:_average|3": {
235
+ "em": 0.38912512698074014,
236
+ "em_stderr": 0.029572017545352165,
237
+ "qem": 0.38912512698074014,
238
+ "qem_stderr": 0.029572017545352165,
239
+ "pem": 0.41105352983756255,
240
+ "pem_stderr": 0.029314106977547463,
241
+ "pqem": 0.5146759804147092,
242
+ "pqem_stderr": 0.03071052445105299,
243
+ "perfect_em": 0.38912512698074014,
244
+ "perfect_em_stderr": 0.029572017545352165
245
+ }
246
+ },
247
+ "versions": {
248
+ "harness|bbh:causal_judgment|3": 0,
249
+ "harness|bbh:date_understanding|3": 0,
250
+ "harness|bbh:disambiguation_qa|3": 0,
251
+ "harness|bbh:geometric_shapes|3": 0,
252
+ "harness|bbh:logical_deduction_five_objects|3": 0,
253
+ "harness|bbh:logical_deduction_seven_objects|3": 0,
254
+ "harness|bbh:logical_deduction_three_objects|3": 0,
255
+ "harness|bbh:movie_recommendation|3": 0,
256
+ "harness|bbh:navigate|3": 0,
257
+ "harness|bbh:reasoning_about_colored_objects|3": 0,
258
+ "harness|bbh:ruin_names|3": 0,
259
+ "harness|bbh:salient_translation_error_detection|3": 0,
260
+ "harness|bbh:snarks|3": 0,
261
+ "harness|bbh:sports_understanding|3": 0,
262
+ "harness|bbh:temporal_sequences|3": 0,
263
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": 0,
264
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": 0,
265
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": 0
266
+ },
267
+ "config_tasks": {
268
+ "harness|bbh:causal_judgment": {
269
+ "name": "bbh:causal_judgment",
270
+ "prompt_function": "bbh_causal_judgment",
271
+ "hf_repo": "lukaemon/bbh",
272
+ "hf_subset": "causal_judgement",
273
+ "metric": [
274
+ "exact_match",
275
+ "quasi_exact_match",
276
+ "prefix_exact_match",
277
+ "prefix_quasi_exact_match",
278
+ "perfect_exact_match"
279
+ ],
280
+ "hf_avail_splits": [
281
+ "test"
282
+ ],
283
+ "evaluation_splits": [
284
+ "test"
285
+ ],
286
+ "few_shots_split": null,
287
+ "few_shots_select": null,
288
+ "generation_size": 20,
289
+ "stop_sequence": [
290
+ "</s>",
291
+ "Q:",
292
+ "\n\n"
293
+ ],
294
+ "output_regex": null,
295
+ "frozen": false,
296
+ "suite": [
297
+ "harness"
298
+ ],
299
+ "original_num_docs": 187,
300
+ "effective_num_docs": 187,
301
+ "trust_dataset": true,
302
+ "must_remove_duplicate_docs": null
303
+ },
304
+ "harness|bbh:date_understanding": {
305
+ "name": "bbh:date_understanding",
306
+ "prompt_function": "bbh_date_understanding",
307
+ "hf_repo": "lukaemon/bbh",
308
+ "hf_subset": "date_understanding",
309
+ "metric": [
310
+ "exact_match",
311
+ "quasi_exact_match",
312
+ "prefix_exact_match",
313
+ "prefix_quasi_exact_match",
314
+ "perfect_exact_match"
315
+ ],
316
+ "hf_avail_splits": [
317
+ "test"
318
+ ],
319
+ "evaluation_splits": [
320
+ "test"
321
+ ],
322
+ "few_shots_split": null,
323
+ "few_shots_select": null,
324
+ "generation_size": 20,
325
+ "stop_sequence": [
326
+ "</s>",
327
+ "Q:",
328
+ "\n\n"
329
+ ],
330
+ "output_regex": null,
331
+ "frozen": false,
332
+ "suite": [
333
+ "harness"
334
+ ],
335
+ "original_num_docs": 250,
336
+ "effective_num_docs": 250,
337
+ "trust_dataset": true,
338
+ "must_remove_duplicate_docs": null
339
+ },
340
+ "harness|bbh:disambiguation_qa": {
341
+ "name": "bbh:disambiguation_qa",
342
+ "prompt_function": "bbh_disambiguation_qa",
343
+ "hf_repo": "lukaemon/bbh",
344
+ "hf_subset": "disambiguation_qa",
345
+ "metric": [
346
+ "exact_match",
347
+ "quasi_exact_match",
348
+ "prefix_exact_match",
349
+ "prefix_quasi_exact_match",
350
+ "perfect_exact_match"
351
+ ],
352
+ "hf_avail_splits": [
353
+ "test"
354
+ ],
355
+ "evaluation_splits": [
356
+ "test"
357
+ ],
358
+ "few_shots_split": null,
359
+ "few_shots_select": null,
360
+ "generation_size": 20,
361
+ "stop_sequence": [
362
+ "</s>",
363
+ "Q:",
364
+ "\n\n"
365
+ ],
366
+ "output_regex": null,
367
+ "frozen": false,
368
+ "suite": [
369
+ "harness"
370
+ ],
371
+ "original_num_docs": 250,
372
+ "effective_num_docs": 250,
373
+ "trust_dataset": true,
374
+ "must_remove_duplicate_docs": null
375
+ },
376
+ "harness|bbh:geometric_shapes": {
377
+ "name": "bbh:geometric_shapes",
378
+ "prompt_function": "bbh_geometric_shapes",
379
+ "hf_repo": "lukaemon/bbh",
380
+ "hf_subset": "geometric_shapes",
381
+ "metric": [
382
+ "exact_match",
383
+ "quasi_exact_match",
384
+ "prefix_exact_match",
385
+ "prefix_quasi_exact_match",
386
+ "perfect_exact_match"
387
+ ],
388
+ "hf_avail_splits": [
389
+ "test"
390
+ ],
391
+ "evaluation_splits": [
392
+ "test"
393
+ ],
394
+ "few_shots_split": null,
395
+ "few_shots_select": null,
396
+ "generation_size": 20,
397
+ "stop_sequence": [
398
+ "</s>",
399
+ "Q:",
400
+ "\n\n"
401
+ ],
402
+ "output_regex": null,
403
+ "frozen": false,
404
+ "suite": [
405
+ "harness"
406
+ ],
407
+ "original_num_docs": 250,
408
+ "effective_num_docs": 250,
409
+ "trust_dataset": true,
410
+ "must_remove_duplicate_docs": null
411
+ },
412
+ "harness|bbh:logical_deduction_five_objects": {
413
+ "name": "bbh:logical_deduction_five_objects",
414
+ "prompt_function": "bbh_logical_deduction_five_objects",
415
+ "hf_repo": "lukaemon/bbh",
416
+ "hf_subset": "logical_deduction_five_objects",
417
+ "metric": [
418
+ "exact_match",
419
+ "quasi_exact_match",
420
+ "prefix_exact_match",
421
+ "prefix_quasi_exact_match",
422
+ "perfect_exact_match"
423
+ ],
424
+ "hf_avail_splits": [
425
+ "test"
426
+ ],
427
+ "evaluation_splits": [
428
+ "test"
429
+ ],
430
+ "few_shots_split": null,
431
+ "few_shots_select": null,
432
+ "generation_size": 20,
433
+ "stop_sequence": [
434
+ "</s>",
435
+ "Q:",
436
+ "\n\n"
437
+ ],
438
+ "output_regex": null,
439
+ "frozen": false,
440
+ "suite": [
441
+ "harness"
442
+ ],
443
+ "original_num_docs": 250,
444
+ "effective_num_docs": 250,
445
+ "trust_dataset": true,
446
+ "must_remove_duplicate_docs": null
447
+ },
448
+ "harness|bbh:logical_deduction_seven_objects": {
449
+ "name": "bbh:logical_deduction_seven_objects",
450
+ "prompt_function": "bbh_logical_deduction_seven_objects",
451
+ "hf_repo": "lukaemon/bbh",
452
+ "hf_subset": "logical_deduction_seven_objects",
453
+ "metric": [
454
+ "exact_match",
455
+ "quasi_exact_match",
456
+ "prefix_exact_match",
457
+ "prefix_quasi_exact_match",
458
+ "perfect_exact_match"
459
+ ],
460
+ "hf_avail_splits": [
461
+ "test"
462
+ ],
463
+ "evaluation_splits": [
464
+ "test"
465
+ ],
466
+ "few_shots_split": null,
467
+ "few_shots_select": null,
468
+ "generation_size": 20,
469
+ "stop_sequence": [
470
+ "</s>",
471
+ "Q:",
472
+ "\n\n"
473
+ ],
474
+ "output_regex": null,
475
+ "frozen": false,
476
+ "suite": [
477
+ "harness"
478
+ ],
479
+ "original_num_docs": 250,
480
+ "effective_num_docs": 250,
481
+ "trust_dataset": true,
482
+ "must_remove_duplicate_docs": null
483
+ },
484
+ "harness|bbh:logical_deduction_three_objects": {
485
+ "name": "bbh:logical_deduction_three_objects",
486
+ "prompt_function": "bbh_logical_deduction_three_objects",
487
+ "hf_repo": "lukaemon/bbh",
488
+ "hf_subset": "logical_deduction_three_objects",
489
+ "metric": [
490
+ "exact_match",
491
+ "quasi_exact_match",
492
+ "prefix_exact_match",
493
+ "prefix_quasi_exact_match",
494
+ "perfect_exact_match"
495
+ ],
496
+ "hf_avail_splits": [
497
+ "test"
498
+ ],
499
+ "evaluation_splits": [
500
+ "test"
501
+ ],
502
+ "few_shots_split": null,
503
+ "few_shots_select": null,
504
+ "generation_size": 20,
505
+ "stop_sequence": [
506
+ "</s>",
507
+ "Q:",
508
+ "\n\n"
509
+ ],
510
+ "output_regex": null,
511
+ "frozen": false,
512
+ "suite": [
513
+ "harness"
514
+ ],
515
+ "original_num_docs": 250,
516
+ "effective_num_docs": 250,
517
+ "trust_dataset": true,
518
+ "must_remove_duplicate_docs": null
519
+ },
520
+ "harness|bbh:movie_recommendation": {
521
+ "name": "bbh:movie_recommendation",
522
+ "prompt_function": "bbh_movie_recommendation",
523
+ "hf_repo": "lukaemon/bbh",
524
+ "hf_subset": "movie_recommendation",
525
+ "metric": [
526
+ "exact_match",
527
+ "quasi_exact_match",
528
+ "prefix_exact_match",
529
+ "prefix_quasi_exact_match",
530
+ "perfect_exact_match"
531
+ ],
532
+ "hf_avail_splits": [
533
+ "test"
534
+ ],
535
+ "evaluation_splits": [
536
+ "test"
537
+ ],
538
+ "few_shots_split": null,
539
+ "few_shots_select": null,
540
+ "generation_size": 20,
541
+ "stop_sequence": [
542
+ "</s>",
543
+ "Q:",
544
+ "\n\n"
545
+ ],
546
+ "output_regex": null,
547
+ "frozen": false,
548
+ "suite": [
549
+ "harness"
550
+ ],
551
+ "original_num_docs": 249,
552
+ "effective_num_docs": 249,
553
+ "trust_dataset": true,
554
+ "must_remove_duplicate_docs": null
555
+ },
556
+ "harness|bbh:navigate": {
557
+ "name": "bbh:navigate",
558
+ "prompt_function": "bbh_navigate",
559
+ "hf_repo": "lukaemon/bbh",
560
+ "hf_subset": "navigate",
561
+ "metric": [
562
+ "exact_match",
563
+ "quasi_exact_match",
564
+ "prefix_exact_match",
565
+ "prefix_quasi_exact_match",
566
+ "perfect_exact_match"
567
+ ],
568
+ "hf_avail_splits": [
569
+ "test"
570
+ ],
571
+ "evaluation_splits": [
572
+ "test"
573
+ ],
574
+ "few_shots_split": null,
575
+ "few_shots_select": null,
576
+ "generation_size": 20,
577
+ "stop_sequence": [
578
+ "</s>",
579
+ "Q:",
580
+ "\n\n"
581
+ ],
582
+ "output_regex": null,
583
+ "frozen": false,
584
+ "suite": [
585
+ "harness"
586
+ ],
587
+ "original_num_docs": 250,
588
+ "effective_num_docs": 250,
589
+ "trust_dataset": true,
590
+ "must_remove_duplicate_docs": null
591
+ },
592
+ "harness|bbh:reasoning_about_colored_objects": {
593
+ "name": "bbh:reasoning_about_colored_objects",
594
+ "prompt_function": "bbh_reasoning_about_colored_objects",
595
+ "hf_repo": "lukaemon/bbh",
596
+ "hf_subset": "reasoning_about_colored_objects",
597
+ "metric": [
598
+ "exact_match",
599
+ "quasi_exact_match",
600
+ "prefix_exact_match",
601
+ "prefix_quasi_exact_match",
602
+ "perfect_exact_match"
603
+ ],
604
+ "hf_avail_splits": [
605
+ "test"
606
+ ],
607
+ "evaluation_splits": [
608
+ "test"
609
+ ],
610
+ "few_shots_split": null,
611
+ "few_shots_select": null,
612
+ "generation_size": 20,
613
+ "stop_sequence": [
614
+ "</s>",
615
+ "Q:",
616
+ "\n\n"
617
+ ],
618
+ "output_regex": null,
619
+ "frozen": false,
620
+ "suite": [
621
+ "harness"
622
+ ],
623
+ "original_num_docs": 250,
624
+ "effective_num_docs": 250,
625
+ "trust_dataset": true,
626
+ "must_remove_duplicate_docs": null
627
+ },
628
+ "harness|bbh:ruin_names": {
629
+ "name": "bbh:ruin_names",
630
+ "prompt_function": "bbh_ruin_names",
631
+ "hf_repo": "lukaemon/bbh",
632
+ "hf_subset": "ruin_names",
633
+ "metric": [
634
+ "exact_match",
635
+ "quasi_exact_match",
636
+ "prefix_exact_match",
637
+ "prefix_quasi_exact_match",
638
+ "perfect_exact_match"
639
+ ],
640
+ "hf_avail_splits": [
641
+ "test"
642
+ ],
643
+ "evaluation_splits": [
644
+ "test"
645
+ ],
646
+ "few_shots_split": null,
647
+ "few_shots_select": null,
648
+ "generation_size": 20,
649
+ "stop_sequence": [
650
+ "</s>",
651
+ "Q:",
652
+ "\n\n"
653
+ ],
654
+ "output_regex": null,
655
+ "frozen": false,
656
+ "suite": [
657
+ "harness"
658
+ ],
659
+ "original_num_docs": 248,
660
+ "effective_num_docs": 248,
661
+ "trust_dataset": true,
662
+ "must_remove_duplicate_docs": null
663
+ },
664
+ "harness|bbh:salient_translation_error_detection": {
665
+ "name": "bbh:salient_translation_error_detection",
666
+ "prompt_function": "bbh_salient_translation_error_detection",
667
+ "hf_repo": "lukaemon/bbh",
668
+ "hf_subset": "salient_translation_error_detection",
669
+ "metric": [
670
+ "exact_match",
671
+ "quasi_exact_match",
672
+ "prefix_exact_match",
673
+ "prefix_quasi_exact_match",
674
+ "perfect_exact_match"
675
+ ],
676
+ "hf_avail_splits": [
677
+ "test"
678
+ ],
679
+ "evaluation_splits": [
680
+ "test"
681
+ ],
682
+ "few_shots_split": null,
683
+ "few_shots_select": null,
684
+ "generation_size": 20,
685
+ "stop_sequence": [
686
+ "</s>",
687
+ "Q:",
688
+ "\n\n"
689
+ ],
690
+ "output_regex": null,
691
+ "frozen": false,
692
+ "suite": [
693
+ "harness"
694
+ ],
695
+ "original_num_docs": 250,
696
+ "effective_num_docs": 250,
697
+ "trust_dataset": true,
698
+ "must_remove_duplicate_docs": null
699
+ },
700
+ "harness|bbh:snarks": {
701
+ "name": "bbh:snarks",
702
+ "prompt_function": "bbh_snarks",
703
+ "hf_repo": "lukaemon/bbh",
704
+ "hf_subset": "snarks",
705
+ "metric": [
706
+ "exact_match",
707
+ "quasi_exact_match",
708
+ "prefix_exact_match",
709
+ "prefix_quasi_exact_match",
710
+ "perfect_exact_match"
711
+ ],
712
+ "hf_avail_splits": [
713
+ "test"
714
+ ],
715
+ "evaluation_splits": [
716
+ "test"
717
+ ],
718
+ "few_shots_split": null,
719
+ "few_shots_select": null,
720
+ "generation_size": 20,
721
+ "stop_sequence": [
722
+ "</s>",
723
+ "Q:",
724
+ "\n\n"
725
+ ],
726
+ "output_regex": null,
727
+ "frozen": false,
728
+ "suite": [
729
+ "harness"
730
+ ],
731
+ "original_num_docs": 178,
732
+ "effective_num_docs": 178,
733
+ "trust_dataset": true,
734
+ "must_remove_duplicate_docs": null
735
+ },
736
+ "harness|bbh:sports_understanding": {
737
+ "name": "bbh:sports_understanding",
738
+ "prompt_function": "bbh_sports_understanding",
739
+ "hf_repo": "lukaemon/bbh",
740
+ "hf_subset": "sports_understanding",
741
+ "metric": [
742
+ "exact_match",
743
+ "quasi_exact_match",
744
+ "prefix_exact_match",
745
+ "prefix_quasi_exact_match",
746
+ "perfect_exact_match"
747
+ ],
748
+ "hf_avail_splits": [
749
+ "test"
750
+ ],
751
+ "evaluation_splits": [
752
+ "test"
753
+ ],
754
+ "few_shots_split": null,
755
+ "few_shots_select": null,
756
+ "generation_size": 20,
757
+ "stop_sequence": [
758
+ "</s>",
759
+ "Q:",
760
+ "\n\n"
761
+ ],
762
+ "output_regex": null,
763
+ "frozen": false,
764
+ "suite": [
765
+ "harness"
766
+ ],
767
+ "original_num_docs": 250,
768
+ "effective_num_docs": 250,
769
+ "trust_dataset": true,
770
+ "must_remove_duplicate_docs": null
771
+ },
772
+ "harness|bbh:temporal_sequences": {
773
+ "name": "bbh:temporal_sequences",
774
+ "prompt_function": "bbh_temporal_sequences",
775
+ "hf_repo": "lukaemon/bbh",
776
+ "hf_subset": "temporal_sequences",
777
+ "metric": [
778
+ "exact_match",
779
+ "quasi_exact_match",
780
+ "prefix_exact_match",
781
+ "prefix_quasi_exact_match",
782
+ "perfect_exact_match"
783
+ ],
784
+ "hf_avail_splits": [
785
+ "test"
786
+ ],
787
+ "evaluation_splits": [
788
+ "test"
789
+ ],
790
+ "few_shots_split": null,
791
+ "few_shots_select": null,
792
+ "generation_size": 20,
793
+ "stop_sequence": [
794
+ "</s>",
795
+ "Q:",
796
+ "\n\n"
797
+ ],
798
+ "output_regex": null,
799
+ "frozen": false,
800
+ "suite": [
801
+ "harness"
802
+ ],
803
+ "original_num_docs": 250,
804
+ "effective_num_docs": 250,
805
+ "trust_dataset": true,
806
+ "must_remove_duplicate_docs": null
807
+ },
808
+ "harness|bbh:tracking_shuffled_objects_five_objects": {
809
+ "name": "bbh:tracking_shuffled_objects_five_objects",
810
+ "prompt_function": "bbh_tracking_shuffled_objects_five_objects",
811
+ "hf_repo": "lukaemon/bbh",
812
+ "hf_subset": "tracking_shuffled_objects_five_objects",
813
+ "metric": [
814
+ "exact_match",
815
+ "quasi_exact_match",
816
+ "prefix_exact_match",
817
+ "prefix_quasi_exact_match",
818
+ "perfect_exact_match"
819
+ ],
820
+ "hf_avail_splits": [
821
+ "test"
822
+ ],
823
+ "evaluation_splits": [
824
+ "test"
825
+ ],
826
+ "few_shots_split": null,
827
+ "few_shots_select": null,
828
+ "generation_size": 20,
829
+ "stop_sequence": [
830
+ "</s>",
831
+ "Q:",
832
+ "\n\n"
833
+ ],
834
+ "output_regex": null,
835
+ "frozen": false,
836
+ "suite": [
837
+ "harness"
838
+ ],
839
+ "original_num_docs": 250,
840
+ "effective_num_docs": 250,
841
+ "trust_dataset": true,
842
+ "must_remove_duplicate_docs": null
843
+ },
844
+ "harness|bbh:tracking_shuffled_objects_seven_objects": {
845
+ "name": "bbh:tracking_shuffled_objects_seven_objects",
846
+ "prompt_function": "bbh_tracking_shuffled_objects_seven_objects",
847
+ "hf_repo": "lukaemon/bbh",
848
+ "hf_subset": "tracking_shuffled_objects_seven_objects",
849
+ "metric": [
850
+ "exact_match",
851
+ "quasi_exact_match",
852
+ "prefix_exact_match",
853
+ "prefix_quasi_exact_match",
854
+ "perfect_exact_match"
855
+ ],
856
+ "hf_avail_splits": [
857
+ "test"
858
+ ],
859
+ "evaluation_splits": [
860
+ "test"
861
+ ],
862
+ "few_shots_split": null,
863
+ "few_shots_select": null,
864
+ "generation_size": 20,
865
+ "stop_sequence": [
866
+ "</s>",
867
+ "Q:",
868
+ "\n\n"
869
+ ],
870
+ "output_regex": null,
871
+ "frozen": false,
872
+ "suite": [
873
+ "harness"
874
+ ],
875
+ "original_num_docs": 250,
876
+ "effective_num_docs": 250,
877
+ "trust_dataset": true,
878
+ "must_remove_duplicate_docs": null
879
+ },
880
+ "harness|bbh:tracking_shuffled_objects_three_objects": {
881
+ "name": "bbh:tracking_shuffled_objects_three_objects",
882
+ "prompt_function": "bbh_tracking_shuffled_objects_three_objects",
883
+ "hf_repo": "lukaemon/bbh",
884
+ "hf_subset": "tracking_shuffled_objects_three_objects",
885
+ "metric": [
886
+ "exact_match",
887
+ "quasi_exact_match",
888
+ "prefix_exact_match",
889
+ "prefix_quasi_exact_match",
890
+ "perfect_exact_match"
891
+ ],
892
+ "hf_avail_splits": [
893
+ "test"
894
+ ],
895
+ "evaluation_splits": [
896
+ "test"
897
+ ],
898
+ "few_shots_split": null,
899
+ "few_shots_select": null,
900
+ "generation_size": 20,
901
+ "stop_sequence": [
902
+ "</s>",
903
+ "Q:",
904
+ "\n\n"
905
+ ],
906
+ "output_regex": null,
907
+ "frozen": false,
908
+ "suite": [
909
+ "harness"
910
+ ],
911
+ "original_num_docs": 250,
912
+ "effective_num_docs": 250,
913
+ "trust_dataset": true,
914
+ "must_remove_duplicate_docs": null
915
+ }
916
+ },
917
+ "summary_tasks": {
918
+ "harness|bbh:causal_judgment|3": {
919
+ "hashes": {
920
+ "hash_examples": "63218f5ae055ab2b",
921
+ "hash_full_prompts": "7303fa1d0fe0b29a",
922
+ "hash_input_tokens": "79663e73bb5ce6ac",
923
+ "hash_cont_tokens": "420a15c0cad30087"
924
+ },
925
+ "truncated": 187,
926
+ "non_truncated": 0,
927
+ "padded": 0,
928
+ "non_padded": 187,
929
+ "effective_few_shots": 3.0,
930
+ "num_truncated_few_shots": 0
931
+ },
932
+ "harness|bbh:date_understanding|3": {
933
+ "hashes": {
934
+ "hash_examples": "f145c7a06def3c8e",
935
+ "hash_full_prompts": "69e60d10afa5a6f1",
936
+ "hash_input_tokens": "e9bd5760c58a1104",
937
+ "hash_cont_tokens": "94b26277f7a2ae47"
938
+ },
939
+ "truncated": 250,
940
+ "non_truncated": 0,
941
+ "padded": 0,
942
+ "non_padded": 250,
943
+ "effective_few_shots": 3.0,
944
+ "num_truncated_few_shots": 0
945
+ },
946
+ "harness|bbh:disambiguation_qa|3": {
947
+ "hashes": {
948
+ "hash_examples": "19677fd1773f7eb9",
949
+ "hash_full_prompts": "ae0a8fd428f9aee3",
950
+ "hash_input_tokens": "b3625dcc25d708b2",
951
+ "hash_cont_tokens": "60387bf1e01e58b3"
952
+ },
953
+ "truncated": 250,
954
+ "non_truncated": 0,
955
+ "padded": 0,
956
+ "non_padded": 250,
957
+ "effective_few_shots": 3.0,
958
+ "num_truncated_few_shots": 0
959
+ },
960
+ "harness|bbh:geometric_shapes|3": {
961
+ "hashes": {
962
+ "hash_examples": "76c7b11a13cc72a9",
963
+ "hash_full_prompts": "76633257f67207f9",
964
+ "hash_input_tokens": "c16e8768d8c9056f",
965
+ "hash_cont_tokens": "ed3c78442ba23c15"
966
+ },
967
+ "truncated": 250,
968
+ "non_truncated": 0,
969
+ "padded": 0,
970
+ "non_padded": 250,
971
+ "effective_few_shots": 3.0,
972
+ "num_truncated_few_shots": 0
973
+ },
974
+ "harness|bbh:logical_deduction_five_objects|3": {
975
+ "hashes": {
976
+ "hash_examples": "0e958c856332a745",
977
+ "hash_full_prompts": "3c96645848786efd",
978
+ "hash_input_tokens": "915443ee37f164dc",
979
+ "hash_cont_tokens": "03c4483827bed168"
980
+ },
981
+ "truncated": 250,
982
+ "non_truncated": 0,
983
+ "padded": 0,
984
+ "non_padded": 250,
985
+ "effective_few_shots": 3.0,
986
+ "num_truncated_few_shots": 0
987
+ },
988
+ "harness|bbh:logical_deduction_seven_objects|3": {
989
+ "hashes": {
990
+ "hash_examples": "ab9de25a5eb40d09",
991
+ "hash_full_prompts": "185c5851c101ee66",
992
+ "hash_input_tokens": "66d532c31ef57236",
993
+ "hash_cont_tokens": "cf115b8fc21ee1a7"
994
+ },
995
+ "truncated": 250,
996
+ "non_truncated": 0,
997
+ "padded": 0,
998
+ "non_padded": 250,
999
+ "effective_few_shots": 3.0,
1000
+ "num_truncated_few_shots": 0
1001
+ },
1002
+ "harness|bbh:logical_deduction_three_objects|3": {
1003
+ "hashes": {
1004
+ "hash_examples": "3c6bf52517714218",
1005
+ "hash_full_prompts": "8ba2d94357e589d0",
1006
+ "hash_input_tokens": "d51c6ad06efbf88b",
1007
+ "hash_cont_tokens": "42a72ecda3007e8a"
1008
+ },
1009
+ "truncated": 250,
1010
+ "non_truncated": 0,
1011
+ "padded": 0,
1012
+ "non_padded": 250,
1013
+ "effective_few_shots": 3.0,
1014
+ "num_truncated_few_shots": 0
1015
+ },
1016
+ "harness|bbh:movie_recommendation|3": {
1017
+ "hashes": {
1018
+ "hash_examples": "2d9dc4975935d31a",
1019
+ "hash_full_prompts": "a411e216d0f5f626",
1020
+ "hash_input_tokens": "e17a3080d43ae54f",
1021
+ "hash_cont_tokens": "2e10ac5dbca97cbe"
1022
+ },
1023
+ "truncated": 249,
1024
+ "non_truncated": 0,
1025
+ "padded": 0,
1026
+ "non_padded": 249,
1027
+ "effective_few_shots": 3.0,
1028
+ "num_truncated_few_shots": 0
1029
+ },
1030
+ "harness|bbh:navigate|3": {
1031
+ "hashes": {
1032
+ "hash_examples": "ba91dcdb9a064255",
1033
+ "hash_full_prompts": "ebb3084ecc78a46a",
1034
+ "hash_input_tokens": "90854b0ca565c8f5",
1035
+ "hash_cont_tokens": "d693194d05b4303d"
1036
+ },
1037
+ "truncated": 250,
1038
+ "non_truncated": 0,
1039
+ "padded": 0,
1040
+ "non_padded": 250,
1041
+ "effective_few_shots": 3.0,
1042
+ "num_truncated_few_shots": 0
1043
+ },
1044
+ "harness|bbh:reasoning_about_colored_objects|3": {
1045
+ "hashes": {
1046
+ "hash_examples": "a6ba328c4c3385d2",
1047
+ "hash_full_prompts": "38328d016a4ebef3",
1048
+ "hash_input_tokens": "b45b5a8a531e8bf5",
1049
+ "hash_cont_tokens": "0cbe08c7a0fab1bb"
1050
+ },
1051
+ "truncated": 250,
1052
+ "non_truncated": 0,
1053
+ "padded": 0,
1054
+ "non_padded": 250,
1055
+ "effective_few_shots": 3.0,
1056
+ "num_truncated_few_shots": 0
1057
+ },
1058
+ "harness|bbh:ruin_names|3": {
1059
+ "hashes": {
1060
+ "hash_examples": "2ef28d5f2d4fdd25",
1061
+ "hash_full_prompts": "9c7d0493c37182d6",
1062
+ "hash_input_tokens": "627b6058879c9350",
1063
+ "hash_cont_tokens": "fcf3761bb1cf561f"
1064
+ },
1065
+ "truncated": 248,
1066
+ "non_truncated": 0,
1067
+ "padded": 0,
1068
+ "non_padded": 248,
1069
+ "effective_few_shots": 3.0,
1070
+ "num_truncated_few_shots": 0
1071
+ },
1072
+ "harness|bbh:salient_translation_error_detection|3": {
1073
+ "hashes": {
1074
+ "hash_examples": "c13f25ec8ffed496",
1075
+ "hash_full_prompts": "edccd4061b168b78",
1076
+ "hash_input_tokens": "7d4d7e481ad8766b",
1077
+ "hash_cont_tokens": "aaf3d8681b8db9a4"
1078
+ },
1079
+ "truncated": 250,
1080
+ "non_truncated": 0,
1081
+ "padded": 0,
1082
+ "non_padded": 250,
1083
+ "effective_few_shots": 3.0,
1084
+ "num_truncated_few_shots": 0
1085
+ },
1086
+ "harness|bbh:snarks|3": {
1087
+ "hashes": {
1088
+ "hash_examples": "5f6db7bff7f6f22e",
1089
+ "hash_full_prompts": "31cafd95ab850a44",
1090
+ "hash_input_tokens": "616900bacd0ba7ca",
1091
+ "hash_cont_tokens": "58d647acc1f43db7"
1092
+ },
1093
+ "truncated": 178,
1094
+ "non_truncated": 0,
1095
+ "padded": 0,
1096
+ "non_padded": 178,
1097
+ "effective_few_shots": 3.0,
1098
+ "num_truncated_few_shots": 0
1099
+ },
1100
+ "harness|bbh:sports_understanding|3": {
1101
+ "hashes": {
1102
+ "hash_examples": "042afbe5d9c1f02d",
1103
+ "hash_full_prompts": "3d46581e9bbec2d0",
1104
+ "hash_input_tokens": "8e9e99c22dd3a8d2",
1105
+ "hash_cont_tokens": "9fac6252ba9320d1"
1106
+ },
1107
+ "truncated": 250,
1108
+ "non_truncated": 0,
1109
+ "padded": 0,
1110
+ "non_padded": 250,
1111
+ "effective_few_shots": 3.0,
1112
+ "num_truncated_few_shots": 0
1113
+ },
1114
+ "harness|bbh:temporal_sequences|3": {
1115
+ "hashes": {
1116
+ "hash_examples": "803a05f352eb6afc",
1117
+ "hash_full_prompts": "4a54db144a5dd222",
1118
+ "hash_input_tokens": "24789970b2290dd3",
1119
+ "hash_cont_tokens": "0d2a5d01c536444e"
1120
+ },
1121
+ "truncated": 250,
1122
+ "non_truncated": 0,
1123
+ "padded": 0,
1124
+ "non_padded": 250,
1125
+ "effective_few_shots": 3.0,
1126
+ "num_truncated_few_shots": 0
1127
+ },
1128
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": {
1129
+ "hashes": {
1130
+ "hash_examples": "2bbac6db7ab0d527",
1131
+ "hash_full_prompts": "e3079106787cc311",
1132
+ "hash_input_tokens": "9036045cff895b08",
1133
+ "hash_cont_tokens": "332c47ab046359af"
1134
+ },
1135
+ "truncated": 250,
1136
+ "non_truncated": 0,
1137
+ "padded": 0,
1138
+ "non_padded": 250,
1139
+ "effective_few_shots": 3.0,
1140
+ "num_truncated_few_shots": 0
1141
+ },
1142
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": {
1143
+ "hashes": {
1144
+ "hash_examples": "845caf093ac2b58c",
1145
+ "hash_full_prompts": "6364e5b860590ec8",
1146
+ "hash_input_tokens": "7100c488aa0764ff",
1147
+ "hash_cont_tokens": "678424605b032e0c"
1148
+ },
1149
+ "truncated": 250,
1150
+ "non_truncated": 0,
1151
+ "padded": 0,
1152
+ "non_padded": 250,
1153
+ "effective_few_shots": 3.0,
1154
+ "num_truncated_few_shots": 0
1155
+ },
1156
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": {
1157
+ "hashes": {
1158
+ "hash_examples": "9004f14d5a32b9a8",
1159
+ "hash_full_prompts": "01aef56c4d1fe9fe",
1160
+ "hash_input_tokens": "b9690a5d32a586fc",
1161
+ "hash_cont_tokens": "bd35bdd85747fa0c"
1162
+ },
1163
+ "truncated": 250,
1164
+ "non_truncated": 0,
1165
+ "padded": 0,
1166
+ "non_padded": 250,
1167
+ "effective_few_shots": 3.0,
1168
+ "num_truncated_few_shots": 0
1169
+ }
1170
+ },
1171
+ "summary_general": {
1172
+ "hashes": {
1173
+ "hash_examples": "4ff1e3dc5703575d",
1174
+ "hash_full_prompts": "1cbeab0a00117cb8",
1175
+ "hash_input_tokens": "3608679dab4ce40e",
1176
+ "hash_cont_tokens": "ade7cabce940fa72"
1177
+ },
1178
+ "truncated": 4362,
1179
+ "non_truncated": 0,
1180
+ "padded": 0,
1181
+ "non_padded": 4362,
1182
+ "num_truncated_few_shots": 0
1183
+ }
1184
+ }