lewtun HF staff commited on
Commit
87a6174
·
verified ·
1 Parent(s): 05025ca

Upload eval_results/HuggingFaceH4/mistral-7b-kto/v0.0/bbh/results_2024-03-22T16-15-08.088486.json with huggingface_hub

Browse files
eval_results/HuggingFaceH4/mistral-7b-kto/v0.0/bbh/results_2024-03-22T16-15-08.088486.json ADDED
@@ -0,0 +1,1184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 1,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 2723.723505233,
9
+ "end_time": 2949.089739925,
10
+ "total_evaluation_time_secondes": "225.36623469199958",
11
+ "model_name": "HuggingFaceH4/mistral-7b-kto",
12
+ "model_sha": "23199c67ea7e0f8a1a635b7f918c25425289aeeb",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "13.99 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "harness|bbh:causal_judgment|3": {
19
+ "em": 0.5614973262032086,
20
+ "em_stderr": 0.03638341809400996,
21
+ "qem": 0.5614973262032086,
22
+ "qem_stderr": 0.03638341809400996,
23
+ "pem": 0.5775401069518716,
24
+ "pem_stderr": 0.03621824020753356,
25
+ "pqem": 0.5775401069518716,
26
+ "pqem_stderr": 0.03621824020753356,
27
+ "perfect_em": 0.5614973262032086,
28
+ "perfect_em_stderr": 0.03638341809400996
29
+ },
30
+ "harness|bbh:date_understanding|3": {
31
+ "em": 0.464,
32
+ "em_stderr": 0.031603975145223735,
33
+ "qem": 0.464,
34
+ "qem_stderr": 0.031603975145223735,
35
+ "pem": 0.464,
36
+ "pem_stderr": 0.031603975145223735,
37
+ "pqem": 0.552,
38
+ "pqem_stderr": 0.031514387611153515,
39
+ "perfect_em": 0.464,
40
+ "perfect_em_stderr": 0.031603975145223735
41
+ },
42
+ "harness|bbh:disambiguation_qa|3": {
43
+ "em": 0.604,
44
+ "em_stderr": 0.03099319785457785,
45
+ "qem": 0.604,
46
+ "qem_stderr": 0.03099319785457785,
47
+ "pem": 0.604,
48
+ "pem_stderr": 0.03099319785457785,
49
+ "pqem": 0.704,
50
+ "pqem_stderr": 0.02892893938837963,
51
+ "perfect_em": 0.604,
52
+ "perfect_em_stderr": 0.03099319785457785
53
+ },
54
+ "harness|bbh:geometric_shapes|3": {
55
+ "em": 0.22,
56
+ "em_stderr": 0.02625179282460584,
57
+ "qem": 0.22,
58
+ "qem_stderr": 0.02625179282460584,
59
+ "pem": 0.22,
60
+ "pem_stderr": 0.02625179282460584,
61
+ "pqem": 0.22,
62
+ "pqem_stderr": 0.02625179282460584,
63
+ "perfect_em": 0.22,
64
+ "perfect_em_stderr": 0.02625179282460584
65
+ },
66
+ "harness|bbh:logical_deduction_five_objects|3": {
67
+ "em": 0.396,
68
+ "em_stderr": 0.030993197854577853,
69
+ "qem": 0.396,
70
+ "qem_stderr": 0.030993197854577853,
71
+ "pem": 0.396,
72
+ "pem_stderr": 0.030993197854577853,
73
+ "pqem": 0.516,
74
+ "pqem_stderr": 0.03166998503010742,
75
+ "perfect_em": 0.396,
76
+ "perfect_em_stderr": 0.030993197854577853
77
+ },
78
+ "harness|bbh:logical_deduction_seven_objects|3": {
79
+ "em": 0.336,
80
+ "em_stderr": 0.029933259094191516,
81
+ "qem": 0.336,
82
+ "qem_stderr": 0.029933259094191516,
83
+ "pem": 0.336,
84
+ "pem_stderr": 0.029933259094191516,
85
+ "pqem": 0.448,
86
+ "pqem_stderr": 0.03151438761115355,
87
+ "perfect_em": 0.336,
88
+ "perfect_em_stderr": 0.029933259094191516
89
+ },
90
+ "harness|bbh:logical_deduction_three_objects|3": {
91
+ "em": 0.512,
92
+ "em_stderr": 0.03167708558254709,
93
+ "qem": 0.512,
94
+ "qem_stderr": 0.03167708558254709,
95
+ "pem": 0.512,
96
+ "pem_stderr": 0.03167708558254709,
97
+ "pqem": 0.736,
98
+ "pqem_stderr": 0.027934518957690908,
99
+ "perfect_em": 0.512,
100
+ "perfect_em_stderr": 0.03167708558254709
101
+ },
102
+ "harness|bbh:movie_recommendation|3": {
103
+ "em": 0.42168674698795183,
104
+ "em_stderr": 0.031358169846784814,
105
+ "qem": 0.42168674698795183,
106
+ "qem_stderr": 0.031358169846784814,
107
+ "pem": 0.42168674698795183,
108
+ "pem_stderr": 0.031358169846784814,
109
+ "pqem": 0.5622489959839357,
110
+ "pqem_stderr": 0.03150301204870893,
111
+ "perfect_em": 0.42168674698795183,
112
+ "perfect_em_stderr": 0.031358169846784814
113
+ },
114
+ "harness|bbh:navigate|3": {
115
+ "em": 0.624,
116
+ "em_stderr": 0.03069633626739458,
117
+ "qem": 0.624,
118
+ "qem_stderr": 0.03069633626739458,
119
+ "pem": 0.624,
120
+ "pem_stderr": 0.03069633626739458,
121
+ "pqem": 0.624,
122
+ "pqem_stderr": 0.03069633626739458,
123
+ "perfect_em": 0.624,
124
+ "perfect_em_stderr": 0.03069633626739458
125
+ },
126
+ "harness|bbh:reasoning_about_colored_objects|3": {
127
+ "em": 0.352,
128
+ "em_stderr": 0.03026628805735994,
129
+ "qem": 0.352,
130
+ "qem_stderr": 0.03026628805735994,
131
+ "pem": 0.368,
132
+ "pem_stderr": 0.030562070620993167,
133
+ "pqem": 0.496,
134
+ "pqem_stderr": 0.031685198551199154,
135
+ "perfect_em": 0.352,
136
+ "perfect_em_stderr": 0.03026628805735994
137
+ },
138
+ "harness|bbh:ruin_names|3": {
139
+ "em": 0.375,
140
+ "em_stderr": 0.03080400363063401,
141
+ "qem": 0.375,
142
+ "qem_stderr": 0.03080400363063401,
143
+ "pem": 0.375,
144
+ "pem_stderr": 0.03080400363063401,
145
+ "pqem": 0.5645161290322581,
146
+ "pqem_stderr": 0.031548283738756754,
147
+ "perfect_em": 0.375,
148
+ "perfect_em_stderr": 0.03080400363063401
149
+ },
150
+ "harness|bbh:salient_translation_error_detection|3": {
151
+ "em": 0.384,
152
+ "em_stderr": 0.03082167911737538,
153
+ "qem": 0.384,
154
+ "qem_stderr": 0.03082167911737538,
155
+ "pem": 0.384,
156
+ "pem_stderr": 0.03082167911737538,
157
+ "pqem": 0.516,
158
+ "pqem_stderr": 0.03166998503010742,
159
+ "perfect_em": 0.384,
160
+ "perfect_em_stderr": 0.03082167911737538
161
+ },
162
+ "harness|bbh:snarks|3": {
163
+ "em": 0.7303370786516854,
164
+ "em_stderr": 0.03335689818443928,
165
+ "qem": 0.7303370786516854,
166
+ "qem_stderr": 0.03335689818443928,
167
+ "pem": 0.7359550561797753,
168
+ "pem_stderr": 0.0331343107765884,
169
+ "pqem": 0.7752808988764045,
170
+ "pqem_stderr": 0.031373495121251,
171
+ "perfect_em": 0.7303370786516854,
172
+ "perfect_em_stderr": 0.03335689818443928
173
+ },
174
+ "harness|bbh:sports_understanding|3": {
175
+ "em": 0.536,
176
+ "em_stderr": 0.03160397514522374,
177
+ "qem": 0.536,
178
+ "qem_stderr": 0.03160397514522374,
179
+ "pem": 0.772,
180
+ "pem_stderr": 0.026587432487268473,
181
+ "pqem": 0.772,
182
+ "pqem_stderr": 0.026587432487268473,
183
+ "perfect_em": 0.536,
184
+ "perfect_em_stderr": 0.03160397514522374
185
+ },
186
+ "harness|bbh:temporal_sequences|3": {
187
+ "em": 0.264,
188
+ "em_stderr": 0.027934518957690908,
189
+ "qem": 0.264,
190
+ "qem_stderr": 0.027934518957690908,
191
+ "pem": 0.264,
192
+ "pem_stderr": 0.027934518957690908,
193
+ "pqem": 0.496,
194
+ "pqem_stderr": 0.03168519855119917,
195
+ "perfect_em": 0.264,
196
+ "perfect_em_stderr": 0.027934518957690908
197
+ },
198
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": {
199
+ "em": 0.152,
200
+ "em_stderr": 0.022752024491765464,
201
+ "qem": 0.152,
202
+ "qem_stderr": 0.022752024491765464,
203
+ "pem": 0.152,
204
+ "pem_stderr": 0.022752024491765464,
205
+ "pqem": 0.352,
206
+ "pqem_stderr": 0.030266288057359945,
207
+ "perfect_em": 0.152,
208
+ "perfect_em_stderr": 0.022752024491765464
209
+ },
210
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": {
211
+ "em": 0.144,
212
+ "em_stderr": 0.022249407735450213,
213
+ "qem": 0.144,
214
+ "qem_stderr": 0.022249407735450213,
215
+ "pem": 0.144,
216
+ "pem_stderr": 0.022249407735450213,
217
+ "pqem": 0.288,
218
+ "pqem_stderr": 0.028697004587398215,
219
+ "perfect_em": 0.144,
220
+ "perfect_em_stderr": 0.022249407735450213
221
+ },
222
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": {
223
+ "em": 0.336,
224
+ "em_stderr": 0.029933259094191512,
225
+ "qem": 0.336,
226
+ "qem_stderr": 0.029933259094191512,
227
+ "pem": 0.336,
228
+ "pem_stderr": 0.029933259094191512,
229
+ "pqem": 0.648,
230
+ "pqem_stderr": 0.030266288057359928,
231
+ "perfect_em": 0.336,
232
+ "perfect_em_stderr": 0.029933259094191512
233
+ },
234
+ "harness|bbh:_average|3": {
235
+ "em": 0.411806730657936,
236
+ "em_stderr": 0.029978471498780204,
237
+ "qem": 0.411806730657936,
238
+ "qem_stderr": 0.029978471498780204,
239
+ "pem": 0.4270101061177556,
240
+ "pem_stderr": 0.029694664532744133,
241
+ "pqem": 0.5470881183802484,
242
+ "pqem_stderr": 0.030556154118257114,
243
+ "perfect_em": 0.411806730657936,
244
+ "perfect_em_stderr": 0.029978471498780204
245
+ }
246
+ },
247
+ "versions": {
248
+ "harness|bbh:causal_judgment|3": 0,
249
+ "harness|bbh:date_understanding|3": 0,
250
+ "harness|bbh:disambiguation_qa|3": 0,
251
+ "harness|bbh:geometric_shapes|3": 0,
252
+ "harness|bbh:logical_deduction_five_objects|3": 0,
253
+ "harness|bbh:logical_deduction_seven_objects|3": 0,
254
+ "harness|bbh:logical_deduction_three_objects|3": 0,
255
+ "harness|bbh:movie_recommendation|3": 0,
256
+ "harness|bbh:navigate|3": 0,
257
+ "harness|bbh:reasoning_about_colored_objects|3": 0,
258
+ "harness|bbh:ruin_names|3": 0,
259
+ "harness|bbh:salient_translation_error_detection|3": 0,
260
+ "harness|bbh:snarks|3": 0,
261
+ "harness|bbh:sports_understanding|3": 0,
262
+ "harness|bbh:temporal_sequences|3": 0,
263
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": 0,
264
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": 0,
265
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": 0
266
+ },
267
+ "config_tasks": {
268
+ "harness|bbh:causal_judgment": {
269
+ "name": "bbh:causal_judgment",
270
+ "prompt_function": "bbh_causal_judgment",
271
+ "hf_repo": "lukaemon/bbh",
272
+ "hf_subset": "causal_judgement",
273
+ "metric": [
274
+ "exact_match",
275
+ "quasi_exact_match",
276
+ "prefix_exact_match",
277
+ "prefix_quasi_exact_match",
278
+ "perfect_exact_match"
279
+ ],
280
+ "hf_avail_splits": [
281
+ "test"
282
+ ],
283
+ "evaluation_splits": [
284
+ "test"
285
+ ],
286
+ "few_shots_split": null,
287
+ "few_shots_select": null,
288
+ "generation_size": 20,
289
+ "stop_sequence": [
290
+ "</s>",
291
+ "Q:",
292
+ "\n\n"
293
+ ],
294
+ "output_regex": null,
295
+ "frozen": false,
296
+ "suite": [
297
+ "harness"
298
+ ],
299
+ "original_num_docs": 187,
300
+ "effective_num_docs": 187,
301
+ "trust_dataset": true,
302
+ "must_remove_duplicate_docs": null
303
+ },
304
+ "harness|bbh:date_understanding": {
305
+ "name": "bbh:date_understanding",
306
+ "prompt_function": "bbh_date_understanding",
307
+ "hf_repo": "lukaemon/bbh",
308
+ "hf_subset": "date_understanding",
309
+ "metric": [
310
+ "exact_match",
311
+ "quasi_exact_match",
312
+ "prefix_exact_match",
313
+ "prefix_quasi_exact_match",
314
+ "perfect_exact_match"
315
+ ],
316
+ "hf_avail_splits": [
317
+ "test"
318
+ ],
319
+ "evaluation_splits": [
320
+ "test"
321
+ ],
322
+ "few_shots_split": null,
323
+ "few_shots_select": null,
324
+ "generation_size": 20,
325
+ "stop_sequence": [
326
+ "</s>",
327
+ "Q:",
328
+ "\n\n"
329
+ ],
330
+ "output_regex": null,
331
+ "frozen": false,
332
+ "suite": [
333
+ "harness"
334
+ ],
335
+ "original_num_docs": 250,
336
+ "effective_num_docs": 250,
337
+ "trust_dataset": true,
338
+ "must_remove_duplicate_docs": null
339
+ },
340
+ "harness|bbh:disambiguation_qa": {
341
+ "name": "bbh:disambiguation_qa",
342
+ "prompt_function": "bbh_disambiguation_qa",
343
+ "hf_repo": "lukaemon/bbh",
344
+ "hf_subset": "disambiguation_qa",
345
+ "metric": [
346
+ "exact_match",
347
+ "quasi_exact_match",
348
+ "prefix_exact_match",
349
+ "prefix_quasi_exact_match",
350
+ "perfect_exact_match"
351
+ ],
352
+ "hf_avail_splits": [
353
+ "test"
354
+ ],
355
+ "evaluation_splits": [
356
+ "test"
357
+ ],
358
+ "few_shots_split": null,
359
+ "few_shots_select": null,
360
+ "generation_size": 20,
361
+ "stop_sequence": [
362
+ "</s>",
363
+ "Q:",
364
+ "\n\n"
365
+ ],
366
+ "output_regex": null,
367
+ "frozen": false,
368
+ "suite": [
369
+ "harness"
370
+ ],
371
+ "original_num_docs": 250,
372
+ "effective_num_docs": 250,
373
+ "trust_dataset": true,
374
+ "must_remove_duplicate_docs": null
375
+ },
376
+ "harness|bbh:geometric_shapes": {
377
+ "name": "bbh:geometric_shapes",
378
+ "prompt_function": "bbh_geometric_shapes",
379
+ "hf_repo": "lukaemon/bbh",
380
+ "hf_subset": "geometric_shapes",
381
+ "metric": [
382
+ "exact_match",
383
+ "quasi_exact_match",
384
+ "prefix_exact_match",
385
+ "prefix_quasi_exact_match",
386
+ "perfect_exact_match"
387
+ ],
388
+ "hf_avail_splits": [
389
+ "test"
390
+ ],
391
+ "evaluation_splits": [
392
+ "test"
393
+ ],
394
+ "few_shots_split": null,
395
+ "few_shots_select": null,
396
+ "generation_size": 20,
397
+ "stop_sequence": [
398
+ "</s>",
399
+ "Q:",
400
+ "\n\n"
401
+ ],
402
+ "output_regex": null,
403
+ "frozen": false,
404
+ "suite": [
405
+ "harness"
406
+ ],
407
+ "original_num_docs": 250,
408
+ "effective_num_docs": 250,
409
+ "trust_dataset": true,
410
+ "must_remove_duplicate_docs": null
411
+ },
412
+ "harness|bbh:logical_deduction_five_objects": {
413
+ "name": "bbh:logical_deduction_five_objects",
414
+ "prompt_function": "bbh_logical_deduction_five_objects",
415
+ "hf_repo": "lukaemon/bbh",
416
+ "hf_subset": "logical_deduction_five_objects",
417
+ "metric": [
418
+ "exact_match",
419
+ "quasi_exact_match",
420
+ "prefix_exact_match",
421
+ "prefix_quasi_exact_match",
422
+ "perfect_exact_match"
423
+ ],
424
+ "hf_avail_splits": [
425
+ "test"
426
+ ],
427
+ "evaluation_splits": [
428
+ "test"
429
+ ],
430
+ "few_shots_split": null,
431
+ "few_shots_select": null,
432
+ "generation_size": 20,
433
+ "stop_sequence": [
434
+ "</s>",
435
+ "Q:",
436
+ "\n\n"
437
+ ],
438
+ "output_regex": null,
439
+ "frozen": false,
440
+ "suite": [
441
+ "harness"
442
+ ],
443
+ "original_num_docs": 250,
444
+ "effective_num_docs": 250,
445
+ "trust_dataset": true,
446
+ "must_remove_duplicate_docs": null
447
+ },
448
+ "harness|bbh:logical_deduction_seven_objects": {
449
+ "name": "bbh:logical_deduction_seven_objects",
450
+ "prompt_function": "bbh_logical_deduction_seven_objects",
451
+ "hf_repo": "lukaemon/bbh",
452
+ "hf_subset": "logical_deduction_seven_objects",
453
+ "metric": [
454
+ "exact_match",
455
+ "quasi_exact_match",
456
+ "prefix_exact_match",
457
+ "prefix_quasi_exact_match",
458
+ "perfect_exact_match"
459
+ ],
460
+ "hf_avail_splits": [
461
+ "test"
462
+ ],
463
+ "evaluation_splits": [
464
+ "test"
465
+ ],
466
+ "few_shots_split": null,
467
+ "few_shots_select": null,
468
+ "generation_size": 20,
469
+ "stop_sequence": [
470
+ "</s>",
471
+ "Q:",
472
+ "\n\n"
473
+ ],
474
+ "output_regex": null,
475
+ "frozen": false,
476
+ "suite": [
477
+ "harness"
478
+ ],
479
+ "original_num_docs": 250,
480
+ "effective_num_docs": 250,
481
+ "trust_dataset": true,
482
+ "must_remove_duplicate_docs": null
483
+ },
484
+ "harness|bbh:logical_deduction_three_objects": {
485
+ "name": "bbh:logical_deduction_three_objects",
486
+ "prompt_function": "bbh_logical_deduction_three_objects",
487
+ "hf_repo": "lukaemon/bbh",
488
+ "hf_subset": "logical_deduction_three_objects",
489
+ "metric": [
490
+ "exact_match",
491
+ "quasi_exact_match",
492
+ "prefix_exact_match",
493
+ "prefix_quasi_exact_match",
494
+ "perfect_exact_match"
495
+ ],
496
+ "hf_avail_splits": [
497
+ "test"
498
+ ],
499
+ "evaluation_splits": [
500
+ "test"
501
+ ],
502
+ "few_shots_split": null,
503
+ "few_shots_select": null,
504
+ "generation_size": 20,
505
+ "stop_sequence": [
506
+ "</s>",
507
+ "Q:",
508
+ "\n\n"
509
+ ],
510
+ "output_regex": null,
511
+ "frozen": false,
512
+ "suite": [
513
+ "harness"
514
+ ],
515
+ "original_num_docs": 250,
516
+ "effective_num_docs": 250,
517
+ "trust_dataset": true,
518
+ "must_remove_duplicate_docs": null
519
+ },
520
+ "harness|bbh:movie_recommendation": {
521
+ "name": "bbh:movie_recommendation",
522
+ "prompt_function": "bbh_movie_recommendation",
523
+ "hf_repo": "lukaemon/bbh",
524
+ "hf_subset": "movie_recommendation",
525
+ "metric": [
526
+ "exact_match",
527
+ "quasi_exact_match",
528
+ "prefix_exact_match",
529
+ "prefix_quasi_exact_match",
530
+ "perfect_exact_match"
531
+ ],
532
+ "hf_avail_splits": [
533
+ "test"
534
+ ],
535
+ "evaluation_splits": [
536
+ "test"
537
+ ],
538
+ "few_shots_split": null,
539
+ "few_shots_select": null,
540
+ "generation_size": 20,
541
+ "stop_sequence": [
542
+ "</s>",
543
+ "Q:",
544
+ "\n\n"
545
+ ],
546
+ "output_regex": null,
547
+ "frozen": false,
548
+ "suite": [
549
+ "harness"
550
+ ],
551
+ "original_num_docs": 249,
552
+ "effective_num_docs": 249,
553
+ "trust_dataset": true,
554
+ "must_remove_duplicate_docs": null
555
+ },
556
+ "harness|bbh:navigate": {
557
+ "name": "bbh:navigate",
558
+ "prompt_function": "bbh_navigate",
559
+ "hf_repo": "lukaemon/bbh",
560
+ "hf_subset": "navigate",
561
+ "metric": [
562
+ "exact_match",
563
+ "quasi_exact_match",
564
+ "prefix_exact_match",
565
+ "prefix_quasi_exact_match",
566
+ "perfect_exact_match"
567
+ ],
568
+ "hf_avail_splits": [
569
+ "test"
570
+ ],
571
+ "evaluation_splits": [
572
+ "test"
573
+ ],
574
+ "few_shots_split": null,
575
+ "few_shots_select": null,
576
+ "generation_size": 20,
577
+ "stop_sequence": [
578
+ "</s>",
579
+ "Q:",
580
+ "\n\n"
581
+ ],
582
+ "output_regex": null,
583
+ "frozen": false,
584
+ "suite": [
585
+ "harness"
586
+ ],
587
+ "original_num_docs": 250,
588
+ "effective_num_docs": 250,
589
+ "trust_dataset": true,
590
+ "must_remove_duplicate_docs": null
591
+ },
592
+ "harness|bbh:reasoning_about_colored_objects": {
593
+ "name": "bbh:reasoning_about_colored_objects",
594
+ "prompt_function": "bbh_reasoning_about_colored_objects",
595
+ "hf_repo": "lukaemon/bbh",
596
+ "hf_subset": "reasoning_about_colored_objects",
597
+ "metric": [
598
+ "exact_match",
599
+ "quasi_exact_match",
600
+ "prefix_exact_match",
601
+ "prefix_quasi_exact_match",
602
+ "perfect_exact_match"
603
+ ],
604
+ "hf_avail_splits": [
605
+ "test"
606
+ ],
607
+ "evaluation_splits": [
608
+ "test"
609
+ ],
610
+ "few_shots_split": null,
611
+ "few_shots_select": null,
612
+ "generation_size": 20,
613
+ "stop_sequence": [
614
+ "</s>",
615
+ "Q:",
616
+ "\n\n"
617
+ ],
618
+ "output_regex": null,
619
+ "frozen": false,
620
+ "suite": [
621
+ "harness"
622
+ ],
623
+ "original_num_docs": 250,
624
+ "effective_num_docs": 250,
625
+ "trust_dataset": true,
626
+ "must_remove_duplicate_docs": null
627
+ },
628
+ "harness|bbh:ruin_names": {
629
+ "name": "bbh:ruin_names",
630
+ "prompt_function": "bbh_ruin_names",
631
+ "hf_repo": "lukaemon/bbh",
632
+ "hf_subset": "ruin_names",
633
+ "metric": [
634
+ "exact_match",
635
+ "quasi_exact_match",
636
+ "prefix_exact_match",
637
+ "prefix_quasi_exact_match",
638
+ "perfect_exact_match"
639
+ ],
640
+ "hf_avail_splits": [
641
+ "test"
642
+ ],
643
+ "evaluation_splits": [
644
+ "test"
645
+ ],
646
+ "few_shots_split": null,
647
+ "few_shots_select": null,
648
+ "generation_size": 20,
649
+ "stop_sequence": [
650
+ "</s>",
651
+ "Q:",
652
+ "\n\n"
653
+ ],
654
+ "output_regex": null,
655
+ "frozen": false,
656
+ "suite": [
657
+ "harness"
658
+ ],
659
+ "original_num_docs": 248,
660
+ "effective_num_docs": 248,
661
+ "trust_dataset": true,
662
+ "must_remove_duplicate_docs": null
663
+ },
664
+ "harness|bbh:salient_translation_error_detection": {
665
+ "name": "bbh:salient_translation_error_detection",
666
+ "prompt_function": "bbh_salient_translation_error_detection",
667
+ "hf_repo": "lukaemon/bbh",
668
+ "hf_subset": "salient_translation_error_detection",
669
+ "metric": [
670
+ "exact_match",
671
+ "quasi_exact_match",
672
+ "prefix_exact_match",
673
+ "prefix_quasi_exact_match",
674
+ "perfect_exact_match"
675
+ ],
676
+ "hf_avail_splits": [
677
+ "test"
678
+ ],
679
+ "evaluation_splits": [
680
+ "test"
681
+ ],
682
+ "few_shots_split": null,
683
+ "few_shots_select": null,
684
+ "generation_size": 20,
685
+ "stop_sequence": [
686
+ "</s>",
687
+ "Q:",
688
+ "\n\n"
689
+ ],
690
+ "output_regex": null,
691
+ "frozen": false,
692
+ "suite": [
693
+ "harness"
694
+ ],
695
+ "original_num_docs": 250,
696
+ "effective_num_docs": 250,
697
+ "trust_dataset": true,
698
+ "must_remove_duplicate_docs": null
699
+ },
700
+ "harness|bbh:snarks": {
701
+ "name": "bbh:snarks",
702
+ "prompt_function": "bbh_snarks",
703
+ "hf_repo": "lukaemon/bbh",
704
+ "hf_subset": "snarks",
705
+ "metric": [
706
+ "exact_match",
707
+ "quasi_exact_match",
708
+ "prefix_exact_match",
709
+ "prefix_quasi_exact_match",
710
+ "perfect_exact_match"
711
+ ],
712
+ "hf_avail_splits": [
713
+ "test"
714
+ ],
715
+ "evaluation_splits": [
716
+ "test"
717
+ ],
718
+ "few_shots_split": null,
719
+ "few_shots_select": null,
720
+ "generation_size": 20,
721
+ "stop_sequence": [
722
+ "</s>",
723
+ "Q:",
724
+ "\n\n"
725
+ ],
726
+ "output_regex": null,
727
+ "frozen": false,
728
+ "suite": [
729
+ "harness"
730
+ ],
731
+ "original_num_docs": 178,
732
+ "effective_num_docs": 178,
733
+ "trust_dataset": true,
734
+ "must_remove_duplicate_docs": null
735
+ },
736
+ "harness|bbh:sports_understanding": {
737
+ "name": "bbh:sports_understanding",
738
+ "prompt_function": "bbh_sports_understanding",
739
+ "hf_repo": "lukaemon/bbh",
740
+ "hf_subset": "sports_understanding",
741
+ "metric": [
742
+ "exact_match",
743
+ "quasi_exact_match",
744
+ "prefix_exact_match",
745
+ "prefix_quasi_exact_match",
746
+ "perfect_exact_match"
747
+ ],
748
+ "hf_avail_splits": [
749
+ "test"
750
+ ],
751
+ "evaluation_splits": [
752
+ "test"
753
+ ],
754
+ "few_shots_split": null,
755
+ "few_shots_select": null,
756
+ "generation_size": 20,
757
+ "stop_sequence": [
758
+ "</s>",
759
+ "Q:",
760
+ "\n\n"
761
+ ],
762
+ "output_regex": null,
763
+ "frozen": false,
764
+ "suite": [
765
+ "harness"
766
+ ],
767
+ "original_num_docs": 250,
768
+ "effective_num_docs": 250,
769
+ "trust_dataset": true,
770
+ "must_remove_duplicate_docs": null
771
+ },
772
+ "harness|bbh:temporal_sequences": {
773
+ "name": "bbh:temporal_sequences",
774
+ "prompt_function": "bbh_temporal_sequences",
775
+ "hf_repo": "lukaemon/bbh",
776
+ "hf_subset": "temporal_sequences",
777
+ "metric": [
778
+ "exact_match",
779
+ "quasi_exact_match",
780
+ "prefix_exact_match",
781
+ "prefix_quasi_exact_match",
782
+ "perfect_exact_match"
783
+ ],
784
+ "hf_avail_splits": [
785
+ "test"
786
+ ],
787
+ "evaluation_splits": [
788
+ "test"
789
+ ],
790
+ "few_shots_split": null,
791
+ "few_shots_select": null,
792
+ "generation_size": 20,
793
+ "stop_sequence": [
794
+ "</s>",
795
+ "Q:",
796
+ "\n\n"
797
+ ],
798
+ "output_regex": null,
799
+ "frozen": false,
800
+ "suite": [
801
+ "harness"
802
+ ],
803
+ "original_num_docs": 250,
804
+ "effective_num_docs": 250,
805
+ "trust_dataset": true,
806
+ "must_remove_duplicate_docs": null
807
+ },
808
+ "harness|bbh:tracking_shuffled_objects_five_objects": {
809
+ "name": "bbh:tracking_shuffled_objects_five_objects",
810
+ "prompt_function": "bbh_tracking_shuffled_objects_five_objects",
811
+ "hf_repo": "lukaemon/bbh",
812
+ "hf_subset": "tracking_shuffled_objects_five_objects",
813
+ "metric": [
814
+ "exact_match",
815
+ "quasi_exact_match",
816
+ "prefix_exact_match",
817
+ "prefix_quasi_exact_match",
818
+ "perfect_exact_match"
819
+ ],
820
+ "hf_avail_splits": [
821
+ "test"
822
+ ],
823
+ "evaluation_splits": [
824
+ "test"
825
+ ],
826
+ "few_shots_split": null,
827
+ "few_shots_select": null,
828
+ "generation_size": 20,
829
+ "stop_sequence": [
830
+ "</s>",
831
+ "Q:",
832
+ "\n\n"
833
+ ],
834
+ "output_regex": null,
835
+ "frozen": false,
836
+ "suite": [
837
+ "harness"
838
+ ],
839
+ "original_num_docs": 250,
840
+ "effective_num_docs": 250,
841
+ "trust_dataset": true,
842
+ "must_remove_duplicate_docs": null
843
+ },
844
+ "harness|bbh:tracking_shuffled_objects_seven_objects": {
845
+ "name": "bbh:tracking_shuffled_objects_seven_objects",
846
+ "prompt_function": "bbh_tracking_shuffled_objects_seven_objects",
847
+ "hf_repo": "lukaemon/bbh",
848
+ "hf_subset": "tracking_shuffled_objects_seven_objects",
849
+ "metric": [
850
+ "exact_match",
851
+ "quasi_exact_match",
852
+ "prefix_exact_match",
853
+ "prefix_quasi_exact_match",
854
+ "perfect_exact_match"
855
+ ],
856
+ "hf_avail_splits": [
857
+ "test"
858
+ ],
859
+ "evaluation_splits": [
860
+ "test"
861
+ ],
862
+ "few_shots_split": null,
863
+ "few_shots_select": null,
864
+ "generation_size": 20,
865
+ "stop_sequence": [
866
+ "</s>",
867
+ "Q:",
868
+ "\n\n"
869
+ ],
870
+ "output_regex": null,
871
+ "frozen": false,
872
+ "suite": [
873
+ "harness"
874
+ ],
875
+ "original_num_docs": 250,
876
+ "effective_num_docs": 250,
877
+ "trust_dataset": true,
878
+ "must_remove_duplicate_docs": null
879
+ },
880
+ "harness|bbh:tracking_shuffled_objects_three_objects": {
881
+ "name": "bbh:tracking_shuffled_objects_three_objects",
882
+ "prompt_function": "bbh_tracking_shuffled_objects_three_objects",
883
+ "hf_repo": "lukaemon/bbh",
884
+ "hf_subset": "tracking_shuffled_objects_three_objects",
885
+ "metric": [
886
+ "exact_match",
887
+ "quasi_exact_match",
888
+ "prefix_exact_match",
889
+ "prefix_quasi_exact_match",
890
+ "perfect_exact_match"
891
+ ],
892
+ "hf_avail_splits": [
893
+ "test"
894
+ ],
895
+ "evaluation_splits": [
896
+ "test"
897
+ ],
898
+ "few_shots_split": null,
899
+ "few_shots_select": null,
900
+ "generation_size": 20,
901
+ "stop_sequence": [
902
+ "</s>",
903
+ "Q:",
904
+ "\n\n"
905
+ ],
906
+ "output_regex": null,
907
+ "frozen": false,
908
+ "suite": [
909
+ "harness"
910
+ ],
911
+ "original_num_docs": 250,
912
+ "effective_num_docs": 250,
913
+ "trust_dataset": true,
914
+ "must_remove_duplicate_docs": null
915
+ }
916
+ },
917
+ "summary_tasks": {
918
+ "harness|bbh:causal_judgment|3": {
919
+ "hashes": {
920
+ "hash_examples": "63218f5ae055ab2b",
921
+ "hash_full_prompts": "fa8168f39a475fb0",
922
+ "hash_input_tokens": "787f75e06fd43c0d",
923
+ "hash_cont_tokens": "284e857febd4be85"
924
+ },
925
+ "truncated": 187,
926
+ "non_truncated": 0,
927
+ "padded": 0,
928
+ "non_padded": 187,
929
+ "effective_few_shots": 3.0,
930
+ "num_truncated_few_shots": 0
931
+ },
932
+ "harness|bbh:date_understanding|3": {
933
+ "hashes": {
934
+ "hash_examples": "f145c7a06def3c8e",
935
+ "hash_full_prompts": "2cceeea606638d49",
936
+ "hash_input_tokens": "10c13d6fb8af7c22",
937
+ "hash_cont_tokens": "d3c601f01854a89e"
938
+ },
939
+ "truncated": 250,
940
+ "non_truncated": 0,
941
+ "padded": 0,
942
+ "non_padded": 250,
943
+ "effective_few_shots": 3.0,
944
+ "num_truncated_few_shots": 0
945
+ },
946
+ "harness|bbh:disambiguation_qa|3": {
947
+ "hashes": {
948
+ "hash_examples": "19677fd1773f7eb9",
949
+ "hash_full_prompts": "d8f1ba70c22ae578",
950
+ "hash_input_tokens": "c21a88707f480cab",
951
+ "hash_cont_tokens": "8705f09efb1872ed"
952
+ },
953
+ "truncated": 250,
954
+ "non_truncated": 0,
955
+ "padded": 0,
956
+ "non_padded": 250,
957
+ "effective_few_shots": 3.0,
958
+ "num_truncated_few_shots": 0
959
+ },
960
+ "harness|bbh:geometric_shapes|3": {
961
+ "hashes": {
962
+ "hash_examples": "76c7b11a13cc72a9",
963
+ "hash_full_prompts": "52a60ed1d0113b8b",
964
+ "hash_input_tokens": "10e113b2cf3fa584",
965
+ "hash_cont_tokens": "aa68d879c1c72d3c"
966
+ },
967
+ "truncated": 250,
968
+ "non_truncated": 0,
969
+ "padded": 0,
970
+ "non_padded": 250,
971
+ "effective_few_shots": 3.0,
972
+ "num_truncated_few_shots": 0
973
+ },
974
+ "harness|bbh:logical_deduction_five_objects|3": {
975
+ "hashes": {
976
+ "hash_examples": "0e958c856332a745",
977
+ "hash_full_prompts": "253aa9791c941909",
978
+ "hash_input_tokens": "0bc166cab0aed76a",
979
+ "hash_cont_tokens": "a33748b3d10c2699"
980
+ },
981
+ "truncated": 250,
982
+ "non_truncated": 0,
983
+ "padded": 0,
984
+ "non_padded": 250,
985
+ "effective_few_shots": 3.0,
986
+ "num_truncated_few_shots": 0
987
+ },
988
+ "harness|bbh:logical_deduction_seven_objects|3": {
989
+ "hashes": {
990
+ "hash_examples": "ab9de25a5eb40d09",
991
+ "hash_full_prompts": "aa6117f601cd268e",
992
+ "hash_input_tokens": "ab99c78b48e3a0bb",
993
+ "hash_cont_tokens": "3edfc27f0e21c94d"
994
+ },
995
+ "truncated": 250,
996
+ "non_truncated": 0,
997
+ "padded": 0,
998
+ "non_padded": 250,
999
+ "effective_few_shots": 3.0,
1000
+ "num_truncated_few_shots": 0
1001
+ },
1002
+ "harness|bbh:logical_deduction_three_objects|3": {
1003
+ "hashes": {
1004
+ "hash_examples": "3c6bf52517714218",
1005
+ "hash_full_prompts": "1892b050bc7848a4",
1006
+ "hash_input_tokens": "a720b56aa7c52551",
1007
+ "hash_cont_tokens": "abd8a52498d4f728"
1008
+ },
1009
+ "truncated": 250,
1010
+ "non_truncated": 0,
1011
+ "padded": 0,
1012
+ "non_padded": 250,
1013
+ "effective_few_shots": 3.0,
1014
+ "num_truncated_few_shots": 0
1015
+ },
1016
+ "harness|bbh:movie_recommendation|3": {
1017
+ "hashes": {
1018
+ "hash_examples": "2d9dc4975935d31a",
1019
+ "hash_full_prompts": "8e00606ed3407167",
1020
+ "hash_input_tokens": "c825ab1c99245a17",
1021
+ "hash_cont_tokens": "5ec900e40cbc845e"
1022
+ },
1023
+ "truncated": 249,
1024
+ "non_truncated": 0,
1025
+ "padded": 0,
1026
+ "non_padded": 249,
1027
+ "effective_few_shots": 3.0,
1028
+ "num_truncated_few_shots": 0
1029
+ },
1030
+ "harness|bbh:navigate|3": {
1031
+ "hashes": {
1032
+ "hash_examples": "ba91dcdb9a064255",
1033
+ "hash_full_prompts": "8d50c5baf1df7aef",
1034
+ "hash_input_tokens": "f234e6b28ea1fa49",
1035
+ "hash_cont_tokens": "ee12c9f32365d922"
1036
+ },
1037
+ "truncated": 250,
1038
+ "non_truncated": 0,
1039
+ "padded": 0,
1040
+ "non_padded": 250,
1041
+ "effective_few_shots": 3.0,
1042
+ "num_truncated_few_shots": 0
1043
+ },
1044
+ "harness|bbh:reasoning_about_colored_objects|3": {
1045
+ "hashes": {
1046
+ "hash_examples": "a6ba328c4c3385d2",
1047
+ "hash_full_prompts": "3d2441a21c12a960",
1048
+ "hash_input_tokens": "f3b577892955aa84",
1049
+ "hash_cont_tokens": "e87bd4c4e98a3778"
1050
+ },
1051
+ "truncated": 250,
1052
+ "non_truncated": 0,
1053
+ "padded": 0,
1054
+ "non_padded": 250,
1055
+ "effective_few_shots": 3.0,
1056
+ "num_truncated_few_shots": 0
1057
+ },
1058
+ "harness|bbh:ruin_names|3": {
1059
+ "hashes": {
1060
+ "hash_examples": "2ef28d5f2d4fdd25",
1061
+ "hash_full_prompts": "ba95caa786f313b1",
1062
+ "hash_input_tokens": "9954b30d4205604a",
1063
+ "hash_cont_tokens": "1d43f73a5b320bd5"
1064
+ },
1065
+ "truncated": 248,
1066
+ "non_truncated": 0,
1067
+ "padded": 0,
1068
+ "non_padded": 248,
1069
+ "effective_few_shots": 3.0,
1070
+ "num_truncated_few_shots": 0
1071
+ },
1072
+ "harness|bbh:salient_translation_error_detection|3": {
1073
+ "hashes": {
1074
+ "hash_examples": "c13f25ec8ffed496",
1075
+ "hash_full_prompts": "a8512d174e1cab8f",
1076
+ "hash_input_tokens": "3e738df24b7eddf8",
1077
+ "hash_cont_tokens": "c134887984e0a3e3"
1078
+ },
1079
+ "truncated": 250,
1080
+ "non_truncated": 0,
1081
+ "padded": 0,
1082
+ "non_padded": 250,
1083
+ "effective_few_shots": 3.0,
1084
+ "num_truncated_few_shots": 0
1085
+ },
1086
+ "harness|bbh:snarks|3": {
1087
+ "hashes": {
1088
+ "hash_examples": "5f6db7bff7f6f22e",
1089
+ "hash_full_prompts": "ff91d81466b9041f",
1090
+ "hash_input_tokens": "21388b09e13d0208",
1091
+ "hash_cont_tokens": "cb0d472786a5109c"
1092
+ },
1093
+ "truncated": 178,
1094
+ "non_truncated": 0,
1095
+ "padded": 0,
1096
+ "non_padded": 178,
1097
+ "effective_few_shots": 3.0,
1098
+ "num_truncated_few_shots": 0
1099
+ },
1100
+ "harness|bbh:sports_understanding|3": {
1101
+ "hashes": {
1102
+ "hash_examples": "042afbe5d9c1f02d",
1103
+ "hash_full_prompts": "a59324d9eb37e0f5",
1104
+ "hash_input_tokens": "0ad41bb8d2290a5b",
1105
+ "hash_cont_tokens": "d597f572759be385"
1106
+ },
1107
+ "truncated": 250,
1108
+ "non_truncated": 0,
1109
+ "padded": 0,
1110
+ "non_padded": 250,
1111
+ "effective_few_shots": 3.0,
1112
+ "num_truncated_few_shots": 0
1113
+ },
1114
+ "harness|bbh:temporal_sequences|3": {
1115
+ "hashes": {
1116
+ "hash_examples": "803a05f352eb6afc",
1117
+ "hash_full_prompts": "1b3971192bf481e7",
1118
+ "hash_input_tokens": "3051b60940ccceab",
1119
+ "hash_cont_tokens": "7f65864d92d59a9a"
1120
+ },
1121
+ "truncated": 250,
1122
+ "non_truncated": 0,
1123
+ "padded": 0,
1124
+ "non_padded": 250,
1125
+ "effective_few_shots": 3.0,
1126
+ "num_truncated_few_shots": 0
1127
+ },
1128
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": {
1129
+ "hashes": {
1130
+ "hash_examples": "2bbac6db7ab0d527",
1131
+ "hash_full_prompts": "7ef4567d2fcf5094",
1132
+ "hash_input_tokens": "b841310ee5531238",
1133
+ "hash_cont_tokens": "76b08e70815da010"
1134
+ },
1135
+ "truncated": 250,
1136
+ "non_truncated": 0,
1137
+ "padded": 0,
1138
+ "non_padded": 250,
1139
+ "effective_few_shots": 3.0,
1140
+ "num_truncated_few_shots": 0
1141
+ },
1142
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": {
1143
+ "hashes": {
1144
+ "hash_examples": "845caf093ac2b58c",
1145
+ "hash_full_prompts": "196a0f8712857624",
1146
+ "hash_input_tokens": "3e738df24b7eddf8",
1147
+ "hash_cont_tokens": "ffbf705ca9254165"
1148
+ },
1149
+ "truncated": 250,
1150
+ "non_truncated": 0,
1151
+ "padded": 0,
1152
+ "non_padded": 250,
1153
+ "effective_few_shots": 3.0,
1154
+ "num_truncated_few_shots": 0
1155
+ },
1156
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": {
1157
+ "hashes": {
1158
+ "hash_examples": "9004f14d5a32b9a8",
1159
+ "hash_full_prompts": "592a03f0518f17b6",
1160
+ "hash_input_tokens": "19e0ef1dd5ae9d33",
1161
+ "hash_cont_tokens": "feef4dd2052babcc"
1162
+ },
1163
+ "truncated": 250,
1164
+ "non_truncated": 0,
1165
+ "padded": 0,
1166
+ "non_padded": 250,
1167
+ "effective_few_shots": 3.0,
1168
+ "num_truncated_few_shots": 0
1169
+ }
1170
+ },
1171
+ "summary_general": {
1172
+ "hashes": {
1173
+ "hash_examples": "4ff1e3dc5703575d",
1174
+ "hash_full_prompts": "0d80ce968d89d4ef",
1175
+ "hash_input_tokens": "72bda1e7aeb34786",
1176
+ "hash_cont_tokens": "67741ca54caa3be8"
1177
+ },
1178
+ "truncated": 4362,
1179
+ "non_truncated": 0,
1180
+ "padded": 0,
1181
+ "non_padded": 4362,
1182
+ "num_truncated_few_shots": 0
1183
+ }
1184
+ }