lewtun HF staff commited on
Commit
1956f9c
·
verified ·
1 Parent(s): 406cb35

Upload eval_results/mistralai/Mixtral-8x7B-Instruct-v0.1/main/bbh/results_2024-03-18T20-58-12.014656.json with huggingface_hub

Browse files
eval_results/mistralai/Mixtral-8x7B-Instruct-v0.1/main/bbh/results_2024-03-18T20-58-12.014656.json ADDED
@@ -0,0 +1,1184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 1,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 2022781.083108455,
9
+ "end_time": 2025078.696342136,
10
+ "total_evaluation_time_secondes": "2297.6132336810697",
11
+ "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1",
12
+ "model_sha": "1e637f2d7cb0a9d6fb1922f305cb784995190a83",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "87.49 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "harness|bbh:causal_judgment|3": {
19
+ "em": 0.26737967914438504,
20
+ "em_stderr": 0.032452428900352305,
21
+ "qem": 0.2994652406417112,
22
+ "qem_stderr": 0.033583935154759645,
23
+ "pem": 0.6470588235294118,
24
+ "pem_stderr": 0.03504019983419237,
25
+ "pqem": 0.6470588235294118,
26
+ "pqem_stderr": 0.03504019983419237,
27
+ "perfect_em": 0.26737967914438504,
28
+ "perfect_em_stderr": 0.032452428900352305
29
+ },
30
+ "harness|bbh:date_understanding|3": {
31
+ "em": 0.0,
32
+ "em_stderr": 0.0,
33
+ "qem": 0.2,
34
+ "qem_stderr": 0.02534897002097908,
35
+ "pem": 0.216,
36
+ "pem_stderr": 0.02607865766373272,
37
+ "pqem": 0.64,
38
+ "pqem_stderr": 0.03041876402517499,
39
+ "perfect_em": 0.0,
40
+ "perfect_em_stderr": 0.0
41
+ },
42
+ "harness|bbh:disambiguation_qa|3": {
43
+ "em": 0.016,
44
+ "em_stderr": 0.00795166118887434,
45
+ "qem": 0.156,
46
+ "qem_stderr": 0.022995023034068755,
47
+ "pem": 0.22,
48
+ "pem_stderr": 0.026251792824605834,
49
+ "pqem": 0.74,
50
+ "pqem_stderr": 0.027797315752644308,
51
+ "perfect_em": 0.016,
52
+ "perfect_em_stderr": 0.00795166118887434
53
+ },
54
+ "harness|bbh:geometric_shapes|3": {
55
+ "em": 0.0,
56
+ "em_stderr": 0.0,
57
+ "qem": 0.308,
58
+ "qem_stderr": 0.029256928606501864,
59
+ "pem": 0.008,
60
+ "pem_stderr": 0.005645483676690174,
61
+ "pqem": 0.368,
62
+ "pqem_stderr": 0.03056207062099316,
63
+ "perfect_em": 0.0,
64
+ "perfect_em_stderr": 0.0
65
+ },
66
+ "harness|bbh:logical_deduction_five_objects|3": {
67
+ "em": 0.02,
68
+ "em_stderr": 0.008872139507342683,
69
+ "qem": 0.284,
70
+ "qem_stderr": 0.02857695873043741,
71
+ "pem": 0.056,
72
+ "pem_stderr": 0.014570697336899599,
73
+ "pqem": 0.448,
74
+ "pqem_stderr": 0.03151438761115355,
75
+ "perfect_em": 0.02,
76
+ "perfect_em_stderr": 0.008872139507342683
77
+ },
78
+ "harness|bbh:logical_deduction_seven_objects|3": {
79
+ "em": 0.0,
80
+ "em_stderr": 0.0,
81
+ "qem": 0.304,
82
+ "qem_stderr": 0.029150213374159677,
83
+ "pem": 0.0,
84
+ "pem_stderr": 0.0,
85
+ "pqem": 0.452,
86
+ "pqem_stderr": 0.03153986449255662,
87
+ "perfect_em": 0.0,
88
+ "perfect_em_stderr": 0.0
89
+ },
90
+ "harness|bbh:logical_deduction_three_objects|3": {
91
+ "em": 0.036,
92
+ "em_stderr": 0.011805655169278133,
93
+ "qem": 0.428,
94
+ "qem_stderr": 0.031355968923772626,
95
+ "pem": 0.072,
96
+ "pem_stderr": 0.016381005750490108,
97
+ "pqem": 0.632,
98
+ "pqem_stderr": 0.030562070620993163,
99
+ "perfect_em": 0.036,
100
+ "perfect_em_stderr": 0.011805655169278133
101
+ },
102
+ "harness|bbh:movie_recommendation|3": {
103
+ "em": 0.0,
104
+ "em_stderr": 0.0,
105
+ "qem": 0.4779116465863454,
106
+ "qem_stderr": 0.03171903523348456,
107
+ "pem": 0.14457831325301204,
108
+ "pem_stderr": 0.022331395571821913,
109
+ "pqem": 0.7751004016064257,
110
+ "pqem_stderr": 0.02651230458673727,
111
+ "perfect_em": 0.0,
112
+ "perfect_em_stderr": 0.0
113
+ },
114
+ "harness|bbh:navigate|3": {
115
+ "em": 0.224,
116
+ "em_stderr": 0.02642136168734791,
117
+ "qem": 0.224,
118
+ "qem_stderr": 0.02642136168734791,
119
+ "pem": 0.648,
120
+ "pem_stderr": 0.030266288057359925,
121
+ "pqem": 0.648,
122
+ "pqem_stderr": 0.030266288057359925,
123
+ "perfect_em": 0.224,
124
+ "perfect_em_stderr": 0.02642136168734791
125
+ },
126
+ "harness|bbh:reasoning_about_colored_objects|3": {
127
+ "em": 0.18,
128
+ "em_stderr": 0.024346890650293548,
129
+ "qem": 0.5,
130
+ "qem_stderr": 0.031686212526223896,
131
+ "pem": 0.232,
132
+ "pem_stderr": 0.02675007037486516,
133
+ "pqem": 0.62,
134
+ "pqem_stderr": 0.030760116042626046,
135
+ "perfect_em": 0.18,
136
+ "perfect_em_stderr": 0.024346890650293548
137
+ },
138
+ "harness|bbh:ruin_names|3": {
139
+ "em": 0.0,
140
+ "em_stderr": 0.0,
141
+ "qem": 0.4475806451612903,
142
+ "qem_stderr": 0.03163891746142309,
143
+ "pem": 0.016129032258064516,
144
+ "pem_stderr": 0.008015391715832133,
145
+ "pqem": 0.6048387096774194,
146
+ "pqem_stderr": 0.03110702726972493,
147
+ "perfect_em": 0.0,
148
+ "perfect_em_stderr": 0.0
149
+ },
150
+ "harness|bbh:salient_translation_error_detection|3": {
151
+ "em": 0.0,
152
+ "em_stderr": 0.0,
153
+ "qem": 0.328,
154
+ "qem_stderr": 0.02975239182447538,
155
+ "pem": 0.012,
156
+ "pem_stderr": 0.0069003230236943,
157
+ "pqem": 0.564,
158
+ "pqem_stderr": 0.03142556706028129,
159
+ "perfect_em": 0.0,
160
+ "perfect_em_stderr": 0.0
161
+ },
162
+ "harness|bbh:snarks|3": {
163
+ "em": 0.0,
164
+ "em_stderr": 0.0,
165
+ "qem": 0.6348314606741573,
166
+ "qem_stderr": 0.03619005678691266,
167
+ "pem": 0.0,
168
+ "pem_stderr": 0.0,
169
+ "pqem": 0.7584269662921348,
170
+ "pqem_stderr": 0.0321732161383325,
171
+ "perfect_em": 0.0,
172
+ "perfect_em_stderr": 0.0
173
+ },
174
+ "harness|bbh:sports_understanding|3": {
175
+ "em": 0.216,
176
+ "em_stderr": 0.026078657663732727,
177
+ "qem": 0.22,
178
+ "qem_stderr": 0.02625179282460584,
179
+ "pem": 0.72,
180
+ "pem_stderr": 0.02845414827783232,
181
+ "pqem": 0.72,
182
+ "pqem_stderr": 0.02845414827783232,
183
+ "perfect_em": 0.216,
184
+ "perfect_em_stderr": 0.026078657663732727
185
+ },
186
+ "harness|bbh:temporal_sequences|3": {
187
+ "em": 0.016,
188
+ "em_stderr": 0.007951661188874328,
189
+ "qem": 0.36,
190
+ "qem_stderr": 0.030418764025174974,
191
+ "pem": 0.116,
192
+ "pem_stderr": 0.02029342980308387,
193
+ "pqem": 0.716,
194
+ "pqem_stderr": 0.028576958730437398,
195
+ "perfect_em": 0.016,
196
+ "perfect_em_stderr": 0.007951661188874328
197
+ },
198
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": {
199
+ "em": 0.044,
200
+ "em_stderr": 0.012997373846574964,
201
+ "qem": 0.148,
202
+ "qem_stderr": 0.022503547243806144,
203
+ "pem": 0.08,
204
+ "pem_stderr": 0.017192507941462983,
205
+ "pqem": 0.356,
206
+ "pqem_stderr": 0.03034368065715322,
207
+ "perfect_em": 0.044,
208
+ "perfect_em_stderr": 0.012997373846574964
209
+ },
210
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": {
211
+ "em": 0.08,
212
+ "em_stderr": 0.01719250794146297,
213
+ "qem": 0.144,
214
+ "qem_stderr": 0.022249407735450203,
215
+ "pem": 0.144,
216
+ "pem_stderr": 0.022249407735450207,
217
+ "pqem": 0.34,
218
+ "pqem_stderr": 0.030020073605457907,
219
+ "perfect_em": 0.08,
220
+ "perfect_em_stderr": 0.01719250794146297
221
+ },
222
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": {
223
+ "em": 0.088,
224
+ "em_stderr": 0.017953084777052892,
225
+ "qem": 0.208,
226
+ "qem_stderr": 0.02572139890141639,
227
+ "pem": 0.184,
228
+ "pem_stderr": 0.02455581299422256,
229
+ "pqem": 0.556,
230
+ "pqem_stderr": 0.031486849425545735,
231
+ "perfect_em": 0.088,
232
+ "perfect_em_stderr": 0.017953084777052892
233
+ },
234
+ "harness|bbh:_average|3": {
235
+ "em": 0.06596553773024362,
236
+ "em_stderr": 0.010779079028954822,
237
+ "qem": 0.31509938850352803,
238
+ "qem_stderr": 0.028601160227500002,
239
+ "pem": 0.1953203427244716,
240
+ "pem_stderr": 0.018387589587902008,
241
+ "pqem": 0.5880791611725217,
242
+ "pqem_stderr": 0.03047560571162204,
243
+ "perfect_em": 0.06596553773024362,
244
+ "perfect_em_stderr": 0.010779079028954822
245
+ }
246
+ },
247
+ "versions": {
248
+ "harness|bbh:causal_judgment|3": 0,
249
+ "harness|bbh:date_understanding|3": 0,
250
+ "harness|bbh:disambiguation_qa|3": 0,
251
+ "harness|bbh:geometric_shapes|3": 0,
252
+ "harness|bbh:logical_deduction_five_objects|3": 0,
253
+ "harness|bbh:logical_deduction_seven_objects|3": 0,
254
+ "harness|bbh:logical_deduction_three_objects|3": 0,
255
+ "harness|bbh:movie_recommendation|3": 0,
256
+ "harness|bbh:navigate|3": 0,
257
+ "harness|bbh:reasoning_about_colored_objects|3": 0,
258
+ "harness|bbh:ruin_names|3": 0,
259
+ "harness|bbh:salient_translation_error_detection|3": 0,
260
+ "harness|bbh:snarks|3": 0,
261
+ "harness|bbh:sports_understanding|3": 0,
262
+ "harness|bbh:temporal_sequences|3": 0,
263
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": 0,
264
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": 0,
265
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": 0
266
+ },
267
+ "config_tasks": {
268
+ "harness|bbh:causal_judgment": {
269
+ "name": "bbh:causal_judgment",
270
+ "prompt_function": "bbh_causal_judgment",
271
+ "hf_repo": "lukaemon/bbh",
272
+ "hf_subset": "causal_judgement",
273
+ "metric": [
274
+ "exact_match",
275
+ "quasi_exact_match",
276
+ "prefix_exact_match",
277
+ "prefix_quasi_exact_match",
278
+ "perfect_exact_match"
279
+ ],
280
+ "hf_avail_splits": [
281
+ "test"
282
+ ],
283
+ "evaluation_splits": [
284
+ "test"
285
+ ],
286
+ "few_shots_split": null,
287
+ "few_shots_select": null,
288
+ "generation_size": 20,
289
+ "stop_sequence": [
290
+ "</s>",
291
+ "Q:",
292
+ "\n\n"
293
+ ],
294
+ "output_regex": null,
295
+ "frozen": false,
296
+ "suite": [
297
+ "harness"
298
+ ],
299
+ "original_num_docs": 187,
300
+ "effective_num_docs": 187,
301
+ "trust_dataset": true,
302
+ "must_remove_duplicate_docs": null
303
+ },
304
+ "harness|bbh:date_understanding": {
305
+ "name": "bbh:date_understanding",
306
+ "prompt_function": "bbh_date_understanding",
307
+ "hf_repo": "lukaemon/bbh",
308
+ "hf_subset": "date_understanding",
309
+ "metric": [
310
+ "exact_match",
311
+ "quasi_exact_match",
312
+ "prefix_exact_match",
313
+ "prefix_quasi_exact_match",
314
+ "perfect_exact_match"
315
+ ],
316
+ "hf_avail_splits": [
317
+ "test"
318
+ ],
319
+ "evaluation_splits": [
320
+ "test"
321
+ ],
322
+ "few_shots_split": null,
323
+ "few_shots_select": null,
324
+ "generation_size": 20,
325
+ "stop_sequence": [
326
+ "</s>",
327
+ "Q:",
328
+ "\n\n"
329
+ ],
330
+ "output_regex": null,
331
+ "frozen": false,
332
+ "suite": [
333
+ "harness"
334
+ ],
335
+ "original_num_docs": 250,
336
+ "effective_num_docs": 250,
337
+ "trust_dataset": true,
338
+ "must_remove_duplicate_docs": null
339
+ },
340
+ "harness|bbh:disambiguation_qa": {
341
+ "name": "bbh:disambiguation_qa",
342
+ "prompt_function": "bbh_disambiguation_qa",
343
+ "hf_repo": "lukaemon/bbh",
344
+ "hf_subset": "disambiguation_qa",
345
+ "metric": [
346
+ "exact_match",
347
+ "quasi_exact_match",
348
+ "prefix_exact_match",
349
+ "prefix_quasi_exact_match",
350
+ "perfect_exact_match"
351
+ ],
352
+ "hf_avail_splits": [
353
+ "test"
354
+ ],
355
+ "evaluation_splits": [
356
+ "test"
357
+ ],
358
+ "few_shots_split": null,
359
+ "few_shots_select": null,
360
+ "generation_size": 20,
361
+ "stop_sequence": [
362
+ "</s>",
363
+ "Q:",
364
+ "\n\n"
365
+ ],
366
+ "output_regex": null,
367
+ "frozen": false,
368
+ "suite": [
369
+ "harness"
370
+ ],
371
+ "original_num_docs": 250,
372
+ "effective_num_docs": 250,
373
+ "trust_dataset": true,
374
+ "must_remove_duplicate_docs": null
375
+ },
376
+ "harness|bbh:geometric_shapes": {
377
+ "name": "bbh:geometric_shapes",
378
+ "prompt_function": "bbh_geometric_shapes",
379
+ "hf_repo": "lukaemon/bbh",
380
+ "hf_subset": "geometric_shapes",
381
+ "metric": [
382
+ "exact_match",
383
+ "quasi_exact_match",
384
+ "prefix_exact_match",
385
+ "prefix_quasi_exact_match",
386
+ "perfect_exact_match"
387
+ ],
388
+ "hf_avail_splits": [
389
+ "test"
390
+ ],
391
+ "evaluation_splits": [
392
+ "test"
393
+ ],
394
+ "few_shots_split": null,
395
+ "few_shots_select": null,
396
+ "generation_size": 20,
397
+ "stop_sequence": [
398
+ "</s>",
399
+ "Q:",
400
+ "\n\n"
401
+ ],
402
+ "output_regex": null,
403
+ "frozen": false,
404
+ "suite": [
405
+ "harness"
406
+ ],
407
+ "original_num_docs": 250,
408
+ "effective_num_docs": 250,
409
+ "trust_dataset": true,
410
+ "must_remove_duplicate_docs": null
411
+ },
412
+ "harness|bbh:logical_deduction_five_objects": {
413
+ "name": "bbh:logical_deduction_five_objects",
414
+ "prompt_function": "bbh_logical_deduction_five_objects",
415
+ "hf_repo": "lukaemon/bbh",
416
+ "hf_subset": "logical_deduction_five_objects",
417
+ "metric": [
418
+ "exact_match",
419
+ "quasi_exact_match",
420
+ "prefix_exact_match",
421
+ "prefix_quasi_exact_match",
422
+ "perfect_exact_match"
423
+ ],
424
+ "hf_avail_splits": [
425
+ "test"
426
+ ],
427
+ "evaluation_splits": [
428
+ "test"
429
+ ],
430
+ "few_shots_split": null,
431
+ "few_shots_select": null,
432
+ "generation_size": 20,
433
+ "stop_sequence": [
434
+ "</s>",
435
+ "Q:",
436
+ "\n\n"
437
+ ],
438
+ "output_regex": null,
439
+ "frozen": false,
440
+ "suite": [
441
+ "harness"
442
+ ],
443
+ "original_num_docs": 250,
444
+ "effective_num_docs": 250,
445
+ "trust_dataset": true,
446
+ "must_remove_duplicate_docs": null
447
+ },
448
+ "harness|bbh:logical_deduction_seven_objects": {
449
+ "name": "bbh:logical_deduction_seven_objects",
450
+ "prompt_function": "bbh_logical_deduction_seven_objects",
451
+ "hf_repo": "lukaemon/bbh",
452
+ "hf_subset": "logical_deduction_seven_objects",
453
+ "metric": [
454
+ "exact_match",
455
+ "quasi_exact_match",
456
+ "prefix_exact_match",
457
+ "prefix_quasi_exact_match",
458
+ "perfect_exact_match"
459
+ ],
460
+ "hf_avail_splits": [
461
+ "test"
462
+ ],
463
+ "evaluation_splits": [
464
+ "test"
465
+ ],
466
+ "few_shots_split": null,
467
+ "few_shots_select": null,
468
+ "generation_size": 20,
469
+ "stop_sequence": [
470
+ "</s>",
471
+ "Q:",
472
+ "\n\n"
473
+ ],
474
+ "output_regex": null,
475
+ "frozen": false,
476
+ "suite": [
477
+ "harness"
478
+ ],
479
+ "original_num_docs": 250,
480
+ "effective_num_docs": 250,
481
+ "trust_dataset": true,
482
+ "must_remove_duplicate_docs": null
483
+ },
484
+ "harness|bbh:logical_deduction_three_objects": {
485
+ "name": "bbh:logical_deduction_three_objects",
486
+ "prompt_function": "bbh_logical_deduction_three_objects",
487
+ "hf_repo": "lukaemon/bbh",
488
+ "hf_subset": "logical_deduction_three_objects",
489
+ "metric": [
490
+ "exact_match",
491
+ "quasi_exact_match",
492
+ "prefix_exact_match",
493
+ "prefix_quasi_exact_match",
494
+ "perfect_exact_match"
495
+ ],
496
+ "hf_avail_splits": [
497
+ "test"
498
+ ],
499
+ "evaluation_splits": [
500
+ "test"
501
+ ],
502
+ "few_shots_split": null,
503
+ "few_shots_select": null,
504
+ "generation_size": 20,
505
+ "stop_sequence": [
506
+ "</s>",
507
+ "Q:",
508
+ "\n\n"
509
+ ],
510
+ "output_regex": null,
511
+ "frozen": false,
512
+ "suite": [
513
+ "harness"
514
+ ],
515
+ "original_num_docs": 250,
516
+ "effective_num_docs": 250,
517
+ "trust_dataset": true,
518
+ "must_remove_duplicate_docs": null
519
+ },
520
+ "harness|bbh:movie_recommendation": {
521
+ "name": "bbh:movie_recommendation",
522
+ "prompt_function": "bbh_movie_recommendation",
523
+ "hf_repo": "lukaemon/bbh",
524
+ "hf_subset": "movie_recommendation",
525
+ "metric": [
526
+ "exact_match",
527
+ "quasi_exact_match",
528
+ "prefix_exact_match",
529
+ "prefix_quasi_exact_match",
530
+ "perfect_exact_match"
531
+ ],
532
+ "hf_avail_splits": [
533
+ "test"
534
+ ],
535
+ "evaluation_splits": [
536
+ "test"
537
+ ],
538
+ "few_shots_split": null,
539
+ "few_shots_select": null,
540
+ "generation_size": 20,
541
+ "stop_sequence": [
542
+ "</s>",
543
+ "Q:",
544
+ "\n\n"
545
+ ],
546
+ "output_regex": null,
547
+ "frozen": false,
548
+ "suite": [
549
+ "harness"
550
+ ],
551
+ "original_num_docs": 249,
552
+ "effective_num_docs": 249,
553
+ "trust_dataset": true,
554
+ "must_remove_duplicate_docs": null
555
+ },
556
+ "harness|bbh:navigate": {
557
+ "name": "bbh:navigate",
558
+ "prompt_function": "bbh_navigate",
559
+ "hf_repo": "lukaemon/bbh",
560
+ "hf_subset": "navigate",
561
+ "metric": [
562
+ "exact_match",
563
+ "quasi_exact_match",
564
+ "prefix_exact_match",
565
+ "prefix_quasi_exact_match",
566
+ "perfect_exact_match"
567
+ ],
568
+ "hf_avail_splits": [
569
+ "test"
570
+ ],
571
+ "evaluation_splits": [
572
+ "test"
573
+ ],
574
+ "few_shots_split": null,
575
+ "few_shots_select": null,
576
+ "generation_size": 20,
577
+ "stop_sequence": [
578
+ "</s>",
579
+ "Q:",
580
+ "\n\n"
581
+ ],
582
+ "output_regex": null,
583
+ "frozen": false,
584
+ "suite": [
585
+ "harness"
586
+ ],
587
+ "original_num_docs": 250,
588
+ "effective_num_docs": 250,
589
+ "trust_dataset": true,
590
+ "must_remove_duplicate_docs": null
591
+ },
592
+ "harness|bbh:reasoning_about_colored_objects": {
593
+ "name": "bbh:reasoning_about_colored_objects",
594
+ "prompt_function": "bbh_reasoning_about_colored_objects",
595
+ "hf_repo": "lukaemon/bbh",
596
+ "hf_subset": "reasoning_about_colored_objects",
597
+ "metric": [
598
+ "exact_match",
599
+ "quasi_exact_match",
600
+ "prefix_exact_match",
601
+ "prefix_quasi_exact_match",
602
+ "perfect_exact_match"
603
+ ],
604
+ "hf_avail_splits": [
605
+ "test"
606
+ ],
607
+ "evaluation_splits": [
608
+ "test"
609
+ ],
610
+ "few_shots_split": null,
611
+ "few_shots_select": null,
612
+ "generation_size": 20,
613
+ "stop_sequence": [
614
+ "</s>",
615
+ "Q:",
616
+ "\n\n"
617
+ ],
618
+ "output_regex": null,
619
+ "frozen": false,
620
+ "suite": [
621
+ "harness"
622
+ ],
623
+ "original_num_docs": 250,
624
+ "effective_num_docs": 250,
625
+ "trust_dataset": true,
626
+ "must_remove_duplicate_docs": null
627
+ },
628
+ "harness|bbh:ruin_names": {
629
+ "name": "bbh:ruin_names",
630
+ "prompt_function": "bbh_ruin_names",
631
+ "hf_repo": "lukaemon/bbh",
632
+ "hf_subset": "ruin_names",
633
+ "metric": [
634
+ "exact_match",
635
+ "quasi_exact_match",
636
+ "prefix_exact_match",
637
+ "prefix_quasi_exact_match",
638
+ "perfect_exact_match"
639
+ ],
640
+ "hf_avail_splits": [
641
+ "test"
642
+ ],
643
+ "evaluation_splits": [
644
+ "test"
645
+ ],
646
+ "few_shots_split": null,
647
+ "few_shots_select": null,
648
+ "generation_size": 20,
649
+ "stop_sequence": [
650
+ "</s>",
651
+ "Q:",
652
+ "\n\n"
653
+ ],
654
+ "output_regex": null,
655
+ "frozen": false,
656
+ "suite": [
657
+ "harness"
658
+ ],
659
+ "original_num_docs": 248,
660
+ "effective_num_docs": 248,
661
+ "trust_dataset": true,
662
+ "must_remove_duplicate_docs": null
663
+ },
664
+ "harness|bbh:salient_translation_error_detection": {
665
+ "name": "bbh:salient_translation_error_detection",
666
+ "prompt_function": "bbh_salient_translation_error_detection",
667
+ "hf_repo": "lukaemon/bbh",
668
+ "hf_subset": "salient_translation_error_detection",
669
+ "metric": [
670
+ "exact_match",
671
+ "quasi_exact_match",
672
+ "prefix_exact_match",
673
+ "prefix_quasi_exact_match",
674
+ "perfect_exact_match"
675
+ ],
676
+ "hf_avail_splits": [
677
+ "test"
678
+ ],
679
+ "evaluation_splits": [
680
+ "test"
681
+ ],
682
+ "few_shots_split": null,
683
+ "few_shots_select": null,
684
+ "generation_size": 20,
685
+ "stop_sequence": [
686
+ "</s>",
687
+ "Q:",
688
+ "\n\n"
689
+ ],
690
+ "output_regex": null,
691
+ "frozen": false,
692
+ "suite": [
693
+ "harness"
694
+ ],
695
+ "original_num_docs": 250,
696
+ "effective_num_docs": 250,
697
+ "trust_dataset": true,
698
+ "must_remove_duplicate_docs": null
699
+ },
700
+ "harness|bbh:snarks": {
701
+ "name": "bbh:snarks",
702
+ "prompt_function": "bbh_snarks",
703
+ "hf_repo": "lukaemon/bbh",
704
+ "hf_subset": "snarks",
705
+ "metric": [
706
+ "exact_match",
707
+ "quasi_exact_match",
708
+ "prefix_exact_match",
709
+ "prefix_quasi_exact_match",
710
+ "perfect_exact_match"
711
+ ],
712
+ "hf_avail_splits": [
713
+ "test"
714
+ ],
715
+ "evaluation_splits": [
716
+ "test"
717
+ ],
718
+ "few_shots_split": null,
719
+ "few_shots_select": null,
720
+ "generation_size": 20,
721
+ "stop_sequence": [
722
+ "</s>",
723
+ "Q:",
724
+ "\n\n"
725
+ ],
726
+ "output_regex": null,
727
+ "frozen": false,
728
+ "suite": [
729
+ "harness"
730
+ ],
731
+ "original_num_docs": 178,
732
+ "effective_num_docs": 178,
733
+ "trust_dataset": true,
734
+ "must_remove_duplicate_docs": null
735
+ },
736
+ "harness|bbh:sports_understanding": {
737
+ "name": "bbh:sports_understanding",
738
+ "prompt_function": "bbh_sports_understanding",
739
+ "hf_repo": "lukaemon/bbh",
740
+ "hf_subset": "sports_understanding",
741
+ "metric": [
742
+ "exact_match",
743
+ "quasi_exact_match",
744
+ "prefix_exact_match",
745
+ "prefix_quasi_exact_match",
746
+ "perfect_exact_match"
747
+ ],
748
+ "hf_avail_splits": [
749
+ "test"
750
+ ],
751
+ "evaluation_splits": [
752
+ "test"
753
+ ],
754
+ "few_shots_split": null,
755
+ "few_shots_select": null,
756
+ "generation_size": 20,
757
+ "stop_sequence": [
758
+ "</s>",
759
+ "Q:",
760
+ "\n\n"
761
+ ],
762
+ "output_regex": null,
763
+ "frozen": false,
764
+ "suite": [
765
+ "harness"
766
+ ],
767
+ "original_num_docs": 250,
768
+ "effective_num_docs": 250,
769
+ "trust_dataset": true,
770
+ "must_remove_duplicate_docs": null
771
+ },
772
+ "harness|bbh:temporal_sequences": {
773
+ "name": "bbh:temporal_sequences",
774
+ "prompt_function": "bbh_temporal_sequences",
775
+ "hf_repo": "lukaemon/bbh",
776
+ "hf_subset": "temporal_sequences",
777
+ "metric": [
778
+ "exact_match",
779
+ "quasi_exact_match",
780
+ "prefix_exact_match",
781
+ "prefix_quasi_exact_match",
782
+ "perfect_exact_match"
783
+ ],
784
+ "hf_avail_splits": [
785
+ "test"
786
+ ],
787
+ "evaluation_splits": [
788
+ "test"
789
+ ],
790
+ "few_shots_split": null,
791
+ "few_shots_select": null,
792
+ "generation_size": 20,
793
+ "stop_sequence": [
794
+ "</s>",
795
+ "Q:",
796
+ "\n\n"
797
+ ],
798
+ "output_regex": null,
799
+ "frozen": false,
800
+ "suite": [
801
+ "harness"
802
+ ],
803
+ "original_num_docs": 250,
804
+ "effective_num_docs": 250,
805
+ "trust_dataset": true,
806
+ "must_remove_duplicate_docs": null
807
+ },
808
+ "harness|bbh:tracking_shuffled_objects_five_objects": {
809
+ "name": "bbh:tracking_shuffled_objects_five_objects",
810
+ "prompt_function": "bbh_tracking_shuffled_objects_five_objects",
811
+ "hf_repo": "lukaemon/bbh",
812
+ "hf_subset": "tracking_shuffled_objects_five_objects",
813
+ "metric": [
814
+ "exact_match",
815
+ "quasi_exact_match",
816
+ "prefix_exact_match",
817
+ "prefix_quasi_exact_match",
818
+ "perfect_exact_match"
819
+ ],
820
+ "hf_avail_splits": [
821
+ "test"
822
+ ],
823
+ "evaluation_splits": [
824
+ "test"
825
+ ],
826
+ "few_shots_split": null,
827
+ "few_shots_select": null,
828
+ "generation_size": 20,
829
+ "stop_sequence": [
830
+ "</s>",
831
+ "Q:",
832
+ "\n\n"
833
+ ],
834
+ "output_regex": null,
835
+ "frozen": false,
836
+ "suite": [
837
+ "harness"
838
+ ],
839
+ "original_num_docs": 250,
840
+ "effective_num_docs": 250,
841
+ "trust_dataset": true,
842
+ "must_remove_duplicate_docs": null
843
+ },
844
+ "harness|bbh:tracking_shuffled_objects_seven_objects": {
845
+ "name": "bbh:tracking_shuffled_objects_seven_objects",
846
+ "prompt_function": "bbh_tracking_shuffled_objects_seven_objects",
847
+ "hf_repo": "lukaemon/bbh",
848
+ "hf_subset": "tracking_shuffled_objects_seven_objects",
849
+ "metric": [
850
+ "exact_match",
851
+ "quasi_exact_match",
852
+ "prefix_exact_match",
853
+ "prefix_quasi_exact_match",
854
+ "perfect_exact_match"
855
+ ],
856
+ "hf_avail_splits": [
857
+ "test"
858
+ ],
859
+ "evaluation_splits": [
860
+ "test"
861
+ ],
862
+ "few_shots_split": null,
863
+ "few_shots_select": null,
864
+ "generation_size": 20,
865
+ "stop_sequence": [
866
+ "</s>",
867
+ "Q:",
868
+ "\n\n"
869
+ ],
870
+ "output_regex": null,
871
+ "frozen": false,
872
+ "suite": [
873
+ "harness"
874
+ ],
875
+ "original_num_docs": 250,
876
+ "effective_num_docs": 250,
877
+ "trust_dataset": true,
878
+ "must_remove_duplicate_docs": null
879
+ },
880
+ "harness|bbh:tracking_shuffled_objects_three_objects": {
881
+ "name": "bbh:tracking_shuffled_objects_three_objects",
882
+ "prompt_function": "bbh_tracking_shuffled_objects_three_objects",
883
+ "hf_repo": "lukaemon/bbh",
884
+ "hf_subset": "tracking_shuffled_objects_three_objects",
885
+ "metric": [
886
+ "exact_match",
887
+ "quasi_exact_match",
888
+ "prefix_exact_match",
889
+ "prefix_quasi_exact_match",
890
+ "perfect_exact_match"
891
+ ],
892
+ "hf_avail_splits": [
893
+ "test"
894
+ ],
895
+ "evaluation_splits": [
896
+ "test"
897
+ ],
898
+ "few_shots_split": null,
899
+ "few_shots_select": null,
900
+ "generation_size": 20,
901
+ "stop_sequence": [
902
+ "</s>",
903
+ "Q:",
904
+ "\n\n"
905
+ ],
906
+ "output_regex": null,
907
+ "frozen": false,
908
+ "suite": [
909
+ "harness"
910
+ ],
911
+ "original_num_docs": 250,
912
+ "effective_num_docs": 250,
913
+ "trust_dataset": true,
914
+ "must_remove_duplicate_docs": null
915
+ }
916
+ },
917
+ "summary_tasks": {
918
+ "harness|bbh:causal_judgment|3": {
919
+ "hashes": {
920
+ "hash_examples": "63218f5ae055ab2b",
921
+ "hash_full_prompts": "c3f54a56ef8b0c84",
922
+ "hash_input_tokens": "7e5e423a80ecc1ce",
923
+ "hash_cont_tokens": "ea7329191884f89a"
924
+ },
925
+ "truncated": 187,
926
+ "non_truncated": 0,
927
+ "padded": 0,
928
+ "non_padded": 187,
929
+ "effective_few_shots": 3.0,
930
+ "num_truncated_few_shots": 0
931
+ },
932
+ "harness|bbh:date_understanding|3": {
933
+ "hashes": {
934
+ "hash_examples": "f145c7a06def3c8e",
935
+ "hash_full_prompts": "c70878aae99812bd",
936
+ "hash_input_tokens": "6e0f2d719a606df2",
937
+ "hash_cont_tokens": "168747abf2ed1172"
938
+ },
939
+ "truncated": 250,
940
+ "non_truncated": 0,
941
+ "padded": 0,
942
+ "non_padded": 250,
943
+ "effective_few_shots": 3.0,
944
+ "num_truncated_few_shots": 0
945
+ },
946
+ "harness|bbh:disambiguation_qa|3": {
947
+ "hashes": {
948
+ "hash_examples": "19677fd1773f7eb9",
949
+ "hash_full_prompts": "27580dbe8f42357e",
950
+ "hash_input_tokens": "28c4ba12a22d1cd2",
951
+ "hash_cont_tokens": "ea97fd2b4e8339e1"
952
+ },
953
+ "truncated": 250,
954
+ "non_truncated": 0,
955
+ "padded": 0,
956
+ "non_padded": 250,
957
+ "effective_few_shots": 3.0,
958
+ "num_truncated_few_shots": 0
959
+ },
960
+ "harness|bbh:geometric_shapes|3": {
961
+ "hashes": {
962
+ "hash_examples": "76c7b11a13cc72a9",
963
+ "hash_full_prompts": "c1efb1382b3d5f26",
964
+ "hash_input_tokens": "f6262983a77b3c15",
965
+ "hash_cont_tokens": "27b384bb46462abe"
966
+ },
967
+ "truncated": 250,
968
+ "non_truncated": 0,
969
+ "padded": 0,
970
+ "non_padded": 250,
971
+ "effective_few_shots": 3.0,
972
+ "num_truncated_few_shots": 0
973
+ },
974
+ "harness|bbh:logical_deduction_five_objects|3": {
975
+ "hashes": {
976
+ "hash_examples": "0e958c856332a745",
977
+ "hash_full_prompts": "841180f101f6ed2f",
978
+ "hash_input_tokens": "4ae5c84ac47f9c00",
979
+ "hash_cont_tokens": "7ee9208c42f0f391"
980
+ },
981
+ "truncated": 250,
982
+ "non_truncated": 0,
983
+ "padded": 0,
984
+ "non_padded": 250,
985
+ "effective_few_shots": 3.0,
986
+ "num_truncated_few_shots": 0
987
+ },
988
+ "harness|bbh:logical_deduction_seven_objects|3": {
989
+ "hashes": {
990
+ "hash_examples": "ab9de25a5eb40d09",
991
+ "hash_full_prompts": "d4ea65028e06515a",
992
+ "hash_input_tokens": "d12c34ec49852722",
993
+ "hash_cont_tokens": "3ea6682edd9ab88c"
994
+ },
995
+ "truncated": 250,
996
+ "non_truncated": 0,
997
+ "padded": 0,
998
+ "non_padded": 250,
999
+ "effective_few_shots": 3.0,
1000
+ "num_truncated_few_shots": 0
1001
+ },
1002
+ "harness|bbh:logical_deduction_three_objects|3": {
1003
+ "hashes": {
1004
+ "hash_examples": "3c6bf52517714218",
1005
+ "hash_full_prompts": "e45ba419a02569cb",
1006
+ "hash_input_tokens": "539d01f4b2e6a62e",
1007
+ "hash_cont_tokens": "20fdf72b7bdf66c8"
1008
+ },
1009
+ "truncated": 250,
1010
+ "non_truncated": 0,
1011
+ "padded": 0,
1012
+ "non_padded": 250,
1013
+ "effective_few_shots": 3.0,
1014
+ "num_truncated_few_shots": 0
1015
+ },
1016
+ "harness|bbh:movie_recommendation|3": {
1017
+ "hashes": {
1018
+ "hash_examples": "2d9dc4975935d31a",
1019
+ "hash_full_prompts": "e7f9f6227ef6d091",
1020
+ "hash_input_tokens": "ccf15e0353520262",
1021
+ "hash_cont_tokens": "2c887402efd4ba5c"
1022
+ },
1023
+ "truncated": 249,
1024
+ "non_truncated": 0,
1025
+ "padded": 0,
1026
+ "non_padded": 249,
1027
+ "effective_few_shots": 3.0,
1028
+ "num_truncated_few_shots": 0
1029
+ },
1030
+ "harness|bbh:navigate|3": {
1031
+ "hashes": {
1032
+ "hash_examples": "ba91dcdb9a064255",
1033
+ "hash_full_prompts": "363cbc26d2694856",
1034
+ "hash_input_tokens": "6f26d91169f0f9e7",
1035
+ "hash_cont_tokens": "f534d356569141cc"
1036
+ },
1037
+ "truncated": 250,
1038
+ "non_truncated": 0,
1039
+ "padded": 0,
1040
+ "non_padded": 250,
1041
+ "effective_few_shots": 3.0,
1042
+ "num_truncated_few_shots": 0
1043
+ },
1044
+ "harness|bbh:reasoning_about_colored_objects|3": {
1045
+ "hashes": {
1046
+ "hash_examples": "a6ba328c4c3385d2",
1047
+ "hash_full_prompts": "e2c5ea75faa663be",
1048
+ "hash_input_tokens": "9bc9fca7f0afa719",
1049
+ "hash_cont_tokens": "6e10e99577f43f65"
1050
+ },
1051
+ "truncated": 250,
1052
+ "non_truncated": 0,
1053
+ "padded": 0,
1054
+ "non_padded": 250,
1055
+ "effective_few_shots": 3.0,
1056
+ "num_truncated_few_shots": 0
1057
+ },
1058
+ "harness|bbh:ruin_names|3": {
1059
+ "hashes": {
1060
+ "hash_examples": "2ef28d5f2d4fdd25",
1061
+ "hash_full_prompts": "39c98a33af277e05",
1062
+ "hash_input_tokens": "8fc43170201dbcbe",
1063
+ "hash_cont_tokens": "13fad13af65299fa"
1064
+ },
1065
+ "truncated": 248,
1066
+ "non_truncated": 0,
1067
+ "padded": 0,
1068
+ "non_padded": 248,
1069
+ "effective_few_shots": 3.0,
1070
+ "num_truncated_few_shots": 0
1071
+ },
1072
+ "harness|bbh:salient_translation_error_detection|3": {
1073
+ "hashes": {
1074
+ "hash_examples": "c13f25ec8ffed496",
1075
+ "hash_full_prompts": "61c199869236d1d0",
1076
+ "hash_input_tokens": "ebfefff744f0c7a3",
1077
+ "hash_cont_tokens": "cc08b8755b81929c"
1078
+ },
1079
+ "truncated": 250,
1080
+ "non_truncated": 0,
1081
+ "padded": 0,
1082
+ "non_padded": 250,
1083
+ "effective_few_shots": 3.0,
1084
+ "num_truncated_few_shots": 0
1085
+ },
1086
+ "harness|bbh:snarks|3": {
1087
+ "hashes": {
1088
+ "hash_examples": "5f6db7bff7f6f22e",
1089
+ "hash_full_prompts": "70b02bc8db0a7a32",
1090
+ "hash_input_tokens": "bae40895da8c14d5",
1091
+ "hash_cont_tokens": "0320bef36bbf541f"
1092
+ },
1093
+ "truncated": 178,
1094
+ "non_truncated": 0,
1095
+ "padded": 0,
1096
+ "non_padded": 178,
1097
+ "effective_few_shots": 3.0,
1098
+ "num_truncated_few_shots": 0
1099
+ },
1100
+ "harness|bbh:sports_understanding|3": {
1101
+ "hashes": {
1102
+ "hash_examples": "042afbe5d9c1f02d",
1103
+ "hash_full_prompts": "6a6bb045bbf84268",
1104
+ "hash_input_tokens": "f8b065aed222d461",
1105
+ "hash_cont_tokens": "5d577240f3fac015"
1106
+ },
1107
+ "truncated": 250,
1108
+ "non_truncated": 0,
1109
+ "padded": 0,
1110
+ "non_padded": 250,
1111
+ "effective_few_shots": 3.0,
1112
+ "num_truncated_few_shots": 0
1113
+ },
1114
+ "harness|bbh:temporal_sequences|3": {
1115
+ "hashes": {
1116
+ "hash_examples": "803a05f352eb6afc",
1117
+ "hash_full_prompts": "95ddd530c1a01713",
1118
+ "hash_input_tokens": "fe324a3e155b8686",
1119
+ "hash_cont_tokens": "e993b67463ec16da"
1120
+ },
1121
+ "truncated": 250,
1122
+ "non_truncated": 0,
1123
+ "padded": 0,
1124
+ "non_padded": 250,
1125
+ "effective_few_shots": 3.0,
1126
+ "num_truncated_few_shots": 0
1127
+ },
1128
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": {
1129
+ "hashes": {
1130
+ "hash_examples": "2bbac6db7ab0d527",
1131
+ "hash_full_prompts": "343105b81cd88d67",
1132
+ "hash_input_tokens": "3aa53f00d4db548e",
1133
+ "hash_cont_tokens": "9a61bd1425845d55"
1134
+ },
1135
+ "truncated": 250,
1136
+ "non_truncated": 0,
1137
+ "padded": 0,
1138
+ "non_padded": 250,
1139
+ "effective_few_shots": 3.0,
1140
+ "num_truncated_few_shots": 0
1141
+ },
1142
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": {
1143
+ "hashes": {
1144
+ "hash_examples": "845caf093ac2b58c",
1145
+ "hash_full_prompts": "4b07759d13b7ab32",
1146
+ "hash_input_tokens": "5a5a60741cae444d",
1147
+ "hash_cont_tokens": "f8acae0c3d9c3a40"
1148
+ },
1149
+ "truncated": 250,
1150
+ "non_truncated": 0,
1151
+ "padded": 0,
1152
+ "non_padded": 250,
1153
+ "effective_few_shots": 3.0,
1154
+ "num_truncated_few_shots": 0
1155
+ },
1156
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": {
1157
+ "hashes": {
1158
+ "hash_examples": "9004f14d5a32b9a8",
1159
+ "hash_full_prompts": "95a2d1f682b8e98c",
1160
+ "hash_input_tokens": "667b9bc0a831893c",
1161
+ "hash_cont_tokens": "13cc9c9f9aa887ea"
1162
+ },
1163
+ "truncated": 250,
1164
+ "non_truncated": 0,
1165
+ "padded": 0,
1166
+ "non_padded": 250,
1167
+ "effective_few_shots": 3.0,
1168
+ "num_truncated_few_shots": 0
1169
+ }
1170
+ },
1171
+ "summary_general": {
1172
+ "hashes": {
1173
+ "hash_examples": "4ff1e3dc5703575d",
1174
+ "hash_full_prompts": "cc928e928ada6335",
1175
+ "hash_input_tokens": "fa03e97b7a0db38d",
1176
+ "hash_cont_tokens": "4e1a33fb6e15f9bc"
1177
+ },
1178
+ "truncated": 4362,
1179
+ "non_truncated": 0,
1180
+ "padded": 0,
1181
+ "non_padded": 4362,
1182
+ "num_truncated_few_shots": 0
1183
+ }
1184
+ }