lewtun HF staff commited on
Commit
3b64438
·
verified ·
1 Parent(s): c131383

Upload eval_results/teknium/OpenHermes-2.5-Mistral-7B/main/eval_bbh.json with huggingface_hub

Browse files
eval_results/teknium/OpenHermes-2.5-Mistral-7B/main/eval_bbh.json ADDED
@@ -0,0 +1,1170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "bbh_zeroshot": {
4
+ "exact_match,none": 0.03639993856550453,
5
+ "exact_match_stderr,none": 0.001672390259526908,
6
+ "alias": "bbh_zeroshot"
7
+ },
8
+ "bbh_zeroshot_boolean_expressions": {
9
+ "exact_match,none": 0.612,
10
+ "exact_match_stderr,none": 0.030881038748993922,
11
+ "alias": " - bbh_zeroshot_boolean_expressions"
12
+ },
13
+ "bbh_zeroshot_causal_judgement": {
14
+ "exact_match,none": 0.36363636363636365,
15
+ "exact_match_stderr,none": 0.03527198153014411,
16
+ "alias": " - bbh_zeroshot_causal_judgement"
17
+ },
18
+ "bbh_zeroshot_date_understanding": {
19
+ "exact_match,none": 0.0,
20
+ "exact_match_stderr,none": 0.0,
21
+ "alias": " - bbh_zeroshot_date_understanding"
22
+ },
23
+ "bbh_zeroshot_disambiguation_qa": {
24
+ "exact_match,none": 0.0,
25
+ "exact_match_stderr,none": 0.0,
26
+ "alias": " - bbh_zeroshot_disambiguation_qa"
27
+ },
28
+ "bbh_zeroshot_dyck_languages": {
29
+ "exact_match,none": 0.0,
30
+ "exact_match_stderr,none": 0.0,
31
+ "alias": " - bbh_zeroshot_dyck_languages"
32
+ },
33
+ "bbh_zeroshot_formal_fallacies": {
34
+ "exact_match,none": 0.0,
35
+ "exact_match_stderr,none": 0.0,
36
+ "alias": " - bbh_zeroshot_formal_fallacies"
37
+ },
38
+ "bbh_zeroshot_geometric_shapes": {
39
+ "exact_match,none": 0.0,
40
+ "exact_match_stderr,none": 0.0,
41
+ "alias": " - bbh_zeroshot_geometric_shapes"
42
+ },
43
+ "bbh_zeroshot_hyperbaton": {
44
+ "exact_match,none": 0.0,
45
+ "exact_match_stderr,none": 0.0,
46
+ "alias": " - bbh_zeroshot_hyperbaton"
47
+ },
48
+ "bbh_zeroshot_logical_deduction_five_objects": {
49
+ "exact_match,none": 0.0,
50
+ "exact_match_stderr,none": 0.0,
51
+ "alias": " - bbh_zeroshot_logical_deduction_five_objects"
52
+ },
53
+ "bbh_zeroshot_logical_deduction_seven_objects": {
54
+ "exact_match,none": 0.0,
55
+ "exact_match_stderr,none": 0.0,
56
+ "alias": " - bbh_zeroshot_logical_deduction_seven_objects"
57
+ },
58
+ "bbh_zeroshot_logical_deduction_three_objects": {
59
+ "exact_match,none": 0.0,
60
+ "exact_match_stderr,none": 0.0,
61
+ "alias": " - bbh_zeroshot_logical_deduction_three_objects"
62
+ },
63
+ "bbh_zeroshot_movie_recommendation": {
64
+ "exact_match,none": 0.0,
65
+ "exact_match_stderr,none": 0.0,
66
+ "alias": " - bbh_zeroshot_movie_recommendation"
67
+ },
68
+ "bbh_zeroshot_multistep_arithmetic_two": {
69
+ "exact_match,none": 0.004,
70
+ "exact_match_stderr,none": 0.004000000000000004,
71
+ "alias": " - bbh_zeroshot_multistep_arithmetic_two"
72
+ },
73
+ "bbh_zeroshot_navigate": {
74
+ "exact_match,none": 0.0,
75
+ "exact_match_stderr,none": 0.0,
76
+ "alias": " - bbh_zeroshot_navigate"
77
+ },
78
+ "bbh_zeroshot_object_counting": {
79
+ "exact_match,none": 0.0,
80
+ "exact_match_stderr,none": 0.0,
81
+ "alias": " - bbh_zeroshot_object_counting"
82
+ },
83
+ "bbh_zeroshot_penguins_in_a_table": {
84
+ "exact_match,none": 0.0,
85
+ "exact_match_stderr,none": 0.0,
86
+ "alias": " - bbh_zeroshot_penguins_in_a_table"
87
+ },
88
+ "bbh_zeroshot_reasoning_about_colored_objects": {
89
+ "exact_match,none": 0.0,
90
+ "exact_match_stderr,none": 0.0,
91
+ "alias": " - bbh_zeroshot_reasoning_about_colored_objects"
92
+ },
93
+ "bbh_zeroshot_ruin_names": {
94
+ "exact_match,none": 0.0,
95
+ "exact_match_stderr,none": 0.0,
96
+ "alias": " - bbh_zeroshot_ruin_names"
97
+ },
98
+ "bbh_zeroshot_salient_translation_error_detection": {
99
+ "exact_match,none": 0.0,
100
+ "exact_match_stderr,none": 0.0,
101
+ "alias": " - bbh_zeroshot_salient_translation_error_detection"
102
+ },
103
+ "bbh_zeroshot_snarks": {
104
+ "exact_match,none": 0.0449438202247191,
105
+ "exact_match_stderr,none": 0.015572660609707176,
106
+ "alias": " - bbh_zeroshot_snarks"
107
+ },
108
+ "bbh_zeroshot_sports_understanding": {
109
+ "exact_match,none": 0.0,
110
+ "exact_match_stderr,none": 0.0,
111
+ "alias": " - bbh_zeroshot_sports_understanding"
112
+ },
113
+ "bbh_zeroshot_temporal_sequences": {
114
+ "exact_match,none": 0.0,
115
+ "exact_match_stderr,none": 0.0,
116
+ "alias": " - bbh_zeroshot_temporal_sequences"
117
+ },
118
+ "bbh_zeroshot_tracking_shuffled_objects_five_objects": {
119
+ "exact_match,none": 0.0,
120
+ "exact_match_stderr,none": 0.0,
121
+ "alias": " - bbh_zeroshot_tracking_shuffled_objects_five_objects"
122
+ },
123
+ "bbh_zeroshot_tracking_shuffled_objects_seven_objects": {
124
+ "exact_match,none": 0.0,
125
+ "exact_match_stderr,none": 0.0,
126
+ "alias": " - bbh_zeroshot_tracking_shuffled_objects_seven_objects"
127
+ },
128
+ "bbh_zeroshot_tracking_shuffled_objects_three_objects": {
129
+ "exact_match,none": 0.0,
130
+ "exact_match_stderr,none": 0.0,
131
+ "alias": " - bbh_zeroshot_tracking_shuffled_objects_three_objects"
132
+ },
133
+ "bbh_zeroshot_web_of_lies": {
134
+ "exact_match,none": 0.0,
135
+ "exact_match_stderr,none": 0.0,
136
+ "alias": " - bbh_zeroshot_web_of_lies"
137
+ },
138
+ "bbh_zeroshot_word_sorting": {
139
+ "exact_match,none": 0.028,
140
+ "exact_match_stderr,none": 0.010454721651927287,
141
+ "alias": " - bbh_zeroshot_word_sorting"
142
+ }
143
+ },
144
+ "groups": {
145
+ "bbh_zeroshot": {
146
+ "exact_match,none": 0.03639993856550453,
147
+ "exact_match_stderr,none": 0.001672390259526908,
148
+ "alias": "bbh_zeroshot"
149
+ }
150
+ },
151
+ "configs": {
152
+ "bbh_zeroshot_boolean_expressions": {
153
+ "task": "bbh_zeroshot_boolean_expressions",
154
+ "group": "bbh_zeroshot",
155
+ "dataset_path": "lukaemon/bbh",
156
+ "dataset_name": "boolean_expressions",
157
+ "test_split": "test",
158
+ "doc_to_text": "Q: {{input}}\nA:",
159
+ "doc_to_target": "{{target}}",
160
+ "description": "Evaluate the result of a random Boolean expression.\n\n",
161
+ "target_delimiter": " ",
162
+ "fewshot_delimiter": "\n\n",
163
+ "num_fewshot": 0,
164
+ "metric_list": [
165
+ {
166
+ "metric": "exact_match",
167
+ "aggregation": "mean",
168
+ "higher_is_better": true
169
+ }
170
+ ],
171
+ "output_type": "generate_until",
172
+ "generation_kwargs": {
173
+ "until": [
174
+ "</s>",
175
+ "Q:",
176
+ "\n\n"
177
+ ],
178
+ "do_sample": false,
179
+ "temperature": 0.0
180
+ },
181
+ "repeats": 1,
182
+ "should_decontaminate": false,
183
+ "metadata": {
184
+ "version": 1.0
185
+ }
186
+ },
187
+ "bbh_zeroshot_causal_judgement": {
188
+ "task": "bbh_zeroshot_causal_judgement",
189
+ "group": "bbh_zeroshot",
190
+ "dataset_path": "lukaemon/bbh",
191
+ "dataset_name": "causal_judgement",
192
+ "test_split": "test",
193
+ "doc_to_text": "Q: {{input}}\nA:",
194
+ "doc_to_target": "{{target}}",
195
+ "description": "Answer questions about causal attribution.\n\n",
196
+ "target_delimiter": " ",
197
+ "fewshot_delimiter": "\n\n",
198
+ "num_fewshot": 0,
199
+ "metric_list": [
200
+ {
201
+ "metric": "exact_match",
202
+ "aggregation": "mean",
203
+ "higher_is_better": true
204
+ }
205
+ ],
206
+ "output_type": "generate_until",
207
+ "generation_kwargs": {
208
+ "until": [
209
+ "</s>",
210
+ "Q:",
211
+ "\n\n"
212
+ ],
213
+ "do_sample": false,
214
+ "temperature": 0.0
215
+ },
216
+ "repeats": 1,
217
+ "should_decontaminate": false,
218
+ "metadata": {
219
+ "version": 1.0
220
+ }
221
+ },
222
+ "bbh_zeroshot_date_understanding": {
223
+ "task": "bbh_zeroshot_date_understanding",
224
+ "group": "bbh_zeroshot",
225
+ "dataset_path": "lukaemon/bbh",
226
+ "dataset_name": "date_understanding",
227
+ "test_split": "test",
228
+ "doc_to_text": "Q: {{input}}\nA:",
229
+ "doc_to_target": "{{target}}",
230
+ "description": "Infer the date from context.\n\n",
231
+ "target_delimiter": " ",
232
+ "fewshot_delimiter": "\n\n",
233
+ "num_fewshot": 0,
234
+ "metric_list": [
235
+ {
236
+ "metric": "exact_match",
237
+ "aggregation": "mean",
238
+ "higher_is_better": true
239
+ }
240
+ ],
241
+ "output_type": "generate_until",
242
+ "generation_kwargs": {
243
+ "until": [
244
+ "</s>",
245
+ "Q:",
246
+ "\n\n"
247
+ ],
248
+ "do_sample": false,
249
+ "temperature": 0.0
250
+ },
251
+ "repeats": 1,
252
+ "should_decontaminate": false,
253
+ "metadata": {
254
+ "version": 1.0
255
+ }
256
+ },
257
+ "bbh_zeroshot_disambiguation_qa": {
258
+ "task": "bbh_zeroshot_disambiguation_qa",
259
+ "group": "bbh_zeroshot",
260
+ "dataset_path": "lukaemon/bbh",
261
+ "dataset_name": "disambiguation_qa",
262
+ "test_split": "test",
263
+ "doc_to_text": "Q: {{input}}\nA:",
264
+ "doc_to_target": "{{target}}",
265
+ "description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n",
266
+ "target_delimiter": " ",
267
+ "fewshot_delimiter": "\n\n",
268
+ "num_fewshot": 0,
269
+ "metric_list": [
270
+ {
271
+ "metric": "exact_match",
272
+ "aggregation": "mean",
273
+ "higher_is_better": true
274
+ }
275
+ ],
276
+ "output_type": "generate_until",
277
+ "generation_kwargs": {
278
+ "until": [
279
+ "</s>",
280
+ "Q:",
281
+ "\n\n"
282
+ ],
283
+ "do_sample": false,
284
+ "temperature": 0.0
285
+ },
286
+ "repeats": 1,
287
+ "should_decontaminate": false,
288
+ "metadata": {
289
+ "version": 1.0
290
+ }
291
+ },
292
+ "bbh_zeroshot_dyck_languages": {
293
+ "task": "bbh_zeroshot_dyck_languages",
294
+ "group": "bbh_zeroshot",
295
+ "dataset_path": "lukaemon/bbh",
296
+ "dataset_name": "dyck_languages",
297
+ "test_split": "test",
298
+ "doc_to_text": "Q: {{input}}\nA:",
299
+ "doc_to_target": "{{target}}",
300
+ "description": "Correctly close a Dyck-n word.\n\n",
301
+ "target_delimiter": " ",
302
+ "fewshot_delimiter": "\n\n",
303
+ "num_fewshot": 0,
304
+ "metric_list": [
305
+ {
306
+ "metric": "exact_match",
307
+ "aggregation": "mean",
308
+ "higher_is_better": true
309
+ }
310
+ ],
311
+ "output_type": "generate_until",
312
+ "generation_kwargs": {
313
+ "until": [
314
+ "</s>",
315
+ "Q:",
316
+ "\n\n"
317
+ ],
318
+ "do_sample": false,
319
+ "temperature": 0.0
320
+ },
321
+ "repeats": 1,
322
+ "should_decontaminate": false,
323
+ "metadata": {
324
+ "version": 1.0
325
+ }
326
+ },
327
+ "bbh_zeroshot_formal_fallacies": {
328
+ "task": "bbh_zeroshot_formal_fallacies",
329
+ "group": "bbh_zeroshot",
330
+ "dataset_path": "lukaemon/bbh",
331
+ "dataset_name": "formal_fallacies",
332
+ "test_split": "test",
333
+ "doc_to_text": "Q: {{input}}\nA:",
334
+ "doc_to_target": "{{target}}",
335
+ "description": "Distinguish deductively valid arguments from formal fallacies.\n\n",
336
+ "target_delimiter": " ",
337
+ "fewshot_delimiter": "\n\n",
338
+ "num_fewshot": 0,
339
+ "metric_list": [
340
+ {
341
+ "metric": "exact_match",
342
+ "aggregation": "mean",
343
+ "higher_is_better": true
344
+ }
345
+ ],
346
+ "output_type": "generate_until",
347
+ "generation_kwargs": {
348
+ "until": [
349
+ "</s>",
350
+ "Q:",
351
+ "\n\n"
352
+ ],
353
+ "do_sample": false,
354
+ "temperature": 0.0
355
+ },
356
+ "repeats": 1,
357
+ "should_decontaminate": false,
358
+ "metadata": {
359
+ "version": 1.0
360
+ }
361
+ },
362
+ "bbh_zeroshot_geometric_shapes": {
363
+ "task": "bbh_zeroshot_geometric_shapes",
364
+ "group": "bbh_zeroshot",
365
+ "dataset_path": "lukaemon/bbh",
366
+ "dataset_name": "geometric_shapes",
367
+ "test_split": "test",
368
+ "doc_to_text": "Q: {{input}}\nA:",
369
+ "doc_to_target": "{{target}}",
370
+ "description": "Name geometric shapes from their SVG paths.\n\n",
371
+ "target_delimiter": " ",
372
+ "fewshot_delimiter": "\n\n",
373
+ "num_fewshot": 0,
374
+ "metric_list": [
375
+ {
376
+ "metric": "exact_match",
377
+ "aggregation": "mean",
378
+ "higher_is_better": true
379
+ }
380
+ ],
381
+ "output_type": "generate_until",
382
+ "generation_kwargs": {
383
+ "until": [
384
+ "</s>",
385
+ "Q:",
386
+ "\n\n"
387
+ ],
388
+ "do_sample": false,
389
+ "temperature": 0.0
390
+ },
391
+ "repeats": 1,
392
+ "should_decontaminate": false,
393
+ "metadata": {
394
+ "version": 1.0
395
+ }
396
+ },
397
+ "bbh_zeroshot_hyperbaton": {
398
+ "task": "bbh_zeroshot_hyperbaton",
399
+ "group": "bbh_zeroshot",
400
+ "dataset_path": "lukaemon/bbh",
401
+ "dataset_name": "hyperbaton",
402
+ "test_split": "test",
403
+ "doc_to_text": "Q: {{input}}\nA:",
404
+ "doc_to_target": "{{target}}",
405
+ "description": "Order adjectives correctly in English sentences.\n\n",
406
+ "target_delimiter": " ",
407
+ "fewshot_delimiter": "\n\n",
408
+ "num_fewshot": 0,
409
+ "metric_list": [
410
+ {
411
+ "metric": "exact_match",
412
+ "aggregation": "mean",
413
+ "higher_is_better": true
414
+ }
415
+ ],
416
+ "output_type": "generate_until",
417
+ "generation_kwargs": {
418
+ "until": [
419
+ "</s>",
420
+ "Q:",
421
+ "\n\n"
422
+ ],
423
+ "do_sample": false,
424
+ "temperature": 0.0
425
+ },
426
+ "repeats": 1,
427
+ "should_decontaminate": false,
428
+ "metadata": {
429
+ "version": 1.0
430
+ }
431
+ },
432
+ "bbh_zeroshot_logical_deduction_five_objects": {
433
+ "task": "bbh_zeroshot_logical_deduction_five_objects",
434
+ "group": "bbh_zeroshot",
435
+ "dataset_path": "lukaemon/bbh",
436
+ "dataset_name": "logical_deduction_five_objects",
437
+ "test_split": "test",
438
+ "doc_to_text": "Q: {{input}}\nA:",
439
+ "doc_to_target": "{{target}}",
440
+ "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n",
441
+ "target_delimiter": " ",
442
+ "fewshot_delimiter": "\n\n",
443
+ "num_fewshot": 0,
444
+ "metric_list": [
445
+ {
446
+ "metric": "exact_match",
447
+ "aggregation": "mean",
448
+ "higher_is_better": true
449
+ }
450
+ ],
451
+ "output_type": "generate_until",
452
+ "generation_kwargs": {
453
+ "until": [
454
+ "</s>",
455
+ "Q:",
456
+ "\n\n"
457
+ ],
458
+ "do_sample": false,
459
+ "temperature": 0.0
460
+ },
461
+ "repeats": 1,
462
+ "should_decontaminate": false,
463
+ "metadata": {
464
+ "version": 1.0
465
+ }
466
+ },
467
+ "bbh_zeroshot_logical_deduction_seven_objects": {
468
+ "task": "bbh_zeroshot_logical_deduction_seven_objects",
469
+ "group": "bbh_zeroshot",
470
+ "dataset_path": "lukaemon/bbh",
471
+ "dataset_name": "logical_deduction_seven_objects",
472
+ "test_split": "test",
473
+ "doc_to_text": "Q: {{input}}\nA:",
474
+ "doc_to_target": "{{target}}",
475
+ "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n",
476
+ "target_delimiter": " ",
477
+ "fewshot_delimiter": "\n\n",
478
+ "num_fewshot": 0,
479
+ "metric_list": [
480
+ {
481
+ "metric": "exact_match",
482
+ "aggregation": "mean",
483
+ "higher_is_better": true
484
+ }
485
+ ],
486
+ "output_type": "generate_until",
487
+ "generation_kwargs": {
488
+ "until": [
489
+ "</s>",
490
+ "Q:",
491
+ "\n\n"
492
+ ],
493
+ "do_sample": false,
494
+ "temperature": 0.0
495
+ },
496
+ "repeats": 1,
497
+ "should_decontaminate": false,
498
+ "metadata": {
499
+ "version": 1.0
500
+ }
501
+ },
502
+ "bbh_zeroshot_logical_deduction_three_objects": {
503
+ "task": "bbh_zeroshot_logical_deduction_three_objects",
504
+ "group": "bbh_zeroshot",
505
+ "dataset_path": "lukaemon/bbh",
506
+ "dataset_name": "logical_deduction_three_objects",
507
+ "test_split": "test",
508
+ "doc_to_text": "Q: {{input}}\nA:",
509
+ "doc_to_target": "{{target}}",
510
+ "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n",
511
+ "target_delimiter": " ",
512
+ "fewshot_delimiter": "\n\n",
513
+ "num_fewshot": 0,
514
+ "metric_list": [
515
+ {
516
+ "metric": "exact_match",
517
+ "aggregation": "mean",
518
+ "higher_is_better": true
519
+ }
520
+ ],
521
+ "output_type": "generate_until",
522
+ "generation_kwargs": {
523
+ "until": [
524
+ "</s>",
525
+ "Q:",
526
+ "\n\n"
527
+ ],
528
+ "do_sample": false,
529
+ "temperature": 0.0
530
+ },
531
+ "repeats": 1,
532
+ "should_decontaminate": false,
533
+ "metadata": {
534
+ "version": 1.0
535
+ }
536
+ },
537
+ "bbh_zeroshot_movie_recommendation": {
538
+ "task": "bbh_zeroshot_movie_recommendation",
539
+ "group": "bbh_zeroshot",
540
+ "dataset_path": "lukaemon/bbh",
541
+ "dataset_name": "movie_recommendation",
542
+ "test_split": "test",
543
+ "doc_to_text": "Q: {{input}}\nA:",
544
+ "doc_to_target": "{{target}}",
545
+ "description": "Recommend movies similar to the given list of movies.\n\n",
546
+ "target_delimiter": " ",
547
+ "fewshot_delimiter": "\n\n",
548
+ "num_fewshot": 0,
549
+ "metric_list": [
550
+ {
551
+ "metric": "exact_match",
552
+ "aggregation": "mean",
553
+ "higher_is_better": true
554
+ }
555
+ ],
556
+ "output_type": "generate_until",
557
+ "generation_kwargs": {
558
+ "until": [
559
+ "</s>",
560
+ "Q:",
561
+ "\n\n"
562
+ ],
563
+ "do_sample": false,
564
+ "temperature": 0.0
565
+ },
566
+ "repeats": 1,
567
+ "should_decontaminate": false,
568
+ "metadata": {
569
+ "version": 1.0
570
+ }
571
+ },
572
+ "bbh_zeroshot_multistep_arithmetic_two": {
573
+ "task": "bbh_zeroshot_multistep_arithmetic_two",
574
+ "group": "bbh_zeroshot",
575
+ "dataset_path": "lukaemon/bbh",
576
+ "dataset_name": "multistep_arithmetic_two",
577
+ "test_split": "test",
578
+ "doc_to_text": "Q: {{input}}\nA:",
579
+ "doc_to_target": "{{target}}",
580
+ "description": "Solve multi-step arithmetic problems.\n\n",
581
+ "target_delimiter": " ",
582
+ "fewshot_delimiter": "\n\n",
583
+ "num_fewshot": 0,
584
+ "metric_list": [
585
+ {
586
+ "metric": "exact_match",
587
+ "aggregation": "mean",
588
+ "higher_is_better": true
589
+ }
590
+ ],
591
+ "output_type": "generate_until",
592
+ "generation_kwargs": {
593
+ "until": [
594
+ "</s>",
595
+ "Q:",
596
+ "\n\n"
597
+ ],
598
+ "do_sample": false,
599
+ "temperature": 0.0
600
+ },
601
+ "repeats": 1,
602
+ "should_decontaminate": false,
603
+ "metadata": {
604
+ "version": 1.0
605
+ }
606
+ },
607
+ "bbh_zeroshot_navigate": {
608
+ "task": "bbh_zeroshot_navigate",
609
+ "group": "bbh_zeroshot",
610
+ "dataset_path": "lukaemon/bbh",
611
+ "dataset_name": "navigate",
612
+ "test_split": "test",
613
+ "doc_to_text": "Q: {{input}}\nA:",
614
+ "doc_to_target": "{{target}}",
615
+ "description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n",
616
+ "target_delimiter": " ",
617
+ "fewshot_delimiter": "\n\n",
618
+ "num_fewshot": 0,
619
+ "metric_list": [
620
+ {
621
+ "metric": "exact_match",
622
+ "aggregation": "mean",
623
+ "higher_is_better": true
624
+ }
625
+ ],
626
+ "output_type": "generate_until",
627
+ "generation_kwargs": {
628
+ "until": [
629
+ "</s>",
630
+ "Q:",
631
+ "\n\n"
632
+ ],
633
+ "do_sample": false,
634
+ "temperature": 0.0
635
+ },
636
+ "repeats": 1,
637
+ "should_decontaminate": false,
638
+ "metadata": {
639
+ "version": 1.0
640
+ }
641
+ },
642
+ "bbh_zeroshot_object_counting": {
643
+ "task": "bbh_zeroshot_object_counting",
644
+ "group": "bbh_zeroshot",
645
+ "dataset_path": "lukaemon/bbh",
646
+ "dataset_name": "object_counting",
647
+ "test_split": "test",
648
+ "doc_to_text": "Q: {{input}}\nA:",
649
+ "doc_to_target": "{{target}}",
650
+ "description": "Questions that involve enumerating objects and asking the model to count them.\n\n",
651
+ "target_delimiter": " ",
652
+ "fewshot_delimiter": "\n\n",
653
+ "num_fewshot": 0,
654
+ "metric_list": [
655
+ {
656
+ "metric": "exact_match",
657
+ "aggregation": "mean",
658
+ "higher_is_better": true
659
+ }
660
+ ],
661
+ "output_type": "generate_until",
662
+ "generation_kwargs": {
663
+ "until": [
664
+ "</s>",
665
+ "Q:",
666
+ "\n\n"
667
+ ],
668
+ "do_sample": false,
669
+ "temperature": 0.0
670
+ },
671
+ "repeats": 1,
672
+ "should_decontaminate": false,
673
+ "metadata": {
674
+ "version": 1.0
675
+ }
676
+ },
677
+ "bbh_zeroshot_penguins_in_a_table": {
678
+ "task": "bbh_zeroshot_penguins_in_a_table",
679
+ "group": "bbh_zeroshot",
680
+ "dataset_path": "lukaemon/bbh",
681
+ "dataset_name": "penguins_in_a_table",
682
+ "test_split": "test",
683
+ "doc_to_text": "Q: {{input}}\nA:",
684
+ "doc_to_target": "{{target}}",
685
+ "description": "Answer questions about a table of penguins and their attributes.\n\n",
686
+ "target_delimiter": " ",
687
+ "fewshot_delimiter": "\n\n",
688
+ "num_fewshot": 0,
689
+ "metric_list": [
690
+ {
691
+ "metric": "exact_match",
692
+ "aggregation": "mean",
693
+ "higher_is_better": true
694
+ }
695
+ ],
696
+ "output_type": "generate_until",
697
+ "generation_kwargs": {
698
+ "until": [
699
+ "</s>",
700
+ "Q:",
701
+ "\n\n"
702
+ ],
703
+ "do_sample": false,
704
+ "temperature": 0.0
705
+ },
706
+ "repeats": 1,
707
+ "should_decontaminate": false,
708
+ "metadata": {
709
+ "version": 1.0
710
+ }
711
+ },
712
+ "bbh_zeroshot_reasoning_about_colored_objects": {
713
+ "task": "bbh_zeroshot_reasoning_about_colored_objects",
714
+ "group": "bbh_zeroshot",
715
+ "dataset_path": "lukaemon/bbh",
716
+ "dataset_name": "reasoning_about_colored_objects",
717
+ "test_split": "test",
718
+ "doc_to_text": "Q: {{input}}\nA:",
719
+ "doc_to_target": "{{target}}",
720
+ "description": "Answer extremely simple questions about the colors of objects on a surface.\n\n",
721
+ "target_delimiter": " ",
722
+ "fewshot_delimiter": "\n\n",
723
+ "num_fewshot": 0,
724
+ "metric_list": [
725
+ {
726
+ "metric": "exact_match",
727
+ "aggregation": "mean",
728
+ "higher_is_better": true
729
+ }
730
+ ],
731
+ "output_type": "generate_until",
732
+ "generation_kwargs": {
733
+ "until": [
734
+ "</s>",
735
+ "Q:",
736
+ "\n\n"
737
+ ],
738
+ "do_sample": false,
739
+ "temperature": 0.0
740
+ },
741
+ "repeats": 1,
742
+ "should_decontaminate": false,
743
+ "metadata": {
744
+ "version": 1.0
745
+ }
746
+ },
747
+ "bbh_zeroshot_ruin_names": {
748
+ "task": "bbh_zeroshot_ruin_names",
749
+ "group": "bbh_zeroshot",
750
+ "dataset_path": "lukaemon/bbh",
751
+ "dataset_name": "ruin_names",
752
+ "test_split": "test",
753
+ "doc_to_text": "Q: {{input}}\nA:",
754
+ "doc_to_target": "{{target}}",
755
+ "description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n",
756
+ "target_delimiter": " ",
757
+ "fewshot_delimiter": "\n\n",
758
+ "num_fewshot": 0,
759
+ "metric_list": [
760
+ {
761
+ "metric": "exact_match",
762
+ "aggregation": "mean",
763
+ "higher_is_better": true
764
+ }
765
+ ],
766
+ "output_type": "generate_until",
767
+ "generation_kwargs": {
768
+ "until": [
769
+ "</s>",
770
+ "Q:",
771
+ "\n\n"
772
+ ],
773
+ "do_sample": false,
774
+ "temperature": 0.0
775
+ },
776
+ "repeats": 1,
777
+ "should_decontaminate": false,
778
+ "metadata": {
779
+ "version": 1.0
780
+ }
781
+ },
782
+ "bbh_zeroshot_salient_translation_error_detection": {
783
+ "task": "bbh_zeroshot_salient_translation_error_detection",
784
+ "group": "bbh_zeroshot",
785
+ "dataset_path": "lukaemon/bbh",
786
+ "dataset_name": "salient_translation_error_detection",
787
+ "test_split": "test",
788
+ "doc_to_text": "Q: {{input}}\nA:",
789
+ "doc_to_target": "{{target}}",
790
+ "description": "Detect the type of error in an English translation of a German source sentence.\n\n",
791
+ "target_delimiter": " ",
792
+ "fewshot_delimiter": "\n\n",
793
+ "num_fewshot": 0,
794
+ "metric_list": [
795
+ {
796
+ "metric": "exact_match",
797
+ "aggregation": "mean",
798
+ "higher_is_better": true
799
+ }
800
+ ],
801
+ "output_type": "generate_until",
802
+ "generation_kwargs": {
803
+ "until": [
804
+ "</s>",
805
+ "Q:",
806
+ "\n\n"
807
+ ],
808
+ "do_sample": false,
809
+ "temperature": 0.0
810
+ },
811
+ "repeats": 1,
812
+ "should_decontaminate": false,
813
+ "metadata": {
814
+ "version": 1.0
815
+ }
816
+ },
817
+ "bbh_zeroshot_snarks": {
818
+ "task": "bbh_zeroshot_snarks",
819
+ "group": "bbh_zeroshot",
820
+ "dataset_path": "lukaemon/bbh",
821
+ "dataset_name": "snarks",
822
+ "test_split": "test",
823
+ "doc_to_text": "Q: {{input}}\nA:",
824
+ "doc_to_target": "{{target}}",
825
+ "description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n",
826
+ "target_delimiter": " ",
827
+ "fewshot_delimiter": "\n\n",
828
+ "num_fewshot": 0,
829
+ "metric_list": [
830
+ {
831
+ "metric": "exact_match",
832
+ "aggregation": "mean",
833
+ "higher_is_better": true
834
+ }
835
+ ],
836
+ "output_type": "generate_until",
837
+ "generation_kwargs": {
838
+ "until": [
839
+ "</s>",
840
+ "Q:",
841
+ "\n\n"
842
+ ],
843
+ "do_sample": false,
844
+ "temperature": 0.0
845
+ },
846
+ "repeats": 1,
847
+ "should_decontaminate": false,
848
+ "metadata": {
849
+ "version": 1.0
850
+ }
851
+ },
852
+ "bbh_zeroshot_sports_understanding": {
853
+ "task": "bbh_zeroshot_sports_understanding",
854
+ "group": "bbh_zeroshot",
855
+ "dataset_path": "lukaemon/bbh",
856
+ "dataset_name": "sports_understanding",
857
+ "test_split": "test",
858
+ "doc_to_text": "Q: {{input}}\nA:",
859
+ "doc_to_target": "{{target}}",
860
+ "description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n",
861
+ "target_delimiter": " ",
862
+ "fewshot_delimiter": "\n\n",
863
+ "num_fewshot": 0,
864
+ "metric_list": [
865
+ {
866
+ "metric": "exact_match",
867
+ "aggregation": "mean",
868
+ "higher_is_better": true
869
+ }
870
+ ],
871
+ "output_type": "generate_until",
872
+ "generation_kwargs": {
873
+ "until": [
874
+ "</s>",
875
+ "Q:",
876
+ "\n\n"
877
+ ],
878
+ "do_sample": false,
879
+ "temperature": 0.0
880
+ },
881
+ "repeats": 1,
882
+ "should_decontaminate": false,
883
+ "metadata": {
884
+ "version": 1.0
885
+ }
886
+ },
887
+ "bbh_zeroshot_temporal_sequences": {
888
+ "task": "bbh_zeroshot_temporal_sequences",
889
+ "group": "bbh_zeroshot",
890
+ "dataset_path": "lukaemon/bbh",
891
+ "dataset_name": "temporal_sequences",
892
+ "test_split": "test",
893
+ "doc_to_text": "Q: {{input}}\nA:",
894
+ "doc_to_target": "{{target}}",
895
+ "description": "Task description: Answer questions about which times certain events could have occurred.\n\n",
896
+ "target_delimiter": " ",
897
+ "fewshot_delimiter": "\n\n",
898
+ "num_fewshot": 0,
899
+ "metric_list": [
900
+ {
901
+ "metric": "exact_match",
902
+ "aggregation": "mean",
903
+ "higher_is_better": true
904
+ }
905
+ ],
906
+ "output_type": "generate_until",
907
+ "generation_kwargs": {
908
+ "until": [
909
+ "</s>",
910
+ "Q:",
911
+ "\n\n"
912
+ ],
913
+ "do_sample": false,
914
+ "temperature": 0.0
915
+ },
916
+ "repeats": 1,
917
+ "should_decontaminate": false,
918
+ "metadata": {
919
+ "version": 1.0
920
+ }
921
+ },
922
+ "bbh_zeroshot_tracking_shuffled_objects_five_objects": {
923
+ "task": "bbh_zeroshot_tracking_shuffled_objects_five_objects",
924
+ "group": "bbh_zeroshot",
925
+ "dataset_path": "lukaemon/bbh",
926
+ "dataset_name": "tracking_shuffled_objects_five_objects",
927
+ "test_split": "test",
928
+ "doc_to_text": "Q: {{input}}\nA:",
929
+ "doc_to_target": "{{target}}",
930
+ "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n",
931
+ "target_delimiter": " ",
932
+ "fewshot_delimiter": "\n\n",
933
+ "num_fewshot": 0,
934
+ "metric_list": [
935
+ {
936
+ "metric": "exact_match",
937
+ "aggregation": "mean",
938
+ "higher_is_better": true
939
+ }
940
+ ],
941
+ "output_type": "generate_until",
942
+ "generation_kwargs": {
943
+ "until": [
944
+ "</s>",
945
+ "Q:",
946
+ "\n\n"
947
+ ],
948
+ "do_sample": false,
949
+ "temperature": 0.0
950
+ },
951
+ "repeats": 1,
952
+ "should_decontaminate": false,
953
+ "metadata": {
954
+ "version": 1.0
955
+ }
956
+ },
957
+ "bbh_zeroshot_tracking_shuffled_objects_seven_objects": {
958
+ "task": "bbh_zeroshot_tracking_shuffled_objects_seven_objects",
959
+ "group": "bbh_zeroshot",
960
+ "dataset_path": "lukaemon/bbh",
961
+ "dataset_name": "tracking_shuffled_objects_seven_objects",
962
+ "test_split": "test",
963
+ "doc_to_text": "Q: {{input}}\nA:",
964
+ "doc_to_target": "{{target}}",
965
+ "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n",
966
+ "target_delimiter": " ",
967
+ "fewshot_delimiter": "\n\n",
968
+ "num_fewshot": 0,
969
+ "metric_list": [
970
+ {
971
+ "metric": "exact_match",
972
+ "aggregation": "mean",
973
+ "higher_is_better": true
974
+ }
975
+ ],
976
+ "output_type": "generate_until",
977
+ "generation_kwargs": {
978
+ "until": [
979
+ "</s>",
980
+ "Q:",
981
+ "\n\n"
982
+ ],
983
+ "do_sample": false,
984
+ "temperature": 0.0
985
+ },
986
+ "repeats": 1,
987
+ "should_decontaminate": false,
988
+ "metadata": {
989
+ "version": 1.0
990
+ }
991
+ },
992
+ "bbh_zeroshot_tracking_shuffled_objects_three_objects": {
993
+ "task": "bbh_zeroshot_tracking_shuffled_objects_three_objects",
994
+ "group": "bbh_zeroshot",
995
+ "dataset_path": "lukaemon/bbh",
996
+ "dataset_name": "tracking_shuffled_objects_three_objects",
997
+ "test_split": "test",
998
+ "doc_to_text": "Q: {{input}}\nA:",
999
+ "doc_to_target": "{{target}}",
1000
+ "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n",
1001
+ "target_delimiter": " ",
1002
+ "fewshot_delimiter": "\n\n",
1003
+ "num_fewshot": 0,
1004
+ "metric_list": [
1005
+ {
1006
+ "metric": "exact_match",
1007
+ "aggregation": "mean",
1008
+ "higher_is_better": true
1009
+ }
1010
+ ],
1011
+ "output_type": "generate_until",
1012
+ "generation_kwargs": {
1013
+ "until": [
1014
+ "</s>",
1015
+ "Q:",
1016
+ "\n\n"
1017
+ ],
1018
+ "do_sample": false,
1019
+ "temperature": 0.0
1020
+ },
1021
+ "repeats": 1,
1022
+ "should_decontaminate": false,
1023
+ "metadata": {
1024
+ "version": 1.0
1025
+ }
1026
+ },
1027
+ "bbh_zeroshot_web_of_lies": {
1028
+ "task": "bbh_zeroshot_web_of_lies",
1029
+ "group": "bbh_zeroshot",
1030
+ "dataset_path": "lukaemon/bbh",
1031
+ "dataset_name": "web_of_lies",
1032
+ "test_split": "test",
1033
+ "doc_to_text": "Q: {{input}}\nA:",
1034
+ "doc_to_target": "{{target}}",
1035
+ "description": "Evaluate a random boolean function expressed as a word problem.\n\n",
1036
+ "target_delimiter": " ",
1037
+ "fewshot_delimiter": "\n\n",
1038
+ "num_fewshot": 0,
1039
+ "metric_list": [
1040
+ {
1041
+ "metric": "exact_match",
1042
+ "aggregation": "mean",
1043
+ "higher_is_better": true
1044
+ }
1045
+ ],
1046
+ "output_type": "generate_until",
1047
+ "generation_kwargs": {
1048
+ "until": [
1049
+ "</s>",
1050
+ "Q:",
1051
+ "\n\n"
1052
+ ],
1053
+ "do_sample": false,
1054
+ "temperature": 0.0
1055
+ },
1056
+ "repeats": 1,
1057
+ "should_decontaminate": false,
1058
+ "metadata": {
1059
+ "version": 1.0
1060
+ }
1061
+ },
1062
+ "bbh_zeroshot_word_sorting": {
1063
+ "task": "bbh_zeroshot_word_sorting",
1064
+ "group": "bbh_zeroshot",
1065
+ "dataset_path": "lukaemon/bbh",
1066
+ "dataset_name": "word_sorting",
1067
+ "test_split": "test",
1068
+ "doc_to_text": "Q: {{input}}\nA:",
1069
+ "doc_to_target": "{{target}}",
1070
+ "description": "Sort a list of words.\n\n",
1071
+ "target_delimiter": " ",
1072
+ "fewshot_delimiter": "\n\n",
1073
+ "num_fewshot": 0,
1074
+ "metric_list": [
1075
+ {
1076
+ "metric": "exact_match",
1077
+ "aggregation": "mean",
1078
+ "higher_is_better": true
1079
+ }
1080
+ ],
1081
+ "output_type": "generate_until",
1082
+ "generation_kwargs": {
1083
+ "until": [
1084
+ "</s>",
1085
+ "Q:",
1086
+ "\n\n"
1087
+ ],
1088
+ "do_sample": false,
1089
+ "temperature": 0.0
1090
+ },
1091
+ "repeats": 1,
1092
+ "should_decontaminate": false,
1093
+ "metadata": {
1094
+ "version": 1.0
1095
+ }
1096
+ }
1097
+ },
1098
+ "versions": {
1099
+ "bbh_zeroshot": "N/A",
1100
+ "bbh_zeroshot_boolean_expressions": 1.0,
1101
+ "bbh_zeroshot_causal_judgement": 1.0,
1102
+ "bbh_zeroshot_date_understanding": 1.0,
1103
+ "bbh_zeroshot_disambiguation_qa": 1.0,
1104
+ "bbh_zeroshot_dyck_languages": 1.0,
1105
+ "bbh_zeroshot_formal_fallacies": 1.0,
1106
+ "bbh_zeroshot_geometric_shapes": 1.0,
1107
+ "bbh_zeroshot_hyperbaton": 1.0,
1108
+ "bbh_zeroshot_logical_deduction_five_objects": 1.0,
1109
+ "bbh_zeroshot_logical_deduction_seven_objects": 1.0,
1110
+ "bbh_zeroshot_logical_deduction_three_objects": 1.0,
1111
+ "bbh_zeroshot_movie_recommendation": 1.0,
1112
+ "bbh_zeroshot_multistep_arithmetic_two": 1.0,
1113
+ "bbh_zeroshot_navigate": 1.0,
1114
+ "bbh_zeroshot_object_counting": 1.0,
1115
+ "bbh_zeroshot_penguins_in_a_table": 1.0,
1116
+ "bbh_zeroshot_reasoning_about_colored_objects": 1.0,
1117
+ "bbh_zeroshot_ruin_names": 1.0,
1118
+ "bbh_zeroshot_salient_translation_error_detection": 1.0,
1119
+ "bbh_zeroshot_snarks": 1.0,
1120
+ "bbh_zeroshot_sports_understanding": 1.0,
1121
+ "bbh_zeroshot_temporal_sequences": 1.0,
1122
+ "bbh_zeroshot_tracking_shuffled_objects_five_objects": 1.0,
1123
+ "bbh_zeroshot_tracking_shuffled_objects_seven_objects": 1.0,
1124
+ "bbh_zeroshot_tracking_shuffled_objects_three_objects": 1.0,
1125
+ "bbh_zeroshot_web_of_lies": 1.0,
1126
+ "bbh_zeroshot_word_sorting": 1.0
1127
+ },
1128
+ "n-shot": {
1129
+ "bbh_zeroshot": 0,
1130
+ "bbh_zeroshot_boolean_expressions": 0,
1131
+ "bbh_zeroshot_causal_judgement": 0,
1132
+ "bbh_zeroshot_date_understanding": 0,
1133
+ "bbh_zeroshot_disambiguation_qa": 0,
1134
+ "bbh_zeroshot_dyck_languages": 0,
1135
+ "bbh_zeroshot_formal_fallacies": 0,
1136
+ "bbh_zeroshot_geometric_shapes": 0,
1137
+ "bbh_zeroshot_hyperbaton": 0,
1138
+ "bbh_zeroshot_logical_deduction_five_objects": 0,
1139
+ "bbh_zeroshot_logical_deduction_seven_objects": 0,
1140
+ "bbh_zeroshot_logical_deduction_three_objects": 0,
1141
+ "bbh_zeroshot_movie_recommendation": 0,
1142
+ "bbh_zeroshot_multistep_arithmetic_two": 0,
1143
+ "bbh_zeroshot_navigate": 0,
1144
+ "bbh_zeroshot_object_counting": 0,
1145
+ "bbh_zeroshot_penguins_in_a_table": 0,
1146
+ "bbh_zeroshot_reasoning_about_colored_objects": 0,
1147
+ "bbh_zeroshot_ruin_names": 0,
1148
+ "bbh_zeroshot_salient_translation_error_detection": 0,
1149
+ "bbh_zeroshot_snarks": 0,
1150
+ "bbh_zeroshot_sports_understanding": 0,
1151
+ "bbh_zeroshot_temporal_sequences": 0,
1152
+ "bbh_zeroshot_tracking_shuffled_objects_five_objects": 0,
1153
+ "bbh_zeroshot_tracking_shuffled_objects_seven_objects": 0,
1154
+ "bbh_zeroshot_tracking_shuffled_objects_three_objects": 0,
1155
+ "bbh_zeroshot_web_of_lies": 0,
1156
+ "bbh_zeroshot_word_sorting": 0
1157
+ },
1158
+ "config": {
1159
+ "model": "hf",
1160
+ "model_args": "pretrained=teknium/OpenHermes-2.5-Mistral-7B,revision=main,dtype=bfloat16",
1161
+ "batch_size": "auto",
1162
+ "batch_sizes": [],
1163
+ "device": null,
1164
+ "use_cache": null,
1165
+ "limit": null,
1166
+ "bootstrap_iters": 100000,
1167
+ "gen_kwargs": null
1168
+ },
1169
+ "git_hash": "8237ac1"
1170
+ }