open-r1-eval-leaderboard

Running

App Files Files Community

lewtun HF Staff commited on Feb 15, 2024

Commit

3b64438

verified ·

1 Parent(s): c131383

Upload eval_results/teknium/OpenHermes-2.5-Mistral-7B/main/eval_bbh.json with huggingface_hub

Browse files

Files changed (1) hide show

eval_results/teknium/OpenHermes-2.5-Mistral-7B/main/eval_bbh.json +1170 -0

eval_results/teknium/OpenHermes-2.5-Mistral-7B/main/eval_bbh.json ADDED Viewed

	@@ -0,0 +1,1170 @@

+{
+  "results": {
+    "bbh_zeroshot": {
+      "exact_match,none": 0.03639993856550453,
+      "exact_match_stderr,none": 0.001672390259526908,
+      "alias": "bbh_zeroshot"
+    },
+    "bbh_zeroshot_boolean_expressions": {
+      "exact_match,none": 0.612,
+      "exact_match_stderr,none": 0.030881038748993922,
+      "alias": " - bbh_zeroshot_boolean_expressions"
+    },
+    "bbh_zeroshot_causal_judgement": {
+      "exact_match,none": 0.36363636363636365,
+      "exact_match_stderr,none": 0.03527198153014411,
+      "alias": " - bbh_zeroshot_causal_judgement"
+    },
+    "bbh_zeroshot_date_understanding": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_date_understanding"
+    },
+    "bbh_zeroshot_disambiguation_qa": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_disambiguation_qa"
+    },
+    "bbh_zeroshot_dyck_languages": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_dyck_languages"
+    },
+    "bbh_zeroshot_formal_fallacies": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_formal_fallacies"
+    },
+    "bbh_zeroshot_geometric_shapes": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_geometric_shapes"
+    },
+    "bbh_zeroshot_hyperbaton": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_hyperbaton"
+    },
+    "bbh_zeroshot_logical_deduction_five_objects": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_logical_deduction_five_objects"
+    },
+    "bbh_zeroshot_logical_deduction_seven_objects": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_logical_deduction_seven_objects"
+    },
+    "bbh_zeroshot_logical_deduction_three_objects": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_logical_deduction_three_objects"
+    },
+    "bbh_zeroshot_movie_recommendation": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_movie_recommendation"
+    },
+    "bbh_zeroshot_multistep_arithmetic_two": {
+      "exact_match,none": 0.004,
+      "exact_match_stderr,none": 0.004000000000000004,
+      "alias": " - bbh_zeroshot_multistep_arithmetic_two"
+    },
+    "bbh_zeroshot_navigate": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_navigate"
+    },
+    "bbh_zeroshot_object_counting": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_object_counting"
+    },
+    "bbh_zeroshot_penguins_in_a_table": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_penguins_in_a_table"
+    },
+    "bbh_zeroshot_reasoning_about_colored_objects": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_reasoning_about_colored_objects"
+    },
+    "bbh_zeroshot_ruin_names": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_ruin_names"
+    },
+    "bbh_zeroshot_salient_translation_error_detection": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_salient_translation_error_detection"
+    },
+    "bbh_zeroshot_snarks": {
+      "exact_match,none": 0.0449438202247191,
+      "exact_match_stderr,none": 0.015572660609707176,
+      "alias": " - bbh_zeroshot_snarks"
+    },
+    "bbh_zeroshot_sports_understanding": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_sports_understanding"
+    },
+    "bbh_zeroshot_temporal_sequences": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_temporal_sequences"
+    },
+    "bbh_zeroshot_tracking_shuffled_objects_five_objects": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_tracking_shuffled_objects_five_objects"
+    },
+    "bbh_zeroshot_tracking_shuffled_objects_seven_objects": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_tracking_shuffled_objects_seven_objects"
+    },
+    "bbh_zeroshot_tracking_shuffled_objects_three_objects": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_tracking_shuffled_objects_three_objects"
+    },
+    "bbh_zeroshot_web_of_lies": {
+      "exact_match,none": 0.0,
+      "exact_match_stderr,none": 0.0,
+      "alias": " - bbh_zeroshot_web_of_lies"
+    },
+    "bbh_zeroshot_word_sorting": {
+      "exact_match,none": 0.028,
+      "exact_match_stderr,none": 0.010454721651927287,
+      "alias": " - bbh_zeroshot_word_sorting"
+    }
+  },
+  "groups": {
+    "bbh_zeroshot": {
+      "exact_match,none": 0.03639993856550453,
+      "exact_match_stderr,none": 0.001672390259526908,
+      "alias": "bbh_zeroshot"
+    }
+  },
+  "configs": {
+    "bbh_zeroshot_boolean_expressions": {
+      "task": "bbh_zeroshot_boolean_expressions",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "boolean_expressions",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Evaluate the result of a random Boolean expression.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_causal_judgement": {
+      "task": "bbh_zeroshot_causal_judgement",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "causal_judgement",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Answer questions about causal attribution.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_date_understanding": {
+      "task": "bbh_zeroshot_date_understanding",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "date_understanding",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Infer the date from context.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_disambiguation_qa": {
+      "task": "bbh_zeroshot_disambiguation_qa",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "disambiguation_qa",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_dyck_languages": {
+      "task": "bbh_zeroshot_dyck_languages",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "dyck_languages",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Correctly close a Dyck-n word.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_formal_fallacies": {
+      "task": "bbh_zeroshot_formal_fallacies",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "formal_fallacies",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Distinguish deductively valid arguments from formal fallacies.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_geometric_shapes": {
+      "task": "bbh_zeroshot_geometric_shapes",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "geometric_shapes",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Name geometric shapes from their SVG paths.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_hyperbaton": {
+      "task": "bbh_zeroshot_hyperbaton",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "hyperbaton",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Order adjectives correctly in English sentences.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_logical_deduction_five_objects": {
+      "task": "bbh_zeroshot_logical_deduction_five_objects",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "logical_deduction_five_objects",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_logical_deduction_seven_objects": {
+      "task": "bbh_zeroshot_logical_deduction_seven_objects",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "logical_deduction_seven_objects",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_logical_deduction_three_objects": {
+      "task": "bbh_zeroshot_logical_deduction_three_objects",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "logical_deduction_three_objects",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_movie_recommendation": {
+      "task": "bbh_zeroshot_movie_recommendation",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "movie_recommendation",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Recommend movies similar to the given list of movies.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_multistep_arithmetic_two": {
+      "task": "bbh_zeroshot_multistep_arithmetic_two",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "multistep_arithmetic_two",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Solve multi-step arithmetic problems.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_navigate": {
+      "task": "bbh_zeroshot_navigate",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "navigate",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_object_counting": {
+      "task": "bbh_zeroshot_object_counting",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "object_counting",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Questions that involve enumerating objects and asking the model to count them.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_penguins_in_a_table": {
+      "task": "bbh_zeroshot_penguins_in_a_table",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "penguins_in_a_table",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Answer questions about a table of penguins and their attributes.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_reasoning_about_colored_objects": {
+      "task": "bbh_zeroshot_reasoning_about_colored_objects",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "reasoning_about_colored_objects",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Answer extremely simple questions about the colors of objects on a surface.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_ruin_names": {
+      "task": "bbh_zeroshot_ruin_names",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "ruin_names",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_salient_translation_error_detection": {
+      "task": "bbh_zeroshot_salient_translation_error_detection",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "salient_translation_error_detection",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Detect the type of error in an English translation of a German source sentence.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_snarks": {
+      "task": "bbh_zeroshot_snarks",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "snarks",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Determine which of two sentences is sarcastic.\n\nAccording to Cambridge University Dictionary, sarcasm is \"the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way.\" Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_sports_understanding": {
+      "task": "bbh_zeroshot_sports_understanding",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "sports_understanding",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Determine whether an artificially constructed sentence relating to sports is plausible or not.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_temporal_sequences": {
+      "task": "bbh_zeroshot_temporal_sequences",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "temporal_sequences",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Task description: Answer questions about which times certain events could have occurred.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_tracking_shuffled_objects_five_objects": {
+      "task": "bbh_zeroshot_tracking_shuffled_objects_five_objects",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "tracking_shuffled_objects_five_objects",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_tracking_shuffled_objects_seven_objects": {
+      "task": "bbh_zeroshot_tracking_shuffled_objects_seven_objects",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "tracking_shuffled_objects_seven_objects",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_tracking_shuffled_objects_three_objects": {
+      "task": "bbh_zeroshot_tracking_shuffled_objects_three_objects",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "tracking_shuffled_objects_three_objects",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_web_of_lies": {
+      "task": "bbh_zeroshot_web_of_lies",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "web_of_lies",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Evaluate a random boolean function expressed as a word problem.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "bbh_zeroshot_word_sorting": {
+      "task": "bbh_zeroshot_word_sorting",
+      "group": "bbh_zeroshot",
+      "dataset_path": "lukaemon/bbh",
+      "dataset_name": "word_sorting",
+      "test_split": "test",
+      "doc_to_text": "Q: {{input}}\nA:",
+      "doc_to_target": "{{target}}",
+      "description": "Sort a list of words.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "exact_match",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "until": [
+          "</s>",
+          "Q:",
+          "\n\n"
+        ],
+        "do_sample": false,
+        "temperature": 0.0
+      },
+      "repeats": 1,
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    }
+  },
+  "versions": {
+    "bbh_zeroshot": "N/A",
+    "bbh_zeroshot_boolean_expressions": 1.0,
+    "bbh_zeroshot_causal_judgement": 1.0,
+    "bbh_zeroshot_date_understanding": 1.0,
+    "bbh_zeroshot_disambiguation_qa": 1.0,
+    "bbh_zeroshot_dyck_languages": 1.0,
+    "bbh_zeroshot_formal_fallacies": 1.0,
+    "bbh_zeroshot_geometric_shapes": 1.0,
+    "bbh_zeroshot_hyperbaton": 1.0,
+    "bbh_zeroshot_logical_deduction_five_objects": 1.0,
+    "bbh_zeroshot_logical_deduction_seven_objects": 1.0,
+    "bbh_zeroshot_logical_deduction_three_objects": 1.0,
+    "bbh_zeroshot_movie_recommendation": 1.0,
+    "bbh_zeroshot_multistep_arithmetic_two": 1.0,
+    "bbh_zeroshot_navigate": 1.0,
+    "bbh_zeroshot_object_counting": 1.0,
+    "bbh_zeroshot_penguins_in_a_table": 1.0,
+    "bbh_zeroshot_reasoning_about_colored_objects": 1.0,
+    "bbh_zeroshot_ruin_names": 1.0,
+    "bbh_zeroshot_salient_translation_error_detection": 1.0,
+    "bbh_zeroshot_snarks": 1.0,
+    "bbh_zeroshot_sports_understanding": 1.0,
+    "bbh_zeroshot_temporal_sequences": 1.0,
+    "bbh_zeroshot_tracking_shuffled_objects_five_objects": 1.0,
+    "bbh_zeroshot_tracking_shuffled_objects_seven_objects": 1.0,
+    "bbh_zeroshot_tracking_shuffled_objects_three_objects": 1.0,
+    "bbh_zeroshot_web_of_lies": 1.0,
+    "bbh_zeroshot_word_sorting": 1.0
+  },
+  "n-shot": {
+    "bbh_zeroshot": 0,
+    "bbh_zeroshot_boolean_expressions": 0,
+    "bbh_zeroshot_causal_judgement": 0,
+    "bbh_zeroshot_date_understanding": 0,
+    "bbh_zeroshot_disambiguation_qa": 0,
+    "bbh_zeroshot_dyck_languages": 0,
+    "bbh_zeroshot_formal_fallacies": 0,
+    "bbh_zeroshot_geometric_shapes": 0,
+    "bbh_zeroshot_hyperbaton": 0,
+    "bbh_zeroshot_logical_deduction_five_objects": 0,
+    "bbh_zeroshot_logical_deduction_seven_objects": 0,
+    "bbh_zeroshot_logical_deduction_three_objects": 0,
+    "bbh_zeroshot_movie_recommendation": 0,
+    "bbh_zeroshot_multistep_arithmetic_two": 0,
+    "bbh_zeroshot_navigate": 0,
+    "bbh_zeroshot_object_counting": 0,
+    "bbh_zeroshot_penguins_in_a_table": 0,
+    "bbh_zeroshot_reasoning_about_colored_objects": 0,
+    "bbh_zeroshot_ruin_names": 0,
+    "bbh_zeroshot_salient_translation_error_detection": 0,
+    "bbh_zeroshot_snarks": 0,
+    "bbh_zeroshot_sports_understanding": 0,
+    "bbh_zeroshot_temporal_sequences": 0,
+    "bbh_zeroshot_tracking_shuffled_objects_five_objects": 0,
+    "bbh_zeroshot_tracking_shuffled_objects_seven_objects": 0,
+    "bbh_zeroshot_tracking_shuffled_objects_three_objects": 0,
+    "bbh_zeroshot_web_of_lies": 0,
+    "bbh_zeroshot_word_sorting": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=teknium/OpenHermes-2.5-Mistral-7B,revision=main,dtype=bfloat16",
+    "batch_size": "auto",
+    "batch_sizes": [],
+    "device": null,
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "8237ac1"
+}