{ "config_general": { "lighteval_sha": "?", "num_fewshot_seeds": 1, "override_batch_size": 1, "max_samples": null, "job_id": "", "start_time": 388378.470149001, "end_time": 389760.771702178, "total_evaluation_time_secondes": "1382.301553177007", "model_name": "NousResearch/Nous-Hermes-2-Yi-34B", "model_sha": "fcb0a8847e76aea14aba9aa44009d4418ad7c18f", "model_dtype": "torch.bfloat16", "model_size": "64.17 GB", "config": null }, "results": { "harness|bbh:causal_judgment|3": { "em": 0.6363636363636364, "em_stderr": 0.035271981530144124, "qem": 0.6363636363636364, "qem_stderr": 0.035271981530144124, "pem": 0.6363636363636364, "pem_stderr": 0.035271981530144124, "pqem": 0.6363636363636364, "pqem_stderr": 0.035271981530144124, "perfect_em": 0.6363636363636364, "perfect_em_stderr": 0.035271981530144124 }, "harness|bbh:date_understanding|3": { "em": 0.58, "em_stderr": 0.03127799950463661, "qem": 0.58, "qem_stderr": 0.03127799950463661, "pem": 0.58, "pem_stderr": 0.03127799950463661, "pqem": 0.672, "pqem_stderr": 0.029752391824475387, "perfect_em": 0.58, "perfect_em_stderr": 0.03127799950463661 }, "harness|bbh:disambiguation_qa|3": { "em": 0.74, "em_stderr": 0.027797315752644297, "qem": 0.74, "qem_stderr": 0.027797315752644297, "pem": 0.74, "pem_stderr": 0.027797315752644297, "pqem": 0.772, "pqem_stderr": 0.026587432487268508, "perfect_em": 0.74, "perfect_em_stderr": 0.027797315752644297 }, "harness|bbh:geometric_shapes|3": { "em": 0.356, "em_stderr": 0.03034368065715322, "qem": 0.356, "qem_stderr": 0.03034368065715322, "pem": 0.356, "pem_stderr": 0.03034368065715322, "pqem": 0.356, "pqem_stderr": 0.03034368065715322, "perfect_em": 0.356, "perfect_em_stderr": 0.03034368065715322 }, "harness|bbh:logical_deduction_five_objects|3": { "em": 0.48, "em_stderr": 0.031660853408495185, "qem": 0.48, "qem_stderr": 0.031660853408495185, "pem": 0.48, "pem_stderr": 0.031660853408495185, "pqem": 0.556, "pqem_stderr": 0.03148684942554574, "perfect_em": 0.48, "perfect_em_stderr": 0.031660853408495185 }, "harness|bbh:logical_deduction_seven_objects|3": { "em": 0.468, "em_stderr": 0.031621252575725504, "qem": 0.468, "qem_stderr": 0.031621252575725504, "pem": 0.468, "pem_stderr": 0.031621252575725504, "pqem": 0.552, "pqem_stderr": 0.031514387611153515, "perfect_em": 0.468, "perfect_em_stderr": 0.031621252575725504 }, "harness|bbh:logical_deduction_three_objects|3": { "em": 0.728, "em_stderr": 0.02820008829631001, "qem": 0.728, "qem_stderr": 0.02820008829631001, "pem": 0.728, "pem_stderr": 0.02820008829631001, "pqem": 0.828, "pqem_stderr": 0.023915513944486218, "perfect_em": 0.728, "perfect_em_stderr": 0.02820008829631001 }, "harness|bbh:movie_recommendation|3": { "em": 0.8634538152610441, "em_stderr": 0.021803865831784248, "qem": 0.8634538152610441, "qem_stderr": 0.021803865831784248, "pem": 0.8634538152610441, "pem_stderr": 0.021803865831784248, "pqem": 0.8835341365461847, "pqem_stderr": 0.02036972434210335, "perfect_em": 0.8634538152610441, "perfect_em_stderr": 0.021803865831784248 }, "harness|bbh:navigate|3": { "em": 0.656, "em_stderr": 0.030104503392316385, "qem": 0.656, "qem_stderr": 0.030104503392316385, "pem": 0.656, "pem_stderr": 0.030104503392316385, "pqem": 0.656, "pqem_stderr": 0.030104503392316385, "perfect_em": 0.656, "perfect_em_stderr": 0.030104503392316385 }, "harness|bbh:reasoning_about_colored_objects|3": { "em": 0.668, "em_stderr": 0.0298440390474659, "qem": 0.668, "qem_stderr": 0.0298440390474659, "pem": 0.672, "pem_stderr": 0.029752391824475376, "pqem": 0.724, "pqem_stderr": 0.028328537274211342, "perfect_em": 0.668, "perfect_em_stderr": 0.0298440390474659 }, "harness|bbh:ruin_names|3": { "em": 0.8225806451612904, "em_stderr": 0.024307554295303634, "qem": 0.8225806451612904, "qem_stderr": 0.024307554295303634, "pem": 0.8225806451612904, "pem_stderr": 0.024307554295303634, "pqem": 0.875, "pqem_stderr": 0.021043140573169826, "perfect_em": 0.8225806451612904, "perfect_em_stderr": 0.024307554295303634 }, "harness|bbh:salient_translation_error_detection|3": { "em": 0.0, "em_stderr": 0.0, "qem": 0.14, "qem_stderr": 0.021989409645240265, "pem": 0.0, "pem_stderr": 0.0, "pqem": 0.14, "pqem_stderr": 0.021989409645240265, "perfect_em": 0.0, "perfect_em_stderr": 0.0 }, "harness|bbh:snarks|3": { "em": 0.8426966292134831, "em_stderr": 0.027366421373452483, "qem": 0.8426966292134831, "qem_stderr": 0.027366421373452483, "pem": 0.8426966292134831, "pem_stderr": 0.027366421373452483, "pqem": 0.898876404494382, "pqem_stderr": 0.0226615498467241, "perfect_em": 0.8426966292134831, "perfect_em_stderr": 0.027366421373452483 }, "harness|bbh:sports_understanding|3": { "em": 0.86, "em_stderr": 0.021989409645240272, "qem": 0.86, "qem_stderr": 0.021989409645240272, "pem": 0.86, "pem_stderr": 0.021989409645240272, "pqem": 0.86, "pqem_stderr": 0.021989409645240272, "perfect_em": 0.86, "perfect_em_stderr": 0.021989409645240272 }, "harness|bbh:temporal_sequences|3": { "em": 0.592, "em_stderr": 0.031145209846548495, "qem": 0.592, "qem_stderr": 0.031145209846548495, "pem": 0.592, "pem_stderr": 0.031145209846548495, "pqem": 0.692, "pqem_stderr": 0.029256928606501864, "perfect_em": 0.592, "perfect_em_stderr": 0.031145209846548495 }, "harness|bbh:tracking_shuffled_objects_five_objects|3": { "em": 0.228, "em_stderr": 0.02658743248726847, "qem": 0.228, "qem_stderr": 0.02658743248726847, "pem": 0.24, "pem_stderr": 0.027065293652239003, "pqem": 0.42, "pqem_stderr": 0.03127799950463661, "perfect_em": 0.228, "perfect_em_stderr": 0.02658743248726847 }, "harness|bbh:tracking_shuffled_objects_seven_objects|3": { "em": 0.22, "em_stderr": 0.02625179282460584, "qem": 0.22, "qem_stderr": 0.02625179282460584, "pem": 0.22, "pem_stderr": 0.02625179282460584, "pqem": 0.336, "pqem_stderr": 0.02993325909419152, "perfect_em": 0.22, "perfect_em_stderr": 0.02625179282460584 }, "harness|bbh:tracking_shuffled_objects_three_objects|3": { "em": 0.388, "em_stderr": 0.030881038748993925, "qem": 0.388, "qem_stderr": 0.030881038748993925, "pem": 0.388, "pem_stderr": 0.030881038748993925, "pqem": 0.624, "pqem_stderr": 0.03069633626739459, "perfect_em": 0.388, "perfect_em_stderr": 0.030881038748993925 }, "harness|bbh:_average|3": { "em": 0.5627274847777475, "em_stderr": 0.027025246623227146, "qem": 0.5705052625555251, "qem_stderr": 0.02824688049240716, "pem": 0.5636163736666364, "pem_stderr": 0.027046702953337144, "pqem": 0.6378763431891223, "pqem_stderr": 0.02758461309288649, "perfect_em": 0.5627274847777475, "perfect_em_stderr": 0.027025246623227146 } }, "versions": { "harness|bbh:causal_judgment|3": 0, "harness|bbh:date_understanding|3": 0, "harness|bbh:disambiguation_qa|3": 0, "harness|bbh:geometric_shapes|3": 0, "harness|bbh:logical_deduction_five_objects|3": 0, "harness|bbh:logical_deduction_seven_objects|3": 0, "harness|bbh:logical_deduction_three_objects|3": 0, "harness|bbh:movie_recommendation|3": 0, "harness|bbh:navigate|3": 0, "harness|bbh:reasoning_about_colored_objects|3": 0, "harness|bbh:ruin_names|3": 0, "harness|bbh:salient_translation_error_detection|3": 0, "harness|bbh:snarks|3": 0, "harness|bbh:sports_understanding|3": 0, "harness|bbh:temporal_sequences|3": 0, "harness|bbh:tracking_shuffled_objects_five_objects|3": 0, "harness|bbh:tracking_shuffled_objects_seven_objects|3": 0, "harness|bbh:tracking_shuffled_objects_three_objects|3": 0 }, "config_tasks": { "harness|bbh:causal_judgment": { "name": "bbh:causal_judgment", "prompt_function": "bbh_causal_judgment", "hf_repo": "lukaemon/bbh", "hf_subset": "causal_judgement", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 187, "effective_num_docs": 187, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:date_understanding": { "name": "bbh:date_understanding", "prompt_function": "bbh_date_understanding", "hf_repo": "lukaemon/bbh", "hf_subset": "date_understanding", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:disambiguation_qa": { "name": "bbh:disambiguation_qa", "prompt_function": "bbh_disambiguation_qa", "hf_repo": "lukaemon/bbh", "hf_subset": "disambiguation_qa", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:geometric_shapes": { "name": "bbh:geometric_shapes", "prompt_function": "bbh_geometric_shapes", "hf_repo": "lukaemon/bbh", "hf_subset": "geometric_shapes", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_five_objects": { "name": "bbh:logical_deduction_five_objects", "prompt_function": "bbh_logical_deduction_five_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_five_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_seven_objects": { "name": "bbh:logical_deduction_seven_objects", "prompt_function": "bbh_logical_deduction_seven_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_seven_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_three_objects": { "name": "bbh:logical_deduction_three_objects", "prompt_function": "bbh_logical_deduction_three_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_three_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:movie_recommendation": { "name": "bbh:movie_recommendation", "prompt_function": "bbh_movie_recommendation", "hf_repo": "lukaemon/bbh", "hf_subset": "movie_recommendation", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 249, "effective_num_docs": 249, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:navigate": { "name": "bbh:navigate", "prompt_function": "bbh_navigate", "hf_repo": "lukaemon/bbh", "hf_subset": "navigate", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:reasoning_about_colored_objects": { "name": "bbh:reasoning_about_colored_objects", "prompt_function": "bbh_reasoning_about_colored_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "reasoning_about_colored_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:ruin_names": { "name": "bbh:ruin_names", "prompt_function": "bbh_ruin_names", "hf_repo": "lukaemon/bbh", "hf_subset": "ruin_names", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 248, "effective_num_docs": 248, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:salient_translation_error_detection": { "name": "bbh:salient_translation_error_detection", "prompt_function": "bbh_salient_translation_error_detection", "hf_repo": "lukaemon/bbh", "hf_subset": "salient_translation_error_detection", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:snarks": { "name": "bbh:snarks", "prompt_function": "bbh_snarks", "hf_repo": "lukaemon/bbh", "hf_subset": "snarks", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 178, "effective_num_docs": 178, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:sports_understanding": { "name": "bbh:sports_understanding", "prompt_function": "bbh_sports_understanding", "hf_repo": "lukaemon/bbh", "hf_subset": "sports_understanding", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:temporal_sequences": { "name": "bbh:temporal_sequences", "prompt_function": "bbh_temporal_sequences", "hf_repo": "lukaemon/bbh", "hf_subset": "temporal_sequences", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_five_objects": { "name": "bbh:tracking_shuffled_objects_five_objects", "prompt_function": "bbh_tracking_shuffled_objects_five_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_five_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_seven_objects": { "name": "bbh:tracking_shuffled_objects_seven_objects", "prompt_function": "bbh_tracking_shuffled_objects_seven_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_seven_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_three_objects": { "name": "bbh:tracking_shuffled_objects_three_objects", "prompt_function": "bbh_tracking_shuffled_objects_three_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_three_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null } }, "summary_tasks": { "harness|bbh:causal_judgment|3": { "hashes": { "hash_examples": "63218f5ae055ab2b", "hash_full_prompts": "7303fa1d0fe0b29a", "hash_input_tokens": "ff40286f542dde45", "hash_cont_tokens": "c8124940e02211ee" }, "truncated": 187, "non_truncated": 0, "padded": 0, "non_padded": 187, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:date_understanding|3": { "hashes": { "hash_examples": "f145c7a06def3c8e", "hash_full_prompts": "69e60d10afa5a6f1", "hash_input_tokens": "10fb5f39bf829fd8", "hash_cont_tokens": "b3ad94e159c27161" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:disambiguation_qa|3": { "hashes": { "hash_examples": "19677fd1773f7eb9", "hash_full_prompts": "ae0a8fd428f9aee3", "hash_input_tokens": "8bbd4a389ff5ba7b", "hash_cont_tokens": "728f6ad7ee171ae1" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:geometric_shapes|3": { "hashes": { "hash_examples": "76c7b11a13cc72a9", "hash_full_prompts": "76633257f67207f9", "hash_input_tokens": "443102f37ce9783e", "hash_cont_tokens": "a0941658b62cd4ca" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_five_objects|3": { "hashes": { "hash_examples": "0e958c856332a745", "hash_full_prompts": "3c96645848786efd", "hash_input_tokens": "2d58a04d6d043088", "hash_cont_tokens": "be8d17f49c1d485f" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_seven_objects|3": { "hashes": { "hash_examples": "ab9de25a5eb40d09", "hash_full_prompts": "185c5851c101ee66", "hash_input_tokens": "a2d56867ce719e0b", "hash_cont_tokens": "ab9f70a65634e6d8" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_three_objects|3": { "hashes": { "hash_examples": "3c6bf52517714218", "hash_full_prompts": "8ba2d94357e589d0", "hash_input_tokens": "84b0d03f28551125", "hash_cont_tokens": "cbd3a53ec0353283" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:movie_recommendation|3": { "hashes": { "hash_examples": "2d9dc4975935d31a", "hash_full_prompts": "a411e216d0f5f626", "hash_input_tokens": "530f18bf7986e06b", "hash_cont_tokens": "e1683548ac59cd9e" }, "truncated": 249, "non_truncated": 0, "padded": 0, "non_padded": 249, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:navigate|3": { "hashes": { "hash_examples": "ba91dcdb9a064255", "hash_full_prompts": "ebb3084ecc78a46a", "hash_input_tokens": "4a6455e714f48eef", "hash_cont_tokens": "592fd15620ac890d" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:reasoning_about_colored_objects|3": { "hashes": { "hash_examples": "a6ba328c4c3385d2", "hash_full_prompts": "38328d016a4ebef3", "hash_input_tokens": "db1eb3c373c1272c", "hash_cont_tokens": "62c672c83d724448" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:ruin_names|3": { "hashes": { "hash_examples": "2ef28d5f2d4fdd25", "hash_full_prompts": "9c7d0493c37182d6", "hash_input_tokens": "ab4cd3aae8a12a26", "hash_cont_tokens": "04ee1428b8feb220" }, "truncated": 248, "non_truncated": 0, "padded": 0, "non_padded": 248, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:salient_translation_error_detection|3": { "hashes": { "hash_examples": "c13f25ec8ffed496", "hash_full_prompts": "edccd4061b168b78", "hash_input_tokens": "a2b01faa59fd27d2", "hash_cont_tokens": "513bf57e3cf4a199" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:snarks|3": { "hashes": { "hash_examples": "5f6db7bff7f6f22e", "hash_full_prompts": "31cafd95ab850a44", "hash_input_tokens": "fd22744425e833a6", "hash_cont_tokens": "6cc6ab55eb3ceaff" }, "truncated": 178, "non_truncated": 0, "padded": 0, "non_padded": 178, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:sports_understanding|3": { "hashes": { "hash_examples": "042afbe5d9c1f02d", "hash_full_prompts": "3d46581e9bbec2d0", "hash_input_tokens": "b5755c7b3f0adf05", "hash_cont_tokens": "6ec699699c0aa09d" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:temporal_sequences|3": { "hashes": { "hash_examples": "803a05f352eb6afc", "hash_full_prompts": "4a54db144a5dd222", "hash_input_tokens": "7039f2a74fc297c2", "hash_cont_tokens": "9fda55c71b310122" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_five_objects|3": { "hashes": { "hash_examples": "2bbac6db7ab0d527", "hash_full_prompts": "e3079106787cc311", "hash_input_tokens": "fe4290ba9ac12a0f", "hash_cont_tokens": "b2c34002187013fc" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_seven_objects|3": { "hashes": { "hash_examples": "845caf093ac2b58c", "hash_full_prompts": "6364e5b860590ec8", "hash_input_tokens": "9c23dc1d703fe193", "hash_cont_tokens": "489cba322c9c3f1f" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_three_objects|3": { "hashes": { "hash_examples": "9004f14d5a32b9a8", "hash_full_prompts": "01aef56c4d1fe9fe", "hash_input_tokens": "d1698f73a6518669", "hash_cont_tokens": "24c53557640b2ed8" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 } }, "summary_general": { "hashes": { "hash_examples": "4ff1e3dc5703575d", "hash_full_prompts": "1cbeab0a00117cb8", "hash_input_tokens": "e2592a72ca049a67", "hash_cont_tokens": "848ccc86e5a7522d" }, "truncated": 4362, "non_truncated": 0, "padded": 0, "non_padded": 4362, "num_truncated_few_shots": 0 } }