{ "config_general": { "lighteval_sha": "?", "num_fewshot_seeds": 1, "override_batch_size": 1, "max_samples": null, "job_id": "", "start_time": 2106314.171046321, "end_time": 2107521.942727167, "total_evaluation_time_secondes": "1207.7716808463447", "model_name": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO", "model_sha": "707b6e3251d114cc3d326e6a2bcff1449110aedf", "model_dtype": "torch.bfloat16", "model_size": "87.49 GB", "config": null }, "results": { "harness|bbh:causal_judgment|3": { "em": 0.6684491978609626, "em_stderr": 0.034518568148016175, "qem": 0.6684491978609626, "qem_stderr": 0.034518568148016175, "pem": 0.6684491978609626, "pem_stderr": 0.034518568148016175, "pqem": 0.6684491978609626, "pqem_stderr": 0.034518568148016175, "perfect_em": 0.6684491978609626, "perfect_em_stderr": 0.034518568148016175 }, "harness|bbh:date_understanding|3": { "em": 0.536, "em_stderr": 0.03160397514522374, "qem": 0.536, "qem_stderr": 0.03160397514522374, "pem": 0.536, "pem_stderr": 0.03160397514522374, "pqem": 0.624, "pqem_stderr": 0.030696336267394594, "perfect_em": 0.536, "perfect_em_stderr": 0.03160397514522374 }, "harness|bbh:disambiguation_qa|3": { "em": 0.632, "em_stderr": 0.030562070620993167, "qem": 0.632, "qem_stderr": 0.030562070620993167, "pem": 0.684, "pem_stderr": 0.029462657598578666, "pqem": 0.728, "pqem_stderr": 0.028200088296309985, "perfect_em": 0.632, "perfect_em_stderr": 0.030562070620993167 }, "harness|bbh:geometric_shapes|3": { "em": 0.196, "em_stderr": 0.025156857313255898, "qem": 0.196, "qem_stderr": 0.025156857313255898, "pem": 0.38, "pem_stderr": 0.03076011604262604, "pqem": 0.38, "pqem_stderr": 0.03076011604262604, "perfect_em": 0.196, "perfect_em_stderr": 0.025156857313255898 }, "harness|bbh:logical_deduction_five_objects|3": { "em": 0.536, "em_stderr": 0.031603975145223735, "qem": 0.536, "qem_stderr": 0.031603975145223735, "pem": 0.536, "pem_stderr": 0.031603975145223735, "pqem": 0.592, "pqem_stderr": 0.0311452098465485, "perfect_em": 0.536, "perfect_em_stderr": 0.031603975145223735 }, "harness|bbh:logical_deduction_seven_objects|3": { "em": 0.524, "em_stderr": 0.031649688959687824, "qem": 0.524, "qem_stderr": 0.031649688959687824, "pem": 0.524, "pem_stderr": 0.031649688959687824, "pqem": 0.552, "pqem_stderr": 0.03151438761115351, "perfect_em": 0.524, "perfect_em_stderr": 0.031649688959687824 }, "harness|bbh:logical_deduction_three_objects|3": { "em": 0.708, "em_stderr": 0.028814320402205655, "qem": 0.708, "qem_stderr": 0.028814320402205655, "pem": 0.712, "pem_stderr": 0.02869700458739821, "pqem": 0.824, "pqem_stderr": 0.02413349752545711, "perfect_em": 0.708, "perfect_em_stderr": 0.028814320402205655 }, "harness|bbh:movie_recommendation|3": { "em": 0.7951807228915663, "em_stderr": 0.025626688837598215, "qem": 0.7951807228915663, "qem_stderr": 0.025626688837598215, "pem": 0.8554216867469879, "pem_stderr": 0.02233139557182192, "pqem": 0.8875502008032129, "pqem_stderr": 0.020060879304188376, "perfect_em": 0.7951807228915663, "perfect_em_stderr": 0.025626688837598215 }, "harness|bbh:navigate|3": { "em": 0.684, "em_stderr": 0.02946265759857867, "qem": 0.684, "qem_stderr": 0.02946265759857867, "pem": 0.684, "pem_stderr": 0.02946265759857867, "pqem": 0.684, "pqem_stderr": 0.02946265759857867, "perfect_em": 0.684, "perfect_em_stderr": 0.02946265759857867 }, "harness|bbh:reasoning_about_colored_objects|3": { "em": 0.604, "em_stderr": 0.03099319785457785, "qem": 0.604, "qem_stderr": 0.03099319785457785, "pem": 0.628, "pem_stderr": 0.030630325944558317, "pqem": 0.696, "pqem_stderr": 0.029150213374159673, "perfect_em": 0.604, "perfect_em_stderr": 0.03099319785457785 }, "harness|bbh:ruin_names|3": { "em": 0.657258064516129, "em_stderr": 0.030199733298058248, "qem": 0.657258064516129, "qem_stderr": 0.030199733298058248, "pem": 0.657258064516129, "pem_stderr": 0.030199733298058248, "pqem": 0.7903225806451613, "pqem_stderr": 0.025901776404740794, "perfect_em": 0.657258064516129, "perfect_em_stderr": 0.030199733298058248 }, "harness|bbh:salient_translation_error_detection|3": { "em": 0.492, "em_stderr": 0.031682156431413803, "qem": 0.492, "qem_stderr": 0.031682156431413803, "pem": 0.5, "pem_stderr": 0.031686212526223896, "pqem": 0.62, "pqem_stderr": 0.030760116042626046, "perfect_em": 0.492, "perfect_em_stderr": 0.031682156431413803 }, "harness|bbh:snarks|3": { "em": 0.7696629213483146, "em_stderr": 0.03164794946543343, "qem": 0.7696629213483146, "qem_stderr": 0.03164794946543343, "pem": 0.7752808988764045, "pem_stderr": 0.03137349512125098, "pqem": 0.8089887640449438, "pqem_stderr": 0.029547046053276616, "perfect_em": 0.7696629213483146, "perfect_em_stderr": 0.03164794946543343 }, "harness|bbh:sports_understanding|3": { "em": 0.656, "em_stderr": 0.03010450339231639, "qem": 0.656, "qem_stderr": 0.03010450339231639, "pem": 0.728, "pem_stderr": 0.02820008829631, "pqem": 0.728, "pqem_stderr": 0.02820008829631, "perfect_em": 0.656, "perfect_em_stderr": 0.03010450339231639 }, "harness|bbh:temporal_sequences|3": { "em": 0.712, "em_stderr": 0.02869700458739821, "qem": 0.712, "qem_stderr": 0.02869700458739821, "pem": 0.716, "pem_stderr": 0.0285769587304374, "pqem": 0.78, "pqem_stderr": 0.026251792824605824, "perfect_em": 0.712, "perfect_em_stderr": 0.02869700458739821 }, "harness|bbh:tracking_shuffled_objects_five_objects|3": { "em": 0.216, "em_stderr": 0.02607865766373272, "qem": 0.216, "qem_stderr": 0.02607865766373272, "pem": 0.216, "pem_stderr": 0.02607865766373272, "pqem": 0.368, "pqem_stderr": 0.03056207062099316, "perfect_em": 0.216, "perfect_em_stderr": 0.02607865766373272 }, "harness|bbh:tracking_shuffled_objects_seven_objects|3": { "em": 0.24, "em_stderr": 0.027065293652239003, "qem": 0.24, "qem_stderr": 0.027065293652239003, "pem": 0.24, "pem_stderr": 0.027065293652239003, "pqem": 0.34, "pqem_stderr": 0.030020073605457904, "perfect_em": 0.24, "perfect_em_stderr": 0.027065293652239003 }, "harness|bbh:tracking_shuffled_objects_three_objects|3": { "em": 0.292, "em_stderr": 0.028814320402205638, "qem": 0.292, "qem_stderr": 0.028814320402205638, "pem": 0.292, "pem_stderr": 0.028814320402205638, "pqem": 0.576, "pqem_stderr": 0.03131803437491614, "perfect_em": 0.292, "perfect_em_stderr": 0.028814320402205638 }, "harness|bbh:_average|3": { "em": 0.5510306059231651, "em_stderr": 0.02968231216211991, "qem": 0.5510306059231651, "qem_stderr": 0.02968231216211991, "pem": 0.5740227693333602, "pem_stderr": 0.029595284690676182, "pqem": 0.6470728190752378, "pqem_stderr": 0.029011275124297724, "perfect_em": 0.5510306059231651, "perfect_em_stderr": 0.02968231216211991 } }, "versions": { "harness|bbh:causal_judgment|3": 0, "harness|bbh:date_understanding|3": 0, "harness|bbh:disambiguation_qa|3": 0, "harness|bbh:geometric_shapes|3": 0, "harness|bbh:logical_deduction_five_objects|3": 0, "harness|bbh:logical_deduction_seven_objects|3": 0, "harness|bbh:logical_deduction_three_objects|3": 0, "harness|bbh:movie_recommendation|3": 0, "harness|bbh:navigate|3": 0, "harness|bbh:reasoning_about_colored_objects|3": 0, "harness|bbh:ruin_names|3": 0, "harness|bbh:salient_translation_error_detection|3": 0, "harness|bbh:snarks|3": 0, "harness|bbh:sports_understanding|3": 0, "harness|bbh:temporal_sequences|3": 0, "harness|bbh:tracking_shuffled_objects_five_objects|3": 0, "harness|bbh:tracking_shuffled_objects_seven_objects|3": 0, "harness|bbh:tracking_shuffled_objects_three_objects|3": 0 }, "config_tasks": { "harness|bbh:causal_judgment": { "name": "bbh:causal_judgment", "prompt_function": "bbh_causal_judgment", "hf_repo": "lukaemon/bbh", "hf_subset": "causal_judgement", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 187, "effective_num_docs": 187, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:date_understanding": { "name": "bbh:date_understanding", "prompt_function": "bbh_date_understanding", "hf_repo": "lukaemon/bbh", "hf_subset": "date_understanding", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:disambiguation_qa": { "name": "bbh:disambiguation_qa", "prompt_function": "bbh_disambiguation_qa", "hf_repo": "lukaemon/bbh", "hf_subset": "disambiguation_qa", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:geometric_shapes": { "name": "bbh:geometric_shapes", "prompt_function": "bbh_geometric_shapes", "hf_repo": "lukaemon/bbh", "hf_subset": "geometric_shapes", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_five_objects": { "name": "bbh:logical_deduction_five_objects", "prompt_function": "bbh_logical_deduction_five_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_five_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_seven_objects": { "name": "bbh:logical_deduction_seven_objects", "prompt_function": "bbh_logical_deduction_seven_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_seven_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_three_objects": { "name": "bbh:logical_deduction_three_objects", "prompt_function": "bbh_logical_deduction_three_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_three_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:movie_recommendation": { "name": "bbh:movie_recommendation", "prompt_function": "bbh_movie_recommendation", "hf_repo": "lukaemon/bbh", "hf_subset": "movie_recommendation", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 249, "effective_num_docs": 249, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:navigate": { "name": "bbh:navigate", "prompt_function": "bbh_navigate", "hf_repo": "lukaemon/bbh", "hf_subset": "navigate", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:reasoning_about_colored_objects": { "name": "bbh:reasoning_about_colored_objects", "prompt_function": "bbh_reasoning_about_colored_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "reasoning_about_colored_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:ruin_names": { "name": "bbh:ruin_names", "prompt_function": "bbh_ruin_names", "hf_repo": "lukaemon/bbh", "hf_subset": "ruin_names", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 248, "effective_num_docs": 248, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:salient_translation_error_detection": { "name": "bbh:salient_translation_error_detection", "prompt_function": "bbh_salient_translation_error_detection", "hf_repo": "lukaemon/bbh", "hf_subset": "salient_translation_error_detection", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:snarks": { "name": "bbh:snarks", "prompt_function": "bbh_snarks", "hf_repo": "lukaemon/bbh", "hf_subset": "snarks", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 178, "effective_num_docs": 178, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:sports_understanding": { "name": "bbh:sports_understanding", "prompt_function": "bbh_sports_understanding", "hf_repo": "lukaemon/bbh", "hf_subset": "sports_understanding", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:temporal_sequences": { "name": "bbh:temporal_sequences", "prompt_function": "bbh_temporal_sequences", "hf_repo": "lukaemon/bbh", "hf_subset": "temporal_sequences", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_five_objects": { "name": "bbh:tracking_shuffled_objects_five_objects", "prompt_function": "bbh_tracking_shuffled_objects_five_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_five_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_seven_objects": { "name": "bbh:tracking_shuffled_objects_seven_objects", "prompt_function": "bbh_tracking_shuffled_objects_seven_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_seven_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_three_objects": { "name": "bbh:tracking_shuffled_objects_three_objects", "prompt_function": "bbh_tracking_shuffled_objects_three_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_three_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null } }, "summary_tasks": { "harness|bbh:causal_judgment|3": { "hashes": { "hash_examples": "63218f5ae055ab2b", "hash_full_prompts": "7303fa1d0fe0b29a", "hash_input_tokens": "94e6ca97dc7a8d65", "hash_cont_tokens": "c1fd192768c0e57e" }, "truncated": 187, "non_truncated": 0, "padded": 0, "non_padded": 187, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:date_understanding|3": { "hashes": { "hash_examples": "f145c7a06def3c8e", "hash_full_prompts": "69e60d10afa5a6f1", "hash_input_tokens": "56c1b1dfb318cc75", "hash_cont_tokens": "07e69cb17d2c8831" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:disambiguation_qa|3": { "hashes": { "hash_examples": "19677fd1773f7eb9", "hash_full_prompts": "ae0a8fd428f9aee3", "hash_input_tokens": "bc3e442621b75177", "hash_cont_tokens": "b0bec51398a8ab33" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:geometric_shapes|3": { "hashes": { "hash_examples": "76c7b11a13cc72a9", "hash_full_prompts": "76633257f67207f9", "hash_input_tokens": "18d576df2960751d", "hash_cont_tokens": "e45ae98082e3f8e4" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_five_objects|3": { "hashes": { "hash_examples": "0e958c856332a745", "hash_full_prompts": "3c96645848786efd", "hash_input_tokens": "36a60b866a1bf813", "hash_cont_tokens": "d0fd54910c263084" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_seven_objects|3": { "hashes": { "hash_examples": "ab9de25a5eb40d09", "hash_full_prompts": "185c5851c101ee66", "hash_input_tokens": "c1e2e1d71455bb49", "hash_cont_tokens": "be34a7e0cad54ac6" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_three_objects|3": { "hashes": { "hash_examples": "3c6bf52517714218", "hash_full_prompts": "8ba2d94357e589d0", "hash_input_tokens": "70f1b3c78b924815", "hash_cont_tokens": "9809821397c70a07" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:movie_recommendation|3": { "hashes": { "hash_examples": "2d9dc4975935d31a", "hash_full_prompts": "a411e216d0f5f626", "hash_input_tokens": "d671ce3b88ee45cd", "hash_cont_tokens": "bb2b8b6208282129" }, "truncated": 249, "non_truncated": 0, "padded": 0, "non_padded": 249, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:navigate|3": { "hashes": { "hash_examples": "ba91dcdb9a064255", "hash_full_prompts": "ebb3084ecc78a46a", "hash_input_tokens": "51743c1fef4a5482", "hash_cont_tokens": "be308b10012c18c5" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:reasoning_about_colored_objects|3": { "hashes": { "hash_examples": "a6ba328c4c3385d2", "hash_full_prompts": "38328d016a4ebef3", "hash_input_tokens": "6897c18acd616cb9", "hash_cont_tokens": "41e24bff8937bc90" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:ruin_names|3": { "hashes": { "hash_examples": "2ef28d5f2d4fdd25", "hash_full_prompts": "9c7d0493c37182d6", "hash_input_tokens": "4d618e950c8d013d", "hash_cont_tokens": "af8b825ca94f816b" }, "truncated": 248, "non_truncated": 0, "padded": 0, "non_padded": 248, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:salient_translation_error_detection|3": { "hashes": { "hash_examples": "c13f25ec8ffed496", "hash_full_prompts": "edccd4061b168b78", "hash_input_tokens": "fcdd25281b1eba05", "hash_cont_tokens": "23b483e6c7f2007c" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:snarks|3": { "hashes": { "hash_examples": "5f6db7bff7f6f22e", "hash_full_prompts": "31cafd95ab850a44", "hash_input_tokens": "16886c991ce348c1", "hash_cont_tokens": "f03d63eceacec321" }, "truncated": 178, "non_truncated": 0, "padded": 0, "non_padded": 178, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:sports_understanding|3": { "hashes": { "hash_examples": "042afbe5d9c1f02d", "hash_full_prompts": "3d46581e9bbec2d0", "hash_input_tokens": "117a2d8c0e6cb894", "hash_cont_tokens": "ffeb0de5394f482b" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:temporal_sequences|3": { "hashes": { "hash_examples": "803a05f352eb6afc", "hash_full_prompts": "4a54db144a5dd222", "hash_input_tokens": "70740a88f84e4a13", "hash_cont_tokens": "e734dc7f2a09b368" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_five_objects|3": { "hashes": { "hash_examples": "2bbac6db7ab0d527", "hash_full_prompts": "e3079106787cc311", "hash_input_tokens": "c8d7203b8c369cb8", "hash_cont_tokens": "4906a9453f52f313" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_seven_objects|3": { "hashes": { "hash_examples": "845caf093ac2b58c", "hash_full_prompts": "6364e5b860590ec8", "hash_input_tokens": "beb0b08cec3d048f", "hash_cont_tokens": "ea591550a02af892" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_three_objects|3": { "hashes": { "hash_examples": "9004f14d5a32b9a8", "hash_full_prompts": "01aef56c4d1fe9fe", "hash_input_tokens": "9642e09abf045647", "hash_cont_tokens": "423b3e314913c518" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 } }, "summary_general": { "hashes": { "hash_examples": "4ff1e3dc5703575d", "hash_full_prompts": "1cbeab0a00117cb8", "hash_input_tokens": "2a35e6d8f7c2fc79", "hash_cont_tokens": "7c03f3b4f9d962b4" }, "truncated": 4362, "non_truncated": 0, "padded": 0, "non_padded": 4362, "num_truncated_few_shots": 0 } }