{ "config_general": { "lighteval_sha": "?", "num_fewshot_seeds": 1, "override_batch_size": 1, "max_samples": null, "job_id": "", "start_time": 390765.741940286, "end_time": 391011.411535347, "total_evaluation_time_secondes": "245.66959506098647", "model_name": "HuggingFaceH4/zephyr-7b-beta-ift", "model_sha": "841a9223d05bc50686a493fbe441f0ef91b406b4", "model_dtype": "torch.bfloat16", "model_size": "13.99 GB", "config": null }, "results": { "harness|bbh:causal_judgment|3": { "em": 0.5026737967914439, "em_stderr": 0.03666125454759919, "qem": 0.5026737967914439, "qem_stderr": 0.03666125454759919, "pem": 0.5026737967914439, "pem_stderr": 0.03666125454759919, "pqem": 0.5026737967914439, "pqem_stderr": 0.03666125454759919, "perfect_em": 0.5026737967914439, "perfect_em_stderr": 0.03666125454759919 }, "harness|bbh:date_understanding|3": { "em": 0.456, "em_stderr": 0.03156328506121339, "qem": 0.456, "qem_stderr": 0.03156328506121339, "pem": 0.456, "pem_stderr": 0.03156328506121339, "pqem": 0.56, "pqem_stderr": 0.031457244522235646, "perfect_em": 0.456, "perfect_em_stderr": 0.03156328506121339 }, "harness|bbh:disambiguation_qa|3": { "em": 0.564, "em_stderr": 0.03142556706028128, "qem": 0.564, "qem_stderr": 0.03142556706028128, "pem": 0.572, "pem_stderr": 0.03135596892377261, "pqem": 0.696, "pqem_stderr": 0.029150213374159677, "perfect_em": 0.564, "perfect_em_stderr": 0.03142556706028128 }, "harness|bbh:geometric_shapes|3": { "em": 0.1, "em_stderr": 0.019011727515734357, "qem": 0.1, "qem_stderr": 0.019011727515734357, "pem": 0.2, "pem_stderr": 0.025348970020979078, "pqem": 0.2, "pqem_stderr": 0.025348970020979078, "perfect_em": 0.1, "perfect_em_stderr": 0.019011727515734357 }, "harness|bbh:logical_deduction_five_objects|3": { "em": 0.336, "em_stderr": 0.029933259094191516, "qem": 0.336, "qem_stderr": 0.029933259094191516, "pem": 0.336, "pem_stderr": 0.029933259094191516, "pqem": 0.456, "pqem_stderr": 0.03156328506121339, "perfect_em": 0.336, "perfect_em_stderr": 0.029933259094191516 }, "harness|bbh:logical_deduction_seven_objects|3": { "em": 0.256, "em_stderr": 0.027657108718204915, "qem": 0.256, "qem_stderr": 0.027657108718204915, "pem": 0.256, "pem_stderr": 0.027657108718204915, "pqem": 0.392, "pqem_stderr": 0.030938207620401195, "perfect_em": 0.256, "perfect_em_stderr": 0.027657108718204915 }, "harness|bbh:logical_deduction_three_objects|3": { "em": 0.436, "em_stderr": 0.03142556706028128, "qem": 0.436, "qem_stderr": 0.03142556706028128, "pem": 0.436, "pem_stderr": 0.03142556706028128, "pqem": 0.684, "pqem_stderr": 0.029462657598578676, "perfect_em": 0.436, "perfect_em_stderr": 0.03142556706028128 }, "harness|bbh:movie_recommendation|3": { "em": 0.642570281124498, "em_stderr": 0.030431951782206115, "qem": 0.642570281124498, "qem_stderr": 0.030431951782206115, "pem": 0.642570281124498, "pem_stderr": 0.030431951782206115, "pqem": 0.7469879518072289, "pqem_stderr": 0.027605877680456924, "perfect_em": 0.642570281124498, "perfect_em_stderr": 0.030431951782206115 }, "harness|bbh:navigate|3": { "em": 0.596, "em_stderr": 0.031096688184825298, "qem": 0.596, "qem_stderr": 0.031096688184825298, "pem": 0.596, "pem_stderr": 0.031096688184825298, "pqem": 0.596, "pqem_stderr": 0.031096688184825298, "perfect_em": 0.596, "perfect_em_stderr": 0.031096688184825298 }, "harness|bbh:reasoning_about_colored_objects|3": { "em": 0.092, "em_stderr": 0.018316275379429644, "qem": 0.092, "qem_stderr": 0.018316275379429644, "pem": 0.304, "pem_stderr": 0.029150213374159673, "pqem": 0.472, "pqem_stderr": 0.031636489531544396, "perfect_em": 0.092, "perfect_em_stderr": 0.018316275379429644 }, "harness|bbh:ruin_names|3": { "em": 0.3629032258064516, "em_stderr": 0.030594942459036583, "qem": 0.3629032258064516, "qem_stderr": 0.030594942459036583, "pem": 0.375, "pem_stderr": 0.03080400363063401, "pqem": 0.5483870967741935, "pqem_stderr": 0.03166491365125692, "perfect_em": 0.3629032258064516, "perfect_em_stderr": 0.030594942459036583 }, "harness|bbh:salient_translation_error_detection|3": { "em": 0.324, "em_stderr": 0.029658294924545567, "qem": 0.324, "qem_stderr": 0.029658294924545567, "pem": 0.324, "pem_stderr": 0.029658294924545567, "pqem": 0.448, "pqem_stderr": 0.03151438761115355, "perfect_em": 0.324, "perfect_em_stderr": 0.029658294924545567 }, "harness|bbh:snarks|3": { "em": 0.5674157303370787, "em_stderr": 0.037239120377075136, "qem": 0.5674157303370787, "qem_stderr": 0.037239120377075136, "pem": 0.5730337078651685, "pem_stderr": 0.03717921762559316, "pqem": 0.6348314606741573, "pqem_stderr": 0.03619005678691266, "perfect_em": 0.5674157303370787, "perfect_em_stderr": 0.037239120377075136 }, "harness|bbh:sports_understanding|3": { "em": 0.688, "em_stderr": 0.029361067575219817, "qem": 0.688, "qem_stderr": 0.029361067575219817, "pem": 0.768, "pem_stderr": 0.026750070374865164, "pqem": 0.768, "pqem_stderr": 0.026750070374865164, "perfect_em": 0.688, "perfect_em_stderr": 0.029361067575219817 }, "harness|bbh:temporal_sequences|3": { "em": 0.208, "em_stderr": 0.025721398901416392, "qem": 0.208, "qem_stderr": 0.025721398901416392, "pem": 0.208, "pem_stderr": 0.025721398901416392, "pqem": 0.468, "pqem_stderr": 0.03162125257572551, "perfect_em": 0.208, "perfect_em_stderr": 0.025721398901416392 }, "harness|bbh:tracking_shuffled_objects_five_objects|3": { "em": 0.196, "em_stderr": 0.02515685731325595, "qem": 0.196, "qem_stderr": 0.02515685731325595, "pem": 0.196, "pem_stderr": 0.02515685731325595, "pqem": 0.396, "pqem_stderr": 0.030993197854577853, "perfect_em": 0.196, "perfect_em_stderr": 0.02515685731325595 }, "harness|bbh:tracking_shuffled_objects_seven_objects|3": { "em": 0.132, "em_stderr": 0.02145098082403812, "qem": 0.132, "qem_stderr": 0.02145098082403812, "pem": 0.132, "pem_stderr": 0.02145098082403812, "pqem": 0.272, "pqem_stderr": 0.02820008829631, "perfect_em": 0.132, "perfect_em_stderr": 0.02145098082403812 }, "harness|bbh:tracking_shuffled_objects_three_objects|3": { "em": 0.308, "em_stderr": 0.029256928606501864, "qem": 0.308, "qem_stderr": 0.029256928606501864, "pem": 0.36, "pem_stderr": 0.030418764025174978, "pqem": 0.672, "pqem_stderr": 0.02975239182447539, "perfect_em": 0.308, "perfect_em_stderr": 0.029256928606501864 }, "harness|bbh:_average|3": { "em": 0.37597572411441504, "em_stderr": 0.028664570854725364, "qem": 0.37597572411441504, "qem_stderr": 0.028664570854725364, "pem": 0.4020709880989506, "pem_stderr": 0.02954243635483091, "pqem": 0.5284933503359459, "pqem_stderr": 0.030644847062070588, "perfect_em": 0.37597572411441504, "perfect_em_stderr": 0.028664570854725364 } }, "versions": { "harness|bbh:causal_judgment|3": 0, "harness|bbh:date_understanding|3": 0, "harness|bbh:disambiguation_qa|3": 0, "harness|bbh:geometric_shapes|3": 0, "harness|bbh:logical_deduction_five_objects|3": 0, "harness|bbh:logical_deduction_seven_objects|3": 0, "harness|bbh:logical_deduction_three_objects|3": 0, "harness|bbh:movie_recommendation|3": 0, "harness|bbh:navigate|3": 0, "harness|bbh:reasoning_about_colored_objects|3": 0, "harness|bbh:ruin_names|3": 0, "harness|bbh:salient_translation_error_detection|3": 0, "harness|bbh:snarks|3": 0, "harness|bbh:sports_understanding|3": 0, "harness|bbh:temporal_sequences|3": 0, "harness|bbh:tracking_shuffled_objects_five_objects|3": 0, "harness|bbh:tracking_shuffled_objects_seven_objects|3": 0, "harness|bbh:tracking_shuffled_objects_three_objects|3": 0 }, "config_tasks": { "harness|bbh:causal_judgment": { "name": "bbh:causal_judgment", "prompt_function": "bbh_causal_judgment", "hf_repo": "lukaemon/bbh", "hf_subset": "causal_judgement", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 187, "effective_num_docs": 187, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:date_understanding": { "name": "bbh:date_understanding", "prompt_function": "bbh_date_understanding", "hf_repo": "lukaemon/bbh", "hf_subset": "date_understanding", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:disambiguation_qa": { "name": "bbh:disambiguation_qa", "prompt_function": "bbh_disambiguation_qa", "hf_repo": "lukaemon/bbh", "hf_subset": "disambiguation_qa", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:geometric_shapes": { "name": "bbh:geometric_shapes", "prompt_function": "bbh_geometric_shapes", "hf_repo": "lukaemon/bbh", "hf_subset": "geometric_shapes", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_five_objects": { "name": "bbh:logical_deduction_five_objects", "prompt_function": "bbh_logical_deduction_five_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_five_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_seven_objects": { "name": "bbh:logical_deduction_seven_objects", "prompt_function": "bbh_logical_deduction_seven_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_seven_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_three_objects": { "name": "bbh:logical_deduction_three_objects", "prompt_function": "bbh_logical_deduction_three_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_three_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:movie_recommendation": { "name": "bbh:movie_recommendation", "prompt_function": "bbh_movie_recommendation", "hf_repo": "lukaemon/bbh", "hf_subset": "movie_recommendation", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 249, "effective_num_docs": 249, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:navigate": { "name": "bbh:navigate", "prompt_function": "bbh_navigate", "hf_repo": "lukaemon/bbh", "hf_subset": "navigate", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:reasoning_about_colored_objects": { "name": "bbh:reasoning_about_colored_objects", "prompt_function": "bbh_reasoning_about_colored_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "reasoning_about_colored_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:ruin_names": { "name": "bbh:ruin_names", "prompt_function": "bbh_ruin_names", "hf_repo": "lukaemon/bbh", "hf_subset": "ruin_names", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 248, "effective_num_docs": 248, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:salient_translation_error_detection": { "name": "bbh:salient_translation_error_detection", "prompt_function": "bbh_salient_translation_error_detection", "hf_repo": "lukaemon/bbh", "hf_subset": "salient_translation_error_detection", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:snarks": { "name": "bbh:snarks", "prompt_function": "bbh_snarks", "hf_repo": "lukaemon/bbh", "hf_subset": "snarks", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 178, "effective_num_docs": 178, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:sports_understanding": { "name": "bbh:sports_understanding", "prompt_function": "bbh_sports_understanding", "hf_repo": "lukaemon/bbh", "hf_subset": "sports_understanding", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:temporal_sequences": { "name": "bbh:temporal_sequences", "prompt_function": "bbh_temporal_sequences", "hf_repo": "lukaemon/bbh", "hf_subset": "temporal_sequences", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_five_objects": { "name": "bbh:tracking_shuffled_objects_five_objects", "prompt_function": "bbh_tracking_shuffled_objects_five_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_five_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_seven_objects": { "name": "bbh:tracking_shuffled_objects_seven_objects", "prompt_function": "bbh_tracking_shuffled_objects_seven_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_seven_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_three_objects": { "name": "bbh:tracking_shuffled_objects_three_objects", "prompt_function": "bbh_tracking_shuffled_objects_three_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_three_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null } }, "summary_tasks": { "harness|bbh:causal_judgment|3": { "hashes": { "hash_examples": "63218f5ae055ab2b", "hash_full_prompts": "fa8168f39a475fb0", "hash_input_tokens": "787f75e06fd43c0d", "hash_cont_tokens": "d38fce6b83cb6cf8" }, "truncated": 187, "non_truncated": 0, "padded": 0, "non_padded": 187, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:date_understanding|3": { "hashes": { "hash_examples": "f145c7a06def3c8e", "hash_full_prompts": "2cceeea606638d49", "hash_input_tokens": "10c13d6fb8af7c22", "hash_cont_tokens": "afe1a458d01d6b49" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:disambiguation_qa|3": { "hashes": { "hash_examples": "19677fd1773f7eb9", "hash_full_prompts": "d8f1ba70c22ae578", "hash_input_tokens": "c21a88707f480cab", "hash_cont_tokens": "975af0d0edb5e548" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:geometric_shapes|3": { "hashes": { "hash_examples": "76c7b11a13cc72a9", "hash_full_prompts": "52a60ed1d0113b8b", "hash_input_tokens": "10e113b2cf3fa584", "hash_cont_tokens": "d7bd15f16aa3a69e" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_five_objects|3": { "hashes": { "hash_examples": "0e958c856332a745", "hash_full_prompts": "253aa9791c941909", "hash_input_tokens": "0bc166cab0aed76a", "hash_cont_tokens": "33afca6b1b153349" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_seven_objects|3": { "hashes": { "hash_examples": "ab9de25a5eb40d09", "hash_full_prompts": "aa6117f601cd268e", "hash_input_tokens": "ab99c78b48e3a0bb", "hash_cont_tokens": "cb0a829037414bc4" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_three_objects|3": { "hashes": { "hash_examples": "3c6bf52517714218", "hash_full_prompts": "1892b050bc7848a4", "hash_input_tokens": "a720b56aa7c52551", "hash_cont_tokens": "df1c7a934ff4d4fd" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:movie_recommendation|3": { "hashes": { "hash_examples": "2d9dc4975935d31a", "hash_full_prompts": "8e00606ed3407167", "hash_input_tokens": "c825ab1c99245a17", "hash_cont_tokens": "8993eba556c19cb1" }, "truncated": 249, "non_truncated": 0, "padded": 0, "non_padded": 249, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:navigate|3": { "hashes": { "hash_examples": "ba91dcdb9a064255", "hash_full_prompts": "8d50c5baf1df7aef", "hash_input_tokens": "f234e6b28ea1fa49", "hash_cont_tokens": "525142c03c67fa52" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:reasoning_about_colored_objects|3": { "hashes": { "hash_examples": "a6ba328c4c3385d2", "hash_full_prompts": "3d2441a21c12a960", "hash_input_tokens": "f3b577892955aa84", "hash_cont_tokens": "70ffb1d11ef7cb70" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:ruin_names|3": { "hashes": { "hash_examples": "2ef28d5f2d4fdd25", "hash_full_prompts": "ba95caa786f313b1", "hash_input_tokens": "9954b30d4205604a", "hash_cont_tokens": "117e9fc172132861" }, "truncated": 248, "non_truncated": 0, "padded": 0, "non_padded": 248, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:salient_translation_error_detection|3": { "hashes": { "hash_examples": "c13f25ec8ffed496", "hash_full_prompts": "a8512d174e1cab8f", "hash_input_tokens": "3e738df24b7eddf8", "hash_cont_tokens": "78a8d7de4e56c3f6" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:snarks|3": { "hashes": { "hash_examples": "5f6db7bff7f6f22e", "hash_full_prompts": "ff91d81466b9041f", "hash_input_tokens": "21388b09e13d0208", "hash_cont_tokens": "25008d3d52e836a9" }, "truncated": 178, "non_truncated": 0, "padded": 0, "non_padded": 178, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:sports_understanding|3": { "hashes": { "hash_examples": "042afbe5d9c1f02d", "hash_full_prompts": "a59324d9eb37e0f5", "hash_input_tokens": "0ad41bb8d2290a5b", "hash_cont_tokens": "481132373d21794f" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:temporal_sequences|3": { "hashes": { "hash_examples": "803a05f352eb6afc", "hash_full_prompts": "1b3971192bf481e7", "hash_input_tokens": "3051b60940ccceab", "hash_cont_tokens": "bc6468999bd8da8a" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_five_objects|3": { "hashes": { "hash_examples": "2bbac6db7ab0d527", "hash_full_prompts": "7ef4567d2fcf5094", "hash_input_tokens": "b841310ee5531238", "hash_cont_tokens": "614b8cc82e5424f3" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_seven_objects|3": { "hashes": { "hash_examples": "845caf093ac2b58c", "hash_full_prompts": "196a0f8712857624", "hash_input_tokens": "3e738df24b7eddf8", "hash_cont_tokens": "08ee1d03a1ca9cfa" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_three_objects|3": { "hashes": { "hash_examples": "9004f14d5a32b9a8", "hash_full_prompts": "592a03f0518f17b6", "hash_input_tokens": "19e0ef1dd5ae9d33", "hash_cont_tokens": "ff481f64829c6e9d" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 } }, "summary_general": { "hashes": { "hash_examples": "4ff1e3dc5703575d", "hash_full_prompts": "0d80ce968d89d4ef", "hash_input_tokens": "72bda1e7aeb34786", "hash_cont_tokens": "5e3e2b1aa7251ed7" }, "truncated": 4362, "non_truncated": 0, "padded": 0, "non_padded": 4362, "num_truncated_few_shots": 0 } }