{ "config_general": { "lighteval_sha": "?", "num_fewshot_seeds": 1, "override_batch_size": 1, "max_samples": null, "job_id": "", "start_time": 562.475003076, "end_time": 831.651632499, "total_evaluation_time_secondes": "269.176629423", "model_name": "HuggingFaceH4/zephyr-7b-beta-ift", "model_sha": "066d418a66341c2f15e3ffc90b17b6478f5773f5", "model_dtype": "torch.bfloat16", "model_size": "13.99 GB", "config": null }, "results": { "harness|bbh:causal_judgment|3": { "em": 0.5240641711229946, "em_stderr": 0.03661929361528703, "qem": 0.5240641711229946, "qem_stderr": 0.03661929361528703, "pem": 0.5828877005347594, "pem_stderr": 0.03615450931140827, "pqem": 0.5828877005347594, "pqem_stderr": 0.03615450931140827, "perfect_em": 0.5240641711229946, "perfect_em_stderr": 0.03661929361528703 }, "harness|bbh:date_understanding|3": { "em": 0.488, "em_stderr": 0.03167708558254709, "qem": 0.488, "qem_stderr": 0.03167708558254709, "pem": 0.488, "pem_stderr": 0.03167708558254709, "pqem": 0.572, "pqem_stderr": 0.0313559689237726, "perfect_em": 0.488, "perfect_em_stderr": 0.03167708558254709 }, "harness|bbh:disambiguation_qa|3": { "em": 0.636, "em_stderr": 0.030491555220405555, "qem": 0.636, "qem_stderr": 0.030491555220405555, "pem": 0.636, "pem_stderr": 0.030491555220405555, "pqem": 0.7, "pqem_stderr": 0.029040893477575845, "perfect_em": 0.636, "perfect_em_stderr": 0.030491555220405555 }, "harness|bbh:geometric_shapes|3": { "em": 0.26, "em_stderr": 0.0277973157526443, "qem": 0.26, "qem_stderr": 0.0277973157526443, "pem": 0.26, "pem_stderr": 0.0277973157526443, "pqem": 0.26, "pqem_stderr": 0.0277973157526443, "perfect_em": 0.26, "perfect_em_stderr": 0.0277973157526443 }, "harness|bbh:logical_deduction_five_objects|3": { "em": 0.368, "em_stderr": 0.030562070620993163, "qem": 0.368, "qem_stderr": 0.030562070620993163, "pem": 0.368, "pem_stderr": 0.030562070620993163, "pqem": 0.432, "pqem_stderr": 0.031391810765429407, "perfect_em": 0.368, "perfect_em_stderr": 0.030562070620993163 }, "harness|bbh:logical_deduction_seven_objects|3": { "em": 0.296, "em_stderr": 0.028928939388379624, "qem": 0.296, "qem_stderr": 0.028928939388379624, "pem": 0.296, "pem_stderr": 0.028928939388379624, "pqem": 0.4, "pqem_stderr": 0.03104602102825324, "perfect_em": 0.296, "perfect_em_stderr": 0.028928939388379624 }, "harness|bbh:logical_deduction_three_objects|3": { "em": 0.532, "em_stderr": 0.031621252575725504, "qem": 0.532, "qem_stderr": 0.031621252575725504, "pem": 0.532, "pem_stderr": 0.031621252575725504, "pqem": 0.704, "pqem_stderr": 0.02892893938837963, "perfect_em": 0.532, "perfect_em_stderr": 0.031621252575725504 }, "harness|bbh:movie_recommendation|3": { "em": 0.7389558232931727, "em_stderr": 0.027889479062834757, "qem": 0.7389558232931727, "qem_stderr": 0.027889479062834757, "pem": 0.7389558232931727, "pem_stderr": 0.027889479062834757, "pqem": 0.7951807228915663, "pqem_stderr": 0.02562668883759821, "perfect_em": 0.7389558232931727, "perfect_em_stderr": 0.027889479062834757 }, "harness|bbh:navigate|3": { "em": 0.6, "em_stderr": 0.031046021028253257, "qem": 0.6, "qem_stderr": 0.031046021028253257, "pem": 0.6, "pem_stderr": 0.031046021028253257, "pqem": 0.6, "pqem_stderr": 0.031046021028253257, "perfect_em": 0.6, "perfect_em_stderr": 0.031046021028253257 }, "harness|bbh:reasoning_about_colored_objects|3": { "em": 0.376, "em_stderr": 0.030696336267394583, "qem": 0.376, "qem_stderr": 0.030696336267394583, "pem": 0.376, "pem_stderr": 0.030696336267394583, "pqem": 0.512, "pqem_stderr": 0.03167708558254709, "perfect_em": 0.376, "perfect_em_stderr": 0.030696336267394583 }, "harness|bbh:ruin_names|3": { "em": 0.36693548387096775, "em_stderr": 0.03066693445085009, "qem": 0.36693548387096775, "qem_stderr": 0.03066693445085009, "pem": 0.36693548387096775, "pem_stderr": 0.03066693445085009, "pqem": 0.4475806451612903, "pqem_stderr": 0.03163891746142308, "perfect_em": 0.36693548387096775, "perfect_em_stderr": 0.03066693445085009 }, "harness|bbh:salient_translation_error_detection|3": { "em": 0.356, "em_stderr": 0.030343680657153215, "qem": 0.356, "qem_stderr": 0.030343680657153215, "pem": 0.356, "pem_stderr": 0.030343680657153215, "pqem": 0.48, "pqem_stderr": 0.031660853408495185, "perfect_em": 0.356, "perfect_em_stderr": 0.030343680657153215 }, "harness|bbh:snarks|3": { "em": 0.4887640449438202, "em_stderr": 0.03757281091983852, "qem": 0.4887640449438202, "qem_stderr": 0.03757281091983852, "pem": 0.4887640449438202, "pem_stderr": 0.03757281091983852, "pqem": 0.550561797752809, "pqem_stderr": 0.03738964966056965, "perfect_em": 0.4887640449438202, "perfect_em_stderr": 0.03757281091983852 }, "harness|bbh:sports_understanding|3": { "em": 0.324, "em_stderr": 0.029658294924545563, "qem": 0.324, "qem_stderr": 0.029658294924545563, "pem": 0.792, "pem_stderr": 0.025721398901416392, "pqem": 0.792, "pqem_stderr": 0.025721398901416392, "perfect_em": 0.324, "perfect_em_stderr": 0.029658294924545563 }, "harness|bbh:temporal_sequences|3": { "em": 0.088, "em_stderr": 0.017953084777052868, "qem": 0.088, "qem_stderr": 0.017953084777052868, "pem": 0.088, "pem_stderr": 0.017953084777052868, "pqem": 0.344, "pqem_stderr": 0.030104503392316385, "perfect_em": 0.088, "perfect_em_stderr": 0.017953084777052868 }, "harness|bbh:tracking_shuffled_objects_five_objects|3": { "em": 0.2, "em_stderr": 0.02534897002097908, "qem": 0.2, "qem_stderr": 0.02534897002097908, "pem": 0.2, "pem_stderr": 0.02534897002097908, "pqem": 0.396, "pqem_stderr": 0.030993197854577857, "perfect_em": 0.2, "perfect_em_stderr": 0.02534897002097908 }, "harness|bbh:tracking_shuffled_objects_seven_objects|3": { "em": 0.096, "em_stderr": 0.018668961419477173, "qem": 0.096, "qem_stderr": 0.018668961419477173, "pem": 0.096, "pem_stderr": 0.018668961419477173, "pqem": 0.232, "pqem_stderr": 0.02675007037486516, "perfect_em": 0.096, "perfect_em_stderr": 0.018668961419477173 }, "harness|bbh:tracking_shuffled_objects_three_objects|3": { "em": 0.388, "em_stderr": 0.030881038748993915, "qem": 0.388, "qem_stderr": 0.030881038748993915, "pem": 0.388, "pem_stderr": 0.030881038748993915, "pqem": 0.696, "pqem_stderr": 0.029150213374159673, "perfect_em": 0.388, "perfect_em_stderr": 0.030881038748993915 }, "harness|bbh:_average|3": { "em": 0.3959288624017197, "em_stderr": 0.029356840279630852, "qem": 0.3959288624017197, "qem_stderr": 0.029356840279630852, "pem": 0.4251968362579289, "pem_stderr": 0.029112302483685966, "pqem": 0.5275672703522458, "pqem_stderr": 0.03041522547353807, "perfect_em": 0.3959288624017197, "perfect_em_stderr": 0.029356840279630852 } }, "versions": { "harness|bbh:causal_judgment|3": 0, "harness|bbh:date_understanding|3": 0, "harness|bbh:disambiguation_qa|3": 0, "harness|bbh:geometric_shapes|3": 0, "harness|bbh:logical_deduction_five_objects|3": 0, "harness|bbh:logical_deduction_seven_objects|3": 0, "harness|bbh:logical_deduction_three_objects|3": 0, "harness|bbh:movie_recommendation|3": 0, "harness|bbh:navigate|3": 0, "harness|bbh:reasoning_about_colored_objects|3": 0, "harness|bbh:ruin_names|3": 0, "harness|bbh:salient_translation_error_detection|3": 0, "harness|bbh:snarks|3": 0, "harness|bbh:sports_understanding|3": 0, "harness|bbh:temporal_sequences|3": 0, "harness|bbh:tracking_shuffled_objects_five_objects|3": 0, "harness|bbh:tracking_shuffled_objects_seven_objects|3": 0, "harness|bbh:tracking_shuffled_objects_three_objects|3": 0 }, "config_tasks": { "harness|bbh:causal_judgment": { "name": "bbh:causal_judgment", "prompt_function": "bbh_causal_judgment", "hf_repo": "lukaemon/bbh", "hf_subset": "causal_judgement", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 187, "effective_num_docs": 187, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:date_understanding": { "name": "bbh:date_understanding", "prompt_function": "bbh_date_understanding", "hf_repo": "lukaemon/bbh", "hf_subset": "date_understanding", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:disambiguation_qa": { "name": "bbh:disambiguation_qa", "prompt_function": "bbh_disambiguation_qa", "hf_repo": "lukaemon/bbh", "hf_subset": "disambiguation_qa", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:geometric_shapes": { "name": "bbh:geometric_shapes", "prompt_function": "bbh_geometric_shapes", "hf_repo": "lukaemon/bbh", "hf_subset": "geometric_shapes", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_five_objects": { "name": "bbh:logical_deduction_five_objects", "prompt_function": "bbh_logical_deduction_five_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_five_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_seven_objects": { "name": "bbh:logical_deduction_seven_objects", "prompt_function": "bbh_logical_deduction_seven_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_seven_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_three_objects": { "name": "bbh:logical_deduction_three_objects", "prompt_function": "bbh_logical_deduction_three_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_three_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:movie_recommendation": { "name": "bbh:movie_recommendation", "prompt_function": "bbh_movie_recommendation", "hf_repo": "lukaemon/bbh", "hf_subset": "movie_recommendation", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 249, "effective_num_docs": 249, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:navigate": { "name": "bbh:navigate", "prompt_function": "bbh_navigate", "hf_repo": "lukaemon/bbh", "hf_subset": "navigate", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:reasoning_about_colored_objects": { "name": "bbh:reasoning_about_colored_objects", "prompt_function": "bbh_reasoning_about_colored_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "reasoning_about_colored_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:ruin_names": { "name": "bbh:ruin_names", "prompt_function": "bbh_ruin_names", "hf_repo": "lukaemon/bbh", "hf_subset": "ruin_names", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 248, "effective_num_docs": 248, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:salient_translation_error_detection": { "name": "bbh:salient_translation_error_detection", "prompt_function": "bbh_salient_translation_error_detection", "hf_repo": "lukaemon/bbh", "hf_subset": "salient_translation_error_detection", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:snarks": { "name": "bbh:snarks", "prompt_function": "bbh_snarks", "hf_repo": "lukaemon/bbh", "hf_subset": "snarks", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 178, "effective_num_docs": 178, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:sports_understanding": { "name": "bbh:sports_understanding", "prompt_function": "bbh_sports_understanding", "hf_repo": "lukaemon/bbh", "hf_subset": "sports_understanding", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:temporal_sequences": { "name": "bbh:temporal_sequences", "prompt_function": "bbh_temporal_sequences", "hf_repo": "lukaemon/bbh", "hf_subset": "temporal_sequences", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_five_objects": { "name": "bbh:tracking_shuffled_objects_five_objects", "prompt_function": "bbh_tracking_shuffled_objects_five_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_five_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_seven_objects": { "name": "bbh:tracking_shuffled_objects_seven_objects", "prompt_function": "bbh_tracking_shuffled_objects_seven_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_seven_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_three_objects": { "name": "bbh:tracking_shuffled_objects_three_objects", "prompt_function": "bbh_tracking_shuffled_objects_three_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_three_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null } }, "summary_tasks": { "harness|bbh:causal_judgment|3": { "hashes": { "hash_examples": "63218f5ae055ab2b", "hash_full_prompts": "7303fa1d0fe0b29a", "hash_input_tokens": "79663e73bb5ce6ac", "hash_cont_tokens": "9ac2a5d07673b3cc" }, "truncated": 187, "non_truncated": 0, "padded": 0, "non_padded": 187, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:date_understanding|3": { "hashes": { "hash_examples": "f145c7a06def3c8e", "hash_full_prompts": "69e60d10afa5a6f1", "hash_input_tokens": "e9bd5760c58a1104", "hash_cont_tokens": "a2d3c0bfaed524ab" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:disambiguation_qa|3": { "hashes": { "hash_examples": "19677fd1773f7eb9", "hash_full_prompts": "ae0a8fd428f9aee3", "hash_input_tokens": "b3625dcc25d708b2", "hash_cont_tokens": "ddaf838d3d035b59" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:geometric_shapes|3": { "hashes": { "hash_examples": "76c7b11a13cc72a9", "hash_full_prompts": "76633257f67207f9", "hash_input_tokens": "c16e8768d8c9056f", "hash_cont_tokens": "f22e578d2a631fbe" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_five_objects|3": { "hashes": { "hash_examples": "0e958c856332a745", "hash_full_prompts": "3c96645848786efd", "hash_input_tokens": "915443ee37f164dc", "hash_cont_tokens": "ed708a1ddb133844" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_seven_objects|3": { "hashes": { "hash_examples": "ab9de25a5eb40d09", "hash_full_prompts": "185c5851c101ee66", "hash_input_tokens": "66d532c31ef57236", "hash_cont_tokens": "59a054a54a61f53b" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_three_objects|3": { "hashes": { "hash_examples": "3c6bf52517714218", "hash_full_prompts": "8ba2d94357e589d0", "hash_input_tokens": "d51c6ad06efbf88b", "hash_cont_tokens": "ee91af80dae7a64e" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:movie_recommendation|3": { "hashes": { "hash_examples": "2d9dc4975935d31a", "hash_full_prompts": "a411e216d0f5f626", "hash_input_tokens": "e17a3080d43ae54f", "hash_cont_tokens": "42d07b8efeacd312" }, "truncated": 249, "non_truncated": 0, "padded": 0, "non_padded": 249, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:navigate|3": { "hashes": { "hash_examples": "ba91dcdb9a064255", "hash_full_prompts": "ebb3084ecc78a46a", "hash_input_tokens": "90854b0ca565c8f5", "hash_cont_tokens": "425b769b57c4208a" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:reasoning_about_colored_objects|3": { "hashes": { "hash_examples": "a6ba328c4c3385d2", "hash_full_prompts": "38328d016a4ebef3", "hash_input_tokens": "b45b5a8a531e8bf5", "hash_cont_tokens": "dc160e813848aa6d" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:ruin_names|3": { "hashes": { "hash_examples": "2ef28d5f2d4fdd25", "hash_full_prompts": "9c7d0493c37182d6", "hash_input_tokens": "627b6058879c9350", "hash_cont_tokens": "ff58bf3376badc2d" }, "truncated": 248, "non_truncated": 0, "padded": 0, "non_padded": 248, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:salient_translation_error_detection|3": { "hashes": { "hash_examples": "c13f25ec8ffed496", "hash_full_prompts": "edccd4061b168b78", "hash_input_tokens": "7d4d7e481ad8766b", "hash_cont_tokens": "0bee8a66c5c15931" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:snarks|3": { "hashes": { "hash_examples": "5f6db7bff7f6f22e", "hash_full_prompts": "31cafd95ab850a44", "hash_input_tokens": "616900bacd0ba7ca", "hash_cont_tokens": "83bb1be81de11c6c" }, "truncated": 178, "non_truncated": 0, "padded": 0, "non_padded": 178, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:sports_understanding|3": { "hashes": { "hash_examples": "042afbe5d9c1f02d", "hash_full_prompts": "3d46581e9bbec2d0", "hash_input_tokens": "8e9e99c22dd3a8d2", "hash_cont_tokens": "fae5a513c7d06269" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:temporal_sequences|3": { "hashes": { "hash_examples": "803a05f352eb6afc", "hash_full_prompts": "4a54db144a5dd222", "hash_input_tokens": "24789970b2290dd3", "hash_cont_tokens": "d9537f45453d55d2" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_five_objects|3": { "hashes": { "hash_examples": "2bbac6db7ab0d527", "hash_full_prompts": "e3079106787cc311", "hash_input_tokens": "9036045cff895b08", "hash_cont_tokens": "0bdedbe4a1e50154" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_seven_objects|3": { "hashes": { "hash_examples": "845caf093ac2b58c", "hash_full_prompts": "6364e5b860590ec8", "hash_input_tokens": "7100c488aa0764ff", "hash_cont_tokens": "ffd50b95e2438212" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_three_objects|3": { "hashes": { "hash_examples": "9004f14d5a32b9a8", "hash_full_prompts": "01aef56c4d1fe9fe", "hash_input_tokens": "b9690a5d32a586fc", "hash_cont_tokens": "9d481faa62b9a26a" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 } }, "summary_general": { "hashes": { "hash_examples": "4ff1e3dc5703575d", "hash_full_prompts": "1cbeab0a00117cb8", "hash_input_tokens": "3608679dab4ce40e", "hash_cont_tokens": "8116e774e04d9975" }, "truncated": 4362, "non_truncated": 0, "padded": 0, "non_padded": 4362, "num_truncated_few_shots": 0 } }