open-r1-eval-leaderboard
/
eval_results
/deepseek-ai
/deepseek-llm-67b-chat
/main
/bbh
/results_2024-03-18T20-51-01.093533.json
{ | |
"config_general": { | |
"lighteval_sha": "?", | |
"num_fewshot_seeds": 1, | |
"override_batch_size": 1, | |
"max_samples": null, | |
"job_id": "", | |
"start_time": 2102963.406405784, | |
"end_time": 2104805.02724996, | |
"total_evaluation_time_secondes": "1841.620844176039", | |
"model_name": "deepseek-ai/deepseek-llm-67b-chat", | |
"model_sha": "79648bef7658bb824e4630740f6e1484c1b0620b", | |
"model_dtype": "torch.bfloat16", | |
"model_size": "125.78 GB", | |
"config": null | |
}, | |
"results": { | |
"harness|bbh:causal_judgment|3": { | |
"em": 0.679144385026738, | |
"em_stderr": 0.03422783320926155, | |
"qem": 0.679144385026738, | |
"qem_stderr": 0.03422783320926155, | |
"pem": 0.679144385026738, | |
"pem_stderr": 0.03422783320926155, | |
"pqem": 0.679144385026738, | |
"pqem_stderr": 0.03422783320926155, | |
"perfect_em": 0.679144385026738, | |
"perfect_em_stderr": 0.03422783320926155 | |
}, | |
"harness|bbh:date_understanding|3": { | |
"em": 0.584, | |
"em_stderr": 0.03123585623701455, | |
"qem": 0.584, | |
"qem_stderr": 0.03123585623701455, | |
"pem": 0.584, | |
"pem_stderr": 0.03123585623701455, | |
"pqem": 0.624, | |
"pqem_stderr": 0.03069633626739458, | |
"perfect_em": 0.584, | |
"perfect_em_stderr": 0.03123585623701455 | |
}, | |
"harness|bbh:disambiguation_qa|3": { | |
"em": 0.728, | |
"em_stderr": 0.02820008829630998, | |
"qem": 0.728, | |
"qem_stderr": 0.02820008829630998, | |
"pem": 0.728, | |
"pem_stderr": 0.02820008829630998, | |
"pqem": 0.768, | |
"pqem_stderr": 0.02675007037486516, | |
"perfect_em": 0.728, | |
"perfect_em_stderr": 0.02820008829630998 | |
}, | |
"harness|bbh:geometric_shapes|3": { | |
"em": 0.396, | |
"em_stderr": 0.03099319785457785, | |
"qem": 0.396, | |
"qem_stderr": 0.03099319785457785, | |
"pem": 0.396, | |
"pem_stderr": 0.03099319785457785, | |
"pqem": 0.396, | |
"pqem_stderr": 0.03099319785457785, | |
"perfect_em": 0.396, | |
"perfect_em_stderr": 0.03099319785457785 | |
}, | |
"harness|bbh:logical_deduction_five_objects|3": { | |
"em": 0.52, | |
"em_stderr": 0.031660853408495185, | |
"qem": 0.52, | |
"qem_stderr": 0.031660853408495185, | |
"pem": 0.544, | |
"pem_stderr": 0.03156328506121339, | |
"pqem": 0.608, | |
"pqem_stderr": 0.030938207620401195, | |
"perfect_em": 0.52, | |
"perfect_em_stderr": 0.031660853408495185 | |
}, | |
"harness|bbh:logical_deduction_seven_objects|3": { | |
"em": 0.548, | |
"em_stderr": 0.03153986449255662, | |
"qem": 0.548, | |
"qem_stderr": 0.03153986449255662, | |
"pem": 0.548, | |
"pem_stderr": 0.03153986449255662, | |
"pqem": 0.584, | |
"pqem_stderr": 0.031235856237014546, | |
"perfect_em": 0.516, | |
"perfect_em_stderr": 0.03166998503010742 | |
}, | |
"harness|bbh:logical_deduction_three_objects|3": { | |
"em": 0.756, | |
"em_stderr": 0.027217995464553182, | |
"qem": 0.756, | |
"qem_stderr": 0.027217995464553182, | |
"pem": 0.756, | |
"pem_stderr": 0.027217995464553182, | |
"pqem": 0.832, | |
"pqem_stderr": 0.02369281320549259, | |
"perfect_em": 0.736, | |
"perfect_em_stderr": 0.027934518957690908 | |
}, | |
"harness|bbh:movie_recommendation|3": { | |
"em": 0.7951807228915663, | |
"em_stderr": 0.025626688837598215, | |
"qem": 0.7951807228915663, | |
"qem_stderr": 0.025626688837598215, | |
"pem": 0.7951807228915663, | |
"pem_stderr": 0.025626688837598215, | |
"pqem": 0.8273092369477911, | |
"pqem_stderr": 0.02400173577465152, | |
"perfect_em": 0.19678714859437751, | |
"perfect_em_stderr": 0.02524572518448647 | |
}, | |
"harness|bbh:navigate|3": { | |
"em": 0.644, | |
"em_stderr": 0.030343680657153208, | |
"qem": 0.644, | |
"qem_stderr": 0.030343680657153208, | |
"pem": 0.644, | |
"pem_stderr": 0.030343680657153208, | |
"pqem": 0.644, | |
"pqem_stderr": 0.030343680657153208, | |
"perfect_em": 0.088, | |
"perfect_em_stderr": 0.017953084777052864 | |
}, | |
"harness|bbh:reasoning_about_colored_objects|3": { | |
"em": 0.396, | |
"em_stderr": 0.03099319785457785, | |
"qem": 0.396, | |
"qem_stderr": 0.03099319785457785, | |
"pem": 0.68, | |
"pem_stderr": 0.02956172495524104, | |
"pqem": 0.696, | |
"pqem_stderr": 0.029150213374159677, | |
"perfect_em": 0.38, | |
"perfect_em_stderr": 0.030760116042626042 | |
}, | |
"harness|bbh:ruin_names|3": { | |
"em": 0.7258064516129032, | |
"em_stderr": 0.028385108031064067, | |
"qem": 0.7258064516129032, | |
"qem_stderr": 0.028385108031064067, | |
"pem": 0.7258064516129032, | |
"pem_stderr": 0.028385108031064067, | |
"pqem": 0.7782258064516129, | |
"pqem_stderr": 0.026433814730987795, | |
"perfect_em": 0.012096774193548387, | |
"perfect_em_stderr": 0.006955742755304476 | |
}, | |
"harness|bbh:salient_translation_error_detection|3": { | |
"em": 0.0, | |
"em_stderr": 0.0, | |
"qem": 0.14, | |
"qem_stderr": 0.021989409645240265, | |
"pem": 0.0, | |
"pem_stderr": 0.0, | |
"pqem": 0.14, | |
"pqem_stderr": 0.021989409645240265, | |
"perfect_em": 0.0, | |
"perfect_em_stderr": 0.0 | |
}, | |
"harness|bbh:snarks|3": { | |
"em": 0.7303370786516854, | |
"em_stderr": 0.03335689818443928, | |
"qem": 0.7303370786516854, | |
"qem_stderr": 0.03335689818443928, | |
"pem": 0.7303370786516854, | |
"pem_stderr": 0.03335689818443928, | |
"pqem": 0.7359550561797753, | |
"pqem_stderr": 0.0331343107765884, | |
"perfect_em": 0.7303370786516854, | |
"perfect_em_stderr": 0.03335689818443928 | |
}, | |
"harness|bbh:sports_understanding|3": { | |
"em": 0.816, | |
"em_stderr": 0.02455581299422256, | |
"qem": 0.816, | |
"qem_stderr": 0.02455581299422256, | |
"pem": 0.816, | |
"pem_stderr": 0.02455581299422256, | |
"pqem": 0.816, | |
"pqem_stderr": 0.02455581299422256, | |
"perfect_em": 0.368, | |
"perfect_em_stderr": 0.030562070620993167 | |
}, | |
"harness|bbh:temporal_sequences|3": { | |
"em": 0.66, | |
"em_stderr": 0.03002007360545791, | |
"qem": 0.66, | |
"qem_stderr": 0.03002007360545791, | |
"pem": 0.66, | |
"pem_stderr": 0.03002007360545791, | |
"pqem": 0.736, | |
"pqem_stderr": 0.02793451895769091, | |
"perfect_em": 0.66, | |
"perfect_em_stderr": 0.03002007360545791 | |
}, | |
"harness|bbh:tracking_shuffled_objects_five_objects|3": { | |
"em": 0.176, | |
"em_stderr": 0.024133497525457126, | |
"qem": 0.176, | |
"qem_stderr": 0.024133497525457126, | |
"pem": 0.176, | |
"pem_stderr": 0.024133497525457126, | |
"pqem": 0.36, | |
"pqem_stderr": 0.030418764025175005, | |
"perfect_em": 0.164, | |
"perfect_em_stderr": 0.023465261002076764 | |
}, | |
"harness|bbh:tracking_shuffled_objects_seven_objects|3": { | |
"em": 0.144, | |
"em_stderr": 0.02224940773545021, | |
"qem": 0.144, | |
"qem_stderr": 0.02224940773545021, | |
"pem": 0.144, | |
"pem_stderr": 0.02224940773545021, | |
"pqem": 0.264, | |
"pqem_stderr": 0.02793451895769091, | |
"perfect_em": 0.144, | |
"perfect_em_stderr": 0.02224940773545021 | |
}, | |
"harness|bbh:tracking_shuffled_objects_three_objects|3": { | |
"em": 0.324, | |
"em_stderr": 0.029658294924545567, | |
"qem": 0.324, | |
"qem_stderr": 0.029658294924545567, | |
"pem": 0.324, | |
"pem_stderr": 0.029658294924545567, | |
"pqem": 0.604, | |
"pqem_stderr": 0.030993197854577853, | |
"perfect_em": 0.324, | |
"perfect_em_stderr": 0.029658294924545567 | |
}, | |
"harness|bbh:_average|3": { | |
"em": 0.5345815910101608, | |
"em_stderr": 0.027466574961818604, | |
"qem": 0.5423593687879386, | |
"qem_stderr": 0.028688208830998613, | |
"pem": 0.5516927021212719, | |
"pem_stderr": 0.02738162822589535, | |
"pqem": 0.6162574713669954, | |
"pqem_stderr": 0.028634682917619194, | |
"perfect_em": 0.40146474369257495, | |
"perfect_em_stderr": 0.02589716710143834 | |
} | |
}, | |
"versions": { | |
"harness|bbh:causal_judgment|3": 0, | |
"harness|bbh:date_understanding|3": 0, | |
"harness|bbh:disambiguation_qa|3": 0, | |
"harness|bbh:geometric_shapes|3": 0, | |
"harness|bbh:logical_deduction_five_objects|3": 0, | |
"harness|bbh:logical_deduction_seven_objects|3": 0, | |
"harness|bbh:logical_deduction_three_objects|3": 0, | |
"harness|bbh:movie_recommendation|3": 0, | |
"harness|bbh:navigate|3": 0, | |
"harness|bbh:reasoning_about_colored_objects|3": 0, | |
"harness|bbh:ruin_names|3": 0, | |
"harness|bbh:salient_translation_error_detection|3": 0, | |
"harness|bbh:snarks|3": 0, | |
"harness|bbh:sports_understanding|3": 0, | |
"harness|bbh:temporal_sequences|3": 0, | |
"harness|bbh:tracking_shuffled_objects_five_objects|3": 0, | |
"harness|bbh:tracking_shuffled_objects_seven_objects|3": 0, | |
"harness|bbh:tracking_shuffled_objects_three_objects|3": 0 | |
}, | |
"config_tasks": { | |
"harness|bbh:causal_judgment": { | |
"name": "bbh:causal_judgment", | |
"prompt_function": "bbh_causal_judgment", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "causal_judgement", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 187, | |
"effective_num_docs": 187, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:date_understanding": { | |
"name": "bbh:date_understanding", | |
"prompt_function": "bbh_date_understanding", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "date_understanding", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:disambiguation_qa": { | |
"name": "bbh:disambiguation_qa", | |
"prompt_function": "bbh_disambiguation_qa", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "disambiguation_qa", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:geometric_shapes": { | |
"name": "bbh:geometric_shapes", | |
"prompt_function": "bbh_geometric_shapes", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "geometric_shapes", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:logical_deduction_five_objects": { | |
"name": "bbh:logical_deduction_five_objects", | |
"prompt_function": "bbh_logical_deduction_five_objects", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "logical_deduction_five_objects", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:logical_deduction_seven_objects": { | |
"name": "bbh:logical_deduction_seven_objects", | |
"prompt_function": "bbh_logical_deduction_seven_objects", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "logical_deduction_seven_objects", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:logical_deduction_three_objects": { | |
"name": "bbh:logical_deduction_three_objects", | |
"prompt_function": "bbh_logical_deduction_three_objects", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "logical_deduction_three_objects", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:movie_recommendation": { | |
"name": "bbh:movie_recommendation", | |
"prompt_function": "bbh_movie_recommendation", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "movie_recommendation", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 249, | |
"effective_num_docs": 249, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:navigate": { | |
"name": "bbh:navigate", | |
"prompt_function": "bbh_navigate", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "navigate", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:reasoning_about_colored_objects": { | |
"name": "bbh:reasoning_about_colored_objects", | |
"prompt_function": "bbh_reasoning_about_colored_objects", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "reasoning_about_colored_objects", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:ruin_names": { | |
"name": "bbh:ruin_names", | |
"prompt_function": "bbh_ruin_names", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "ruin_names", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 248, | |
"effective_num_docs": 248, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:salient_translation_error_detection": { | |
"name": "bbh:salient_translation_error_detection", | |
"prompt_function": "bbh_salient_translation_error_detection", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "salient_translation_error_detection", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:snarks": { | |
"name": "bbh:snarks", | |
"prompt_function": "bbh_snarks", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "snarks", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 178, | |
"effective_num_docs": 178, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:sports_understanding": { | |
"name": "bbh:sports_understanding", | |
"prompt_function": "bbh_sports_understanding", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "sports_understanding", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:temporal_sequences": { | |
"name": "bbh:temporal_sequences", | |
"prompt_function": "bbh_temporal_sequences", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "temporal_sequences", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:tracking_shuffled_objects_five_objects": { | |
"name": "bbh:tracking_shuffled_objects_five_objects", | |
"prompt_function": "bbh_tracking_shuffled_objects_five_objects", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "tracking_shuffled_objects_five_objects", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:tracking_shuffled_objects_seven_objects": { | |
"name": "bbh:tracking_shuffled_objects_seven_objects", | |
"prompt_function": "bbh_tracking_shuffled_objects_seven_objects", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "tracking_shuffled_objects_seven_objects", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"harness|bbh:tracking_shuffled_objects_three_objects": { | |
"name": "bbh:tracking_shuffled_objects_three_objects", | |
"prompt_function": "bbh_tracking_shuffled_objects_three_objects", | |
"hf_repo": "lukaemon/bbh", | |
"hf_subset": "tracking_shuffled_objects_three_objects", | |
"metric": [ | |
"exact_match", | |
"quasi_exact_match", | |
"prefix_exact_match", | |
"prefix_quasi_exact_match", | |
"perfect_exact_match" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": 20, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"harness" | |
], | |
"original_num_docs": 250, | |
"effective_num_docs": 250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
} | |
}, | |
"summary_tasks": { | |
"harness|bbh:causal_judgment|3": { | |
"hashes": { | |
"hash_examples": "63218f5ae055ab2b", | |
"hash_full_prompts": "7b3130db56d6c0fd", | |
"hash_input_tokens": "f8efde17bea946d2", | |
"hash_cont_tokens": "d37448ab93db95b1" | |
}, | |
"truncated": 187, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 187, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:date_understanding|3": { | |
"hashes": { | |
"hash_examples": "f145c7a06def3c8e", | |
"hash_full_prompts": "4ef8884637b3809e", | |
"hash_input_tokens": "5778509e2dd239a6", | |
"hash_cont_tokens": "5ecfd8648d552a1a" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:disambiguation_qa|3": { | |
"hashes": { | |
"hash_examples": "19677fd1773f7eb9", | |
"hash_full_prompts": "e6b8b850e951cac5", | |
"hash_input_tokens": "edf9fb8be63599e1", | |
"hash_cont_tokens": "c54552eef1fa451d" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:geometric_shapes|3": { | |
"hashes": { | |
"hash_examples": "76c7b11a13cc72a9", | |
"hash_full_prompts": "752ce3137089f8c6", | |
"hash_input_tokens": "12582ad81492ef30", | |
"hash_cont_tokens": "d56c664d2c11cd19" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:logical_deduction_five_objects|3": { | |
"hashes": { | |
"hash_examples": "0e958c856332a745", | |
"hash_full_prompts": "04b42c78c37c2813", | |
"hash_input_tokens": "da3e0045d3f3fafe", | |
"hash_cont_tokens": "69cb7afedacf5f7b" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:logical_deduction_seven_objects|3": { | |
"hashes": { | |
"hash_examples": "ab9de25a5eb40d09", | |
"hash_full_prompts": "b1d113750d20009f", | |
"hash_input_tokens": "df1bb83566c636a6", | |
"hash_cont_tokens": "339406ab1ca6147a" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:logical_deduction_three_objects|3": { | |
"hashes": { | |
"hash_examples": "3c6bf52517714218", | |
"hash_full_prompts": "26b8c55342d9c4cf", | |
"hash_input_tokens": "3902df954d0bbbf6", | |
"hash_cont_tokens": "484cb12287c23c1b" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:movie_recommendation|3": { | |
"hashes": { | |
"hash_examples": "2d9dc4975935d31a", | |
"hash_full_prompts": "5a5bbe39920b2853", | |
"hash_input_tokens": "cfa615bbbe8fbe5b", | |
"hash_cont_tokens": "b8492ca44f4f9976" | |
}, | |
"truncated": 249, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 249, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:navigate|3": { | |
"hashes": { | |
"hash_examples": "ba91dcdb9a064255", | |
"hash_full_prompts": "6c38d2c35525307e", | |
"hash_input_tokens": "e3586c675eab91a3", | |
"hash_cont_tokens": "f08526a682cea3c5" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:reasoning_about_colored_objects|3": { | |
"hashes": { | |
"hash_examples": "a6ba328c4c3385d2", | |
"hash_full_prompts": "7724e6385b13466e", | |
"hash_input_tokens": "36264630f65ca9af", | |
"hash_cont_tokens": "cd21981956203b63" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:ruin_names|3": { | |
"hashes": { | |
"hash_examples": "2ef28d5f2d4fdd25", | |
"hash_full_prompts": "de2860d91060e8cf", | |
"hash_input_tokens": "d534c8cd1fa81993", | |
"hash_cont_tokens": "f492d9eaf26baf7d" | |
}, | |
"truncated": 248, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 248, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:salient_translation_error_detection|3": { | |
"hashes": { | |
"hash_examples": "c13f25ec8ffed496", | |
"hash_full_prompts": "bfeac8f44f2117f0", | |
"hash_input_tokens": "24d43ac989f713ce", | |
"hash_cont_tokens": "2b28b75e34c92c4f" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:snarks|3": { | |
"hashes": { | |
"hash_examples": "5f6db7bff7f6f22e", | |
"hash_full_prompts": "022143757f9d7f99", | |
"hash_input_tokens": "ed30c15894919e4e", | |
"hash_cont_tokens": "ce2c25baaa7e7cfc" | |
}, | |
"truncated": 178, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 178, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:sports_understanding|3": { | |
"hashes": { | |
"hash_examples": "042afbe5d9c1f02d", | |
"hash_full_prompts": "f3f183b8ef29393f", | |
"hash_input_tokens": "120b77b188037444", | |
"hash_cont_tokens": "67d3d3e547d2c39c" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:temporal_sequences|3": { | |
"hashes": { | |
"hash_examples": "803a05f352eb6afc", | |
"hash_full_prompts": "3b400ac37a7e7f79", | |
"hash_input_tokens": "7fe89bf91e79edeb", | |
"hash_cont_tokens": "3a259a21d881e2ac" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:tracking_shuffled_objects_five_objects|3": { | |
"hashes": { | |
"hash_examples": "2bbac6db7ab0d527", | |
"hash_full_prompts": "0be369e720528ba7", | |
"hash_input_tokens": "b64c57bce739e4cc", | |
"hash_cont_tokens": "d65c940a0f16349a" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:tracking_shuffled_objects_seven_objects|3": { | |
"hashes": { | |
"hash_examples": "845caf093ac2b58c", | |
"hash_full_prompts": "dbcfb3ab05af1ef6", | |
"hash_input_tokens": "b027927cdbbbb24b", | |
"hash_cont_tokens": "612e581930a8058b" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|bbh:tracking_shuffled_objects_three_objects|3": { | |
"hashes": { | |
"hash_examples": "9004f14d5a32b9a8", | |
"hash_full_prompts": "c1a14c9574076ae9", | |
"hash_input_tokens": "c0f7f8b2212f6262", | |
"hash_cont_tokens": "aecf7a548a9a0ad3" | |
}, | |
"truncated": 250, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 250, | |
"effective_few_shots": 3.0, | |
"num_truncated_few_shots": 0 | |
} | |
}, | |
"summary_general": { | |
"hashes": { | |
"hash_examples": "4ff1e3dc5703575d", | |
"hash_full_prompts": "83884b9ce63636a6", | |
"hash_input_tokens": "e84655995ed10f64", | |
"hash_cont_tokens": "f8147ec7bed415e6" | |
}, | |
"truncated": 4362, | |
"non_truncated": 0, | |
"padded": 0, | |
"non_padded": 4362, | |
"num_truncated_few_shots": 0 | |
} | |
} |