open-r1-eval-leaderboard
/
eval_results
/deepseek-ai
/deepseek-llm-67b-chat
/main
/bbh
/results_2024-03-28T16-35-56.180836.json
{ | |
"config_general": { | |
"lighteval_sha": "?", | |
"num_fewshot_seeds": 1, | |
"override_batch_size": 1, | |
"max_samples": null, | |
"job_id": "", | |
"start_time": 2078650.217096255, | |
"end_time": 2079474.923861768, | |
"total_evaluation_time_secondes": "824.7067655131686", | |
"model_name": "deepseek-ai/deepseek-llm-67b-chat", | |
"model_sha": "79648bef7658bb824e4630740f6e1484c1b0620b", | |
"model_dtype": "torch.bfloat16", | |
"model_size": "125.77 GB", | |
"config": null | |
}, | |
"results": { | |
"lighteval|bigbench:causal_judgment|0": { | |
"acc": 0.6578947368421053, | |
"acc_stderr": 0.03450858738901066 | |
}, | |
"lighteval|bigbench:date_understanding|0": { | |
"acc": 0.6829268292682927, | |
"acc_stderr": 0.02425732605238151 | |
}, | |
"lighteval|bigbench:disambiguation_qa|0": { | |
"acc": 0.4069767441860465, | |
"acc_stderr": 0.030644609905429086 | |
}, | |
"lighteval|bigbench:geometric_shapes|0": { | |
"acc": 0.1527777777777778, | |
"acc_stderr": 0.018988101273123323 | |
}, | |
"lighteval|bigbench:logical_deduction_five_objects|0": { | |
"acc": 0.484, | |
"acc_stderr": 0.0223716109825804 | |
}, | |
"lighteval|bigbench:logical_deduction_seven_objects|0": { | |
"acc": 0.4014285714285714, | |
"acc_stderr": 0.018540589716944327 | |
}, | |
"lighteval|bigbench:logical_deduction_three_objects|0": { | |
"acc": 0.6866666666666666, | |
"acc_stderr": 0.026825059139630406 | |
}, | |
"lighteval|bigbench:movie_recommendation|0": { | |
"acc": 0.742, | |
"acc_stderr": 0.01958671178521584 | |
}, | |
"lighteval|bigbench:navigate|0": { | |
"acc": 0.5, | |
"acc_stderr": 0.015819299929208316 | |
}, | |
"lighteval|bigbench:reasoning_about_colored_objects|0": { | |
"acc": 0.624, | |
"acc_stderr": 0.010833775211931938 | |
}, | |
"lighteval|bigbench:ruin_names|0": { | |
"acc": 0.6205357142857143, | |
"acc_stderr": 0.022951711861293245 | |
}, | |
"lighteval|bigbench:salient_translation_error_detection|0": { | |
"acc": 0.5260521042084169, | |
"acc_stderr": 0.015813649242772447 | |
}, | |
"lighteval|bigbench:snarks|0": { | |
"acc": 0.6022099447513812, | |
"acc_stderr": 0.036480826561810335 | |
}, | |
"lighteval|bigbench:sports_understanding|0": { | |
"acc": 0.69, | |
"acc_stderr": 0.014632638658632896 | |
}, | |
"lighteval|bigbench:temporal_sequences|0": { | |
"acc": 0.809, | |
"acc_stderr": 0.012436787112179512 | |
}, | |
"lighteval|bigbench:tracking_shuffled_objects_five_objects|0": { | |
"acc": 0.2, | |
"acc_stderr": 0.011318236699485788 | |
}, | |
"lighteval|bigbench:tracking_shuffled_objects_seven_objects|0": { | |
"acc": 0.14285714285714285, | |
"acc_stderr": 0.008367248752248818 | |
}, | |
"lighteval|bigbench:tracking_shuffled_objects_three_objects|0": { | |
"acc": 0.6866666666666666, | |
"acc_stderr": 0.026825059139630406 | |
}, | |
"lighteval|bigbench:_average|0": { | |
"acc": 0.5342218277188212, | |
"acc_stderr": 0.020622323856306066 | |
}, | |
"all": { | |
"acc": 0.5342218277188212, | |
"acc_stderr": 0.020622323856306066 | |
} | |
}, | |
"versions": { | |
"lighteval|bigbench:causal_judgment|0": 0, | |
"lighteval|bigbench:date_understanding|0": 0, | |
"lighteval|bigbench:disambiguation_qa|0": 0, | |
"lighteval|bigbench:geometric_shapes|0": 0, | |
"lighteval|bigbench:logical_deduction_five_objects|0": 0, | |
"lighteval|bigbench:logical_deduction_seven_objects|0": 0, | |
"lighteval|bigbench:logical_deduction_three_objects|0": 0, | |
"lighteval|bigbench:movie_recommendation|0": 0, | |
"lighteval|bigbench:navigate|0": 0, | |
"lighteval|bigbench:reasoning_about_colored_objects|0": 0, | |
"lighteval|bigbench:ruin_names|0": 0, | |
"lighteval|bigbench:salient_translation_error_detection|0": 0, | |
"lighteval|bigbench:snarks|0": 0, | |
"lighteval|bigbench:sports_understanding|0": 0, | |
"lighteval|bigbench:temporal_sequences|0": 0, | |
"lighteval|bigbench:tracking_shuffled_objects_five_objects|0": 0, | |
"lighteval|bigbench:tracking_shuffled_objects_seven_objects|0": 0, | |
"lighteval|bigbench:tracking_shuffled_objects_three_objects|0": 0 | |
}, | |
"config_tasks": { | |
"lighteval|bigbench:causal_judgment": { | |
"name": "bigbench:causal_judgment", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "causal_judgement", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 190, | |
"effective_num_docs": 190, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:date_understanding": { | |
"name": "bigbench:date_understanding", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "date_understanding", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 369, | |
"effective_num_docs": 369, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:disambiguation_qa": { | |
"name": "bigbench:disambiguation_qa", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "disambiguation_qa", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 258, | |
"effective_num_docs": 258, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:geometric_shapes": { | |
"name": "bigbench:geometric_shapes", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "geometric_shapes", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 360, | |
"effective_num_docs": 360, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:logical_deduction_five_objects": { | |
"name": "bigbench:logical_deduction_five_objects", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "logical_deduction_five_objects", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 500, | |
"effective_num_docs": 500, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:logical_deduction_seven_objects": { | |
"name": "bigbench:logical_deduction_seven_objects", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "logical_deduction_seven_objects", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 700, | |
"effective_num_docs": 700, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:logical_deduction_three_objects": { | |
"name": "bigbench:logical_deduction_three_objects", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "logical_deduction_three_objects", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 300, | |
"effective_num_docs": 300, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:movie_recommendation": { | |
"name": "bigbench:movie_recommendation", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "movie_recommendation", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 500, | |
"effective_num_docs": 500, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:navigate": { | |
"name": "bigbench:navigate", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "navigate", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 1000, | |
"effective_num_docs": 1000, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:reasoning_about_colored_objects": { | |
"name": "bigbench:reasoning_about_colored_objects", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "reasoning_about_colored_objects", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 2000, | |
"effective_num_docs": 2000, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:ruin_names": { | |
"name": "bigbench:ruin_names", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "ruin_names", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 448, | |
"effective_num_docs": 448, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:salient_translation_error_detection": { | |
"name": "bigbench:salient_translation_error_detection", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "salient_translation_error_detection", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 998, | |
"effective_num_docs": 998, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:snarks": { | |
"name": "bigbench:snarks", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "snarks", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 181, | |
"effective_num_docs": 181, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:sports_understanding": { | |
"name": "bigbench:sports_understanding", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "sports_understanding", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 1000, | |
"effective_num_docs": 1000, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:temporal_sequences": { | |
"name": "bigbench:temporal_sequences", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "temporal_sequences", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 1000, | |
"effective_num_docs": 1000, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:tracking_shuffled_objects_five_objects": { | |
"name": "bigbench:tracking_shuffled_objects_five_objects", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "tracking_shuffled_objects_five_objects", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 1250, | |
"effective_num_docs": 1250, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:tracking_shuffled_objects_seven_objects": { | |
"name": "bigbench:tracking_shuffled_objects_seven_objects", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "tracking_shuffled_objects_seven_objects", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 1750, | |
"effective_num_docs": 1750, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|bigbench:tracking_shuffled_objects_three_objects": { | |
"name": "bigbench:tracking_shuffled_objects_three_objects", | |
"prompt_function": "bbh_lighteval", | |
"hf_repo": "lighteval/bbh", | |
"hf_subset": "tracking_shuffled_objects_three_objects", | |
"metric": [ | |
"loglikelihood_acc_single_token" | |
], | |
"hf_avail_splits": [ | |
"train" | |
], | |
"evaluation_splits": [ | |
"train" | |
], | |
"few_shots_split": null, | |
"few_shots_select": null, | |
"generation_size": -1, | |
"stop_sequence": [ | |
"</s>", | |
"Q:", | |
"\n\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 300, | |
"effective_num_docs": 300, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
} | |
}, | |
"summary_tasks": { | |
"lighteval|bigbench:causal_judgment|0": { | |
"hashes": { | |
"hash_examples": "dfb1ae47218f2850", | |
"hash_full_prompts": "3267f89e916494a1", | |
"hash_input_tokens": "551f06b7209adc90", | |
"hash_cont_tokens": "f81e6c56654a19cf" | |
}, | |
"truncated": 0, | |
"non_truncated": 190, | |
"padded": 189, | |
"non_padded": 1, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:date_understanding|0": { | |
"hashes": { | |
"hash_examples": "2b823c41500a6ec2", | |
"hash_full_prompts": "343ea3c4f61a90c0", | |
"hash_input_tokens": "44353352781224bd", | |
"hash_cont_tokens": "495d23b33427ae6c" | |
}, | |
"truncated": 0, | |
"non_truncated": 369, | |
"padded": 369, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:disambiguation_qa|0": { | |
"hashes": { | |
"hash_examples": "2a4c3d41db198cea", | |
"hash_full_prompts": "b1890e35960e97a7", | |
"hash_input_tokens": "2441cb049a9d2208", | |
"hash_cont_tokens": "6dee300a0dbe5313" | |
}, | |
"truncated": 0, | |
"non_truncated": 258, | |
"padded": 245, | |
"non_padded": 13, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:geometric_shapes|0": { | |
"hashes": { | |
"hash_examples": "24aa261103911b72", | |
"hash_full_prompts": "23f158937304157a", | |
"hash_input_tokens": "a472d27bda102f13", | |
"hash_cont_tokens": "5110dc8ff998b2ee" | |
}, | |
"truncated": 0, | |
"non_truncated": 360, | |
"padded": 360, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:logical_deduction_five_objects|0": { | |
"hashes": { | |
"hash_examples": "cb5bdc92afc41f83", | |
"hash_full_prompts": "0cc35d5773991738", | |
"hash_input_tokens": "e0fe863a118f6d8b", | |
"hash_cont_tokens": "a644b000a112ebc2" | |
}, | |
"truncated": 0, | |
"non_truncated": 500, | |
"padded": 496, | |
"non_padded": 4, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:logical_deduction_seven_objects|0": { | |
"hashes": { | |
"hash_examples": "b6805ea696739f9f", | |
"hash_full_prompts": "c7b7686724c858fd", | |
"hash_input_tokens": "34c7501b7e6d78fe", | |
"hash_cont_tokens": "f12b97ac6e28ea3d" | |
}, | |
"truncated": 0, | |
"non_truncated": 700, | |
"padded": 700, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:logical_deduction_three_objects|0": { | |
"hashes": { | |
"hash_examples": "0509e5712ab9bcdb", | |
"hash_full_prompts": "bcc24281a028c785", | |
"hash_input_tokens": "9d4b9314ca135e4e", | |
"hash_cont_tokens": "f62e1a7656c207aa" | |
}, | |
"truncated": 0, | |
"non_truncated": 300, | |
"padded": 264, | |
"non_padded": 36, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:movie_recommendation|0": { | |
"hashes": { | |
"hash_examples": "530cc6f737830f45", | |
"hash_full_prompts": "15f7a8899e7ba75a", | |
"hash_input_tokens": "8bd81d504e8e688f", | |
"hash_cont_tokens": "337988ba0f6b6159" | |
}, | |
"truncated": 0, | |
"non_truncated": 500, | |
"padded": 497, | |
"non_padded": 3, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:navigate|0": { | |
"hashes": { | |
"hash_examples": "7962ef85d0058b9a", | |
"hash_full_prompts": "f71a25ebd38cdfa7", | |
"hash_input_tokens": "98b7793fe8d25de5", | |
"hash_cont_tokens": "52126cbdda5b7719" | |
}, | |
"truncated": 0, | |
"non_truncated": 1000, | |
"padded": 978, | |
"non_padded": 22, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:reasoning_about_colored_objects|0": { | |
"hashes": { | |
"hash_examples": "39be1ab1677a651d", | |
"hash_full_prompts": "1f5d03fba2ad5b7a", | |
"hash_input_tokens": "c59b64e65afdd37a", | |
"hash_cont_tokens": "f08abeb367e8b2c4" | |
}, | |
"truncated": 0, | |
"non_truncated": 2000, | |
"padded": 1980, | |
"non_padded": 20, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:ruin_names|0": { | |
"hashes": { | |
"hash_examples": "e9b96b31d2154941", | |
"hash_full_prompts": "37edbe93a8709ff0", | |
"hash_input_tokens": "5cb87272dc424d37", | |
"hash_cont_tokens": "4e632cbeace7c04a" | |
}, | |
"truncated": 0, | |
"non_truncated": 448, | |
"padded": 444, | |
"non_padded": 4, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:salient_translation_error_detection|0": { | |
"hashes": { | |
"hash_examples": "951ac59f7ad0427d", | |
"hash_full_prompts": "2686b351cac866f7", | |
"hash_input_tokens": "a2808c8a427480da", | |
"hash_cont_tokens": "c5fdca3e6ef9c743" | |
}, | |
"truncated": 0, | |
"non_truncated": 998, | |
"padded": 998, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:snarks|0": { | |
"hashes": { | |
"hash_examples": "3a53eb9b9d758534", | |
"hash_full_prompts": "6f8eece5fd0dabc9", | |
"hash_input_tokens": "dee5792e4032a046", | |
"hash_cont_tokens": "9e61eef34693f0a4" | |
}, | |
"truncated": 0, | |
"non_truncated": 181, | |
"padded": 180, | |
"non_padded": 1, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:sports_understanding|0": { | |
"hashes": { | |
"hash_examples": "bd65741f00770373", | |
"hash_full_prompts": "91322620078e3db5", | |
"hash_input_tokens": "047ff3bce135ceeb", | |
"hash_cont_tokens": "be1bbc43d3727d88" | |
}, | |
"truncated": 0, | |
"non_truncated": 1000, | |
"padded": 1000, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:temporal_sequences|0": { | |
"hashes": { | |
"hash_examples": "1d13139f47cb2df7", | |
"hash_full_prompts": "763570b80e9dafec", | |
"hash_input_tokens": "c2ee956ced8b5255", | |
"hash_cont_tokens": "4af1d64788ed1c7d" | |
}, | |
"truncated": 0, | |
"non_truncated": 1000, | |
"padded": 1000, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:tracking_shuffled_objects_five_objects|0": { | |
"hashes": { | |
"hash_examples": "8770a702a9646648", | |
"hash_full_prompts": "b9f4f902cee3b466", | |
"hash_input_tokens": "348e8701e58c07f6", | |
"hash_cont_tokens": "753dcf65729047fb" | |
}, | |
"truncated": 0, | |
"non_truncated": 1250, | |
"padded": 1180, | |
"non_padded": 70, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:tracking_shuffled_objects_seven_objects|0": { | |
"hashes": { | |
"hash_examples": "b469b7d073824a59", | |
"hash_full_prompts": "7b8ae4e5acd55b67", | |
"hash_input_tokens": "8cb38aea04e2b133", | |
"hash_cont_tokens": "b4d99448b3461186" | |
}, | |
"truncated": 0, | |
"non_truncated": 1750, | |
"padded": 1363, | |
"non_padded": 387, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|bigbench:tracking_shuffled_objects_three_objects|0": { | |
"hashes": { | |
"hash_examples": "0509e5712ab9bcdb", | |
"hash_full_prompts": "bcc24281a028c785", | |
"hash_input_tokens": "9d4b9314ca135e4e", | |
"hash_cont_tokens": "341f7abb6a0b7f12" | |
}, | |
"truncated": 0, | |
"non_truncated": 300, | |
"padded": 264, | |
"non_padded": 36, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
} | |
}, | |
"summary_general": { | |
"hashes": { | |
"hash_examples": "51a30c4501ba4586", | |
"hash_full_prompts": "259243f822a8efed", | |
"hash_input_tokens": "947abaf2981ff5af", | |
"hash_cont_tokens": "adb97ea2bc1a2656" | |
}, | |
"truncated": 0, | |
"non_truncated": 13104, | |
"padded": 12507, | |
"non_padded": 597, | |
"num_truncated_few_shots": 0 | |
} | |
} |