open-r1-eval-leaderboard
/
eval_results
/abhishek
/autotrain-llama3-70b-orpo-v1
/main
/mmlu
/results_2024-05-03T10-27-59.823724.json
{ | |
"config_general": { | |
"lighteval_sha": "?", | |
"num_fewshot_seeds": 1, | |
"override_batch_size": 4, | |
"max_samples": null, | |
"job_id": "", | |
"start_time": 19631.841670262, | |
"end_time": 32395.446210489, | |
"total_evaluation_time_secondes": "12763.604540227", | |
"model_name": "abhishek/autotrain-llama3-70b-orpo-v1", | |
"model_sha": "053236c6846cc561c1503ba05e2b28c94855a432", | |
"model_dtype": "torch.float16", | |
"model_size": "131.73 GB", | |
"config": null | |
}, | |
"results": { | |
"leaderboard|mmlu:abstract_algebra|5": { | |
"acc": 0.49, | |
"acc_stderr": 0.05024183937956913 | |
}, | |
"leaderboard|mmlu:anatomy|5": { | |
"acc": 0.7703703703703704, | |
"acc_stderr": 0.036333844140734636 | |
}, | |
"leaderboard|mmlu:astronomy|5": { | |
"acc": 0.9210526315789473, | |
"acc_stderr": 0.02194434281824793 | |
}, | |
"leaderboard|mmlu:business_ethics|5": { | |
"acc": 0.85, | |
"acc_stderr": 0.03588702812826371 | |
}, | |
"leaderboard|mmlu:clinical_knowledge|5": { | |
"acc": 0.8528301886792453, | |
"acc_stderr": 0.02180412613479738 | |
}, | |
"leaderboard|mmlu:college_biology|5": { | |
"acc": 0.9236111111111112, | |
"acc_stderr": 0.022212203938345918 | |
}, | |
"leaderboard|mmlu:college_chemistry|5": { | |
"acc": 0.58, | |
"acc_stderr": 0.049604496374885836 | |
}, | |
"leaderboard|mmlu:college_computer_science|5": { | |
"acc": 0.67, | |
"acc_stderr": 0.04725815626252606 | |
}, | |
"leaderboard|mmlu:college_mathematics|5": { | |
"acc": 0.55, | |
"acc_stderr": 0.049999999999999996 | |
}, | |
"leaderboard|mmlu:college_medicine|5": { | |
"acc": 0.7861271676300579, | |
"acc_stderr": 0.03126511206173044 | |
}, | |
"leaderboard|mmlu:college_physics|5": { | |
"acc": 0.5490196078431373, | |
"acc_stderr": 0.049512182523962604 | |
}, | |
"leaderboard|mmlu:computer_security|5": { | |
"acc": 0.8, | |
"acc_stderr": 0.04020151261036846 | |
}, | |
"leaderboard|mmlu:conceptual_physics|5": { | |
"acc": 0.8127659574468085, | |
"acc_stderr": 0.025501588341883596 | |
}, | |
"leaderboard|mmlu:econometrics|5": { | |
"acc": 0.7192982456140351, | |
"acc_stderr": 0.04227054451232199 | |
}, | |
"leaderboard|mmlu:electrical_engineering|5": { | |
"acc": 0.7517241379310344, | |
"acc_stderr": 0.03600105692727771 | |
}, | |
"leaderboard|mmlu:elementary_mathematics|5": { | |
"acc": 0.6746031746031746, | |
"acc_stderr": 0.024130158299762616 | |
}, | |
"leaderboard|mmlu:formal_logic|5": { | |
"acc": 0.6428571428571429, | |
"acc_stderr": 0.04285714285714281 | |
}, | |
"leaderboard|mmlu:global_facts|5": { | |
"acc": 0.59, | |
"acc_stderr": 0.04943110704237101 | |
}, | |
"leaderboard|mmlu:high_school_biology|5": { | |
"acc": 0.9129032258064517, | |
"acc_stderr": 0.01604110074169669 | |
}, | |
"leaderboard|mmlu:high_school_chemistry|5": { | |
"acc": 0.6600985221674877, | |
"acc_stderr": 0.033327690684107895 | |
}, | |
"leaderboard|mmlu:high_school_computer_science|5": { | |
"acc": 0.9, | |
"acc_stderr": 0.030151134457776348 | |
}, | |
"leaderboard|mmlu:high_school_european_history|5": { | |
"acc": 0.8787878787878788, | |
"acc_stderr": 0.025485498373343237 | |
}, | |
"leaderboard|mmlu:high_school_geography|5": { | |
"acc": 0.9242424242424242, | |
"acc_stderr": 0.018852670234993093 | |
}, | |
"leaderboard|mmlu:high_school_government_and_politics|5": { | |
"acc": 0.9844559585492227, | |
"acc_stderr": 0.00892749271508434 | |
}, | |
"leaderboard|mmlu:high_school_macroeconomics|5": { | |
"acc": 0.8358974358974359, | |
"acc_stderr": 0.01877843431342372 | |
}, | |
"leaderboard|mmlu:high_school_mathematics|5": { | |
"acc": 0.5259259259259259, | |
"acc_stderr": 0.03044452852881074 | |
}, | |
"leaderboard|mmlu:high_school_microeconomics|5": { | |
"acc": 0.8949579831932774, | |
"acc_stderr": 0.019916300758805225 | |
}, | |
"leaderboard|mmlu:high_school_physics|5": { | |
"acc": 0.6158940397350994, | |
"acc_stderr": 0.03971301814719197 | |
}, | |
"leaderboard|mmlu:high_school_psychology|5": { | |
"acc": 0.9486238532110092, | |
"acc_stderr": 0.009465168181022974 | |
}, | |
"leaderboard|mmlu:high_school_statistics|5": { | |
"acc": 0.7361111111111112, | |
"acc_stderr": 0.030058202704309846 | |
}, | |
"leaderboard|mmlu:high_school_us_history|5": { | |
"acc": 0.9313725490196079, | |
"acc_stderr": 0.017744453647073322 | |
}, | |
"leaderboard|mmlu:high_school_world_history|5": { | |
"acc": 0.9240506329113924, | |
"acc_stderr": 0.01724463325106569 | |
}, | |
"leaderboard|mmlu:human_aging|5": { | |
"acc": 0.8161434977578476, | |
"acc_stderr": 0.025998379092356517 | |
}, | |
"leaderboard|mmlu:human_sexuality|5": { | |
"acc": 0.8625954198473282, | |
"acc_stderr": 0.030194823996804475 | |
}, | |
"leaderboard|mmlu:international_law|5": { | |
"acc": 0.9090909090909091, | |
"acc_stderr": 0.026243194054073885 | |
}, | |
"leaderboard|mmlu:jurisprudence|5": { | |
"acc": 0.8703703703703703, | |
"acc_stderr": 0.03247224389917946 | |
}, | |
"leaderboard|mmlu:logical_fallacies|5": { | |
"acc": 0.8466257668711656, | |
"acc_stderr": 0.0283116014414386 | |
}, | |
"leaderboard|mmlu:machine_learning|5": { | |
"acc": 0.7142857142857143, | |
"acc_stderr": 0.042878587513404544 | |
}, | |
"leaderboard|mmlu:management|5": { | |
"acc": 0.912621359223301, | |
"acc_stderr": 0.027960689125970654 | |
}, | |
"leaderboard|mmlu:marketing|5": { | |
"acc": 0.9358974358974359, | |
"acc_stderr": 0.016046261631673137 | |
}, | |
"leaderboard|mmlu:medical_genetics|5": { | |
"acc": 0.93, | |
"acc_stderr": 0.0256432399976243 | |
}, | |
"leaderboard|mmlu:miscellaneous|5": { | |
"acc": 0.9233716475095786, | |
"acc_stderr": 0.009512170699323858 | |
}, | |
"leaderboard|mmlu:moral_disputes|5": { | |
"acc": 0.8208092485549133, | |
"acc_stderr": 0.020647590029679332 | |
}, | |
"leaderboard|mmlu:moral_scenarios|5": { | |
"acc": 0.7463687150837989, | |
"acc_stderr": 0.014551553659369918 | |
}, | |
"leaderboard|mmlu:nutrition|5": { | |
"acc": 0.869281045751634, | |
"acc_stderr": 0.019301873624215267 | |
}, | |
"leaderboard|mmlu:philosophy|5": { | |
"acc": 0.797427652733119, | |
"acc_stderr": 0.022827317491059693 | |
}, | |
"leaderboard|mmlu:prehistory|5": { | |
"acc": 0.9012345679012346, | |
"acc_stderr": 0.01660046080164534 | |
}, | |
"leaderboard|mmlu:professional_accounting|5": { | |
"acc": 0.6595744680851063, | |
"acc_stderr": 0.02826765748265014 | |
}, | |
"leaderboard|mmlu:professional_law|5": { | |
"acc": 0.6323337679269883, | |
"acc_stderr": 0.012314845910071712 | |
}, | |
"leaderboard|mmlu:professional_medicine|5": { | |
"acc": 0.8933823529411765, | |
"acc_stderr": 0.018747725509716835 | |
}, | |
"leaderboard|mmlu:professional_psychology|5": { | |
"acc": 0.8611111111111112, | |
"acc_stderr": 0.013990806277040208 | |
}, | |
"leaderboard|mmlu:public_relations|5": { | |
"acc": 0.7454545454545455, | |
"acc_stderr": 0.041723430387053825 | |
}, | |
"leaderboard|mmlu:security_studies|5": { | |
"acc": 0.8081632653061225, | |
"acc_stderr": 0.0252069631542254 | |
}, | |
"leaderboard|mmlu:sociology|5": { | |
"acc": 0.9154228855721394, | |
"acc_stderr": 0.019675343217199173 | |
}, | |
"leaderboard|mmlu:us_foreign_policy|5": { | |
"acc": 0.92, | |
"acc_stderr": 0.0272659924344291 | |
}, | |
"leaderboard|mmlu:virology|5": { | |
"acc": 0.572289156626506, | |
"acc_stderr": 0.03851597683718533 | |
}, | |
"leaderboard|mmlu:world_religions|5": { | |
"acc": 0.8888888888888888, | |
"acc_stderr": 0.024103384202072864 | |
}, | |
"leaderboard|mmlu:_average|5": { | |
"acc": 0.7957951766493738, | |
"acc_stderr": 0.0280984014309186 | |
}, | |
"all": { | |
"acc": 0.7957951766493738, | |
"acc_stderr": 0.0280984014309186 | |
} | |
}, | |
"versions": { | |
"leaderboard|mmlu:abstract_algebra|5": 0, | |
"leaderboard|mmlu:anatomy|5": 0, | |
"leaderboard|mmlu:astronomy|5": 0, | |
"leaderboard|mmlu:business_ethics|5": 0, | |
"leaderboard|mmlu:clinical_knowledge|5": 0, | |
"leaderboard|mmlu:college_biology|5": 0, | |
"leaderboard|mmlu:college_chemistry|5": 0, | |
"leaderboard|mmlu:college_computer_science|5": 0, | |
"leaderboard|mmlu:college_mathematics|5": 0, | |
"leaderboard|mmlu:college_medicine|5": 0, | |
"leaderboard|mmlu:college_physics|5": 0, | |
"leaderboard|mmlu:computer_security|5": 0, | |
"leaderboard|mmlu:conceptual_physics|5": 0, | |
"leaderboard|mmlu:econometrics|5": 0, | |
"leaderboard|mmlu:electrical_engineering|5": 0, | |
"leaderboard|mmlu:elementary_mathematics|5": 0, | |
"leaderboard|mmlu:formal_logic|5": 0, | |
"leaderboard|mmlu:global_facts|5": 0, | |
"leaderboard|mmlu:high_school_biology|5": 0, | |
"leaderboard|mmlu:high_school_chemistry|5": 0, | |
"leaderboard|mmlu:high_school_computer_science|5": 0, | |
"leaderboard|mmlu:high_school_european_history|5": 0, | |
"leaderboard|mmlu:high_school_geography|5": 0, | |
"leaderboard|mmlu:high_school_government_and_politics|5": 0, | |
"leaderboard|mmlu:high_school_macroeconomics|5": 0, | |
"leaderboard|mmlu:high_school_mathematics|5": 0, | |
"leaderboard|mmlu:high_school_microeconomics|5": 0, | |
"leaderboard|mmlu:high_school_physics|5": 0, | |
"leaderboard|mmlu:high_school_psychology|5": 0, | |
"leaderboard|mmlu:high_school_statistics|5": 0, | |
"leaderboard|mmlu:high_school_us_history|5": 0, | |
"leaderboard|mmlu:high_school_world_history|5": 0, | |
"leaderboard|mmlu:human_aging|5": 0, | |
"leaderboard|mmlu:human_sexuality|5": 0, | |
"leaderboard|mmlu:international_law|5": 0, | |
"leaderboard|mmlu:jurisprudence|5": 0, | |
"leaderboard|mmlu:logical_fallacies|5": 0, | |
"leaderboard|mmlu:machine_learning|5": 0, | |
"leaderboard|mmlu:management|5": 0, | |
"leaderboard|mmlu:marketing|5": 0, | |
"leaderboard|mmlu:medical_genetics|5": 0, | |
"leaderboard|mmlu:miscellaneous|5": 0, | |
"leaderboard|mmlu:moral_disputes|5": 0, | |
"leaderboard|mmlu:moral_scenarios|5": 0, | |
"leaderboard|mmlu:nutrition|5": 0, | |
"leaderboard|mmlu:philosophy|5": 0, | |
"leaderboard|mmlu:prehistory|5": 0, | |
"leaderboard|mmlu:professional_accounting|5": 0, | |
"leaderboard|mmlu:professional_law|5": 0, | |
"leaderboard|mmlu:professional_medicine|5": 0, | |
"leaderboard|mmlu:professional_psychology|5": 0, | |
"leaderboard|mmlu:public_relations|5": 0, | |
"leaderboard|mmlu:security_studies|5": 0, | |
"leaderboard|mmlu:sociology|5": 0, | |
"leaderboard|mmlu:us_foreign_policy|5": 0, | |
"leaderboard|mmlu:virology|5": 0, | |
"leaderboard|mmlu:world_religions|5": 0 | |
}, | |
"config_tasks": { | |
"leaderboard|mmlu:abstract_algebra": { | |
"name": "mmlu:abstract_algebra", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "abstract_algebra", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:anatomy": { | |
"name": "mmlu:anatomy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "anatomy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 135, | |
"effective_num_docs": 135, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:astronomy": { | |
"name": "mmlu:astronomy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "astronomy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 152, | |
"effective_num_docs": 152, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:business_ethics": { | |
"name": "mmlu:business_ethics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "business_ethics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:clinical_knowledge": { | |
"name": "mmlu:clinical_knowledge", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "clinical_knowledge", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 265, | |
"effective_num_docs": 265, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:college_biology": { | |
"name": "mmlu:college_biology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_biology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 144, | |
"effective_num_docs": 144, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:college_chemistry": { | |
"name": "mmlu:college_chemistry", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_chemistry", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:college_computer_science": { | |
"name": "mmlu:college_computer_science", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_computer_science", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:college_mathematics": { | |
"name": "mmlu:college_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:college_medicine": { | |
"name": "mmlu:college_medicine", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_medicine", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 173, | |
"effective_num_docs": 173, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:college_physics": { | |
"name": "mmlu:college_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 102, | |
"effective_num_docs": 102, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:computer_security": { | |
"name": "mmlu:computer_security", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "computer_security", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:conceptual_physics": { | |
"name": "mmlu:conceptual_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "conceptual_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 235, | |
"effective_num_docs": 235, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:econometrics": { | |
"name": "mmlu:econometrics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "econometrics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 114, | |
"effective_num_docs": 114, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:electrical_engineering": { | |
"name": "mmlu:electrical_engineering", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "electrical_engineering", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 145, | |
"effective_num_docs": 145, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:elementary_mathematics": { | |
"name": "mmlu:elementary_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "elementary_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 378, | |
"effective_num_docs": 378, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:formal_logic": { | |
"name": "mmlu:formal_logic", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "formal_logic", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 126, | |
"effective_num_docs": 126, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:global_facts": { | |
"name": "mmlu:global_facts", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "global_facts", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_biology": { | |
"name": "mmlu:high_school_biology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_biology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 310, | |
"effective_num_docs": 310, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_chemistry": { | |
"name": "mmlu:high_school_chemistry", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_chemistry", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 203, | |
"effective_num_docs": 203, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_computer_science": { | |
"name": "mmlu:high_school_computer_science", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_computer_science", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_european_history": { | |
"name": "mmlu:high_school_european_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_european_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 165, | |
"effective_num_docs": 165, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_geography": { | |
"name": "mmlu:high_school_geography", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_geography", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 198, | |
"effective_num_docs": 198, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_government_and_politics": { | |
"name": "mmlu:high_school_government_and_politics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_government_and_politics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 193, | |
"effective_num_docs": 193, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_macroeconomics": { | |
"name": "mmlu:high_school_macroeconomics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_macroeconomics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 390, | |
"effective_num_docs": 390, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_mathematics": { | |
"name": "mmlu:high_school_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 270, | |
"effective_num_docs": 270, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_microeconomics": { | |
"name": "mmlu:high_school_microeconomics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_microeconomics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 238, | |
"effective_num_docs": 238, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_physics": { | |
"name": "mmlu:high_school_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 151, | |
"effective_num_docs": 151, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_psychology": { | |
"name": "mmlu:high_school_psychology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_psychology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 545, | |
"effective_num_docs": 545, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_statistics": { | |
"name": "mmlu:high_school_statistics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_statistics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 216, | |
"effective_num_docs": 216, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_us_history": { | |
"name": "mmlu:high_school_us_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_us_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 204, | |
"effective_num_docs": 204, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:high_school_world_history": { | |
"name": "mmlu:high_school_world_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_world_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 237, | |
"effective_num_docs": 237, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:human_aging": { | |
"name": "mmlu:human_aging", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "human_aging", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 223, | |
"effective_num_docs": 223, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:human_sexuality": { | |
"name": "mmlu:human_sexuality", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "human_sexuality", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 131, | |
"effective_num_docs": 131, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:international_law": { | |
"name": "mmlu:international_law", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "international_law", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 121, | |
"effective_num_docs": 121, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:jurisprudence": { | |
"name": "mmlu:jurisprudence", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "jurisprudence", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 108, | |
"effective_num_docs": 108, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:logical_fallacies": { | |
"name": "mmlu:logical_fallacies", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "logical_fallacies", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 163, | |
"effective_num_docs": 163, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:machine_learning": { | |
"name": "mmlu:machine_learning", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "machine_learning", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 112, | |
"effective_num_docs": 112, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:management": { | |
"name": "mmlu:management", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "management", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 103, | |
"effective_num_docs": 103, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:marketing": { | |
"name": "mmlu:marketing", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "marketing", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 234, | |
"effective_num_docs": 234, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:medical_genetics": { | |
"name": "mmlu:medical_genetics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "medical_genetics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:miscellaneous": { | |
"name": "mmlu:miscellaneous", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "miscellaneous", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 783, | |
"effective_num_docs": 783, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:moral_disputes": { | |
"name": "mmlu:moral_disputes", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "moral_disputes", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 346, | |
"effective_num_docs": 346, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:moral_scenarios": { | |
"name": "mmlu:moral_scenarios", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "moral_scenarios", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 895, | |
"effective_num_docs": 895, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:nutrition": { | |
"name": "mmlu:nutrition", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "nutrition", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 306, | |
"effective_num_docs": 306, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:philosophy": { | |
"name": "mmlu:philosophy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "philosophy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 311, | |
"effective_num_docs": 311, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:prehistory": { | |
"name": "mmlu:prehistory", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "prehistory", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 324, | |
"effective_num_docs": 324, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:professional_accounting": { | |
"name": "mmlu:professional_accounting", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_accounting", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 282, | |
"effective_num_docs": 282, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:professional_law": { | |
"name": "mmlu:professional_law", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_law", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 1534, | |
"effective_num_docs": 1534, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:professional_medicine": { | |
"name": "mmlu:professional_medicine", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_medicine", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 272, | |
"effective_num_docs": 272, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:professional_psychology": { | |
"name": "mmlu:professional_psychology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_psychology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 612, | |
"effective_num_docs": 612, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:public_relations": { | |
"name": "mmlu:public_relations", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "public_relations", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 110, | |
"effective_num_docs": 110, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:security_studies": { | |
"name": "mmlu:security_studies", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "security_studies", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 245, | |
"effective_num_docs": 245, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:sociology": { | |
"name": "mmlu:sociology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "sociology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 201, | |
"effective_num_docs": 201, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:us_foreign_policy": { | |
"name": "mmlu:us_foreign_policy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "us_foreign_policy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:virology": { | |
"name": "mmlu:virology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "virology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 166, | |
"effective_num_docs": 166, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"leaderboard|mmlu:world_religions": { | |
"name": "mmlu:world_religions", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "world_religions", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 171, | |
"effective_num_docs": 171, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
} | |
}, | |
"summary_tasks": { | |
"leaderboard|mmlu:abstract_algebra|5": { | |
"hashes": { | |
"hash_examples": "4c76229e00c9c0e9", | |
"hash_full_prompts": "a45d01c3409c889c", | |
"hash_input_tokens": "c58f21ad388e41a4", | |
"hash_cont_tokens": "b057f3f1d84e35e5" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:anatomy|5": { | |
"hashes": { | |
"hash_examples": "6a1f8104dccbd33b", | |
"hash_full_prompts": "e245c6600e03cc32", | |
"hash_input_tokens": "664ad983d943ad07", | |
"hash_cont_tokens": "eb0c9a1e487e77a6" | |
}, | |
"truncated": 0, | |
"non_truncated": 135, | |
"padded": 540, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:astronomy|5": { | |
"hashes": { | |
"hash_examples": "1302effa3a76ce4c", | |
"hash_full_prompts": "390f9bddf857ad04", | |
"hash_input_tokens": "6b8419ce1ca61ae8", | |
"hash_cont_tokens": "2c8a49864c3d99c2" | |
}, | |
"truncated": 0, | |
"non_truncated": 152, | |
"padded": 608, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:business_ethics|5": { | |
"hashes": { | |
"hash_examples": "03cb8bce5336419a", | |
"hash_full_prompts": "5504f893bc4f2fa1", | |
"hash_input_tokens": "bf7a56022072a446", | |
"hash_cont_tokens": "b057f3f1d84e35e5" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:clinical_knowledge|5": { | |
"hashes": { | |
"hash_examples": "ffbb9c7b2be257f9", | |
"hash_full_prompts": "106ad0bab4b90b78", | |
"hash_input_tokens": "1d310792e0aaf29c", | |
"hash_cont_tokens": "f2bfcea369926d68" | |
}, | |
"truncated": 0, | |
"non_truncated": 265, | |
"padded": 1060, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:college_biology|5": { | |
"hashes": { | |
"hash_examples": "3ee77f176f38eb8e", | |
"hash_full_prompts": "59f9bdf2695cb226", | |
"hash_input_tokens": "1027babc822bd1c5", | |
"hash_cont_tokens": "061b1b91fc518400" | |
}, | |
"truncated": 0, | |
"non_truncated": 144, | |
"padded": 576, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:college_chemistry|5": { | |
"hashes": { | |
"hash_examples": "ce61a69c46d47aeb", | |
"hash_full_prompts": "3cac9b759fcff7a0", | |
"hash_input_tokens": "4fa05a1d43eaf942", | |
"hash_cont_tokens": "b057f3f1d84e35e5" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:college_computer_science|5": { | |
"hashes": { | |
"hash_examples": "32805b52d7d5daab", | |
"hash_full_prompts": "010b0cca35070130", | |
"hash_input_tokens": "c78e59615689a133", | |
"hash_cont_tokens": "b057f3f1d84e35e5" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:college_mathematics|5": { | |
"hashes": { | |
"hash_examples": "55da1a0a0bd33722", | |
"hash_full_prompts": "511422eb9eefc773", | |
"hash_input_tokens": "f62164431ea60a6d", | |
"hash_cont_tokens": "b057f3f1d84e35e5" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:college_medicine|5": { | |
"hashes": { | |
"hash_examples": "c33e143163049176", | |
"hash_full_prompts": "c8cc1a82a51a046e", | |
"hash_input_tokens": "f3ffc86e05b4abab", | |
"hash_cont_tokens": "96f7e09eeaf3577a" | |
}, | |
"truncated": 0, | |
"non_truncated": 173, | |
"padded": 692, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:college_physics|5": { | |
"hashes": { | |
"hash_examples": "ebdab1cdb7e555df", | |
"hash_full_prompts": "e40721b5059c5818", | |
"hash_input_tokens": "bfec18d4c2cf7331", | |
"hash_cont_tokens": "fb74f245268780f7" | |
}, | |
"truncated": 0, | |
"non_truncated": 102, | |
"padded": 408, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:computer_security|5": { | |
"hashes": { | |
"hash_examples": "a24fd7d08a560921", | |
"hash_full_prompts": "946c9be5964ac44a", | |
"hash_input_tokens": "acde342892ed7ff9", | |
"hash_cont_tokens": "b057f3f1d84e35e5" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:conceptual_physics|5": { | |
"hashes": { | |
"hash_examples": "8300977a79386993", | |
"hash_full_prompts": "506a4f6094cc40c9", | |
"hash_input_tokens": "524121d9ddd4bf6a", | |
"hash_cont_tokens": "1f4e1e92f33812b5" | |
}, | |
"truncated": 0, | |
"non_truncated": 235, | |
"padded": 940, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:econometrics|5": { | |
"hashes": { | |
"hash_examples": "ddde36788a04a46f", | |
"hash_full_prompts": "4ed2703f27f1ed05", | |
"hash_input_tokens": "a1dedb7b847d6b19", | |
"hash_cont_tokens": "4060f7b36b2f4140" | |
}, | |
"truncated": 0, | |
"non_truncated": 114, | |
"padded": 456, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:electrical_engineering|5": { | |
"hashes": { | |
"hash_examples": "acbc5def98c19b3f", | |
"hash_full_prompts": "d8f4b3e11c23653c", | |
"hash_input_tokens": "825d9d44e9ca39de", | |
"hash_cont_tokens": "06e5a56f4fae638e" | |
}, | |
"truncated": 0, | |
"non_truncated": 145, | |
"padded": 580, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:elementary_mathematics|5": { | |
"hashes": { | |
"hash_examples": "146e61d07497a9bd", | |
"hash_full_prompts": "256d111bd15647ff", | |
"hash_input_tokens": "6ef65f0c30222cfa", | |
"hash_cont_tokens": "c3385a2b3ed50305" | |
}, | |
"truncated": 0, | |
"non_truncated": 378, | |
"padded": 1506, | |
"non_padded": 6, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:formal_logic|5": { | |
"hashes": { | |
"hash_examples": "8635216e1909a03f", | |
"hash_full_prompts": "1171d04f3b1a11f5", | |
"hash_input_tokens": "1c20d8fd0dcd2b9e", | |
"hash_cont_tokens": "e259fd8e83f5eabe" | |
}, | |
"truncated": 0, | |
"non_truncated": 126, | |
"padded": 504, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:global_facts|5": { | |
"hashes": { | |
"hash_examples": "30b315aa6353ee47", | |
"hash_full_prompts": "a7e56dbc074c7529", | |
"hash_input_tokens": "4fe2c5925dbc174a", | |
"hash_cont_tokens": "b057f3f1d84e35e5" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_biology|5": { | |
"hashes": { | |
"hash_examples": "c9136373af2180de", | |
"hash_full_prompts": "ad6e859ed978e04a", | |
"hash_input_tokens": "ea52b862d9b7af2b", | |
"hash_cont_tokens": "6251059b06be9d97" | |
}, | |
"truncated": 0, | |
"non_truncated": 310, | |
"padded": 1236, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_chemistry|5": { | |
"hashes": { | |
"hash_examples": "b0661bfa1add6404", | |
"hash_full_prompts": "6eb9c04bcc8a8f2a", | |
"hash_input_tokens": "057712c3fd4fe5dc", | |
"hash_cont_tokens": "ce467bdc2825a0b2" | |
}, | |
"truncated": 0, | |
"non_truncated": 203, | |
"padded": 808, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_computer_science|5": { | |
"hashes": { | |
"hash_examples": "80fc1d623a3d665f", | |
"hash_full_prompts": "8e51bc91c81cf8dd", | |
"hash_input_tokens": "efbba49c91a4a950", | |
"hash_cont_tokens": "b057f3f1d84e35e5" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_european_history|5": { | |
"hashes": { | |
"hash_examples": "854da6e5af0fe1a1", | |
"hash_full_prompts": "664a1f16c9f3195c", | |
"hash_input_tokens": "26218866485baf4e", | |
"hash_cont_tokens": "f02ad2401d7aa667" | |
}, | |
"truncated": 0, | |
"non_truncated": 165, | |
"padded": 656, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_geography|5": { | |
"hashes": { | |
"hash_examples": "7dc963c7acd19ad8", | |
"hash_full_prompts": "f3acf911f4023c8a", | |
"hash_input_tokens": "56cbe629d18ead5d", | |
"hash_cont_tokens": "26d9256a0ab4eece" | |
}, | |
"truncated": 0, | |
"non_truncated": 198, | |
"padded": 785, | |
"non_padded": 7, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_government_and_politics|5": { | |
"hashes": { | |
"hash_examples": "1f675dcdebc9758f", | |
"hash_full_prompts": "066254feaa3158ae", | |
"hash_input_tokens": "b86d166b67953159", | |
"hash_cont_tokens": "990c9084748f34ab" | |
}, | |
"truncated": 0, | |
"non_truncated": 193, | |
"padded": 772, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_macroeconomics|5": { | |
"hashes": { | |
"hash_examples": "2fb32cf2d80f0b35", | |
"hash_full_prompts": "19a7fa502aa85c95", | |
"hash_input_tokens": "75b5836573d4418d", | |
"hash_cont_tokens": "76312c2ea8fc4f71" | |
}, | |
"truncated": 0, | |
"non_truncated": 390, | |
"padded": 1552, | |
"non_padded": 8, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_mathematics|5": { | |
"hashes": { | |
"hash_examples": "fd6646fdb5d58a1f", | |
"hash_full_prompts": "4f704e369778b5b0", | |
"hash_input_tokens": "5bc4d612b64cb82b", | |
"hash_cont_tokens": "369a1a933960fad5" | |
}, | |
"truncated": 0, | |
"non_truncated": 270, | |
"padded": 1048, | |
"non_padded": 32, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_microeconomics|5": { | |
"hashes": { | |
"hash_examples": "2118f21f71d87d84", | |
"hash_full_prompts": "4350f9e2240f8010", | |
"hash_input_tokens": "f53ae91ff33dea98", | |
"hash_cont_tokens": "ce39343f06b04c0c" | |
}, | |
"truncated": 0, | |
"non_truncated": 238, | |
"padded": 924, | |
"non_padded": 28, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_physics|5": { | |
"hashes": { | |
"hash_examples": "dc3ce06378548565", | |
"hash_full_prompts": "5dc0d6831b66188f", | |
"hash_input_tokens": "23181b0f6dc1e876", | |
"hash_cont_tokens": "34c4d04275713047" | |
}, | |
"truncated": 0, | |
"non_truncated": 151, | |
"padded": 604, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_psychology|5": { | |
"hashes": { | |
"hash_examples": "c8d1d98a40e11f2f", | |
"hash_full_prompts": "af2b097da6d50365", | |
"hash_input_tokens": "1c03a6aa2ccd7497", | |
"hash_cont_tokens": "76367d535c896191" | |
}, | |
"truncated": 0, | |
"non_truncated": 545, | |
"padded": 2176, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_statistics|5": { | |
"hashes": { | |
"hash_examples": "666c8759b98ee4ff", | |
"hash_full_prompts": "c757694421d6d68d", | |
"hash_input_tokens": "6c3d4c89ebb17624", | |
"hash_cont_tokens": "a076a9a5529c4701" | |
}, | |
"truncated": 0, | |
"non_truncated": 216, | |
"padded": 864, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_us_history|5": { | |
"hashes": { | |
"hash_examples": "95fef1c4b7d3f81e", | |
"hash_full_prompts": "e34a028d0ddeec5e", | |
"hash_input_tokens": "3789cc86ffa04ee6", | |
"hash_cont_tokens": "ff9e65faaa6206d3" | |
}, | |
"truncated": 0, | |
"non_truncated": 204, | |
"padded": 816, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_world_history|5": { | |
"hashes": { | |
"hash_examples": "7e5085b6184b0322", | |
"hash_full_prompts": "1fa3d51392765601", | |
"hash_input_tokens": "e2ade10b727cc567", | |
"hash_cont_tokens": "91d0b99f637c395d" | |
}, | |
"truncated": 0, | |
"non_truncated": 237, | |
"padded": 948, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:human_aging|5": { | |
"hashes": { | |
"hash_examples": "c17333e7c7c10797", | |
"hash_full_prompts": "cac900721f9a1a94", | |
"hash_input_tokens": "e73491e153435aef", | |
"hash_cont_tokens": "503a59d0c8fd9fda" | |
}, | |
"truncated": 0, | |
"non_truncated": 223, | |
"padded": 892, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:human_sexuality|5": { | |
"hashes": { | |
"hash_examples": "4edd1e9045df5e3d", | |
"hash_full_prompts": "0d6567bafee0a13c", | |
"hash_input_tokens": "5aac17e145c73388", | |
"hash_cont_tokens": "2ef8023fd099e328" | |
}, | |
"truncated": 0, | |
"non_truncated": 131, | |
"padded": 524, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:international_law|5": { | |
"hashes": { | |
"hash_examples": "db2fa00d771a062a", | |
"hash_full_prompts": "d018f9116479795e", | |
"hash_input_tokens": "f823a8f71decfd4e", | |
"hash_cont_tokens": "1d135acf09cc77d7" | |
}, | |
"truncated": 0, | |
"non_truncated": 121, | |
"padded": 484, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:jurisprudence|5": { | |
"hashes": { | |
"hash_examples": "e956f86b124076fe", | |
"hash_full_prompts": "1487e89a10ec58b7", | |
"hash_input_tokens": "1296fb073d25fdbd", | |
"hash_cont_tokens": "2bc5403ae73a42ee" | |
}, | |
"truncated": 0, | |
"non_truncated": 108, | |
"padded": 420, | |
"non_padded": 12, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:logical_fallacies|5": { | |
"hashes": { | |
"hash_examples": "956e0e6365ab79f1", | |
"hash_full_prompts": "677785b2181f9243", | |
"hash_input_tokens": "5af73357ea4a33af", | |
"hash_cont_tokens": "343532d46d0dd784" | |
}, | |
"truncated": 0, | |
"non_truncated": 163, | |
"padded": 648, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:machine_learning|5": { | |
"hashes": { | |
"hash_examples": "397997cc6f4d581e", | |
"hash_full_prompts": "769ee14a2aea49bb", | |
"hash_input_tokens": "36e7ee7692d6cb84", | |
"hash_cont_tokens": "ffa678813759b3dc" | |
}, | |
"truncated": 0, | |
"non_truncated": 112, | |
"padded": 448, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:management|5": { | |
"hashes": { | |
"hash_examples": "2bcbe6f6ca63d740", | |
"hash_full_prompts": "cb1ff9dac9582144", | |
"hash_input_tokens": "b6132d503a6b6fe1", | |
"hash_cont_tokens": "c86a690085fd954f" | |
}, | |
"truncated": 0, | |
"non_truncated": 103, | |
"padded": 412, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:marketing|5": { | |
"hashes": { | |
"hash_examples": "8ddb20d964a1b065", | |
"hash_full_prompts": "9fc2114a187ad9a2", | |
"hash_input_tokens": "cea9c7023072b6de", | |
"hash_cont_tokens": "a59b0611811f3b0d" | |
}, | |
"truncated": 0, | |
"non_truncated": 234, | |
"padded": 892, | |
"non_padded": 44, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:medical_genetics|5": { | |
"hashes": { | |
"hash_examples": "182a71f4763d2cea", | |
"hash_full_prompts": "46a616fa51878959", | |
"hash_input_tokens": "26a5d3aa11ad9928", | |
"hash_cont_tokens": "b057f3f1d84e35e5" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:miscellaneous|5": { | |
"hashes": { | |
"hash_examples": "4c404fdbb4ca57fc", | |
"hash_full_prompts": "0813e1be36dbaae1", | |
"hash_input_tokens": "9368b6cccdca09c2", | |
"hash_cont_tokens": "468b0141d91cbe14" | |
}, | |
"truncated": 0, | |
"non_truncated": 783, | |
"padded": 3132, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:moral_disputes|5": { | |
"hashes": { | |
"hash_examples": "60cbd2baa3fea5c9", | |
"hash_full_prompts": "1d14adebb9b62519", | |
"hash_input_tokens": "24ac7096ff952fb3", | |
"hash_cont_tokens": "5513cb6f0e3d0039" | |
}, | |
"truncated": 0, | |
"non_truncated": 346, | |
"padded": 1380, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:moral_scenarios|5": { | |
"hashes": { | |
"hash_examples": "fd8b0431fbdd75ef", | |
"hash_full_prompts": "b80d3d236165e3de", | |
"hash_input_tokens": "c388cdbaa92ff35d", | |
"hash_cont_tokens": "9fd58c2ac3e72795" | |
}, | |
"truncated": 0, | |
"non_truncated": 895, | |
"padded": 3447, | |
"non_padded": 133, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:nutrition|5": { | |
"hashes": { | |
"hash_examples": "71e55e2b829b6528", | |
"hash_full_prompts": "2bfb18e5fab8dea7", | |
"hash_input_tokens": "61d681ff34503244", | |
"hash_cont_tokens": "3294c626ca103ea9" | |
}, | |
"truncated": 0, | |
"non_truncated": 306, | |
"padded": 1224, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:philosophy|5": { | |
"hashes": { | |
"hash_examples": "a6d489a8d208fa4b", | |
"hash_full_prompts": "e8c0d5b6dae3ccc8", | |
"hash_input_tokens": "99dccba498573ce7", | |
"hash_cont_tokens": "84fab74fb9dbe7cc" | |
}, | |
"truncated": 0, | |
"non_truncated": 311, | |
"padded": 1244, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:prehistory|5": { | |
"hashes": { | |
"hash_examples": "6cc50f032a19acaa", | |
"hash_full_prompts": "4a6a1d3ab1bf28e4", | |
"hash_input_tokens": "0b5b291e21537231", | |
"hash_cont_tokens": "65ac3d5bc3a7107a" | |
}, | |
"truncated": 0, | |
"non_truncated": 324, | |
"padded": 1256, | |
"non_padded": 40, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:professional_accounting|5": { | |
"hashes": { | |
"hash_examples": "50f57ab32f5f6cea", | |
"hash_full_prompts": "e60129bd2d82ffc6", | |
"hash_input_tokens": "dfdbfa66c3879e04", | |
"hash_cont_tokens": "3408ef87473c956e" | |
}, | |
"truncated": 0, | |
"non_truncated": 282, | |
"padded": 1108, | |
"non_padded": 20, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:professional_law|5": { | |
"hashes": { | |
"hash_examples": "a8fdc85c64f4b215", | |
"hash_full_prompts": "0dbb1d9b72dcea03", | |
"hash_input_tokens": "c5e40216c766fc5d", | |
"hash_cont_tokens": "ee53bac4bdeb7c6f" | |
}, | |
"truncated": 0, | |
"non_truncated": 1534, | |
"padded": 6136, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:professional_medicine|5": { | |
"hashes": { | |
"hash_examples": "c373a28a3050a73a", | |
"hash_full_prompts": "5e040f9ca68b089e", | |
"hash_input_tokens": "6bb2fb0a41e6e74a", | |
"hash_cont_tokens": "5ba90d13b887dd10" | |
}, | |
"truncated": 0, | |
"non_truncated": 272, | |
"padded": 1088, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:professional_psychology|5": { | |
"hashes": { | |
"hash_examples": "bf5254fe818356af", | |
"hash_full_prompts": "b386ecda8b87150e", | |
"hash_input_tokens": "d5882dcb2ba36239", | |
"hash_cont_tokens": "81d224ca3a7cd1f8" | |
}, | |
"truncated": 0, | |
"non_truncated": 612, | |
"padded": 2448, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:public_relations|5": { | |
"hashes": { | |
"hash_examples": "b66d52e28e7d14e0", | |
"hash_full_prompts": "fe43562263e25677", | |
"hash_input_tokens": "89e2f611b0e690d5", | |
"hash_cont_tokens": "97fc092d5801cddc" | |
}, | |
"truncated": 0, | |
"non_truncated": 110, | |
"padded": 432, | |
"non_padded": 8, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:security_studies|5": { | |
"hashes": { | |
"hash_examples": "514c14feaf000ad9", | |
"hash_full_prompts": "27d4a2ac541ef4b9", | |
"hash_input_tokens": "da3c969306757935", | |
"hash_cont_tokens": "084e0267d5a5a853" | |
}, | |
"truncated": 0, | |
"non_truncated": 245, | |
"padded": 980, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:sociology|5": { | |
"hashes": { | |
"hash_examples": "f6c9bc9d18c80870", | |
"hash_full_prompts": "c072ea7d1a1524f2", | |
"hash_input_tokens": "cba49ace6cf739f2", | |
"hash_cont_tokens": "c654d898bec36354" | |
}, | |
"truncated": 0, | |
"non_truncated": 201, | |
"padded": 804, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:us_foreign_policy|5": { | |
"hashes": { | |
"hash_examples": "ed7b78629db6678f", | |
"hash_full_prompts": "341a97ca3e4d699d", | |
"hash_input_tokens": "3f5014206033d5c8", | |
"hash_cont_tokens": "b057f3f1d84e35e5" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:virology|5": { | |
"hashes": { | |
"hash_examples": "bc52ffdc3f9b994a", | |
"hash_full_prompts": "651d471e2eb8b5e9", | |
"hash_input_tokens": "21ca3e01eff77c5a", | |
"hash_cont_tokens": "648f351cffb42342" | |
}, | |
"truncated": 0, | |
"non_truncated": 166, | |
"padded": 664, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:world_religions|5": { | |
"hashes": { | |
"hash_examples": "ecdb4a4f94f62930", | |
"hash_full_prompts": "3773f03542ce44a3", | |
"hash_input_tokens": "7904d43bc25add76", | |
"hash_cont_tokens": "6cee5406c3f21c2e" | |
}, | |
"truncated": 0, | |
"non_truncated": 171, | |
"padded": 684, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
} | |
}, | |
"summary_general": { | |
"hashes": { | |
"hash_examples": "341a076d0beb7048", | |
"hash_full_prompts": "a5c8f2b7ff4f5ae2", | |
"hash_input_tokens": "74ed818d997ed10b", | |
"hash_cont_tokens": "edf1783519f209b9" | |
}, | |
"truncated": 0, | |
"non_truncated": 14042, | |
"padded": 55806, | |
"non_padded": 362, | |
"num_truncated_few_shots": 0 | |
} | |
} |