open-r1-eval-leaderboard
/
eval_results
/HuggingFaceH4
/mistral-7b-ift
/v41.9
/mmlu
/results_2024-03-08T13-23-07.880881.json

edbeeching
HF staff
Upload eval_results/HuggingFaceH4/mistral-7b-ift/v41.9/mmlu/results_2024-03-08T13-23-07.880881.json with huggingface_hub
fac999c
verified
{ | |
"config_general": { | |
"lighteval_sha": "?", | |
"num_fewshot_seeds": 1, | |
"override_batch_size": 1, | |
"max_samples": null, | |
"job_id": "", | |
"start_time": 53704.05701775, | |
"end_time": 54916.910386431, | |
"total_evaluation_time_secondes": "1212.8533686810042", | |
"model_name": "HuggingFaceH4/mistral-7b-ift", | |
"model_sha": "699d4ad4c36fefeddb6e990475f94d87847ec54c", | |
"model_dtype": "torch.bfloat16", | |
"model_size": "13.99 GB", | |
"config": null | |
}, | |
"results": { | |
"lighteval|mmlu:abstract_algebra|5": { | |
"acc": 0.28, | |
"acc_stderr": 0.04512608598542128 | |
}, | |
"lighteval|mmlu:anatomy|5": { | |
"acc": 0.5333333333333333, | |
"acc_stderr": 0.043097329010363554 | |
}, | |
"lighteval|mmlu:astronomy|5": { | |
"acc": 0.618421052631579, | |
"acc_stderr": 0.03953173377749194 | |
}, | |
"lighteval|mmlu:business_ethics|5": { | |
"acc": 0.52, | |
"acc_stderr": 0.050211673156867795 | |
}, | |
"lighteval|mmlu:clinical_knowledge|5": { | |
"acc": 0.6566037735849056, | |
"acc_stderr": 0.029224526469124792 | |
}, | |
"lighteval|mmlu:college_biology|5": { | |
"acc": 0.6666666666666666, | |
"acc_stderr": 0.03942082639927213 | |
}, | |
"lighteval|mmlu:college_chemistry|5": { | |
"acc": 0.44, | |
"acc_stderr": 0.04988876515698589 | |
}, | |
"lighteval|mmlu:college_computer_science|5": { | |
"acc": 0.47, | |
"acc_stderr": 0.05016135580465919 | |
}, | |
"lighteval|mmlu:college_mathematics|5": { | |
"acc": 0.38, | |
"acc_stderr": 0.04878317312145633 | |
}, | |
"lighteval|mmlu:college_medicine|5": { | |
"acc": 0.630057803468208, | |
"acc_stderr": 0.0368122963339432 | |
}, | |
"lighteval|mmlu:college_physics|5": { | |
"acc": 0.4019607843137255, | |
"acc_stderr": 0.04878608714466996 | |
}, | |
"lighteval|mmlu:computer_security|5": { | |
"acc": 0.73, | |
"acc_stderr": 0.04461960433384739 | |
}, | |
"lighteval|mmlu:conceptual_physics|5": { | |
"acc": 0.4851063829787234, | |
"acc_stderr": 0.032671518489247764 | |
}, | |
"lighteval|mmlu:econometrics|5": { | |
"acc": 0.4298245614035088, | |
"acc_stderr": 0.046570472605949625 | |
}, | |
"lighteval|mmlu:electrical_engineering|5": { | |
"acc": 0.5655172413793104, | |
"acc_stderr": 0.04130740879555498 | |
}, | |
"lighteval|mmlu:elementary_mathematics|5": { | |
"acc": 0.3941798941798942, | |
"acc_stderr": 0.025167982333894143 | |
}, | |
"lighteval|mmlu:formal_logic|5": { | |
"acc": 0.3333333333333333, | |
"acc_stderr": 0.04216370213557836 | |
}, | |
"lighteval|mmlu:global_facts|5": { | |
"acc": 0.36, | |
"acc_stderr": 0.04824181513244218 | |
}, | |
"lighteval|mmlu:high_school_biology|5": { | |
"acc": 0.7096774193548387, | |
"acc_stderr": 0.025822106119415898 | |
}, | |
"lighteval|mmlu:high_school_chemistry|5": { | |
"acc": 0.4630541871921182, | |
"acc_stderr": 0.035083705204426656 | |
}, | |
"lighteval|mmlu:high_school_computer_science|5": { | |
"acc": 0.6, | |
"acc_stderr": 0.049236596391733084 | |
}, | |
"lighteval|mmlu:high_school_european_history|5": { | |
"acc": 0.7151515151515152, | |
"acc_stderr": 0.03524390844511781 | |
}, | |
"lighteval|mmlu:high_school_geography|5": { | |
"acc": 0.7626262626262627, | |
"acc_stderr": 0.0303137105381989 | |
}, | |
"lighteval|mmlu:high_school_government_and_politics|5": { | |
"acc": 0.8393782383419689, | |
"acc_stderr": 0.026499057701397443 | |
}, | |
"lighteval|mmlu:high_school_macroeconomics|5": { | |
"acc": 0.6128205128205129, | |
"acc_stderr": 0.024697216930878937 | |
}, | |
"lighteval|mmlu:high_school_mathematics|5": { | |
"acc": 0.3111111111111111, | |
"acc_stderr": 0.028226446749683515 | |
}, | |
"lighteval|mmlu:high_school_microeconomics|5": { | |
"acc": 0.6260504201680672, | |
"acc_stderr": 0.03142946637883708 | |
}, | |
"lighteval|mmlu:high_school_physics|5": { | |
"acc": 0.3443708609271523, | |
"acc_stderr": 0.038796870240733264 | |
}, | |
"lighteval|mmlu:high_school_psychology|5": { | |
"acc": 0.7834862385321101, | |
"acc_stderr": 0.017658710594443124 | |
}, | |
"lighteval|mmlu:high_school_statistics|5": { | |
"acc": 0.5277777777777778, | |
"acc_stderr": 0.0340470532865388 | |
}, | |
"lighteval|mmlu:high_school_us_history|5": { | |
"acc": 0.7303921568627451, | |
"acc_stderr": 0.031145570659486782 | |
}, | |
"lighteval|mmlu:high_school_world_history|5": { | |
"acc": 0.7046413502109705, | |
"acc_stderr": 0.029696338713422882 | |
}, | |
"lighteval|mmlu:human_aging|5": { | |
"acc": 0.6322869955156951, | |
"acc_stderr": 0.03236198350928275 | |
}, | |
"lighteval|mmlu:human_sexuality|5": { | |
"acc": 0.7404580152671756, | |
"acc_stderr": 0.03844876139785271 | |
}, | |
"lighteval|mmlu:international_law|5": { | |
"acc": 0.7355371900826446, | |
"acc_stderr": 0.040261875275912046 | |
}, | |
"lighteval|mmlu:jurisprudence|5": { | |
"acc": 0.6759259259259259, | |
"acc_stderr": 0.045245960070300476 | |
}, | |
"lighteval|mmlu:logical_fallacies|5": { | |
"acc": 0.6993865030674846, | |
"acc_stderr": 0.03602511318806771 | |
}, | |
"lighteval|mmlu:machine_learning|5": { | |
"acc": 0.41964285714285715, | |
"acc_stderr": 0.046840993210771065 | |
}, | |
"lighteval|mmlu:management|5": { | |
"acc": 0.7766990291262136, | |
"acc_stderr": 0.04123553189891431 | |
}, | |
"lighteval|mmlu:marketing|5": { | |
"acc": 0.8162393162393162, | |
"acc_stderr": 0.025372139671722933 | |
}, | |
"lighteval|mmlu:medical_genetics|5": { | |
"acc": 0.68, | |
"acc_stderr": 0.046882617226215034 | |
}, | |
"lighteval|mmlu:miscellaneous|5": { | |
"acc": 0.7726692209450831, | |
"acc_stderr": 0.014987270640946019 | |
}, | |
"lighteval|mmlu:moral_disputes|5": { | |
"acc": 0.615606936416185, | |
"acc_stderr": 0.02618966696627204 | |
}, | |
"lighteval|mmlu:moral_scenarios|5": { | |
"acc": 0.3139664804469274, | |
"acc_stderr": 0.01552192393352364 | |
}, | |
"lighteval|mmlu:nutrition|5": { | |
"acc": 0.6928104575163399, | |
"acc_stderr": 0.026415601914389002 | |
}, | |
"lighteval|mmlu:philosophy|5": { | |
"acc": 0.6784565916398714, | |
"acc_stderr": 0.026527724079528872 | |
}, | |
"lighteval|mmlu:prehistory|5": { | |
"acc": 0.6327160493827161, | |
"acc_stderr": 0.026822801759507894 | |
}, | |
"lighteval|mmlu:professional_accounting|5": { | |
"acc": 0.4219858156028369, | |
"acc_stderr": 0.029462189233370597 | |
}, | |
"lighteval|mmlu:professional_law|5": { | |
"acc": 0.4172099087353325, | |
"acc_stderr": 0.012593959992906419 | |
}, | |
"lighteval|mmlu:professional_medicine|5": { | |
"acc": 0.6397058823529411, | |
"acc_stderr": 0.02916312857067073 | |
}, | |
"lighteval|mmlu:professional_psychology|5": { | |
"acc": 0.6062091503267973, | |
"acc_stderr": 0.01976621199107307 | |
}, | |
"lighteval|mmlu:public_relations|5": { | |
"acc": 0.6181818181818182, | |
"acc_stderr": 0.046534298079135075 | |
}, | |
"lighteval|mmlu:security_studies|5": { | |
"acc": 0.636734693877551, | |
"acc_stderr": 0.030789051139030806 | |
}, | |
"lighteval|mmlu:sociology|5": { | |
"acc": 0.7960199004975125, | |
"acc_stderr": 0.02849317624532607 | |
}, | |
"lighteval|mmlu:us_foreign_policy|5": { | |
"acc": 0.81, | |
"acc_stderr": 0.039427724440366234 | |
}, | |
"lighteval|mmlu:virology|5": { | |
"acc": 0.5060240963855421, | |
"acc_stderr": 0.03892212195333045 | |
}, | |
"lighteval|mmlu:world_religions|5": { | |
"acc": 0.8070175438596491, | |
"acc_stderr": 0.030267457554898458 | |
}, | |
"lighteval|mmlu:_average|5": { | |
"acc": 0.5924040922260484, | |
"acc_stderr": 0.0349867087383228 | |
} | |
}, | |
"versions": { | |
"lighteval|mmlu:abstract_algebra|5": 0, | |
"lighteval|mmlu:anatomy|5": 0, | |
"lighteval|mmlu:astronomy|5": 0, | |
"lighteval|mmlu:business_ethics|5": 0, | |
"lighteval|mmlu:clinical_knowledge|5": 0, | |
"lighteval|mmlu:college_biology|5": 0, | |
"lighteval|mmlu:college_chemistry|5": 0, | |
"lighteval|mmlu:college_computer_science|5": 0, | |
"lighteval|mmlu:college_mathematics|5": 0, | |
"lighteval|mmlu:college_medicine|5": 0, | |
"lighteval|mmlu:college_physics|5": 0, | |
"lighteval|mmlu:computer_security|5": 0, | |
"lighteval|mmlu:conceptual_physics|5": 0, | |
"lighteval|mmlu:econometrics|5": 0, | |
"lighteval|mmlu:electrical_engineering|5": 0, | |
"lighteval|mmlu:elementary_mathematics|5": 0, | |
"lighteval|mmlu:formal_logic|5": 0, | |
"lighteval|mmlu:global_facts|5": 0, | |
"lighteval|mmlu:high_school_biology|5": 0, | |
"lighteval|mmlu:high_school_chemistry|5": 0, | |
"lighteval|mmlu:high_school_computer_science|5": 0, | |
"lighteval|mmlu:high_school_european_history|5": 0, | |
"lighteval|mmlu:high_school_geography|5": 0, | |
"lighteval|mmlu:high_school_government_and_politics|5": 0, | |
"lighteval|mmlu:high_school_macroeconomics|5": 0, | |
"lighteval|mmlu:high_school_mathematics|5": 0, | |
"lighteval|mmlu:high_school_microeconomics|5": 0, | |
"lighteval|mmlu:high_school_physics|5": 0, | |
"lighteval|mmlu:high_school_psychology|5": 0, | |
"lighteval|mmlu:high_school_statistics|5": 0, | |
"lighteval|mmlu:high_school_us_history|5": 0, | |
"lighteval|mmlu:high_school_world_history|5": 0, | |
"lighteval|mmlu:human_aging|5": 0, | |
"lighteval|mmlu:human_sexuality|5": 0, | |
"lighteval|mmlu:international_law|5": 0, | |
"lighteval|mmlu:jurisprudence|5": 0, | |
"lighteval|mmlu:logical_fallacies|5": 0, | |
"lighteval|mmlu:machine_learning|5": 0, | |
"lighteval|mmlu:management|5": 0, | |
"lighteval|mmlu:marketing|5": 0, | |
"lighteval|mmlu:medical_genetics|5": 0, | |
"lighteval|mmlu:miscellaneous|5": 0, | |
"lighteval|mmlu:moral_disputes|5": 0, | |
"lighteval|mmlu:moral_scenarios|5": 0, | |
"lighteval|mmlu:nutrition|5": 0, | |
"lighteval|mmlu:philosophy|5": 0, | |
"lighteval|mmlu:prehistory|5": 0, | |
"lighteval|mmlu:professional_accounting|5": 0, | |
"lighteval|mmlu:professional_law|5": 0, | |
"lighteval|mmlu:professional_medicine|5": 0, | |
"lighteval|mmlu:professional_psychology|5": 0, | |
"lighteval|mmlu:public_relations|5": 0, | |
"lighteval|mmlu:security_studies|5": 0, | |
"lighteval|mmlu:sociology|5": 0, | |
"lighteval|mmlu:us_foreign_policy|5": 0, | |
"lighteval|mmlu:virology|5": 0, | |
"lighteval|mmlu:world_religions|5": 0 | |
}, | |
"config_tasks": { | |
"lighteval|mmlu:abstract_algebra": { | |
"name": "mmlu:abstract_algebra", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "abstract_algebra", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:anatomy": { | |
"name": "mmlu:anatomy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "anatomy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 135, | |
"effective_num_docs": 135 | |
}, | |
"lighteval|mmlu:astronomy": { | |
"name": "mmlu:astronomy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "astronomy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 152, | |
"effective_num_docs": 152 | |
}, | |
"lighteval|mmlu:business_ethics": { | |
"name": "mmlu:business_ethics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "business_ethics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:clinical_knowledge": { | |
"name": "mmlu:clinical_knowledge", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "clinical_knowledge", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 265, | |
"effective_num_docs": 265 | |
}, | |
"lighteval|mmlu:college_biology": { | |
"name": "mmlu:college_biology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_biology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 144, | |
"effective_num_docs": 144 | |
}, | |
"lighteval|mmlu:college_chemistry": { | |
"name": "mmlu:college_chemistry", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_chemistry", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:college_computer_science": { | |
"name": "mmlu:college_computer_science", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_computer_science", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:college_mathematics": { | |
"name": "mmlu:college_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:college_medicine": { | |
"name": "mmlu:college_medicine", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_medicine", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 173, | |
"effective_num_docs": 173 | |
}, | |
"lighteval|mmlu:college_physics": { | |
"name": "mmlu:college_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 102, | |
"effective_num_docs": 102 | |
}, | |
"lighteval|mmlu:computer_security": { | |
"name": "mmlu:computer_security", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "computer_security", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:conceptual_physics": { | |
"name": "mmlu:conceptual_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "conceptual_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 235, | |
"effective_num_docs": 235 | |
}, | |
"lighteval|mmlu:econometrics": { | |
"name": "mmlu:econometrics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "econometrics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 114, | |
"effective_num_docs": 114 | |
}, | |
"lighteval|mmlu:electrical_engineering": { | |
"name": "mmlu:electrical_engineering", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "electrical_engineering", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 145, | |
"effective_num_docs": 145 | |
}, | |
"lighteval|mmlu:elementary_mathematics": { | |
"name": "mmlu:elementary_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "elementary_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 378, | |
"effective_num_docs": 378 | |
}, | |
"lighteval|mmlu:formal_logic": { | |
"name": "mmlu:formal_logic", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "formal_logic", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 126, | |
"effective_num_docs": 126 | |
}, | |
"lighteval|mmlu:global_facts": { | |
"name": "mmlu:global_facts", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "global_facts", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:high_school_biology": { | |
"name": "mmlu:high_school_biology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_biology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 310, | |
"effective_num_docs": 310 | |
}, | |
"lighteval|mmlu:high_school_chemistry": { | |
"name": "mmlu:high_school_chemistry", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_chemistry", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 203, | |
"effective_num_docs": 203 | |
}, | |
"lighteval|mmlu:high_school_computer_science": { | |
"name": "mmlu:high_school_computer_science", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_computer_science", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:high_school_european_history": { | |
"name": "mmlu:high_school_european_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_european_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 165, | |
"effective_num_docs": 165 | |
}, | |
"lighteval|mmlu:high_school_geography": { | |
"name": "mmlu:high_school_geography", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_geography", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 198, | |
"effective_num_docs": 198 | |
}, | |
"lighteval|mmlu:high_school_government_and_politics": { | |
"name": "mmlu:high_school_government_and_politics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_government_and_politics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 193, | |
"effective_num_docs": 193 | |
}, | |
"lighteval|mmlu:high_school_macroeconomics": { | |
"name": "mmlu:high_school_macroeconomics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_macroeconomics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 390, | |
"effective_num_docs": 390 | |
}, | |
"lighteval|mmlu:high_school_mathematics": { | |
"name": "mmlu:high_school_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 270, | |
"effective_num_docs": 270 | |
}, | |
"lighteval|mmlu:high_school_microeconomics": { | |
"name": "mmlu:high_school_microeconomics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_microeconomics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 238, | |
"effective_num_docs": 238 | |
}, | |
"lighteval|mmlu:high_school_physics": { | |
"name": "mmlu:high_school_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 151, | |
"effective_num_docs": 151 | |
}, | |
"lighteval|mmlu:high_school_psychology": { | |
"name": "mmlu:high_school_psychology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_psychology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 545, | |
"effective_num_docs": 545 | |
}, | |
"lighteval|mmlu:high_school_statistics": { | |
"name": "mmlu:high_school_statistics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_statistics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 216, | |
"effective_num_docs": 216 | |
}, | |
"lighteval|mmlu:high_school_us_history": { | |
"name": "mmlu:high_school_us_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_us_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 204, | |
"effective_num_docs": 204 | |
}, | |
"lighteval|mmlu:high_school_world_history": { | |
"name": "mmlu:high_school_world_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_world_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 237, | |
"effective_num_docs": 237 | |
}, | |
"lighteval|mmlu:human_aging": { | |
"name": "mmlu:human_aging", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "human_aging", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 223, | |
"effective_num_docs": 223 | |
}, | |
"lighteval|mmlu:human_sexuality": { | |
"name": "mmlu:human_sexuality", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "human_sexuality", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 131, | |
"effective_num_docs": 131 | |
}, | |
"lighteval|mmlu:international_law": { | |
"name": "mmlu:international_law", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "international_law", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 121, | |
"effective_num_docs": 121 | |
}, | |
"lighteval|mmlu:jurisprudence": { | |
"name": "mmlu:jurisprudence", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "jurisprudence", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 108, | |
"effective_num_docs": 108 | |
}, | |
"lighteval|mmlu:logical_fallacies": { | |
"name": "mmlu:logical_fallacies", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "logical_fallacies", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 163, | |
"effective_num_docs": 163 | |
}, | |
"lighteval|mmlu:machine_learning": { | |
"name": "mmlu:machine_learning", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "machine_learning", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 112, | |
"effective_num_docs": 112 | |
}, | |
"lighteval|mmlu:management": { | |
"name": "mmlu:management", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "management", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 103, | |
"effective_num_docs": 103 | |
}, | |
"lighteval|mmlu:marketing": { | |
"name": "mmlu:marketing", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "marketing", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 234, | |
"effective_num_docs": 234 | |
}, | |
"lighteval|mmlu:medical_genetics": { | |
"name": "mmlu:medical_genetics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "medical_genetics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:miscellaneous": { | |
"name": "mmlu:miscellaneous", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "miscellaneous", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 783, | |
"effective_num_docs": 783 | |
}, | |
"lighteval|mmlu:moral_disputes": { | |
"name": "mmlu:moral_disputes", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "moral_disputes", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 346, | |
"effective_num_docs": 346 | |
}, | |
"lighteval|mmlu:moral_scenarios": { | |
"name": "mmlu:moral_scenarios", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "moral_scenarios", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 895, | |
"effective_num_docs": 895 | |
}, | |
"lighteval|mmlu:nutrition": { | |
"name": "mmlu:nutrition", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "nutrition", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 306, | |
"effective_num_docs": 306 | |
}, | |
"lighteval|mmlu:philosophy": { | |
"name": "mmlu:philosophy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "philosophy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 311, | |
"effective_num_docs": 311 | |
}, | |
"lighteval|mmlu:prehistory": { | |
"name": "mmlu:prehistory", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "prehistory", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 324, | |
"effective_num_docs": 324 | |
}, | |
"lighteval|mmlu:professional_accounting": { | |
"name": "mmlu:professional_accounting", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_accounting", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 282, | |
"effective_num_docs": 282 | |
}, | |
"lighteval|mmlu:professional_law": { | |
"name": "mmlu:professional_law", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_law", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 1534, | |
"effective_num_docs": 1534 | |
}, | |
"lighteval|mmlu:professional_medicine": { | |
"name": "mmlu:professional_medicine", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_medicine", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 272, | |
"effective_num_docs": 272 | |
}, | |
"lighteval|mmlu:professional_psychology": { | |
"name": "mmlu:professional_psychology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_psychology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 612, | |
"effective_num_docs": 612 | |
}, | |
"lighteval|mmlu:public_relations": { | |
"name": "mmlu:public_relations", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "public_relations", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 110, | |
"effective_num_docs": 110 | |
}, | |
"lighteval|mmlu:security_studies": { | |
"name": "mmlu:security_studies", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "security_studies", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 245, | |
"effective_num_docs": 245 | |
}, | |
"lighteval|mmlu:sociology": { | |
"name": "mmlu:sociology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "sociology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 201, | |
"effective_num_docs": 201 | |
}, | |
"lighteval|mmlu:us_foreign_policy": { | |
"name": "mmlu:us_foreign_policy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "us_foreign_policy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:virology": { | |
"name": "mmlu:virology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "virology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 166, | |
"effective_num_docs": 166 | |
}, | |
"lighteval|mmlu:world_religions": { | |
"name": "mmlu:world_religions", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "world_religions", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 171, | |
"effective_num_docs": 171 | |
} | |
}, | |
"summary_tasks": { | |
"lighteval|mmlu:abstract_algebra|5": { | |
"hashes": { | |
"hash_examples": "4c76229e00c9c0e9", | |
"hash_full_prompts": "c3130662e7cc91d3", | |
"hash_input_tokens": "b617a339eb3b3eb7", | |
"hash_cont_tokens": "9e1c9ca2c51de57e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:anatomy|5": { | |
"hashes": { | |
"hash_examples": "6a1f8104dccbd33b", | |
"hash_full_prompts": "05a97165c871964d", | |
"hash_input_tokens": "14e9962d3b1706ea", | |
"hash_cont_tokens": "025910e68cf29c3d" | |
}, | |
"truncated": 0, | |
"non_truncated": 135, | |
"padded": 540, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:astronomy|5": { | |
"hashes": { | |
"hash_examples": "1302effa3a76ce4c", | |
"hash_full_prompts": "68355efd63c4de09", | |
"hash_input_tokens": "44bd837a633de965", | |
"hash_cont_tokens": "1a66fd04f03e0517" | |
}, | |
"truncated": 0, | |
"non_truncated": 152, | |
"padded": 608, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:business_ethics|5": { | |
"hashes": { | |
"hash_examples": "03cb8bce5336419a", | |
"hash_full_prompts": "8f440e0924442390", | |
"hash_input_tokens": "16217026443317e4", | |
"hash_cont_tokens": "9e1c9ca2c51de57e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:clinical_knowledge|5": { | |
"hashes": { | |
"hash_examples": "ffbb9c7b2be257f9", | |
"hash_full_prompts": "595feee698057167", | |
"hash_input_tokens": "896539d33768791a", | |
"hash_cont_tokens": "de872053260a1588" | |
}, | |
"truncated": 0, | |
"non_truncated": 265, | |
"padded": 1060, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_biology|5": { | |
"hashes": { | |
"hash_examples": "3ee77f176f38eb8e", | |
"hash_full_prompts": "dcd354e231c805ee", | |
"hash_input_tokens": "56c8c2aa3e63f094", | |
"hash_cont_tokens": "9ace296b3e00bba3" | |
}, | |
"truncated": 0, | |
"non_truncated": 144, | |
"padded": 576, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_chemistry|5": { | |
"hashes": { | |
"hash_examples": "ce61a69c46d47aeb", | |
"hash_full_prompts": "a520ca0fd7868631", | |
"hash_input_tokens": "0049443634b997e3", | |
"hash_cont_tokens": "9e1c9ca2c51de57e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_computer_science|5": { | |
"hashes": { | |
"hash_examples": "32805b52d7d5daab", | |
"hash_full_prompts": "ae8f53adf4b6a6e3", | |
"hash_input_tokens": "894bbabad16b75a1", | |
"hash_cont_tokens": "9e1c9ca2c51de57e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_mathematics|5": { | |
"hashes": { | |
"hash_examples": "55da1a0a0bd33722", | |
"hash_full_prompts": "39cd3169534550f3", | |
"hash_input_tokens": "5bfda6d5c7af507c", | |
"hash_cont_tokens": "9e1c9ca2c51de57e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_medicine|5": { | |
"hashes": { | |
"hash_examples": "c33e143163049176", | |
"hash_full_prompts": "bca31c5d5f3a0e4a", | |
"hash_input_tokens": "13452a8f3d9b4b3d", | |
"hash_cont_tokens": "c80c0b5489bdbc5a" | |
}, | |
"truncated": 0, | |
"non_truncated": 173, | |
"padded": 692, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_physics|5": { | |
"hashes": { | |
"hash_examples": "ebdab1cdb7e555df", | |
"hash_full_prompts": "f819d74029f4a018", | |
"hash_input_tokens": "57c45bd30a378407", | |
"hash_cont_tokens": "569fcb9ac44734ae" | |
}, | |
"truncated": 0, | |
"non_truncated": 102, | |
"padded": 408, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:computer_security|5": { | |
"hashes": { | |
"hash_examples": "a24fd7d08a560921", | |
"hash_full_prompts": "d0f4d31508009cd6", | |
"hash_input_tokens": "0af9499b3cb67d95", | |
"hash_cont_tokens": "9e1c9ca2c51de57e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:conceptual_physics|5": { | |
"hashes": { | |
"hash_examples": "8300977a79386993", | |
"hash_full_prompts": "6e2f619c2f0da087", | |
"hash_input_tokens": "00b0c9ac0fc683e8", | |
"hash_cont_tokens": "6e88c64c1a76752a" | |
}, | |
"truncated": 0, | |
"non_truncated": 235, | |
"padded": 940, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:econometrics|5": { | |
"hashes": { | |
"hash_examples": "ddde36788a04a46f", | |
"hash_full_prompts": "3f81ad69c49e1691", | |
"hash_input_tokens": "9314d720a35c62b6", | |
"hash_cont_tokens": "a315e0e16c922c3c" | |
}, | |
"truncated": 0, | |
"non_truncated": 114, | |
"padded": 456, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:electrical_engineering|5": { | |
"hashes": { | |
"hash_examples": "acbc5def98c19b3f", | |
"hash_full_prompts": "f5ab31c3b1d51682", | |
"hash_input_tokens": "863125c49d60d6a4", | |
"hash_cont_tokens": "44c72e6a7422c304" | |
}, | |
"truncated": 0, | |
"non_truncated": 145, | |
"padded": 580, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:elementary_mathematics|5": { | |
"hashes": { | |
"hash_examples": "146e61d07497a9bd", | |
"hash_full_prompts": "3e6f38a631108730", | |
"hash_input_tokens": "ed58bf384a932c74", | |
"hash_cont_tokens": "cac0a6c304791bb7" | |
}, | |
"truncated": 0, | |
"non_truncated": 378, | |
"padded": 1512, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:formal_logic|5": { | |
"hashes": { | |
"hash_examples": "8635216e1909a03f", | |
"hash_full_prompts": "2db73981fed3cf02", | |
"hash_input_tokens": "78b4957033a990a3", | |
"hash_cont_tokens": "8801fad3bbc72e57" | |
}, | |
"truncated": 0, | |
"non_truncated": 126, | |
"padded": 504, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:global_facts|5": { | |
"hashes": { | |
"hash_examples": "30b315aa6353ee47", | |
"hash_full_prompts": "3b5eef82483c02a6", | |
"hash_input_tokens": "65cf7f73e20e1bc1", | |
"hash_cont_tokens": "9e1c9ca2c51de57e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_biology|5": { | |
"hashes": { | |
"hash_examples": "c9136373af2180de", | |
"hash_full_prompts": "97a500550ada1104", | |
"hash_input_tokens": "1c299ee1038cf043", | |
"hash_cont_tokens": "2d57d9e2c5a1fd64" | |
}, | |
"truncated": 0, | |
"non_truncated": 310, | |
"padded": 1240, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_chemistry|5": { | |
"hashes": { | |
"hash_examples": "b0661bfa1add6404", | |
"hash_full_prompts": "7d42623066fb1e8e", | |
"hash_input_tokens": "38aa4f175383a891", | |
"hash_cont_tokens": "bb0fd92673ddfb31" | |
}, | |
"truncated": 0, | |
"non_truncated": 203, | |
"padded": 812, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_computer_science|5": { | |
"hashes": { | |
"hash_examples": "80fc1d623a3d665f", | |
"hash_full_prompts": "2af192ae1faf8c63", | |
"hash_input_tokens": "5a1229c044a91023", | |
"hash_cont_tokens": "9e1c9ca2c51de57e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_european_history|5": { | |
"hashes": { | |
"hash_examples": "854da6e5af0fe1a1", | |
"hash_full_prompts": "189af6182c551e23", | |
"hash_input_tokens": "f0e54538395a12c1", | |
"hash_cont_tokens": "16e494cddccc4a04" | |
}, | |
"truncated": 0, | |
"non_truncated": 165, | |
"padded": 656, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_geography|5": { | |
"hashes": { | |
"hash_examples": "7dc963c7acd19ad8", | |
"hash_full_prompts": "0906f591b7f79a10", | |
"hash_input_tokens": "40aceb5dde64fe64", | |
"hash_cont_tokens": "16b7f65a07b3d47b" | |
}, | |
"truncated": 0, | |
"non_truncated": 198, | |
"padded": 792, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_government_and_politics|5": { | |
"hashes": { | |
"hash_examples": "1f675dcdebc9758f", | |
"hash_full_prompts": "7223a4aebabcdcbd", | |
"hash_input_tokens": "96a4444be05f5ede", | |
"hash_cont_tokens": "476e87fd675136aa" | |
}, | |
"truncated": 0, | |
"non_truncated": 193, | |
"padded": 772, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_macroeconomics|5": { | |
"hashes": { | |
"hash_examples": "2fb32cf2d80f0b35", | |
"hash_full_prompts": "9c32c005a808c453", | |
"hash_input_tokens": "a78ba4100d84ecc5", | |
"hash_cont_tokens": "b0c7b4c5f7bdf3e7" | |
}, | |
"truncated": 0, | |
"non_truncated": 390, | |
"padded": 1560, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_mathematics|5": { | |
"hashes": { | |
"hash_examples": "fd6646fdb5d58a1f", | |
"hash_full_prompts": "61845b4e3d0eafe9", | |
"hash_input_tokens": "72e903543d60e864", | |
"hash_cont_tokens": "1a05d6ff49846fd1" | |
}, | |
"truncated": 0, | |
"non_truncated": 270, | |
"padded": 1080, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_microeconomics|5": { | |
"hashes": { | |
"hash_examples": "2118f21f71d87d84", | |
"hash_full_prompts": "020f7f6e77a6b641", | |
"hash_input_tokens": "8b428c95ab32cdeb", | |
"hash_cont_tokens": "0e7f0645ffffd6cd" | |
}, | |
"truncated": 0, | |
"non_truncated": 238, | |
"padded": 949, | |
"non_padded": 3, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_physics|5": { | |
"hashes": { | |
"hash_examples": "dc3ce06378548565", | |
"hash_full_prompts": "571b28c0f53b90a0", | |
"hash_input_tokens": "0862d9ba4184f5e6", | |
"hash_cont_tokens": "41ca6560b8c10183" | |
}, | |
"truncated": 0, | |
"non_truncated": 151, | |
"padded": 604, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_psychology|5": { | |
"hashes": { | |
"hash_examples": "c8d1d98a40e11f2f", | |
"hash_full_prompts": "896e9a19476b90ed", | |
"hash_input_tokens": "539679e51cf0dadf", | |
"hash_cont_tokens": "53a17ff85c607844" | |
}, | |
"truncated": 0, | |
"non_truncated": 545, | |
"padded": 2178, | |
"non_padded": 2, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_statistics|5": { | |
"hashes": { | |
"hash_examples": "666c8759b98ee4ff", | |
"hash_full_prompts": "9ca986b471235e07", | |
"hash_input_tokens": "d2df2e9ec9cc5ff9", | |
"hash_cont_tokens": "bc9063ad140cc941" | |
}, | |
"truncated": 0, | |
"non_truncated": 216, | |
"padded": 864, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_us_history|5": { | |
"hashes": { | |
"hash_examples": "95fef1c4b7d3f81e", | |
"hash_full_prompts": "b4616b587c96945d", | |
"hash_input_tokens": "1b9a891fe1e28335", | |
"hash_cont_tokens": "5cf777085ba01096" | |
}, | |
"truncated": 0, | |
"non_truncated": 204, | |
"padded": 816, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_world_history|5": { | |
"hashes": { | |
"hash_examples": "7e5085b6184b0322", | |
"hash_full_prompts": "e790690fb05fa0d1", | |
"hash_input_tokens": "60fc90341eab6ac2", | |
"hash_cont_tokens": "152af2d9e4830517" | |
}, | |
"truncated": 0, | |
"non_truncated": 237, | |
"padded": 948, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:human_aging|5": { | |
"hashes": { | |
"hash_examples": "c17333e7c7c10797", | |
"hash_full_prompts": "327f9f213650f977", | |
"hash_input_tokens": "3527cd9b1efd6b7c", | |
"hash_cont_tokens": "da4d9eaa044021dd" | |
}, | |
"truncated": 0, | |
"non_truncated": 223, | |
"padded": 892, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:human_sexuality|5": { | |
"hashes": { | |
"hash_examples": "4edd1e9045df5e3d", | |
"hash_full_prompts": "0b6a52b3d3863745", | |
"hash_input_tokens": "7a97714c98ec3df0", | |
"hash_cont_tokens": "1b99e384258a4eeb" | |
}, | |
"truncated": 0, | |
"non_truncated": 131, | |
"padded": 524, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:international_law|5": { | |
"hashes": { | |
"hash_examples": "db2fa00d771a062a", | |
"hash_full_prompts": "429b8d84640cdf75", | |
"hash_input_tokens": "7e572d7ea1a3e509", | |
"hash_cont_tokens": "cbf02c30cdded208" | |
}, | |
"truncated": 0, | |
"non_truncated": 121, | |
"padded": 484, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:jurisprudence|5": { | |
"hashes": { | |
"hash_examples": "e956f86b124076fe", | |
"hash_full_prompts": "571f9505d9f6fa3d", | |
"hash_input_tokens": "e771bba2041d48e1", | |
"hash_cont_tokens": "4b248cf879d97a50" | |
}, | |
"truncated": 0, | |
"non_truncated": 108, | |
"padded": 424, | |
"non_padded": 8, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:logical_fallacies|5": { | |
"hashes": { | |
"hash_examples": "956e0e6365ab79f1", | |
"hash_full_prompts": "abf6d18a0245c552", | |
"hash_input_tokens": "7016f4de62d61e8f", | |
"hash_cont_tokens": "6d9c35172b158838" | |
}, | |
"truncated": 0, | |
"non_truncated": 163, | |
"padded": 632, | |
"non_padded": 20, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:machine_learning|5": { | |
"hashes": { | |
"hash_examples": "397997cc6f4d581e", | |
"hash_full_prompts": "8b9115560a815fab", | |
"hash_input_tokens": "a718bd4f9fb8eab0", | |
"hash_cont_tokens": "66c3ec85fee2fc98" | |
}, | |
"truncated": 0, | |
"non_truncated": 112, | |
"padded": 448, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:management|5": { | |
"hashes": { | |
"hash_examples": "2bcbe6f6ca63d740", | |
"hash_full_prompts": "f18191cecdc130be", | |
"hash_input_tokens": "dd6a99048a822e5a", | |
"hash_cont_tokens": "5e2470abd1fb9d10" | |
}, | |
"truncated": 0, | |
"non_truncated": 103, | |
"padded": 412, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:marketing|5": { | |
"hashes": { | |
"hash_examples": "8ddb20d964a1b065", | |
"hash_full_prompts": "ad9ff50246bf7d49", | |
"hash_input_tokens": "fb59075fb468b035", | |
"hash_cont_tokens": "27fe68d9630f8999" | |
}, | |
"truncated": 0, | |
"non_truncated": 234, | |
"padded": 916, | |
"non_padded": 20, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:medical_genetics|5": { | |
"hashes": { | |
"hash_examples": "182a71f4763d2cea", | |
"hash_full_prompts": "e95c568978da29c1", | |
"hash_input_tokens": "6ec76fde9dca6553", | |
"hash_cont_tokens": "9e1c9ca2c51de57e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:miscellaneous|5": { | |
"hashes": { | |
"hash_examples": "4c404fdbb4ca57fc", | |
"hash_full_prompts": "468305dc71aa217c", | |
"hash_input_tokens": "9ab5ce7430aeeff7", | |
"hash_cont_tokens": "dfa423a160edd337" | |
}, | |
"truncated": 0, | |
"non_truncated": 783, | |
"padded": 3128, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:moral_disputes|5": { | |
"hashes": { | |
"hash_examples": "60cbd2baa3fea5c9", | |
"hash_full_prompts": "7a24f9c6f83420f2", | |
"hash_input_tokens": "17712020d9c38d0f", | |
"hash_cont_tokens": "bef966e6669349be" | |
}, | |
"truncated": 0, | |
"non_truncated": 346, | |
"padded": 1380, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:moral_scenarios|5": { | |
"hashes": { | |
"hash_examples": "fd8b0431fbdd75ef", | |
"hash_full_prompts": "8723c262038898c8", | |
"hash_input_tokens": "a4a16b58339a1b08", | |
"hash_cont_tokens": "a7bfdd944d86bcb5" | |
}, | |
"truncated": 0, | |
"non_truncated": 895, | |
"padded": 3575, | |
"non_padded": 5, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:nutrition|5": { | |
"hashes": { | |
"hash_examples": "71e55e2b829b6528", | |
"hash_full_prompts": "cc3034694d476c82", | |
"hash_input_tokens": "4589c74e55901b66", | |
"hash_cont_tokens": "fcda7736026f2449" | |
}, | |
"truncated": 0, | |
"non_truncated": 306, | |
"padded": 1224, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:philosophy|5": { | |
"hashes": { | |
"hash_examples": "a6d489a8d208fa4b", | |
"hash_full_prompts": "d92988a447a6ce08", | |
"hash_input_tokens": "fa85837aaec1aef6", | |
"hash_cont_tokens": "0f39b851342e8986" | |
}, | |
"truncated": 0, | |
"non_truncated": 311, | |
"padded": 1244, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:prehistory|5": { | |
"hashes": { | |
"hash_examples": "6cc50f032a19acaa", | |
"hash_full_prompts": "0d0d33c8f9bed861", | |
"hash_input_tokens": "735ed41425466729", | |
"hash_cont_tokens": "b60e45d3e9856b35" | |
}, | |
"truncated": 0, | |
"non_truncated": 324, | |
"padded": 1280, | |
"non_padded": 16, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:professional_accounting|5": { | |
"hashes": { | |
"hash_examples": "50f57ab32f5f6cea", | |
"hash_full_prompts": "9c809e7b8ca8ec1f", | |
"hash_input_tokens": "b0c851d675e5355b", | |
"hash_cont_tokens": "a0c4e121b7293818" | |
}, | |
"truncated": 0, | |
"non_truncated": 282, | |
"padded": 1112, | |
"non_padded": 16, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:professional_law|5": { | |
"hashes": { | |
"hash_examples": "a8fdc85c64f4b215", | |
"hash_full_prompts": "246b3e8a9054a5de", | |
"hash_input_tokens": "c27b16ef17f69218", | |
"hash_cont_tokens": "68b662abeba54fbc" | |
}, | |
"truncated": 0, | |
"non_truncated": 1534, | |
"padded": 6136, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:professional_medicine|5": { | |
"hashes": { | |
"hash_examples": "c373a28a3050a73a", | |
"hash_full_prompts": "f66dd653b5c5022b", | |
"hash_input_tokens": "955343929a6793cb", | |
"hash_cont_tokens": "6caeac5412bb4a09" | |
}, | |
"truncated": 0, | |
"non_truncated": 272, | |
"padded": 1088, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:professional_psychology|5": { | |
"hashes": { | |
"hash_examples": "bf5254fe818356af", | |
"hash_full_prompts": "03228f18e58fb42c", | |
"hash_input_tokens": "a18463f8187e4322", | |
"hash_cont_tokens": "79b091252a1095a9" | |
}, | |
"truncated": 0, | |
"non_truncated": 612, | |
"padded": 2448, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:public_relations|5": { | |
"hashes": { | |
"hash_examples": "b66d52e28e7d14e0", | |
"hash_full_prompts": "2717ec2f9cc3ea3f", | |
"hash_input_tokens": "3118fb19254356b8", | |
"hash_cont_tokens": "987115a77c8704f0" | |
}, | |
"truncated": 0, | |
"non_truncated": 110, | |
"padded": 436, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:security_studies|5": { | |
"hashes": { | |
"hash_examples": "514c14feaf000ad9", | |
"hash_full_prompts": "fd10221b4be3bf11", | |
"hash_input_tokens": "619ae48b231f13d1", | |
"hash_cont_tokens": "6c35bc7e96074b27" | |
}, | |
"truncated": 0, | |
"non_truncated": 245, | |
"padded": 980, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:sociology|5": { | |
"hashes": { | |
"hash_examples": "f6c9bc9d18c80870", | |
"hash_full_prompts": "16bc50365bda7e74", | |
"hash_input_tokens": "e77c9db987dfeede", | |
"hash_cont_tokens": "32af622f73b2e657" | |
}, | |
"truncated": 0, | |
"non_truncated": 201, | |
"padded": 804, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:us_foreign_policy|5": { | |
"hashes": { | |
"hash_examples": "ed7b78629db6678f", | |
"hash_full_prompts": "249ca3f4999e41ad", | |
"hash_input_tokens": "0fa36661f20b1b58", | |
"hash_cont_tokens": "9e1c9ca2c51de57e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:virology|5": { | |
"hashes": { | |
"hash_examples": "bc52ffdc3f9b994a", | |
"hash_full_prompts": "09939d976cecacd7", | |
"hash_input_tokens": "b8237a5fe3c03938", | |
"hash_cont_tokens": "beded8c3660dc8f5" | |
}, | |
"truncated": 0, | |
"non_truncated": 166, | |
"padded": 664, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:world_religions|5": { | |
"hashes": { | |
"hash_examples": "ecdb4a4f94f62930", | |
"hash_full_prompts": "addabd4dc9734c08", | |
"hash_input_tokens": "23943b2941071751", | |
"hash_cont_tokens": "9b1952a4af3d6a73" | |
}, | |
"truncated": 0, | |
"non_truncated": 171, | |
"padded": 684, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
} | |
}, | |
"summary_general": { | |
"hashes": { | |
"hash_examples": "341a076d0beb7048", | |
"hash_full_prompts": "11973fef11ba4c9d", | |
"hash_input_tokens": "0e9d676b8e37ef05", | |
"hash_cont_tokens": "25e9f343d6b95644" | |
}, | |
"truncated": 0, | |
"non_truncated": 14042, | |
"padded": 56062, | |
"non_padded": 106, | |
"num_truncated_few_shots": 0 | |
} | |
} |