edbeeching's picture
edbeeching HF staff
Upload eval_results/HuggingFaceH4/mistral-7b-odpo/v1.14/mmlu/results_2024-03-28T14-01-38.404907.json with huggingface_hub
8b47767 verified
raw
history blame
80.3 kB
{
"config_general": {
"lighteval_sha": "?",
"num_fewshot_seeds": 1,
"override_batch_size": 1,
"max_samples": null,
"job_id": "",
"start_time": 804003.164058114,
"end_time": 805034.587999627,
"total_evaluation_time_secondes": "1031.4239415129414",
"model_name": "HuggingFaceH4/mistral-7b-odpo",
"model_sha": "7fe8e064e8aa0eacb19a4eed77408b333c15dfbc",
"model_dtype": "torch.bfloat16",
"model_size": "13.99 GB",
"config": null
},
"results": {
"leaderboard|mmlu:abstract_algebra|5": {
"acc": 0.37,
"acc_stderr": 0.04852365870939099
},
"leaderboard|mmlu:anatomy|5": {
"acc": 0.5777777777777777,
"acc_stderr": 0.04266763404099582
},
"leaderboard|mmlu:astronomy|5": {
"acc": 0.6052631578947368,
"acc_stderr": 0.039777499346220734
},
"leaderboard|mmlu:business_ethics|5": {
"acc": 0.56,
"acc_stderr": 0.04988876515698589
},
"leaderboard|mmlu:clinical_knowledge|5": {
"acc": 0.6830188679245283,
"acc_stderr": 0.02863723563980089
},
"leaderboard|mmlu:college_biology|5": {
"acc": 0.6875,
"acc_stderr": 0.038760854559127644
},
"leaderboard|mmlu:college_chemistry|5": {
"acc": 0.39,
"acc_stderr": 0.049020713000019756
},
"leaderboard|mmlu:college_computer_science|5": {
"acc": 0.5,
"acc_stderr": 0.050251890762960605
},
"leaderboard|mmlu:college_mathematics|5": {
"acc": 0.29,
"acc_stderr": 0.04560480215720683
},
"leaderboard|mmlu:college_medicine|5": {
"acc": 0.630057803468208,
"acc_stderr": 0.036812296333943194
},
"leaderboard|mmlu:college_physics|5": {
"acc": 0.39215686274509803,
"acc_stderr": 0.04858083574266345
},
"leaderboard|mmlu:computer_security|5": {
"acc": 0.75,
"acc_stderr": 0.04351941398892446
},
"leaderboard|mmlu:conceptual_physics|5": {
"acc": 0.48936170212765956,
"acc_stderr": 0.03267862331014063
},
"leaderboard|mmlu:econometrics|5": {
"acc": 0.39473684210526316,
"acc_stderr": 0.04598188057816542
},
"leaderboard|mmlu:electrical_engineering|5": {
"acc": 0.503448275862069,
"acc_stderr": 0.04166567577101579
},
"leaderboard|mmlu:elementary_mathematics|5": {
"acc": 0.3915343915343915,
"acc_stderr": 0.025138091388851105
},
"leaderboard|mmlu:formal_logic|5": {
"acc": 0.4126984126984127,
"acc_stderr": 0.04403438954768176
},
"leaderboard|mmlu:global_facts|5": {
"acc": 0.37,
"acc_stderr": 0.04852365870939099
},
"leaderboard|mmlu:high_school_biology|5": {
"acc": 0.7161290322580646,
"acc_stderr": 0.02564938106302926
},
"leaderboard|mmlu:high_school_chemistry|5": {
"acc": 0.46798029556650245,
"acc_stderr": 0.035107665979592154
},
"leaderboard|mmlu:high_school_computer_science|5": {
"acc": 0.62,
"acc_stderr": 0.04878317312145632
},
"leaderboard|mmlu:high_school_european_history|5": {
"acc": 0.7454545454545455,
"acc_stderr": 0.03401506715249039
},
"leaderboard|mmlu:high_school_geography|5": {
"acc": 0.7626262626262627,
"acc_stderr": 0.030313710538198913
},
"leaderboard|mmlu:high_school_government_and_politics|5": {
"acc": 0.8082901554404145,
"acc_stderr": 0.028408953626245282
},
"leaderboard|mmlu:high_school_macroeconomics|5": {
"acc": 0.558974358974359,
"acc_stderr": 0.02517404838400074
},
"leaderboard|mmlu:high_school_mathematics|5": {
"acc": 0.3111111111111111,
"acc_stderr": 0.02822644674968352
},
"leaderboard|mmlu:high_school_microeconomics|5": {
"acc": 0.592436974789916,
"acc_stderr": 0.03191863374478466
},
"leaderboard|mmlu:high_school_physics|5": {
"acc": 0.33774834437086093,
"acc_stderr": 0.038615575462551684
},
"leaderboard|mmlu:high_school_psychology|5": {
"acc": 0.7908256880733945,
"acc_stderr": 0.017437937173343233
},
"leaderboard|mmlu:high_school_statistics|5": {
"acc": 0.47685185185185186,
"acc_stderr": 0.03406315360711507
},
"leaderboard|mmlu:high_school_us_history|5": {
"acc": 0.7598039215686274,
"acc_stderr": 0.02998373305591361
},
"leaderboard|mmlu:high_school_world_history|5": {
"acc": 0.7257383966244726,
"acc_stderr": 0.02904133351059804
},
"leaderboard|mmlu:human_aging|5": {
"acc": 0.6681614349775785,
"acc_stderr": 0.031602951437766785
},
"leaderboard|mmlu:human_sexuality|5": {
"acc": 0.6335877862595419,
"acc_stderr": 0.04225875451969638
},
"leaderboard|mmlu:international_law|5": {
"acc": 0.768595041322314,
"acc_stderr": 0.03849856098794088
},
"leaderboard|mmlu:jurisprudence|5": {
"acc": 0.7777777777777778,
"acc_stderr": 0.040191074725573483
},
"leaderboard|mmlu:logical_fallacies|5": {
"acc": 0.6503067484662577,
"acc_stderr": 0.03746668325470021
},
"leaderboard|mmlu:machine_learning|5": {
"acc": 0.4642857142857143,
"acc_stderr": 0.04733667890053756
},
"leaderboard|mmlu:management|5": {
"acc": 0.7378640776699029,
"acc_stderr": 0.04354631077260595
},
"leaderboard|mmlu:marketing|5": {
"acc": 0.8589743589743589,
"acc_stderr": 0.022801382534597518
},
"leaderboard|mmlu:medical_genetics|5": {
"acc": 0.66,
"acc_stderr": 0.04760952285695237
},
"leaderboard|mmlu:miscellaneous|5": {
"acc": 0.7905491698595147,
"acc_stderr": 0.014551310568143705
},
"leaderboard|mmlu:moral_disputes|5": {
"acc": 0.684971098265896,
"acc_stderr": 0.025009313790069692
},
"leaderboard|mmlu:moral_scenarios|5": {
"acc": 0.4100558659217877,
"acc_stderr": 0.016449708209026078
},
"leaderboard|mmlu:nutrition|5": {
"acc": 0.6470588235294118,
"acc_stderr": 0.027363593284684965
},
"leaderboard|mmlu:philosophy|5": {
"acc": 0.6784565916398714,
"acc_stderr": 0.026527724079528872
},
"leaderboard|mmlu:prehistory|5": {
"acc": 0.6759259259259259,
"acc_stderr": 0.02604176620271716
},
"leaderboard|mmlu:professional_accounting|5": {
"acc": 0.44680851063829785,
"acc_stderr": 0.02965823509766691
},
"leaderboard|mmlu:professional_law|5": {
"acc": 0.40808344198174706,
"acc_stderr": 0.012552598958563664
},
"leaderboard|mmlu:professional_medicine|5": {
"acc": 0.6213235294117647,
"acc_stderr": 0.02946513363977613
},
"leaderboard|mmlu:professional_psychology|5": {
"acc": 0.6029411764705882,
"acc_stderr": 0.019794488900024117
},
"leaderboard|mmlu:public_relations|5": {
"acc": 0.5727272727272728,
"acc_stderr": 0.04738198703545483
},
"leaderboard|mmlu:security_studies|5": {
"acc": 0.6448979591836734,
"acc_stderr": 0.030635655150387638
},
"leaderboard|mmlu:sociology|5": {
"acc": 0.8059701492537313,
"acc_stderr": 0.027962677604768924
},
"leaderboard|mmlu:us_foreign_policy|5": {
"acc": 0.82,
"acc_stderr": 0.038612291966536934
},
"leaderboard|mmlu:virology|5": {
"acc": 0.5,
"acc_stderr": 0.03892494720807614
},
"leaderboard|mmlu:world_religions|5": {
"acc": 0.8304093567251462,
"acc_stderr": 0.02878210810540171
},
"leaderboard|mmlu:_average|5": {
"acc": 0.5970395937564318,
"acc_stderr": 0.03487424897725769
}
},
"versions": {
"leaderboard|mmlu:abstract_algebra|5": 0,
"leaderboard|mmlu:anatomy|5": 0,
"leaderboard|mmlu:astronomy|5": 0,
"leaderboard|mmlu:business_ethics|5": 0,
"leaderboard|mmlu:clinical_knowledge|5": 0,
"leaderboard|mmlu:college_biology|5": 0,
"leaderboard|mmlu:college_chemistry|5": 0,
"leaderboard|mmlu:college_computer_science|5": 0,
"leaderboard|mmlu:college_mathematics|5": 0,
"leaderboard|mmlu:college_medicine|5": 0,
"leaderboard|mmlu:college_physics|5": 0,
"leaderboard|mmlu:computer_security|5": 0,
"leaderboard|mmlu:conceptual_physics|5": 0,
"leaderboard|mmlu:econometrics|5": 0,
"leaderboard|mmlu:electrical_engineering|5": 0,
"leaderboard|mmlu:elementary_mathematics|5": 0,
"leaderboard|mmlu:formal_logic|5": 0,
"leaderboard|mmlu:global_facts|5": 0,
"leaderboard|mmlu:high_school_biology|5": 0,
"leaderboard|mmlu:high_school_chemistry|5": 0,
"leaderboard|mmlu:high_school_computer_science|5": 0,
"leaderboard|mmlu:high_school_european_history|5": 0,
"leaderboard|mmlu:high_school_geography|5": 0,
"leaderboard|mmlu:high_school_government_and_politics|5": 0,
"leaderboard|mmlu:high_school_macroeconomics|5": 0,
"leaderboard|mmlu:high_school_mathematics|5": 0,
"leaderboard|mmlu:high_school_microeconomics|5": 0,
"leaderboard|mmlu:high_school_physics|5": 0,
"leaderboard|mmlu:high_school_psychology|5": 0,
"leaderboard|mmlu:high_school_statistics|5": 0,
"leaderboard|mmlu:high_school_us_history|5": 0,
"leaderboard|mmlu:high_school_world_history|5": 0,
"leaderboard|mmlu:human_aging|5": 0,
"leaderboard|mmlu:human_sexuality|5": 0,
"leaderboard|mmlu:international_law|5": 0,
"leaderboard|mmlu:jurisprudence|5": 0,
"leaderboard|mmlu:logical_fallacies|5": 0,
"leaderboard|mmlu:machine_learning|5": 0,
"leaderboard|mmlu:management|5": 0,
"leaderboard|mmlu:marketing|5": 0,
"leaderboard|mmlu:medical_genetics|5": 0,
"leaderboard|mmlu:miscellaneous|5": 0,
"leaderboard|mmlu:moral_disputes|5": 0,
"leaderboard|mmlu:moral_scenarios|5": 0,
"leaderboard|mmlu:nutrition|5": 0,
"leaderboard|mmlu:philosophy|5": 0,
"leaderboard|mmlu:prehistory|5": 0,
"leaderboard|mmlu:professional_accounting|5": 0,
"leaderboard|mmlu:professional_law|5": 0,
"leaderboard|mmlu:professional_medicine|5": 0,
"leaderboard|mmlu:professional_psychology|5": 0,
"leaderboard|mmlu:public_relations|5": 0,
"leaderboard|mmlu:security_studies|5": 0,
"leaderboard|mmlu:sociology|5": 0,
"leaderboard|mmlu:us_foreign_policy|5": 0,
"leaderboard|mmlu:virology|5": 0,
"leaderboard|mmlu:world_religions|5": 0
},
"config_tasks": {
"leaderboard|mmlu:abstract_algebra": {
"name": "mmlu:abstract_algebra",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "abstract_algebra",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:anatomy": {
"name": "mmlu:anatomy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "anatomy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 135,
"effective_num_docs": 135,
"trust_dataset": true
},
"leaderboard|mmlu:astronomy": {
"name": "mmlu:astronomy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "astronomy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 152,
"effective_num_docs": 152,
"trust_dataset": true
},
"leaderboard|mmlu:business_ethics": {
"name": "mmlu:business_ethics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "business_ethics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:clinical_knowledge": {
"name": "mmlu:clinical_knowledge",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "clinical_knowledge",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 265,
"effective_num_docs": 265,
"trust_dataset": true
},
"leaderboard|mmlu:college_biology": {
"name": "mmlu:college_biology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_biology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 144,
"effective_num_docs": 144,
"trust_dataset": true
},
"leaderboard|mmlu:college_chemistry": {
"name": "mmlu:college_chemistry",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_chemistry",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:college_computer_science": {
"name": "mmlu:college_computer_science",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_computer_science",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:college_mathematics": {
"name": "mmlu:college_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:college_medicine": {
"name": "mmlu:college_medicine",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_medicine",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 173,
"effective_num_docs": 173,
"trust_dataset": true
},
"leaderboard|mmlu:college_physics": {
"name": "mmlu:college_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 102,
"effective_num_docs": 102,
"trust_dataset": true
},
"leaderboard|mmlu:computer_security": {
"name": "mmlu:computer_security",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "computer_security",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:conceptual_physics": {
"name": "mmlu:conceptual_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "conceptual_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 235,
"effective_num_docs": 235,
"trust_dataset": true
},
"leaderboard|mmlu:econometrics": {
"name": "mmlu:econometrics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "econometrics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 114,
"effective_num_docs": 114,
"trust_dataset": true
},
"leaderboard|mmlu:electrical_engineering": {
"name": "mmlu:electrical_engineering",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "electrical_engineering",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 145,
"effective_num_docs": 145,
"trust_dataset": true
},
"leaderboard|mmlu:elementary_mathematics": {
"name": "mmlu:elementary_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "elementary_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 378,
"effective_num_docs": 378,
"trust_dataset": true
},
"leaderboard|mmlu:formal_logic": {
"name": "mmlu:formal_logic",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "formal_logic",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 126,
"effective_num_docs": 126,
"trust_dataset": true
},
"leaderboard|mmlu:global_facts": {
"name": "mmlu:global_facts",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "global_facts",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_biology": {
"name": "mmlu:high_school_biology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_biology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 310,
"effective_num_docs": 310,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_chemistry": {
"name": "mmlu:high_school_chemistry",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_chemistry",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 203,
"effective_num_docs": 203,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_computer_science": {
"name": "mmlu:high_school_computer_science",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_computer_science",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_european_history": {
"name": "mmlu:high_school_european_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_european_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 165,
"effective_num_docs": 165,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_geography": {
"name": "mmlu:high_school_geography",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_geography",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 198,
"effective_num_docs": 198,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_government_and_politics": {
"name": "mmlu:high_school_government_and_politics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_government_and_politics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 193,
"effective_num_docs": 193,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_macroeconomics": {
"name": "mmlu:high_school_macroeconomics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_macroeconomics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 390,
"effective_num_docs": 390,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_mathematics": {
"name": "mmlu:high_school_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 270,
"effective_num_docs": 270,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_microeconomics": {
"name": "mmlu:high_school_microeconomics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_microeconomics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 238,
"effective_num_docs": 238,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_physics": {
"name": "mmlu:high_school_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 151,
"effective_num_docs": 151,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_psychology": {
"name": "mmlu:high_school_psychology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_psychology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 545,
"effective_num_docs": 545,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_statistics": {
"name": "mmlu:high_school_statistics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_statistics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 216,
"effective_num_docs": 216,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_us_history": {
"name": "mmlu:high_school_us_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_us_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 204,
"effective_num_docs": 204,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_world_history": {
"name": "mmlu:high_school_world_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_world_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 237,
"effective_num_docs": 237,
"trust_dataset": true
},
"leaderboard|mmlu:human_aging": {
"name": "mmlu:human_aging",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "human_aging",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 223,
"effective_num_docs": 223,
"trust_dataset": true
},
"leaderboard|mmlu:human_sexuality": {
"name": "mmlu:human_sexuality",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "human_sexuality",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 131,
"effective_num_docs": 131,
"trust_dataset": true
},
"leaderboard|mmlu:international_law": {
"name": "mmlu:international_law",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "international_law",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 121,
"effective_num_docs": 121,
"trust_dataset": true
},
"leaderboard|mmlu:jurisprudence": {
"name": "mmlu:jurisprudence",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "jurisprudence",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 108,
"effective_num_docs": 108,
"trust_dataset": true
},
"leaderboard|mmlu:logical_fallacies": {
"name": "mmlu:logical_fallacies",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "logical_fallacies",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 163,
"effective_num_docs": 163,
"trust_dataset": true
},
"leaderboard|mmlu:machine_learning": {
"name": "mmlu:machine_learning",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "machine_learning",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 112,
"effective_num_docs": 112,
"trust_dataset": true
},
"leaderboard|mmlu:management": {
"name": "mmlu:management",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "management",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 103,
"effective_num_docs": 103,
"trust_dataset": true
},
"leaderboard|mmlu:marketing": {
"name": "mmlu:marketing",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "marketing",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 234,
"effective_num_docs": 234,
"trust_dataset": true
},
"leaderboard|mmlu:medical_genetics": {
"name": "mmlu:medical_genetics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "medical_genetics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:miscellaneous": {
"name": "mmlu:miscellaneous",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "miscellaneous",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 783,
"effective_num_docs": 783,
"trust_dataset": true
},
"leaderboard|mmlu:moral_disputes": {
"name": "mmlu:moral_disputes",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "moral_disputes",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 346,
"effective_num_docs": 346,
"trust_dataset": true
},
"leaderboard|mmlu:moral_scenarios": {
"name": "mmlu:moral_scenarios",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "moral_scenarios",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 895,
"effective_num_docs": 895,
"trust_dataset": true
},
"leaderboard|mmlu:nutrition": {
"name": "mmlu:nutrition",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "nutrition",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 306,
"effective_num_docs": 306,
"trust_dataset": true
},
"leaderboard|mmlu:philosophy": {
"name": "mmlu:philosophy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "philosophy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 311,
"effective_num_docs": 311,
"trust_dataset": true
},
"leaderboard|mmlu:prehistory": {
"name": "mmlu:prehistory",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "prehistory",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 324,
"effective_num_docs": 324,
"trust_dataset": true
},
"leaderboard|mmlu:professional_accounting": {
"name": "mmlu:professional_accounting",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_accounting",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 282,
"effective_num_docs": 282,
"trust_dataset": true
},
"leaderboard|mmlu:professional_law": {
"name": "mmlu:professional_law",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_law",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 1534,
"effective_num_docs": 1534,
"trust_dataset": true
},
"leaderboard|mmlu:professional_medicine": {
"name": "mmlu:professional_medicine",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_medicine",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 272,
"effective_num_docs": 272,
"trust_dataset": true
},
"leaderboard|mmlu:professional_psychology": {
"name": "mmlu:professional_psychology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_psychology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 612,
"effective_num_docs": 612,
"trust_dataset": true
},
"leaderboard|mmlu:public_relations": {
"name": "mmlu:public_relations",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "public_relations",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 110,
"effective_num_docs": 110,
"trust_dataset": true
},
"leaderboard|mmlu:security_studies": {
"name": "mmlu:security_studies",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "security_studies",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 245,
"effective_num_docs": 245,
"trust_dataset": true
},
"leaderboard|mmlu:sociology": {
"name": "mmlu:sociology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "sociology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 201,
"effective_num_docs": 201,
"trust_dataset": true
},
"leaderboard|mmlu:us_foreign_policy": {
"name": "mmlu:us_foreign_policy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "us_foreign_policy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:virology": {
"name": "mmlu:virology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "virology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 166,
"effective_num_docs": 166,
"trust_dataset": true
},
"leaderboard|mmlu:world_religions": {
"name": "mmlu:world_religions",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "world_religions",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 171,
"effective_num_docs": 171,
"trust_dataset": true
}
},
"summary_tasks": {
"leaderboard|mmlu:abstract_algebra|5": {
"hashes": {
"hash_examples": "4c76229e00c9c0e9",
"hash_full_prompts": "c3130662e7cc91d3",
"hash_input_tokens": "b617a339eb3b3eb7",
"hash_cont_tokens": "9e1c9ca2c51de57e"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:anatomy|5": {
"hashes": {
"hash_examples": "6a1f8104dccbd33b",
"hash_full_prompts": "05a97165c871964d",
"hash_input_tokens": "14e9962d3b1706ea",
"hash_cont_tokens": "025910e68cf29c3d"
},
"truncated": 0,
"non_truncated": 135,
"padded": 540,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:astronomy|5": {
"hashes": {
"hash_examples": "1302effa3a76ce4c",
"hash_full_prompts": "68355efd63c4de09",
"hash_input_tokens": "44bd837a633de965",
"hash_cont_tokens": "1a66fd04f03e0517"
},
"truncated": 0,
"non_truncated": 152,
"padded": 608,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:business_ethics|5": {
"hashes": {
"hash_examples": "03cb8bce5336419a",
"hash_full_prompts": "8f440e0924442390",
"hash_input_tokens": "16217026443317e4",
"hash_cont_tokens": "9e1c9ca2c51de57e"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:clinical_knowledge|5": {
"hashes": {
"hash_examples": "ffbb9c7b2be257f9",
"hash_full_prompts": "595feee698057167",
"hash_input_tokens": "896539d33768791a",
"hash_cont_tokens": "de872053260a1588"
},
"truncated": 0,
"non_truncated": 265,
"padded": 1060,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_biology|5": {
"hashes": {
"hash_examples": "3ee77f176f38eb8e",
"hash_full_prompts": "dcd354e231c805ee",
"hash_input_tokens": "56c8c2aa3e63f094",
"hash_cont_tokens": "9ace296b3e00bba3"
},
"truncated": 0,
"non_truncated": 144,
"padded": 576,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_chemistry|5": {
"hashes": {
"hash_examples": "ce61a69c46d47aeb",
"hash_full_prompts": "a520ca0fd7868631",
"hash_input_tokens": "0049443634b997e3",
"hash_cont_tokens": "9e1c9ca2c51de57e"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_computer_science|5": {
"hashes": {
"hash_examples": "32805b52d7d5daab",
"hash_full_prompts": "ae8f53adf4b6a6e3",
"hash_input_tokens": "894bbabad16b75a1",
"hash_cont_tokens": "9e1c9ca2c51de57e"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_mathematics|5": {
"hashes": {
"hash_examples": "55da1a0a0bd33722",
"hash_full_prompts": "39cd3169534550f3",
"hash_input_tokens": "5bfda6d5c7af507c",
"hash_cont_tokens": "9e1c9ca2c51de57e"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_medicine|5": {
"hashes": {
"hash_examples": "c33e143163049176",
"hash_full_prompts": "bca31c5d5f3a0e4a",
"hash_input_tokens": "13452a8f3d9b4b3d",
"hash_cont_tokens": "c80c0b5489bdbc5a"
},
"truncated": 0,
"non_truncated": 173,
"padded": 692,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_physics|5": {
"hashes": {
"hash_examples": "ebdab1cdb7e555df",
"hash_full_prompts": "f819d74029f4a018",
"hash_input_tokens": "57c45bd30a378407",
"hash_cont_tokens": "569fcb9ac44734ae"
},
"truncated": 0,
"non_truncated": 102,
"padded": 408,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:computer_security|5": {
"hashes": {
"hash_examples": "a24fd7d08a560921",
"hash_full_prompts": "d0f4d31508009cd6",
"hash_input_tokens": "0af9499b3cb67d95",
"hash_cont_tokens": "9e1c9ca2c51de57e"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:conceptual_physics|5": {
"hashes": {
"hash_examples": "8300977a79386993",
"hash_full_prompts": "6e2f619c2f0da087",
"hash_input_tokens": "00b0c9ac0fc683e8",
"hash_cont_tokens": "6e88c64c1a76752a"
},
"truncated": 0,
"non_truncated": 235,
"padded": 940,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:econometrics|5": {
"hashes": {
"hash_examples": "ddde36788a04a46f",
"hash_full_prompts": "3f81ad69c49e1691",
"hash_input_tokens": "9314d720a35c62b6",
"hash_cont_tokens": "a315e0e16c922c3c"
},
"truncated": 0,
"non_truncated": 114,
"padded": 456,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:electrical_engineering|5": {
"hashes": {
"hash_examples": "acbc5def98c19b3f",
"hash_full_prompts": "f5ab31c3b1d51682",
"hash_input_tokens": "863125c49d60d6a4",
"hash_cont_tokens": "44c72e6a7422c304"
},
"truncated": 0,
"non_truncated": 145,
"padded": 580,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:elementary_mathematics|5": {
"hashes": {
"hash_examples": "146e61d07497a9bd",
"hash_full_prompts": "3e6f38a631108730",
"hash_input_tokens": "ed58bf384a932c74",
"hash_cont_tokens": "cac0a6c304791bb7"
},
"truncated": 0,
"non_truncated": 378,
"padded": 1512,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:formal_logic|5": {
"hashes": {
"hash_examples": "8635216e1909a03f",
"hash_full_prompts": "2db73981fed3cf02",
"hash_input_tokens": "78b4957033a990a3",
"hash_cont_tokens": "8801fad3bbc72e57"
},
"truncated": 0,
"non_truncated": 126,
"padded": 504,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:global_facts|5": {
"hashes": {
"hash_examples": "30b315aa6353ee47",
"hash_full_prompts": "3b5eef82483c02a6",
"hash_input_tokens": "65cf7f73e20e1bc1",
"hash_cont_tokens": "9e1c9ca2c51de57e"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_biology|5": {
"hashes": {
"hash_examples": "c9136373af2180de",
"hash_full_prompts": "97a500550ada1104",
"hash_input_tokens": "1c299ee1038cf043",
"hash_cont_tokens": "2d57d9e2c5a1fd64"
},
"truncated": 0,
"non_truncated": 310,
"padded": 1240,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_chemistry|5": {
"hashes": {
"hash_examples": "b0661bfa1add6404",
"hash_full_prompts": "7d42623066fb1e8e",
"hash_input_tokens": "38aa4f175383a891",
"hash_cont_tokens": "bb0fd92673ddfb31"
},
"truncated": 0,
"non_truncated": 203,
"padded": 812,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_computer_science|5": {
"hashes": {
"hash_examples": "80fc1d623a3d665f",
"hash_full_prompts": "2af192ae1faf8c63",
"hash_input_tokens": "5a1229c044a91023",
"hash_cont_tokens": "9e1c9ca2c51de57e"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_european_history|5": {
"hashes": {
"hash_examples": "854da6e5af0fe1a1",
"hash_full_prompts": "189af6182c551e23",
"hash_input_tokens": "f0e54538395a12c1",
"hash_cont_tokens": "16e494cddccc4a04"
},
"truncated": 0,
"non_truncated": 165,
"padded": 656,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_geography|5": {
"hashes": {
"hash_examples": "7dc963c7acd19ad8",
"hash_full_prompts": "0906f591b7f79a10",
"hash_input_tokens": "40aceb5dde64fe64",
"hash_cont_tokens": "16b7f65a07b3d47b"
},
"truncated": 0,
"non_truncated": 198,
"padded": 792,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_government_and_politics|5": {
"hashes": {
"hash_examples": "1f675dcdebc9758f",
"hash_full_prompts": "7223a4aebabcdcbd",
"hash_input_tokens": "96a4444be05f5ede",
"hash_cont_tokens": "476e87fd675136aa"
},
"truncated": 0,
"non_truncated": 193,
"padded": 772,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_macroeconomics|5": {
"hashes": {
"hash_examples": "2fb32cf2d80f0b35",
"hash_full_prompts": "9c32c005a808c453",
"hash_input_tokens": "a78ba4100d84ecc5",
"hash_cont_tokens": "b0c7b4c5f7bdf3e7"
},
"truncated": 0,
"non_truncated": 390,
"padded": 1560,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_mathematics|5": {
"hashes": {
"hash_examples": "fd6646fdb5d58a1f",
"hash_full_prompts": "61845b4e3d0eafe9",
"hash_input_tokens": "72e903543d60e864",
"hash_cont_tokens": "1a05d6ff49846fd1"
},
"truncated": 0,
"non_truncated": 270,
"padded": 1080,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_microeconomics|5": {
"hashes": {
"hash_examples": "2118f21f71d87d84",
"hash_full_prompts": "020f7f6e77a6b641",
"hash_input_tokens": "8b428c95ab32cdeb",
"hash_cont_tokens": "0e7f0645ffffd6cd"
},
"truncated": 0,
"non_truncated": 238,
"padded": 949,
"non_padded": 3,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_physics|5": {
"hashes": {
"hash_examples": "dc3ce06378548565",
"hash_full_prompts": "571b28c0f53b90a0",
"hash_input_tokens": "0862d9ba4184f5e6",
"hash_cont_tokens": "41ca6560b8c10183"
},
"truncated": 0,
"non_truncated": 151,
"padded": 604,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_psychology|5": {
"hashes": {
"hash_examples": "c8d1d98a40e11f2f",
"hash_full_prompts": "896e9a19476b90ed",
"hash_input_tokens": "539679e51cf0dadf",
"hash_cont_tokens": "53a17ff85c607844"
},
"truncated": 0,
"non_truncated": 545,
"padded": 2178,
"non_padded": 2,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_statistics|5": {
"hashes": {
"hash_examples": "666c8759b98ee4ff",
"hash_full_prompts": "9ca986b471235e07",
"hash_input_tokens": "d2df2e9ec9cc5ff9",
"hash_cont_tokens": "bc9063ad140cc941"
},
"truncated": 0,
"non_truncated": 216,
"padded": 864,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_us_history|5": {
"hashes": {
"hash_examples": "95fef1c4b7d3f81e",
"hash_full_prompts": "b4616b587c96945d",
"hash_input_tokens": "1b9a891fe1e28335",
"hash_cont_tokens": "5cf777085ba01096"
},
"truncated": 0,
"non_truncated": 204,
"padded": 816,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_world_history|5": {
"hashes": {
"hash_examples": "7e5085b6184b0322",
"hash_full_prompts": "e790690fb05fa0d1",
"hash_input_tokens": "60fc90341eab6ac2",
"hash_cont_tokens": "152af2d9e4830517"
},
"truncated": 0,
"non_truncated": 237,
"padded": 948,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:human_aging|5": {
"hashes": {
"hash_examples": "c17333e7c7c10797",
"hash_full_prompts": "327f9f213650f977",
"hash_input_tokens": "3527cd9b1efd6b7c",
"hash_cont_tokens": "da4d9eaa044021dd"
},
"truncated": 0,
"non_truncated": 223,
"padded": 892,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:human_sexuality|5": {
"hashes": {
"hash_examples": "4edd1e9045df5e3d",
"hash_full_prompts": "0b6a52b3d3863745",
"hash_input_tokens": "7a97714c98ec3df0",
"hash_cont_tokens": "1b99e384258a4eeb"
},
"truncated": 0,
"non_truncated": 131,
"padded": 524,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:international_law|5": {
"hashes": {
"hash_examples": "db2fa00d771a062a",
"hash_full_prompts": "429b8d84640cdf75",
"hash_input_tokens": "7e572d7ea1a3e509",
"hash_cont_tokens": "cbf02c30cdded208"
},
"truncated": 0,
"non_truncated": 121,
"padded": 484,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:jurisprudence|5": {
"hashes": {
"hash_examples": "e956f86b124076fe",
"hash_full_prompts": "571f9505d9f6fa3d",
"hash_input_tokens": "e771bba2041d48e1",
"hash_cont_tokens": "4b248cf879d97a50"
},
"truncated": 0,
"non_truncated": 108,
"padded": 424,
"non_padded": 8,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:logical_fallacies|5": {
"hashes": {
"hash_examples": "956e0e6365ab79f1",
"hash_full_prompts": "abf6d18a0245c552",
"hash_input_tokens": "7016f4de62d61e8f",
"hash_cont_tokens": "6d9c35172b158838"
},
"truncated": 0,
"non_truncated": 163,
"padded": 632,
"non_padded": 20,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:machine_learning|5": {
"hashes": {
"hash_examples": "397997cc6f4d581e",
"hash_full_prompts": "8b9115560a815fab",
"hash_input_tokens": "a718bd4f9fb8eab0",
"hash_cont_tokens": "66c3ec85fee2fc98"
},
"truncated": 0,
"non_truncated": 112,
"padded": 448,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:management|5": {
"hashes": {
"hash_examples": "2bcbe6f6ca63d740",
"hash_full_prompts": "f18191cecdc130be",
"hash_input_tokens": "dd6a99048a822e5a",
"hash_cont_tokens": "5e2470abd1fb9d10"
},
"truncated": 0,
"non_truncated": 103,
"padded": 412,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:marketing|5": {
"hashes": {
"hash_examples": "8ddb20d964a1b065",
"hash_full_prompts": "ad9ff50246bf7d49",
"hash_input_tokens": "fb59075fb468b035",
"hash_cont_tokens": "27fe68d9630f8999"
},
"truncated": 0,
"non_truncated": 234,
"padded": 916,
"non_padded": 20,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:medical_genetics|5": {
"hashes": {
"hash_examples": "182a71f4763d2cea",
"hash_full_prompts": "e95c568978da29c1",
"hash_input_tokens": "6ec76fde9dca6553",
"hash_cont_tokens": "9e1c9ca2c51de57e"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:miscellaneous|5": {
"hashes": {
"hash_examples": "4c404fdbb4ca57fc",
"hash_full_prompts": "468305dc71aa217c",
"hash_input_tokens": "9ab5ce7430aeeff7",
"hash_cont_tokens": "dfa423a160edd337"
},
"truncated": 0,
"non_truncated": 783,
"padded": 3128,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:moral_disputes|5": {
"hashes": {
"hash_examples": "60cbd2baa3fea5c9",
"hash_full_prompts": "7a24f9c6f83420f2",
"hash_input_tokens": "17712020d9c38d0f",
"hash_cont_tokens": "bef966e6669349be"
},
"truncated": 0,
"non_truncated": 346,
"padded": 1380,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:moral_scenarios|5": {
"hashes": {
"hash_examples": "fd8b0431fbdd75ef",
"hash_full_prompts": "8723c262038898c8",
"hash_input_tokens": "a4a16b58339a1b08",
"hash_cont_tokens": "a7bfdd944d86bcb5"
},
"truncated": 0,
"non_truncated": 895,
"padded": 3575,
"non_padded": 5,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:nutrition|5": {
"hashes": {
"hash_examples": "71e55e2b829b6528",
"hash_full_prompts": "cc3034694d476c82",
"hash_input_tokens": "4589c74e55901b66",
"hash_cont_tokens": "fcda7736026f2449"
},
"truncated": 0,
"non_truncated": 306,
"padded": 1224,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:philosophy|5": {
"hashes": {
"hash_examples": "a6d489a8d208fa4b",
"hash_full_prompts": "d92988a447a6ce08",
"hash_input_tokens": "fa85837aaec1aef6",
"hash_cont_tokens": "0f39b851342e8986"
},
"truncated": 0,
"non_truncated": 311,
"padded": 1244,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:prehistory|5": {
"hashes": {
"hash_examples": "6cc50f032a19acaa",
"hash_full_prompts": "0d0d33c8f9bed861",
"hash_input_tokens": "735ed41425466729",
"hash_cont_tokens": "b60e45d3e9856b35"
},
"truncated": 0,
"non_truncated": 324,
"padded": 1280,
"non_padded": 16,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_accounting|5": {
"hashes": {
"hash_examples": "50f57ab32f5f6cea",
"hash_full_prompts": "9c809e7b8ca8ec1f",
"hash_input_tokens": "b0c851d675e5355b",
"hash_cont_tokens": "a0c4e121b7293818"
},
"truncated": 0,
"non_truncated": 282,
"padded": 1112,
"non_padded": 16,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_law|5": {
"hashes": {
"hash_examples": "a8fdc85c64f4b215",
"hash_full_prompts": "246b3e8a9054a5de",
"hash_input_tokens": "c27b16ef17f69218",
"hash_cont_tokens": "68b662abeba54fbc"
},
"truncated": 0,
"non_truncated": 1534,
"padded": 6136,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_medicine|5": {
"hashes": {
"hash_examples": "c373a28a3050a73a",
"hash_full_prompts": "f66dd653b5c5022b",
"hash_input_tokens": "955343929a6793cb",
"hash_cont_tokens": "6caeac5412bb4a09"
},
"truncated": 0,
"non_truncated": 272,
"padded": 1088,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_psychology|5": {
"hashes": {
"hash_examples": "bf5254fe818356af",
"hash_full_prompts": "03228f18e58fb42c",
"hash_input_tokens": "a18463f8187e4322",
"hash_cont_tokens": "79b091252a1095a9"
},
"truncated": 0,
"non_truncated": 612,
"padded": 2448,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:public_relations|5": {
"hashes": {
"hash_examples": "b66d52e28e7d14e0",
"hash_full_prompts": "2717ec2f9cc3ea3f",
"hash_input_tokens": "3118fb19254356b8",
"hash_cont_tokens": "987115a77c8704f0"
},
"truncated": 0,
"non_truncated": 110,
"padded": 436,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:security_studies|5": {
"hashes": {
"hash_examples": "514c14feaf000ad9",
"hash_full_prompts": "fd10221b4be3bf11",
"hash_input_tokens": "619ae48b231f13d1",
"hash_cont_tokens": "6c35bc7e96074b27"
},
"truncated": 0,
"non_truncated": 245,
"padded": 980,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:sociology|5": {
"hashes": {
"hash_examples": "f6c9bc9d18c80870",
"hash_full_prompts": "16bc50365bda7e74",
"hash_input_tokens": "e77c9db987dfeede",
"hash_cont_tokens": "32af622f73b2e657"
},
"truncated": 0,
"non_truncated": 201,
"padded": 804,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:us_foreign_policy|5": {
"hashes": {
"hash_examples": "ed7b78629db6678f",
"hash_full_prompts": "249ca3f4999e41ad",
"hash_input_tokens": "0fa36661f20b1b58",
"hash_cont_tokens": "9e1c9ca2c51de57e"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:virology|5": {
"hashes": {
"hash_examples": "bc52ffdc3f9b994a",
"hash_full_prompts": "09939d976cecacd7",
"hash_input_tokens": "b8237a5fe3c03938",
"hash_cont_tokens": "beded8c3660dc8f5"
},
"truncated": 0,
"non_truncated": 166,
"padded": 664,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:world_religions|5": {
"hashes": {
"hash_examples": "ecdb4a4f94f62930",
"hash_full_prompts": "addabd4dc9734c08",
"hash_input_tokens": "23943b2941071751",
"hash_cont_tokens": "9b1952a4af3d6a73"
},
"truncated": 0,
"non_truncated": 171,
"padded": 684,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
}
},
"summary_general": {
"hashes": {
"hash_examples": "341a076d0beb7048",
"hash_full_prompts": "11973fef11ba4c9d",
"hash_input_tokens": "0e9d676b8e37ef05",
"hash_cont_tokens": "25e9f343d6b95644"
},
"truncated": 0,
"non_truncated": 14042,
"padded": 56062,
"non_padded": 106,
"num_truncated_few_shots": 0
}
}