abhishek's picture
Upload eval_results/abhishek/autotrain-mixtral-8x7b-orpo-v1/main/mmlu/results_2024-05-01T18-50-41.016030.json with huggingface_hub
242f4aa verified
raw
history blame
82.8 kB
{
"config_general": {
"lighteval_sha": "?",
"num_fewshot_seeds": 1,
"override_batch_size": 4,
"max_samples": null,
"job_id": "",
"start_time": 133363.920262817,
"end_time": 136440.010917352,
"total_evaluation_time_secondes": "3076.0906545349862",
"model_name": "abhishek/autotrain-mixtral-8x7b-orpo-v1",
"model_sha": "a8be37cf01ad767a0c71e0ba3af29c0b3ebcb559",
"model_dtype": "torch.bfloat16",
"model_size": "87.49 GB",
"config": null
},
"results": {
"leaderboard|mmlu:abstract_algebra|5": {
"acc": 0.45,
"acc_stderr": 0.04999999999999999
},
"leaderboard|mmlu:anatomy|5": {
"acc": 0.6666666666666666,
"acc_stderr": 0.04072314811876837
},
"leaderboard|mmlu:astronomy|5": {
"acc": 0.7828947368421053,
"acc_stderr": 0.03355045304882924
},
"leaderboard|mmlu:business_ethics|5": {
"acc": 0.66,
"acc_stderr": 0.04760952285695237
},
"leaderboard|mmlu:clinical_knowledge|5": {
"acc": 0.7358490566037735,
"acc_stderr": 0.02713429162874171
},
"leaderboard|mmlu:college_biology|5": {
"acc": 0.7708333333333334,
"acc_stderr": 0.03514697467862388
},
"leaderboard|mmlu:college_chemistry|5": {
"acc": 0.52,
"acc_stderr": 0.050211673156867795
},
"leaderboard|mmlu:college_computer_science|5": {
"acc": 0.62,
"acc_stderr": 0.04878317312145633
},
"leaderboard|mmlu:college_mathematics|5": {
"acc": 0.34,
"acc_stderr": 0.04760952285695235
},
"leaderboard|mmlu:college_medicine|5": {
"acc": 0.6763005780346821,
"acc_stderr": 0.0356760379963917
},
"leaderboard|mmlu:college_physics|5": {
"acc": 0.4215686274509804,
"acc_stderr": 0.049135952012744975
},
"leaderboard|mmlu:computer_security|5": {
"acc": 0.78,
"acc_stderr": 0.04163331998932261
},
"leaderboard|mmlu:conceptual_physics|5": {
"acc": 0.6340425531914894,
"acc_stderr": 0.0314895582974553
},
"leaderboard|mmlu:econometrics|5": {
"acc": 0.6052631578947368,
"acc_stderr": 0.04598188057816543
},
"leaderboard|mmlu:electrical_engineering|5": {
"acc": 0.6482758620689655,
"acc_stderr": 0.039792366374974096
},
"leaderboard|mmlu:elementary_mathematics|5": {
"acc": 0.46296296296296297,
"acc_stderr": 0.025680564640056882
},
"leaderboard|mmlu:formal_logic|5": {
"acc": 0.48412698412698413,
"acc_stderr": 0.04469881854072606
},
"leaderboard|mmlu:global_facts|5": {
"acc": 0.44,
"acc_stderr": 0.04988876515698589
},
"leaderboard|mmlu:high_school_biology|5": {
"acc": 0.8290322580645161,
"acc_stderr": 0.021417242936321582
},
"leaderboard|mmlu:high_school_chemistry|5": {
"acc": 0.5517241379310345,
"acc_stderr": 0.034991131376767445
},
"leaderboard|mmlu:high_school_computer_science|5": {
"acc": 0.71,
"acc_stderr": 0.045604802157206845
},
"leaderboard|mmlu:high_school_european_history|5": {
"acc": 0.8,
"acc_stderr": 0.031234752377721175
},
"leaderboard|mmlu:high_school_geography|5": {
"acc": 0.8535353535353535,
"acc_stderr": 0.025190921114603918
},
"leaderboard|mmlu:high_school_government_and_politics|5": {
"acc": 0.927461139896373,
"acc_stderr": 0.018718998520678185
},
"leaderboard|mmlu:high_school_macroeconomics|5": {
"acc": 0.6923076923076923,
"acc_stderr": 0.02340092891831049
},
"leaderboard|mmlu:high_school_mathematics|5": {
"acc": 0.362962962962963,
"acc_stderr": 0.02931820364520686
},
"leaderboard|mmlu:high_school_microeconomics|5": {
"acc": 0.7521008403361344,
"acc_stderr": 0.028047967224176896
},
"leaderboard|mmlu:high_school_physics|5": {
"acc": 0.45695364238410596,
"acc_stderr": 0.04067325174247443
},
"leaderboard|mmlu:high_school_psychology|5": {
"acc": 0.8605504587155963,
"acc_stderr": 0.014852421490033055
},
"leaderboard|mmlu:high_school_statistics|5": {
"acc": 0.5555555555555556,
"acc_stderr": 0.03388857118502325
},
"leaderboard|mmlu:high_school_us_history|5": {
"acc": 0.8431372549019608,
"acc_stderr": 0.025524722324553353
},
"leaderboard|mmlu:high_school_world_history|5": {
"acc": 0.8565400843881856,
"acc_stderr": 0.022818291821017012
},
"leaderboard|mmlu:human_aging|5": {
"acc": 0.7533632286995515,
"acc_stderr": 0.028930413120910884
},
"leaderboard|mmlu:human_sexuality|5": {
"acc": 0.7709923664122137,
"acc_stderr": 0.036853466317118506
},
"leaderboard|mmlu:international_law|5": {
"acc": 0.8677685950413223,
"acc_stderr": 0.030922788320445784
},
"leaderboard|mmlu:jurisprudence|5": {
"acc": 0.7870370370370371,
"acc_stderr": 0.0395783547198098
},
"leaderboard|mmlu:logical_fallacies|5": {
"acc": 0.7791411042944786,
"acc_stderr": 0.03259177392742178
},
"leaderboard|mmlu:machine_learning|5": {
"acc": 0.5625,
"acc_stderr": 0.04708567521880525
},
"leaderboard|mmlu:management|5": {
"acc": 0.8349514563106796,
"acc_stderr": 0.036756688322331886
},
"leaderboard|mmlu:marketing|5": {
"acc": 0.9316239316239316,
"acc_stderr": 0.01653462768431136
},
"leaderboard|mmlu:medical_genetics|5": {
"acc": 0.76,
"acc_stderr": 0.04292346959909282
},
"leaderboard|mmlu:miscellaneous|5": {
"acc": 0.8467432950191571,
"acc_stderr": 0.012881968968303275
},
"leaderboard|mmlu:moral_disputes|5": {
"acc": 0.7254335260115607,
"acc_stderr": 0.02402774515526502
},
"leaderboard|mmlu:moral_scenarios|5": {
"acc": 0.45139664804469276,
"acc_stderr": 0.01664330737231586
},
"leaderboard|mmlu:nutrition|5": {
"acc": 0.7712418300653595,
"acc_stderr": 0.024051029739912248
},
"leaderboard|mmlu:philosophy|5": {
"acc": 0.729903536977492,
"acc_stderr": 0.025218040373410633
},
"leaderboard|mmlu:prehistory|5": {
"acc": 0.7932098765432098,
"acc_stderr": 0.022535006705942835
},
"leaderboard|mmlu:professional_accounting|5": {
"acc": 0.4929078014184397,
"acc_stderr": 0.02982449855912901
},
"leaderboard|mmlu:professional_law|5": {
"acc": 0.5039113428943938,
"acc_stderr": 0.0127698453664412
},
"leaderboard|mmlu:professional_medicine|5": {
"acc": 0.7536764705882353,
"acc_stderr": 0.02617343857052
},
"leaderboard|mmlu:professional_psychology|5": {
"acc": 0.36764705882352944,
"acc_stderr": 0.019506291693954847
},
"leaderboard|mmlu:public_relations|5": {
"acc": 0.6818181818181818,
"acc_stderr": 0.04461272175910508
},
"leaderboard|mmlu:security_studies|5": {
"acc": 0.7551020408163265,
"acc_stderr": 0.027529637440174923
},
"leaderboard|mmlu:sociology|5": {
"acc": 0.8507462686567164,
"acc_stderr": 0.025196929874827058
},
"leaderboard|mmlu:us_foreign_policy|5": {
"acc": 0.87,
"acc_stderr": 0.03379976689896309
},
"leaderboard|mmlu:virology|5": {
"acc": 0.5120481927710844,
"acc_stderr": 0.03891364495835816
},
"leaderboard|mmlu:world_religions|5": {
"acc": 0.8245614035087719,
"acc_stderr": 0.02917088550072767
},
"leaderboard|mmlu:_average|5": {
"acc": 0.6794451069040791,
"acc_stderr": 0.03272737273781944
},
"all": {
"acc": 0.6794451069040791,
"acc_stderr": 0.03272737273781944
}
},
"versions": {
"leaderboard|mmlu:abstract_algebra|5": 0,
"leaderboard|mmlu:anatomy|5": 0,
"leaderboard|mmlu:astronomy|5": 0,
"leaderboard|mmlu:business_ethics|5": 0,
"leaderboard|mmlu:clinical_knowledge|5": 0,
"leaderboard|mmlu:college_biology|5": 0,
"leaderboard|mmlu:college_chemistry|5": 0,
"leaderboard|mmlu:college_computer_science|5": 0,
"leaderboard|mmlu:college_mathematics|5": 0,
"leaderboard|mmlu:college_medicine|5": 0,
"leaderboard|mmlu:college_physics|5": 0,
"leaderboard|mmlu:computer_security|5": 0,
"leaderboard|mmlu:conceptual_physics|5": 0,
"leaderboard|mmlu:econometrics|5": 0,
"leaderboard|mmlu:electrical_engineering|5": 0,
"leaderboard|mmlu:elementary_mathematics|5": 0,
"leaderboard|mmlu:formal_logic|5": 0,
"leaderboard|mmlu:global_facts|5": 0,
"leaderboard|mmlu:high_school_biology|5": 0,
"leaderboard|mmlu:high_school_chemistry|5": 0,
"leaderboard|mmlu:high_school_computer_science|5": 0,
"leaderboard|mmlu:high_school_european_history|5": 0,
"leaderboard|mmlu:high_school_geography|5": 0,
"leaderboard|mmlu:high_school_government_and_politics|5": 0,
"leaderboard|mmlu:high_school_macroeconomics|5": 0,
"leaderboard|mmlu:high_school_mathematics|5": 0,
"leaderboard|mmlu:high_school_microeconomics|5": 0,
"leaderboard|mmlu:high_school_physics|5": 0,
"leaderboard|mmlu:high_school_psychology|5": 0,
"leaderboard|mmlu:high_school_statistics|5": 0,
"leaderboard|mmlu:high_school_us_history|5": 0,
"leaderboard|mmlu:high_school_world_history|5": 0,
"leaderboard|mmlu:human_aging|5": 0,
"leaderboard|mmlu:human_sexuality|5": 0,
"leaderboard|mmlu:international_law|5": 0,
"leaderboard|mmlu:jurisprudence|5": 0,
"leaderboard|mmlu:logical_fallacies|5": 0,
"leaderboard|mmlu:machine_learning|5": 0,
"leaderboard|mmlu:management|5": 0,
"leaderboard|mmlu:marketing|5": 0,
"leaderboard|mmlu:medical_genetics|5": 0,
"leaderboard|mmlu:miscellaneous|5": 0,
"leaderboard|mmlu:moral_disputes|5": 0,
"leaderboard|mmlu:moral_scenarios|5": 0,
"leaderboard|mmlu:nutrition|5": 0,
"leaderboard|mmlu:philosophy|5": 0,
"leaderboard|mmlu:prehistory|5": 0,
"leaderboard|mmlu:professional_accounting|5": 0,
"leaderboard|mmlu:professional_law|5": 0,
"leaderboard|mmlu:professional_medicine|5": 0,
"leaderboard|mmlu:professional_psychology|5": 0,
"leaderboard|mmlu:public_relations|5": 0,
"leaderboard|mmlu:security_studies|5": 0,
"leaderboard|mmlu:sociology|5": 0,
"leaderboard|mmlu:us_foreign_policy|5": 0,
"leaderboard|mmlu:virology|5": 0,
"leaderboard|mmlu:world_religions|5": 0
},
"config_tasks": {
"leaderboard|mmlu:abstract_algebra": {
"name": "mmlu:abstract_algebra",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "abstract_algebra",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:anatomy": {
"name": "mmlu:anatomy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "anatomy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 135,
"effective_num_docs": 135,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:astronomy": {
"name": "mmlu:astronomy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "astronomy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 152,
"effective_num_docs": 152,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:business_ethics": {
"name": "mmlu:business_ethics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "business_ethics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:clinical_knowledge": {
"name": "mmlu:clinical_knowledge",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "clinical_knowledge",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 265,
"effective_num_docs": 265,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:college_biology": {
"name": "mmlu:college_biology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_biology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 144,
"effective_num_docs": 144,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:college_chemistry": {
"name": "mmlu:college_chemistry",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_chemistry",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:college_computer_science": {
"name": "mmlu:college_computer_science",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_computer_science",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:college_mathematics": {
"name": "mmlu:college_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:college_medicine": {
"name": "mmlu:college_medicine",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_medicine",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 173,
"effective_num_docs": 173,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:college_physics": {
"name": "mmlu:college_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 102,
"effective_num_docs": 102,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:computer_security": {
"name": "mmlu:computer_security",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "computer_security",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:conceptual_physics": {
"name": "mmlu:conceptual_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "conceptual_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 235,
"effective_num_docs": 235,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:econometrics": {
"name": "mmlu:econometrics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "econometrics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 114,
"effective_num_docs": 114,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:electrical_engineering": {
"name": "mmlu:electrical_engineering",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "electrical_engineering",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 145,
"effective_num_docs": 145,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:elementary_mathematics": {
"name": "mmlu:elementary_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "elementary_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 378,
"effective_num_docs": 378,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:formal_logic": {
"name": "mmlu:formal_logic",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "formal_logic",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 126,
"effective_num_docs": 126,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:global_facts": {
"name": "mmlu:global_facts",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "global_facts",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_biology": {
"name": "mmlu:high_school_biology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_biology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 310,
"effective_num_docs": 310,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_chemistry": {
"name": "mmlu:high_school_chemistry",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_chemistry",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 203,
"effective_num_docs": 203,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_computer_science": {
"name": "mmlu:high_school_computer_science",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_computer_science",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_european_history": {
"name": "mmlu:high_school_european_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_european_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 165,
"effective_num_docs": 165,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_geography": {
"name": "mmlu:high_school_geography",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_geography",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 198,
"effective_num_docs": 198,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_government_and_politics": {
"name": "mmlu:high_school_government_and_politics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_government_and_politics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 193,
"effective_num_docs": 193,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_macroeconomics": {
"name": "mmlu:high_school_macroeconomics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_macroeconomics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 390,
"effective_num_docs": 390,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_mathematics": {
"name": "mmlu:high_school_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 270,
"effective_num_docs": 270,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_microeconomics": {
"name": "mmlu:high_school_microeconomics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_microeconomics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 238,
"effective_num_docs": 238,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_physics": {
"name": "mmlu:high_school_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 151,
"effective_num_docs": 151,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_psychology": {
"name": "mmlu:high_school_psychology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_psychology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 545,
"effective_num_docs": 545,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_statistics": {
"name": "mmlu:high_school_statistics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_statistics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 216,
"effective_num_docs": 216,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_us_history": {
"name": "mmlu:high_school_us_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_us_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 204,
"effective_num_docs": 204,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_world_history": {
"name": "mmlu:high_school_world_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_world_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 237,
"effective_num_docs": 237,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:human_aging": {
"name": "mmlu:human_aging",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "human_aging",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 223,
"effective_num_docs": 223,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:human_sexuality": {
"name": "mmlu:human_sexuality",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "human_sexuality",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 131,
"effective_num_docs": 131,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:international_law": {
"name": "mmlu:international_law",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "international_law",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 121,
"effective_num_docs": 121,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:jurisprudence": {
"name": "mmlu:jurisprudence",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "jurisprudence",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 108,
"effective_num_docs": 108,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:logical_fallacies": {
"name": "mmlu:logical_fallacies",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "logical_fallacies",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 163,
"effective_num_docs": 163,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:machine_learning": {
"name": "mmlu:machine_learning",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "machine_learning",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 112,
"effective_num_docs": 112,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:management": {
"name": "mmlu:management",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "management",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 103,
"effective_num_docs": 103,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:marketing": {
"name": "mmlu:marketing",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "marketing",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 234,
"effective_num_docs": 234,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:medical_genetics": {
"name": "mmlu:medical_genetics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "medical_genetics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:miscellaneous": {
"name": "mmlu:miscellaneous",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "miscellaneous",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 783,
"effective_num_docs": 783,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:moral_disputes": {
"name": "mmlu:moral_disputes",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "moral_disputes",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 346,
"effective_num_docs": 346,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:moral_scenarios": {
"name": "mmlu:moral_scenarios",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "moral_scenarios",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 895,
"effective_num_docs": 895,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:nutrition": {
"name": "mmlu:nutrition",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "nutrition",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 306,
"effective_num_docs": 306,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:philosophy": {
"name": "mmlu:philosophy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "philosophy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 311,
"effective_num_docs": 311,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:prehistory": {
"name": "mmlu:prehistory",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "prehistory",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 324,
"effective_num_docs": 324,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:professional_accounting": {
"name": "mmlu:professional_accounting",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_accounting",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 282,
"effective_num_docs": 282,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:professional_law": {
"name": "mmlu:professional_law",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_law",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 1534,
"effective_num_docs": 1534,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:professional_medicine": {
"name": "mmlu:professional_medicine",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_medicine",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 272,
"effective_num_docs": 272,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:professional_psychology": {
"name": "mmlu:professional_psychology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_psychology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 612,
"effective_num_docs": 612,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:public_relations": {
"name": "mmlu:public_relations",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "public_relations",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 110,
"effective_num_docs": 110,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:security_studies": {
"name": "mmlu:security_studies",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "security_studies",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 245,
"effective_num_docs": 245,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:sociology": {
"name": "mmlu:sociology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "sociology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 201,
"effective_num_docs": 201,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:us_foreign_policy": {
"name": "mmlu:us_foreign_policy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "us_foreign_policy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:virology": {
"name": "mmlu:virology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "virology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 166,
"effective_num_docs": 166,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:world_religions": {
"name": "mmlu:world_religions",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "world_religions",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 171,
"effective_num_docs": 171,
"trust_dataset": true,
"must_remove_duplicate_docs": null
}
},
"summary_tasks": {
"leaderboard|mmlu:abstract_algebra|5": {
"hashes": {
"hash_examples": "4c76229e00c9c0e9",
"hash_full_prompts": "a45d01c3409c889c",
"hash_input_tokens": "0fe5779bbfd39458",
"hash_cont_tokens": "5739133e99fb8ad8"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:anatomy|5": {
"hashes": {
"hash_examples": "6a1f8104dccbd33b",
"hash_full_prompts": "e245c6600e03cc32",
"hash_input_tokens": "6985602b3df0fdf2",
"hash_cont_tokens": "4020fc250ba8855e"
},
"truncated": 0,
"non_truncated": 135,
"padded": 540,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:astronomy|5": {
"hashes": {
"hash_examples": "1302effa3a76ce4c",
"hash_full_prompts": "390f9bddf857ad04",
"hash_input_tokens": "9f47aa4a827f09ca",
"hash_cont_tokens": "2e19ba0f9d464ec7"
},
"truncated": 0,
"non_truncated": 152,
"padded": 608,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:business_ethics|5": {
"hashes": {
"hash_examples": "03cb8bce5336419a",
"hash_full_prompts": "5504f893bc4f2fa1",
"hash_input_tokens": "be1ff3eeae3168ca",
"hash_cont_tokens": "5739133e99fb8ad8"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:clinical_knowledge|5": {
"hashes": {
"hash_examples": "ffbb9c7b2be257f9",
"hash_full_prompts": "106ad0bab4b90b78",
"hash_input_tokens": "281e8a4124636628",
"hash_cont_tokens": "6e942fe2858712ae"
},
"truncated": 0,
"non_truncated": 265,
"padded": 1060,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_biology|5": {
"hashes": {
"hash_examples": "3ee77f176f38eb8e",
"hash_full_prompts": "59f9bdf2695cb226",
"hash_input_tokens": "5f1c618e37182983",
"hash_cont_tokens": "750cf0dfeff046c4"
},
"truncated": 0,
"non_truncated": 144,
"padded": 576,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_chemistry|5": {
"hashes": {
"hash_examples": "ce61a69c46d47aeb",
"hash_full_prompts": "3cac9b759fcff7a0",
"hash_input_tokens": "7716b78a5b2c7766",
"hash_cont_tokens": "5739133e99fb8ad8"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_computer_science|5": {
"hashes": {
"hash_examples": "32805b52d7d5daab",
"hash_full_prompts": "010b0cca35070130",
"hash_input_tokens": "2eaf06d29b70feec",
"hash_cont_tokens": "5739133e99fb8ad8"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_mathematics|5": {
"hashes": {
"hash_examples": "55da1a0a0bd33722",
"hash_full_prompts": "511422eb9eefc773",
"hash_input_tokens": "2e9212af94cf016b",
"hash_cont_tokens": "5739133e99fb8ad8"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_medicine|5": {
"hashes": {
"hash_examples": "c33e143163049176",
"hash_full_prompts": "c8cc1a82a51a046e",
"hash_input_tokens": "1cf3bd162e71ec93",
"hash_cont_tokens": "de458bd9f6f4c1e8"
},
"truncated": 0,
"non_truncated": 173,
"padded": 692,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_physics|5": {
"hashes": {
"hash_examples": "ebdab1cdb7e555df",
"hash_full_prompts": "e40721b5059c5818",
"hash_input_tokens": "8a3b0f963fd18269",
"hash_cont_tokens": "3ec87f548a37bddc"
},
"truncated": 0,
"non_truncated": 102,
"padded": 408,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:computer_security|5": {
"hashes": {
"hash_examples": "a24fd7d08a560921",
"hash_full_prompts": "946c9be5964ac44a",
"hash_input_tokens": "4d527e954909d404",
"hash_cont_tokens": "5739133e99fb8ad8"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:conceptual_physics|5": {
"hashes": {
"hash_examples": "8300977a79386993",
"hash_full_prompts": "506a4f6094cc40c9",
"hash_input_tokens": "78097767b921e219",
"hash_cont_tokens": "13d792c5220dc0e8"
},
"truncated": 0,
"non_truncated": 235,
"padded": 940,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:econometrics|5": {
"hashes": {
"hash_examples": "ddde36788a04a46f",
"hash_full_prompts": "4ed2703f27f1ed05",
"hash_input_tokens": "75170aedf177c885",
"hash_cont_tokens": "5cd59218b163ddfd"
},
"truncated": 0,
"non_truncated": 114,
"padded": 456,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:electrical_engineering|5": {
"hashes": {
"hash_examples": "acbc5def98c19b3f",
"hash_full_prompts": "d8f4b3e11c23653c",
"hash_input_tokens": "62f8d9c4ee3b4ba3",
"hash_cont_tokens": "9287b3a50a11bdba"
},
"truncated": 0,
"non_truncated": 145,
"padded": 580,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:elementary_mathematics|5": {
"hashes": {
"hash_examples": "146e61d07497a9bd",
"hash_full_prompts": "256d111bd15647ff",
"hash_input_tokens": "0ec6d03f7194631b",
"hash_cont_tokens": "23a8931ce3aa84c9"
},
"truncated": 0,
"non_truncated": 378,
"padded": 1512,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:formal_logic|5": {
"hashes": {
"hash_examples": "8635216e1909a03f",
"hash_full_prompts": "1171d04f3b1a11f5",
"hash_input_tokens": "73487a28bc4960a8",
"hash_cont_tokens": "d3b0643c11a8cc1b"
},
"truncated": 0,
"non_truncated": 126,
"padded": 504,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:global_facts|5": {
"hashes": {
"hash_examples": "30b315aa6353ee47",
"hash_full_prompts": "a7e56dbc074c7529",
"hash_input_tokens": "388c66d3a197fa0a",
"hash_cont_tokens": "5739133e99fb8ad8"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_biology|5": {
"hashes": {
"hash_examples": "c9136373af2180de",
"hash_full_prompts": "ad6e859ed978e04a",
"hash_input_tokens": "9a7dcf6401f12c9a",
"hash_cont_tokens": "9d90477d239cc6d8"
},
"truncated": 0,
"non_truncated": 310,
"padded": 1240,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_chemistry|5": {
"hashes": {
"hash_examples": "b0661bfa1add6404",
"hash_full_prompts": "6eb9c04bcc8a8f2a",
"hash_input_tokens": "95a5f5cb0ad4ae51",
"hash_cont_tokens": "d518689c1577a5bb"
},
"truncated": 0,
"non_truncated": 203,
"padded": 812,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_computer_science|5": {
"hashes": {
"hash_examples": "80fc1d623a3d665f",
"hash_full_prompts": "8e51bc91c81cf8dd",
"hash_input_tokens": "334bb7ac1e7f058c",
"hash_cont_tokens": "5739133e99fb8ad8"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_european_history|5": {
"hashes": {
"hash_examples": "854da6e5af0fe1a1",
"hash_full_prompts": "664a1f16c9f3195c",
"hash_input_tokens": "fe5ac2f30a47b01e",
"hash_cont_tokens": "f27fd41b64bb6c6d"
},
"truncated": 0,
"non_truncated": 165,
"padded": 656,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_geography|5": {
"hashes": {
"hash_examples": "7dc963c7acd19ad8",
"hash_full_prompts": "f3acf911f4023c8a",
"hash_input_tokens": "dbb8fb6fa1921225",
"hash_cont_tokens": "88bb0ab56be2e694"
},
"truncated": 0,
"non_truncated": 198,
"padded": 792,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_government_and_politics|5": {
"hashes": {
"hash_examples": "1f675dcdebc9758f",
"hash_full_prompts": "066254feaa3158ae",
"hash_input_tokens": "b5cd164a3689a010",
"hash_cont_tokens": "f6854f1bb4b558c1"
},
"truncated": 0,
"non_truncated": 193,
"padded": 772,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_macroeconomics|5": {
"hashes": {
"hash_examples": "2fb32cf2d80f0b35",
"hash_full_prompts": "19a7fa502aa85c95",
"hash_input_tokens": "e5af1d29ec1375ef",
"hash_cont_tokens": "6c8e0dc09bb99e37"
},
"truncated": 0,
"non_truncated": 390,
"padded": 1560,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_mathematics|5": {
"hashes": {
"hash_examples": "fd6646fdb5d58a1f",
"hash_full_prompts": "4f704e369778b5b0",
"hash_input_tokens": "6a77d1eeaaa13f88",
"hash_cont_tokens": "6feef2732c1b2d4c"
},
"truncated": 0,
"non_truncated": 270,
"padded": 1080,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_microeconomics|5": {
"hashes": {
"hash_examples": "2118f21f71d87d84",
"hash_full_prompts": "4350f9e2240f8010",
"hash_input_tokens": "df8ba3a19ec61286",
"hash_cont_tokens": "f9dfc942b16f5267"
},
"truncated": 0,
"non_truncated": 238,
"padded": 949,
"non_padded": 3,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_physics|5": {
"hashes": {
"hash_examples": "dc3ce06378548565",
"hash_full_prompts": "5dc0d6831b66188f",
"hash_input_tokens": "c2f89913d26b3804",
"hash_cont_tokens": "c9b6fb68f1119c6c"
},
"truncated": 0,
"non_truncated": 151,
"padded": 604,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_psychology|5": {
"hashes": {
"hash_examples": "c8d1d98a40e11f2f",
"hash_full_prompts": "af2b097da6d50365",
"hash_input_tokens": "fca84bcc94a0f457",
"hash_cont_tokens": "5024b2446e7f0d51"
},
"truncated": 0,
"non_truncated": 545,
"padded": 2178,
"non_padded": 2,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_statistics|5": {
"hashes": {
"hash_examples": "666c8759b98ee4ff",
"hash_full_prompts": "c757694421d6d68d",
"hash_input_tokens": "8f6a5c418a13d2fb",
"hash_cont_tokens": "2bb63458482cea04"
},
"truncated": 0,
"non_truncated": 216,
"padded": 864,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_us_history|5": {
"hashes": {
"hash_examples": "95fef1c4b7d3f81e",
"hash_full_prompts": "e34a028d0ddeec5e",
"hash_input_tokens": "6668b57a661aafc5",
"hash_cont_tokens": "5666f3f217d4332c"
},
"truncated": 0,
"non_truncated": 204,
"padded": 816,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_world_history|5": {
"hashes": {
"hash_examples": "7e5085b6184b0322",
"hash_full_prompts": "1fa3d51392765601",
"hash_input_tokens": "73aeb6bb3ae4d1e9",
"hash_cont_tokens": "dd7e50c17b54c08f"
},
"truncated": 0,
"non_truncated": 237,
"padded": 948,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:human_aging|5": {
"hashes": {
"hash_examples": "c17333e7c7c10797",
"hash_full_prompts": "cac900721f9a1a94",
"hash_input_tokens": "b03c5226fb519d7a",
"hash_cont_tokens": "66bb7b523dbd018d"
},
"truncated": 0,
"non_truncated": 223,
"padded": 892,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:human_sexuality|5": {
"hashes": {
"hash_examples": "4edd1e9045df5e3d",
"hash_full_prompts": "0d6567bafee0a13c",
"hash_input_tokens": "062d0f125fea9343",
"hash_cont_tokens": "9c12fbd8915b29d8"
},
"truncated": 0,
"non_truncated": 131,
"padded": 524,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:international_law|5": {
"hashes": {
"hash_examples": "db2fa00d771a062a",
"hash_full_prompts": "d018f9116479795e",
"hash_input_tokens": "af37ce0e58de6237",
"hash_cont_tokens": "939bc6141fbc2edf"
},
"truncated": 0,
"non_truncated": 121,
"padded": 484,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:jurisprudence|5": {
"hashes": {
"hash_examples": "e956f86b124076fe",
"hash_full_prompts": "1487e89a10ec58b7",
"hash_input_tokens": "49b4ea770c987d32",
"hash_cont_tokens": "0ec098526f036a8a"
},
"truncated": 0,
"non_truncated": 108,
"padded": 424,
"non_padded": 8,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:logical_fallacies|5": {
"hashes": {
"hash_examples": "956e0e6365ab79f1",
"hash_full_prompts": "677785b2181f9243",
"hash_input_tokens": "dd09600885a64129",
"hash_cont_tokens": "22cadb0152a35b33"
},
"truncated": 0,
"non_truncated": 163,
"padded": 632,
"non_padded": 20,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:machine_learning|5": {
"hashes": {
"hash_examples": "397997cc6f4d581e",
"hash_full_prompts": "769ee14a2aea49bb",
"hash_input_tokens": "de15e01f132712a1",
"hash_cont_tokens": "ba0c03916a4f8962"
},
"truncated": 0,
"non_truncated": 112,
"padded": 448,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:management|5": {
"hashes": {
"hash_examples": "2bcbe6f6ca63d740",
"hash_full_prompts": "cb1ff9dac9582144",
"hash_input_tokens": "0bdd13c3e253c084",
"hash_cont_tokens": "a0189ce8f55ad1bc"
},
"truncated": 0,
"non_truncated": 103,
"padded": 412,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:marketing|5": {
"hashes": {
"hash_examples": "8ddb20d964a1b065",
"hash_full_prompts": "9fc2114a187ad9a2",
"hash_input_tokens": "11542dea73278c57",
"hash_cont_tokens": "7c475ce17ba7f995"
},
"truncated": 0,
"non_truncated": 234,
"padded": 916,
"non_padded": 20,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:medical_genetics|5": {
"hashes": {
"hash_examples": "182a71f4763d2cea",
"hash_full_prompts": "46a616fa51878959",
"hash_input_tokens": "d2cbccb05c894c9a",
"hash_cont_tokens": "5739133e99fb8ad8"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:miscellaneous|5": {
"hashes": {
"hash_examples": "4c404fdbb4ca57fc",
"hash_full_prompts": "0813e1be36dbaae1",
"hash_input_tokens": "dc43c11c1d22c9cc",
"hash_cont_tokens": "fffaa9d5e21ea2af"
},
"truncated": 0,
"non_truncated": 783,
"padded": 3128,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:moral_disputes|5": {
"hashes": {
"hash_examples": "60cbd2baa3fea5c9",
"hash_full_prompts": "1d14adebb9b62519",
"hash_input_tokens": "e14bc256c0636235",
"hash_cont_tokens": "7148539bc2747f6f"
},
"truncated": 0,
"non_truncated": 346,
"padded": 1380,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:moral_scenarios|5": {
"hashes": {
"hash_examples": "fd8b0431fbdd75ef",
"hash_full_prompts": "b80d3d236165e3de",
"hash_input_tokens": "3ad51653e199ecb5",
"hash_cont_tokens": "ab3904f2ea05d117"
},
"truncated": 0,
"non_truncated": 895,
"padded": 3575,
"non_padded": 5,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:nutrition|5": {
"hashes": {
"hash_examples": "71e55e2b829b6528",
"hash_full_prompts": "2bfb18e5fab8dea7",
"hash_input_tokens": "9fc73099308228de",
"hash_cont_tokens": "77b06e0a3882a218"
},
"truncated": 0,
"non_truncated": 306,
"padded": 1224,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:philosophy|5": {
"hashes": {
"hash_examples": "a6d489a8d208fa4b",
"hash_full_prompts": "e8c0d5b6dae3ccc8",
"hash_input_tokens": "1f04529a01331877",
"hash_cont_tokens": "16547f8767db6b33"
},
"truncated": 0,
"non_truncated": 311,
"padded": 1244,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:prehistory|5": {
"hashes": {
"hash_examples": "6cc50f032a19acaa",
"hash_full_prompts": "4a6a1d3ab1bf28e4",
"hash_input_tokens": "b888c315b2fe260f",
"hash_cont_tokens": "d02c802ddda38fc6"
},
"truncated": 0,
"non_truncated": 324,
"padded": 1280,
"non_padded": 16,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_accounting|5": {
"hashes": {
"hash_examples": "50f57ab32f5f6cea",
"hash_full_prompts": "e60129bd2d82ffc6",
"hash_input_tokens": "f7d95500add349b4",
"hash_cont_tokens": "727a930b413e9dcc"
},
"truncated": 0,
"non_truncated": 282,
"padded": 1112,
"non_padded": 16,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_law|5": {
"hashes": {
"hash_examples": "a8fdc85c64f4b215",
"hash_full_prompts": "0dbb1d9b72dcea03",
"hash_input_tokens": "118dc3fd9f9a3ee4",
"hash_cont_tokens": "a6b7566ed4e357a4"
},
"truncated": 0,
"non_truncated": 1534,
"padded": 6136,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_medicine|5": {
"hashes": {
"hash_examples": "c373a28a3050a73a",
"hash_full_prompts": "5e040f9ca68b089e",
"hash_input_tokens": "a05e408a696a471a",
"hash_cont_tokens": "842eee9669bb319c"
},
"truncated": 0,
"non_truncated": 272,
"padded": 1088,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_psychology|5": {
"hashes": {
"hash_examples": "bf5254fe818356af",
"hash_full_prompts": "b386ecda8b87150e",
"hash_input_tokens": "24311c9a35d0d28e",
"hash_cont_tokens": "be012eab9f44677d"
},
"truncated": 0,
"non_truncated": 612,
"padded": 2448,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:public_relations|5": {
"hashes": {
"hash_examples": "b66d52e28e7d14e0",
"hash_full_prompts": "fe43562263e25677",
"hash_input_tokens": "3423b8f3b48e9623",
"hash_cont_tokens": "94f3463ddfbff82d"
},
"truncated": 0,
"non_truncated": 110,
"padded": 436,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:security_studies|5": {
"hashes": {
"hash_examples": "514c14feaf000ad9",
"hash_full_prompts": "27d4a2ac541ef4b9",
"hash_input_tokens": "fda6731d7b1ee470",
"hash_cont_tokens": "8a9a00a3be4137d7"
},
"truncated": 0,
"non_truncated": 245,
"padded": 980,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:sociology|5": {
"hashes": {
"hash_examples": "f6c9bc9d18c80870",
"hash_full_prompts": "c072ea7d1a1524f2",
"hash_input_tokens": "9dc8c12d6e111a44",
"hash_cont_tokens": "b342c9aa0cc9576f"
},
"truncated": 0,
"non_truncated": 201,
"padded": 804,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:us_foreign_policy|5": {
"hashes": {
"hash_examples": "ed7b78629db6678f",
"hash_full_prompts": "341a97ca3e4d699d",
"hash_input_tokens": "c5eaba656a6b29de",
"hash_cont_tokens": "5739133e99fb8ad8"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:virology|5": {
"hashes": {
"hash_examples": "bc52ffdc3f9b994a",
"hash_full_prompts": "651d471e2eb8b5e9",
"hash_input_tokens": "46fe9c766d7e8ace",
"hash_cont_tokens": "81e5cec1153bffc8"
},
"truncated": 0,
"non_truncated": 166,
"padded": 664,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:world_religions|5": {
"hashes": {
"hash_examples": "ecdb4a4f94f62930",
"hash_full_prompts": "3773f03542ce44a3",
"hash_input_tokens": "1010d6d65948506f",
"hash_cont_tokens": "1d7b5eb727cbc4c6"
},
"truncated": 0,
"non_truncated": 171,
"padded": 684,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
}
},
"summary_general": {
"hashes": {
"hash_examples": "341a076d0beb7048",
"hash_full_prompts": "a5c8f2b7ff4f5ae2",
"hash_input_tokens": "b5af86d667921a83",
"hash_cont_tokens": "02ae1fe9bf3431a5"
},
"truncated": 0,
"non_truncated": 14042,
"padded": 56062,
"non_padded": 106,
"num_truncated_few_shots": 0
}
}