edbeeching's picture
edbeeching HF staff
Upload eval_results/HuggingFaceH4/Qwen1.5-1.8B-Chat-odpo/v0.2/mmlu/results_2024-03-24T01-24-52.754306.json with huggingface_hub
da34340 verified
raw
history blame
80.3 kB
{
"config_general": {
"lighteval_sha": "?",
"num_fewshot_seeds": 1,
"override_batch_size": 1,
"max_samples": null,
"job_id": "",
"start_time": 1357604.561101805,
"end_time": 1358447.150633949,
"total_evaluation_time_secondes": "842.5895321441349",
"model_name": "HuggingFaceH4/Qwen1.5-1.8B-Chat-odpo",
"model_sha": "3f268359edf3bea5beb998cef7a2bff5b0720e94",
"model_dtype": "torch.bfloat16",
"model_size": "3.8 GB",
"config": null
},
"results": {
"leaderboard|mmlu:abstract_algebra|5": {
"acc": 0.35,
"acc_stderr": 0.0479372485441102
},
"leaderboard|mmlu:anatomy|5": {
"acc": 0.4074074074074074,
"acc_stderr": 0.04244633238353229
},
"leaderboard|mmlu:astronomy|5": {
"acc": 0.506578947368421,
"acc_stderr": 0.04068590050224971
},
"leaderboard|mmlu:business_ethics|5": {
"acc": 0.48,
"acc_stderr": 0.050211673156867795
},
"leaderboard|mmlu:clinical_knowledge|5": {
"acc": 0.47924528301886793,
"acc_stderr": 0.030746349975723463
},
"leaderboard|mmlu:college_biology|5": {
"acc": 0.4583333333333333,
"acc_stderr": 0.04166666666666665
},
"leaderboard|mmlu:college_chemistry|5": {
"acc": 0.29,
"acc_stderr": 0.045604802157206845
},
"leaderboard|mmlu:college_computer_science|5": {
"acc": 0.44,
"acc_stderr": 0.0498887651569859
},
"leaderboard|mmlu:college_mathematics|5": {
"acc": 0.38,
"acc_stderr": 0.048783173121456316
},
"leaderboard|mmlu:college_medicine|5": {
"acc": 0.43352601156069365,
"acc_stderr": 0.037786210790920545
},
"leaderboard|mmlu:college_physics|5": {
"acc": 0.21568627450980393,
"acc_stderr": 0.04092563958237654
},
"leaderboard|mmlu:computer_security|5": {
"acc": 0.59,
"acc_stderr": 0.04943110704237102
},
"leaderboard|mmlu:conceptual_physics|5": {
"acc": 0.4297872340425532,
"acc_stderr": 0.03236214467715563
},
"leaderboard|mmlu:econometrics|5": {
"acc": 0.2982456140350877,
"acc_stderr": 0.04303684033537314
},
"leaderboard|mmlu:electrical_engineering|5": {
"acc": 0.45517241379310347,
"acc_stderr": 0.04149886942192117
},
"leaderboard|mmlu:elementary_mathematics|5": {
"acc": 0.35185185185185186,
"acc_stderr": 0.024594975128920945
},
"leaderboard|mmlu:formal_logic|5": {
"acc": 0.2857142857142857,
"acc_stderr": 0.0404061017820884
},
"leaderboard|mmlu:global_facts|5": {
"acc": 0.33,
"acc_stderr": 0.04725815626252604
},
"leaderboard|mmlu:high_school_biology|5": {
"acc": 0.4612903225806452,
"acc_stderr": 0.028358634859836935
},
"leaderboard|mmlu:high_school_chemistry|5": {
"acc": 0.33497536945812806,
"acc_stderr": 0.033208527423483104
},
"leaderboard|mmlu:high_school_computer_science|5": {
"acc": 0.48,
"acc_stderr": 0.050211673156867795
},
"leaderboard|mmlu:high_school_european_history|5": {
"acc": 0.6303030303030303,
"acc_stderr": 0.03769430314512567
},
"leaderboard|mmlu:high_school_geography|5": {
"acc": 0.601010101010101,
"acc_stderr": 0.03488901616852731
},
"leaderboard|mmlu:high_school_government_and_politics|5": {
"acc": 0.5854922279792746,
"acc_stderr": 0.035553003195576686
},
"leaderboard|mmlu:high_school_macroeconomics|5": {
"acc": 0.4282051282051282,
"acc_stderr": 0.025088301454694834
},
"leaderboard|mmlu:high_school_mathematics|5": {
"acc": 0.32222222222222224,
"acc_stderr": 0.028493465091028593
},
"leaderboard|mmlu:high_school_microeconomics|5": {
"acc": 0.46218487394957986,
"acc_stderr": 0.032385469487589795
},
"leaderboard|mmlu:high_school_physics|5": {
"acc": 0.2847682119205298,
"acc_stderr": 0.03684881521389023
},
"leaderboard|mmlu:high_school_psychology|5": {
"acc": 0.5926605504587156,
"acc_stderr": 0.021065986244412898
},
"leaderboard|mmlu:high_school_statistics|5": {
"acc": 0.32407407407407407,
"acc_stderr": 0.03191923445686186
},
"leaderboard|mmlu:high_school_us_history|5": {
"acc": 0.5196078431372549,
"acc_stderr": 0.03506612560524866
},
"leaderboard|mmlu:high_school_world_history|5": {
"acc": 0.6666666666666666,
"acc_stderr": 0.0306858205966108
},
"leaderboard|mmlu:human_aging|5": {
"acc": 0.49327354260089684,
"acc_stderr": 0.03355476596234353
},
"leaderboard|mmlu:human_sexuality|5": {
"acc": 0.4732824427480916,
"acc_stderr": 0.04379024936553893
},
"leaderboard|mmlu:international_law|5": {
"acc": 0.6198347107438017,
"acc_stderr": 0.04431324501968431
},
"leaderboard|mmlu:jurisprudence|5": {
"acc": 0.5740740740740741,
"acc_stderr": 0.047803436269367894
},
"leaderboard|mmlu:logical_fallacies|5": {
"acc": 0.48466257668711654,
"acc_stderr": 0.039265223787088424
},
"leaderboard|mmlu:machine_learning|5": {
"acc": 0.3392857142857143,
"acc_stderr": 0.04493949068613539
},
"leaderboard|mmlu:management|5": {
"acc": 0.6796116504854369,
"acc_stderr": 0.04620284082280041
},
"leaderboard|mmlu:marketing|5": {
"acc": 0.7435897435897436,
"acc_stderr": 0.02860595370200426
},
"leaderboard|mmlu:medical_genetics|5": {
"acc": 0.47,
"acc_stderr": 0.050161355804659205
},
"leaderboard|mmlu:miscellaneous|5": {
"acc": 0.5747126436781609,
"acc_stderr": 0.01767922548943145
},
"leaderboard|mmlu:moral_disputes|5": {
"acc": 0.5086705202312138,
"acc_stderr": 0.026915047355369804
},
"leaderboard|mmlu:moral_scenarios|5": {
"acc": 0.23575418994413408,
"acc_stderr": 0.014196375686290804
},
"leaderboard|mmlu:nutrition|5": {
"acc": 0.5424836601307189,
"acc_stderr": 0.028526383452142635
},
"leaderboard|mmlu:philosophy|5": {
"acc": 0.4887459807073955,
"acc_stderr": 0.028390897396863533
},
"leaderboard|mmlu:prehistory|5": {
"acc": 0.49691358024691357,
"acc_stderr": 0.027820214158594384
},
"leaderboard|mmlu:professional_accounting|5": {
"acc": 0.34397163120567376,
"acc_stderr": 0.028338017428611317
},
"leaderboard|mmlu:professional_law|5": {
"acc": 0.3663624511082138,
"acc_stderr": 0.012305658346838439
},
"leaderboard|mmlu:professional_medicine|5": {
"acc": 0.4117647058823529,
"acc_stderr": 0.029896163033125468
},
"leaderboard|mmlu:professional_psychology|5": {
"acc": 0.44281045751633985,
"acc_stderr": 0.020095083154577347
},
"leaderboard|mmlu:public_relations|5": {
"acc": 0.5454545454545454,
"acc_stderr": 0.04769300568972744
},
"leaderboard|mmlu:security_studies|5": {
"acc": 0.4489795918367347,
"acc_stderr": 0.0318421386668758
},
"leaderboard|mmlu:sociology|5": {
"acc": 0.5373134328358209,
"acc_stderr": 0.03525675167467974
},
"leaderboard|mmlu:us_foreign_policy|5": {
"acc": 0.69,
"acc_stderr": 0.04648231987117316
},
"leaderboard|mmlu:virology|5": {
"acc": 0.42771084337349397,
"acc_stderr": 0.038515976837185335
},
"leaderboard|mmlu:world_religions|5": {
"acc": 0.5847953216374269,
"acc_stderr": 0.03779275945503201
},
"leaderboard|mmlu:_average|5": {
"acc": 0.4631415313965751,
"acc_stderr": 0.03626529670007629
}
},
"versions": {
"leaderboard|mmlu:abstract_algebra|5": 0,
"leaderboard|mmlu:anatomy|5": 0,
"leaderboard|mmlu:astronomy|5": 0,
"leaderboard|mmlu:business_ethics|5": 0,
"leaderboard|mmlu:clinical_knowledge|5": 0,
"leaderboard|mmlu:college_biology|5": 0,
"leaderboard|mmlu:college_chemistry|5": 0,
"leaderboard|mmlu:college_computer_science|5": 0,
"leaderboard|mmlu:college_mathematics|5": 0,
"leaderboard|mmlu:college_medicine|5": 0,
"leaderboard|mmlu:college_physics|5": 0,
"leaderboard|mmlu:computer_security|5": 0,
"leaderboard|mmlu:conceptual_physics|5": 0,
"leaderboard|mmlu:econometrics|5": 0,
"leaderboard|mmlu:electrical_engineering|5": 0,
"leaderboard|mmlu:elementary_mathematics|5": 0,
"leaderboard|mmlu:formal_logic|5": 0,
"leaderboard|mmlu:global_facts|5": 0,
"leaderboard|mmlu:high_school_biology|5": 0,
"leaderboard|mmlu:high_school_chemistry|5": 0,
"leaderboard|mmlu:high_school_computer_science|5": 0,
"leaderboard|mmlu:high_school_european_history|5": 0,
"leaderboard|mmlu:high_school_geography|5": 0,
"leaderboard|mmlu:high_school_government_and_politics|5": 0,
"leaderboard|mmlu:high_school_macroeconomics|5": 0,
"leaderboard|mmlu:high_school_mathematics|5": 0,
"leaderboard|mmlu:high_school_microeconomics|5": 0,
"leaderboard|mmlu:high_school_physics|5": 0,
"leaderboard|mmlu:high_school_psychology|5": 0,
"leaderboard|mmlu:high_school_statistics|5": 0,
"leaderboard|mmlu:high_school_us_history|5": 0,
"leaderboard|mmlu:high_school_world_history|5": 0,
"leaderboard|mmlu:human_aging|5": 0,
"leaderboard|mmlu:human_sexuality|5": 0,
"leaderboard|mmlu:international_law|5": 0,
"leaderboard|mmlu:jurisprudence|5": 0,
"leaderboard|mmlu:logical_fallacies|5": 0,
"leaderboard|mmlu:machine_learning|5": 0,
"leaderboard|mmlu:management|5": 0,
"leaderboard|mmlu:marketing|5": 0,
"leaderboard|mmlu:medical_genetics|5": 0,
"leaderboard|mmlu:miscellaneous|5": 0,
"leaderboard|mmlu:moral_disputes|5": 0,
"leaderboard|mmlu:moral_scenarios|5": 0,
"leaderboard|mmlu:nutrition|5": 0,
"leaderboard|mmlu:philosophy|5": 0,
"leaderboard|mmlu:prehistory|5": 0,
"leaderboard|mmlu:professional_accounting|5": 0,
"leaderboard|mmlu:professional_law|5": 0,
"leaderboard|mmlu:professional_medicine|5": 0,
"leaderboard|mmlu:professional_psychology|5": 0,
"leaderboard|mmlu:public_relations|5": 0,
"leaderboard|mmlu:security_studies|5": 0,
"leaderboard|mmlu:sociology|5": 0,
"leaderboard|mmlu:us_foreign_policy|5": 0,
"leaderboard|mmlu:virology|5": 0,
"leaderboard|mmlu:world_religions|5": 0
},
"config_tasks": {
"leaderboard|mmlu:abstract_algebra": {
"name": "mmlu:abstract_algebra",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "abstract_algebra",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:anatomy": {
"name": "mmlu:anatomy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "anatomy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 135,
"effective_num_docs": 135,
"trust_dataset": true
},
"leaderboard|mmlu:astronomy": {
"name": "mmlu:astronomy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "astronomy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 152,
"effective_num_docs": 152,
"trust_dataset": true
},
"leaderboard|mmlu:business_ethics": {
"name": "mmlu:business_ethics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "business_ethics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:clinical_knowledge": {
"name": "mmlu:clinical_knowledge",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "clinical_knowledge",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 265,
"effective_num_docs": 265,
"trust_dataset": true
},
"leaderboard|mmlu:college_biology": {
"name": "mmlu:college_biology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_biology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 144,
"effective_num_docs": 144,
"trust_dataset": true
},
"leaderboard|mmlu:college_chemistry": {
"name": "mmlu:college_chemistry",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_chemistry",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:college_computer_science": {
"name": "mmlu:college_computer_science",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_computer_science",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:college_mathematics": {
"name": "mmlu:college_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:college_medicine": {
"name": "mmlu:college_medicine",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_medicine",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 173,
"effective_num_docs": 173,
"trust_dataset": true
},
"leaderboard|mmlu:college_physics": {
"name": "mmlu:college_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 102,
"effective_num_docs": 102,
"trust_dataset": true
},
"leaderboard|mmlu:computer_security": {
"name": "mmlu:computer_security",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "computer_security",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:conceptual_physics": {
"name": "mmlu:conceptual_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "conceptual_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 235,
"effective_num_docs": 235,
"trust_dataset": true
},
"leaderboard|mmlu:econometrics": {
"name": "mmlu:econometrics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "econometrics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 114,
"effective_num_docs": 114,
"trust_dataset": true
},
"leaderboard|mmlu:electrical_engineering": {
"name": "mmlu:electrical_engineering",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "electrical_engineering",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 145,
"effective_num_docs": 145,
"trust_dataset": true
},
"leaderboard|mmlu:elementary_mathematics": {
"name": "mmlu:elementary_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "elementary_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 378,
"effective_num_docs": 378,
"trust_dataset": true
},
"leaderboard|mmlu:formal_logic": {
"name": "mmlu:formal_logic",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "formal_logic",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 126,
"effective_num_docs": 126,
"trust_dataset": true
},
"leaderboard|mmlu:global_facts": {
"name": "mmlu:global_facts",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "global_facts",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_biology": {
"name": "mmlu:high_school_biology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_biology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 310,
"effective_num_docs": 310,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_chemistry": {
"name": "mmlu:high_school_chemistry",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_chemistry",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 203,
"effective_num_docs": 203,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_computer_science": {
"name": "mmlu:high_school_computer_science",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_computer_science",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_european_history": {
"name": "mmlu:high_school_european_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_european_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 165,
"effective_num_docs": 165,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_geography": {
"name": "mmlu:high_school_geography",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_geography",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 198,
"effective_num_docs": 198,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_government_and_politics": {
"name": "mmlu:high_school_government_and_politics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_government_and_politics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 193,
"effective_num_docs": 193,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_macroeconomics": {
"name": "mmlu:high_school_macroeconomics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_macroeconomics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 390,
"effective_num_docs": 390,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_mathematics": {
"name": "mmlu:high_school_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 270,
"effective_num_docs": 270,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_microeconomics": {
"name": "mmlu:high_school_microeconomics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_microeconomics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 238,
"effective_num_docs": 238,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_physics": {
"name": "mmlu:high_school_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 151,
"effective_num_docs": 151,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_psychology": {
"name": "mmlu:high_school_psychology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_psychology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 545,
"effective_num_docs": 545,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_statistics": {
"name": "mmlu:high_school_statistics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_statistics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 216,
"effective_num_docs": 216,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_us_history": {
"name": "mmlu:high_school_us_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_us_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 204,
"effective_num_docs": 204,
"trust_dataset": true
},
"leaderboard|mmlu:high_school_world_history": {
"name": "mmlu:high_school_world_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_world_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 237,
"effective_num_docs": 237,
"trust_dataset": true
},
"leaderboard|mmlu:human_aging": {
"name": "mmlu:human_aging",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "human_aging",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 223,
"effective_num_docs": 223,
"trust_dataset": true
},
"leaderboard|mmlu:human_sexuality": {
"name": "mmlu:human_sexuality",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "human_sexuality",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 131,
"effective_num_docs": 131,
"trust_dataset": true
},
"leaderboard|mmlu:international_law": {
"name": "mmlu:international_law",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "international_law",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 121,
"effective_num_docs": 121,
"trust_dataset": true
},
"leaderboard|mmlu:jurisprudence": {
"name": "mmlu:jurisprudence",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "jurisprudence",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 108,
"effective_num_docs": 108,
"trust_dataset": true
},
"leaderboard|mmlu:logical_fallacies": {
"name": "mmlu:logical_fallacies",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "logical_fallacies",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 163,
"effective_num_docs": 163,
"trust_dataset": true
},
"leaderboard|mmlu:machine_learning": {
"name": "mmlu:machine_learning",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "machine_learning",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 112,
"effective_num_docs": 112,
"trust_dataset": true
},
"leaderboard|mmlu:management": {
"name": "mmlu:management",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "management",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 103,
"effective_num_docs": 103,
"trust_dataset": true
},
"leaderboard|mmlu:marketing": {
"name": "mmlu:marketing",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "marketing",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 234,
"effective_num_docs": 234,
"trust_dataset": true
},
"leaderboard|mmlu:medical_genetics": {
"name": "mmlu:medical_genetics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "medical_genetics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:miscellaneous": {
"name": "mmlu:miscellaneous",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "miscellaneous",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 783,
"effective_num_docs": 783,
"trust_dataset": true
},
"leaderboard|mmlu:moral_disputes": {
"name": "mmlu:moral_disputes",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "moral_disputes",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 346,
"effective_num_docs": 346,
"trust_dataset": true
},
"leaderboard|mmlu:moral_scenarios": {
"name": "mmlu:moral_scenarios",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "moral_scenarios",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 895,
"effective_num_docs": 895,
"trust_dataset": true
},
"leaderboard|mmlu:nutrition": {
"name": "mmlu:nutrition",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "nutrition",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 306,
"effective_num_docs": 306,
"trust_dataset": true
},
"leaderboard|mmlu:philosophy": {
"name": "mmlu:philosophy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "philosophy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 311,
"effective_num_docs": 311,
"trust_dataset": true
},
"leaderboard|mmlu:prehistory": {
"name": "mmlu:prehistory",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "prehistory",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 324,
"effective_num_docs": 324,
"trust_dataset": true
},
"leaderboard|mmlu:professional_accounting": {
"name": "mmlu:professional_accounting",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_accounting",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 282,
"effective_num_docs": 282,
"trust_dataset": true
},
"leaderboard|mmlu:professional_law": {
"name": "mmlu:professional_law",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_law",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 1534,
"effective_num_docs": 1534,
"trust_dataset": true
},
"leaderboard|mmlu:professional_medicine": {
"name": "mmlu:professional_medicine",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_medicine",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 272,
"effective_num_docs": 272,
"trust_dataset": true
},
"leaderboard|mmlu:professional_psychology": {
"name": "mmlu:professional_psychology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_psychology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 612,
"effective_num_docs": 612,
"trust_dataset": true
},
"leaderboard|mmlu:public_relations": {
"name": "mmlu:public_relations",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "public_relations",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 110,
"effective_num_docs": 110,
"trust_dataset": true
},
"leaderboard|mmlu:security_studies": {
"name": "mmlu:security_studies",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "security_studies",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 245,
"effective_num_docs": 245,
"trust_dataset": true
},
"leaderboard|mmlu:sociology": {
"name": "mmlu:sociology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "sociology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 201,
"effective_num_docs": 201,
"trust_dataset": true
},
"leaderboard|mmlu:us_foreign_policy": {
"name": "mmlu:us_foreign_policy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "us_foreign_policy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true
},
"leaderboard|mmlu:virology": {
"name": "mmlu:virology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "virology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 166,
"effective_num_docs": 166,
"trust_dataset": true
},
"leaderboard|mmlu:world_religions": {
"name": "mmlu:world_religions",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "world_religions",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 171,
"effective_num_docs": 171,
"trust_dataset": true
}
},
"summary_tasks": {
"leaderboard|mmlu:abstract_algebra|5": {
"hashes": {
"hash_examples": "4c76229e00c9c0e9",
"hash_full_prompts": "273278cb9fb5ac01",
"hash_input_tokens": "caf9777ccf71eab5",
"hash_cont_tokens": "00520b0ec06da34f"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:anatomy|5": {
"hashes": {
"hash_examples": "6a1f8104dccbd33b",
"hash_full_prompts": "e77b5ebe030aabba",
"hash_input_tokens": "d192cd7584fda4dc",
"hash_cont_tokens": "263324e6ce7f9b36"
},
"truncated": 0,
"non_truncated": 135,
"padded": 540,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:astronomy|5": {
"hashes": {
"hash_examples": "1302effa3a76ce4c",
"hash_full_prompts": "0ff37ef4519e63f9",
"hash_input_tokens": "d241783f0bfdf860",
"hash_cont_tokens": "18ba399c6801138e"
},
"truncated": 0,
"non_truncated": 152,
"padded": 608,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:business_ethics|5": {
"hashes": {
"hash_examples": "03cb8bce5336419a",
"hash_full_prompts": "7c4d312a23bdd669",
"hash_input_tokens": "0aee5ed969278926",
"hash_cont_tokens": "00520b0ec06da34f"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:clinical_knowledge|5": {
"hashes": {
"hash_examples": "ffbb9c7b2be257f9",
"hash_full_prompts": "472d93369b1a8382",
"hash_input_tokens": "aa05960be77863d3",
"hash_cont_tokens": "9d7500060e0dd995"
},
"truncated": 0,
"non_truncated": 265,
"padded": 1060,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_biology|5": {
"hashes": {
"hash_examples": "3ee77f176f38eb8e",
"hash_full_prompts": "6853bf027b349083",
"hash_input_tokens": "3843b5375a04262c",
"hash_cont_tokens": "78a731af5d2f6472"
},
"truncated": 0,
"non_truncated": 144,
"padded": 576,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_chemistry|5": {
"hashes": {
"hash_examples": "ce61a69c46d47aeb",
"hash_full_prompts": "e0f8624971f7af71",
"hash_input_tokens": "2096d1652e232764",
"hash_cont_tokens": "00520b0ec06da34f"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_computer_science|5": {
"hashes": {
"hash_examples": "32805b52d7d5daab",
"hash_full_prompts": "841e9d2ecfbb104d",
"hash_input_tokens": "1e007ac047722e9b",
"hash_cont_tokens": "00520b0ec06da34f"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_mathematics|5": {
"hashes": {
"hash_examples": "55da1a0a0bd33722",
"hash_full_prompts": "696c5f73522b8706",
"hash_input_tokens": "c3061d57b5a4ad7e",
"hash_cont_tokens": "00520b0ec06da34f"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_medicine|5": {
"hashes": {
"hash_examples": "c33e143163049176",
"hash_full_prompts": "7d2530816f672426",
"hash_input_tokens": "4cddd091001776d7",
"hash_cont_tokens": "699c8eb24e3e446b"
},
"truncated": 0,
"non_truncated": 173,
"padded": 692,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_physics|5": {
"hashes": {
"hash_examples": "ebdab1cdb7e555df",
"hash_full_prompts": "66b3a61507c4c92b",
"hash_input_tokens": "821b169941167548",
"hash_cont_tokens": "075997110cbe055e"
},
"truncated": 0,
"non_truncated": 102,
"padded": 408,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:computer_security|5": {
"hashes": {
"hash_examples": "a24fd7d08a560921",
"hash_full_prompts": "f1143da88158bf03",
"hash_input_tokens": "02e64465d74344b4",
"hash_cont_tokens": "00520b0ec06da34f"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:conceptual_physics|5": {
"hashes": {
"hash_examples": "8300977a79386993",
"hash_full_prompts": "d2b4c706b65a71d9",
"hash_input_tokens": "5c7a2235529d2821",
"hash_cont_tokens": "f22daa6d4818086f"
},
"truncated": 0,
"non_truncated": 235,
"padded": 940,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:econometrics|5": {
"hashes": {
"hash_examples": "ddde36788a04a46f",
"hash_full_prompts": "aa5255d923b0e3a3",
"hash_input_tokens": "e0a79ea9e037599d",
"hash_cont_tokens": "26791a0b1941b4c4"
},
"truncated": 0,
"non_truncated": 114,
"padded": 456,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:electrical_engineering|5": {
"hashes": {
"hash_examples": "acbc5def98c19b3f",
"hash_full_prompts": "c1f9a9087987d1d7",
"hash_input_tokens": "e48ddb58b2efa8e3",
"hash_cont_tokens": "3e336577994f6c0d"
},
"truncated": 0,
"non_truncated": 145,
"padded": 580,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:elementary_mathematics|5": {
"hashes": {
"hash_examples": "146e61d07497a9bd",
"hash_full_prompts": "57fb9ddf2f814bb5",
"hash_input_tokens": "9e81373b5265da10",
"hash_cont_tokens": "1d6bbfa8a67327c8"
},
"truncated": 0,
"non_truncated": 378,
"padded": 1512,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:formal_logic|5": {
"hashes": {
"hash_examples": "8635216e1909a03f",
"hash_full_prompts": "dc7e34e04346adfd",
"hash_input_tokens": "0378ed1f1a9bb3f6",
"hash_cont_tokens": "60508d85eb7693a4"
},
"truncated": 0,
"non_truncated": 126,
"padded": 504,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:global_facts|5": {
"hashes": {
"hash_examples": "30b315aa6353ee47",
"hash_full_prompts": "7dedb5baa45f3a38",
"hash_input_tokens": "d20db9bd82fb76c1",
"hash_cont_tokens": "00520b0ec06da34f"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_biology|5": {
"hashes": {
"hash_examples": "c9136373af2180de",
"hash_full_prompts": "15157813fc668acf",
"hash_input_tokens": "c3c10eef8c477c93",
"hash_cont_tokens": "d236ce982144e65f"
},
"truncated": 0,
"non_truncated": 310,
"padded": 1240,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_chemistry|5": {
"hashes": {
"hash_examples": "b0661bfa1add6404",
"hash_full_prompts": "f51dfd92a2d6fdba",
"hash_input_tokens": "dc53c87961ef4ab5",
"hash_cont_tokens": "59f93238ec5aead6"
},
"truncated": 0,
"non_truncated": 203,
"padded": 812,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_computer_science|5": {
"hashes": {
"hash_examples": "80fc1d623a3d665f",
"hash_full_prompts": "fe432a03fe8cc766",
"hash_input_tokens": "61fa356c3ea98372",
"hash_cont_tokens": "00520b0ec06da34f"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_european_history|5": {
"hashes": {
"hash_examples": "854da6e5af0fe1a1",
"hash_full_prompts": "09a62e1560fb1171",
"hash_input_tokens": "272f8d31300ef0af",
"hash_cont_tokens": "7b7414d6a5da3d91"
},
"truncated": 0,
"non_truncated": 165,
"padded": 656,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_geography|5": {
"hashes": {
"hash_examples": "7dc963c7acd19ad8",
"hash_full_prompts": "8284151c76cee4d8",
"hash_input_tokens": "12624aed9bf6356b",
"hash_cont_tokens": "1b66289e10988f84"
},
"truncated": 0,
"non_truncated": 198,
"padded": 792,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_government_and_politics|5": {
"hashes": {
"hash_examples": "1f675dcdebc9758f",
"hash_full_prompts": "083339a69a8bfafa",
"hash_input_tokens": "32e30c43a4a5347e",
"hash_cont_tokens": "5ab3c3415b1d3a55"
},
"truncated": 0,
"non_truncated": 193,
"padded": 772,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_macroeconomics|5": {
"hashes": {
"hash_examples": "2fb32cf2d80f0b35",
"hash_full_prompts": "ececedb0c4a4ffcd",
"hash_input_tokens": "dc2cd6b398f5f86e",
"hash_cont_tokens": "2f5457058d187374"
},
"truncated": 0,
"non_truncated": 390,
"padded": 1557,
"non_padded": 3,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_mathematics|5": {
"hashes": {
"hash_examples": "fd6646fdb5d58a1f",
"hash_full_prompts": "d58a3ca5c8ed6780",
"hash_input_tokens": "6f9c5ce6428dd87d",
"hash_cont_tokens": "e35137cb972e1918"
},
"truncated": 0,
"non_truncated": 270,
"padded": 1080,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_microeconomics|5": {
"hashes": {
"hash_examples": "2118f21f71d87d84",
"hash_full_prompts": "bd49ce8a930e3e78",
"hash_input_tokens": "44722cbe1d85e636",
"hash_cont_tokens": "f756093278ebb83e"
},
"truncated": 0,
"non_truncated": 238,
"padded": 908,
"non_padded": 44,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_physics|5": {
"hashes": {
"hash_examples": "dc3ce06378548565",
"hash_full_prompts": "3904af994b32b959",
"hash_input_tokens": "2132f616c2587937",
"hash_cont_tokens": "9cf883ebf1c82176"
},
"truncated": 0,
"non_truncated": 151,
"padded": 604,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_psychology|5": {
"hashes": {
"hash_examples": "c8d1d98a40e11f2f",
"hash_full_prompts": "d3a4d5dd3f3513f8",
"hash_input_tokens": "6cc69cf1a89e4a88",
"hash_cont_tokens": "bda0f77331ebb21a"
},
"truncated": 0,
"non_truncated": 545,
"padded": 2178,
"non_padded": 2,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_statistics|5": {
"hashes": {
"hash_examples": "666c8759b98ee4ff",
"hash_full_prompts": "1b5599f9d4edc7de",
"hash_input_tokens": "60af7a873b579818",
"hash_cont_tokens": "4d04f014105a0bad"
},
"truncated": 0,
"non_truncated": 216,
"padded": 864,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_us_history|5": {
"hashes": {
"hash_examples": "95fef1c4b7d3f81e",
"hash_full_prompts": "001f7e7cc8185618",
"hash_input_tokens": "8c2d01a0f291db69",
"hash_cont_tokens": "f4590c58f12f2766"
},
"truncated": 0,
"non_truncated": 204,
"padded": 816,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_world_history|5": {
"hashes": {
"hash_examples": "7e5085b6184b0322",
"hash_full_prompts": "6a5c2a43cf7c6cb1",
"hash_input_tokens": "612ed95e43bc21b5",
"hash_cont_tokens": "db6bcddd891df5d9"
},
"truncated": 0,
"non_truncated": 237,
"padded": 948,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:human_aging|5": {
"hashes": {
"hash_examples": "c17333e7c7c10797",
"hash_full_prompts": "a3ad8e679fe07bef",
"hash_input_tokens": "4c948b081b40ba31",
"hash_cont_tokens": "25cec8d640319105"
},
"truncated": 0,
"non_truncated": 223,
"padded": 892,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:human_sexuality|5": {
"hashes": {
"hash_examples": "4edd1e9045df5e3d",
"hash_full_prompts": "3389ffb95929a661",
"hash_input_tokens": "9e649cc80ef9f2fe",
"hash_cont_tokens": "6778302b4a10b645"
},
"truncated": 0,
"non_truncated": 131,
"padded": 524,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:international_law|5": {
"hashes": {
"hash_examples": "db2fa00d771a062a",
"hash_full_prompts": "104f48c64f6f9622",
"hash_input_tokens": "c51db1d4a2a87eed",
"hash_cont_tokens": "9eb54e1a46032749"
},
"truncated": 0,
"non_truncated": 121,
"padded": 484,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:jurisprudence|5": {
"hashes": {
"hash_examples": "e956f86b124076fe",
"hash_full_prompts": "49295d36462ddc97",
"hash_input_tokens": "a779a1b30bc13f30",
"hash_cont_tokens": "f17d9a372cfd66b1"
},
"truncated": 0,
"non_truncated": 108,
"padded": 420,
"non_padded": 12,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:logical_fallacies|5": {
"hashes": {
"hash_examples": "956e0e6365ab79f1",
"hash_full_prompts": "b64f452752d5cd23",
"hash_input_tokens": "61d99e8d4d4d8652",
"hash_cont_tokens": "cf44a68f5bca9a96"
},
"truncated": 0,
"non_truncated": 163,
"padded": 648,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:machine_learning|5": {
"hashes": {
"hash_examples": "397997cc6f4d581e",
"hash_full_prompts": "54da136ebd708042",
"hash_input_tokens": "11e6731506fcf366",
"hash_cont_tokens": "eace00d420f4f32c"
},
"truncated": 0,
"non_truncated": 112,
"padded": 448,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:management|5": {
"hashes": {
"hash_examples": "2bcbe6f6ca63d740",
"hash_full_prompts": "a4b864ff27598ba3",
"hash_input_tokens": "caffa6e4e80cbd5e",
"hash_cont_tokens": "b7c51d0250c252d8"
},
"truncated": 0,
"non_truncated": 103,
"padded": 412,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:marketing|5": {
"hashes": {
"hash_examples": "8ddb20d964a1b065",
"hash_full_prompts": "c7183ac32f36104d",
"hash_input_tokens": "5cd238ac5e8f19f4",
"hash_cont_tokens": "086fb63f8b1d1339"
},
"truncated": 0,
"non_truncated": 234,
"padded": 924,
"non_padded": 12,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:medical_genetics|5": {
"hashes": {
"hash_examples": "182a71f4763d2cea",
"hash_full_prompts": "c17b0a66e3027303",
"hash_input_tokens": "46c0c8a573b43089",
"hash_cont_tokens": "00520b0ec06da34f"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:miscellaneous|5": {
"hashes": {
"hash_examples": "4c404fdbb4ca57fc",
"hash_full_prompts": "bc5fa37ce20a2503",
"hash_input_tokens": "5327cd4585062ac2",
"hash_cont_tokens": "1827274fa6537077"
},
"truncated": 0,
"non_truncated": 783,
"padded": 3132,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:moral_disputes|5": {
"hashes": {
"hash_examples": "60cbd2baa3fea5c9",
"hash_full_prompts": "075742051236078f",
"hash_input_tokens": "a2c9da202f686839",
"hash_cont_tokens": "472c223f6f28cfc7"
},
"truncated": 0,
"non_truncated": 346,
"padded": 1384,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:moral_scenarios|5": {
"hashes": {
"hash_examples": "fd8b0431fbdd75ef",
"hash_full_prompts": "533c4700637599a2",
"hash_input_tokens": "9a1a9f3900b372e6",
"hash_cont_tokens": "e90dade00a092f9e"
},
"truncated": 0,
"non_truncated": 895,
"padded": 3567,
"non_padded": 13,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:nutrition|5": {
"hashes": {
"hash_examples": "71e55e2b829b6528",
"hash_full_prompts": "02b6877dc5a603a6",
"hash_input_tokens": "dd91fec063272e23",
"hash_cont_tokens": "128e0ec97d96b165"
},
"truncated": 0,
"non_truncated": 306,
"padded": 1224,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:philosophy|5": {
"hashes": {
"hash_examples": "a6d489a8d208fa4b",
"hash_full_prompts": "0e65b5f40a9ceb20",
"hash_input_tokens": "2255e15265a7d96a",
"hash_cont_tokens": "cbfd7829a3e0f082"
},
"truncated": 0,
"non_truncated": 311,
"padded": 1244,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:prehistory|5": {
"hashes": {
"hash_examples": "6cc50f032a19acaa",
"hash_full_prompts": "e838e60749e4a598",
"hash_input_tokens": "1b9b906efbcc97fd",
"hash_cont_tokens": "9c0cf5a2f71afa7e"
},
"truncated": 0,
"non_truncated": 324,
"padded": 1284,
"non_padded": 12,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_accounting|5": {
"hashes": {
"hash_examples": "50f57ab32f5f6cea",
"hash_full_prompts": "9abf7319f68b7ba8",
"hash_input_tokens": "d42c8275cd4e10e1",
"hash_cont_tokens": "50f011c2453517ee"
},
"truncated": 0,
"non_truncated": 282,
"padded": 1128,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_law|5": {
"hashes": {
"hash_examples": "a8fdc85c64f4b215",
"hash_full_prompts": "4074faf1eaedcfda",
"hash_input_tokens": "215c854d27e741b8",
"hash_cont_tokens": "73527e852c24186c"
},
"truncated": 0,
"non_truncated": 1534,
"padded": 6136,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_medicine|5": {
"hashes": {
"hash_examples": "c373a28a3050a73a",
"hash_full_prompts": "e72202fc20fcab70",
"hash_input_tokens": "5a6e9aaaaea83544",
"hash_cont_tokens": "ceb7af5e2e789abc"
},
"truncated": 0,
"non_truncated": 272,
"padded": 1088,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_psychology|5": {
"hashes": {
"hash_examples": "bf5254fe818356af",
"hash_full_prompts": "4dcb71c9ef602791",
"hash_input_tokens": "316d0ba731b0de4f",
"hash_cont_tokens": "8cfdced8a9667380"
},
"truncated": 0,
"non_truncated": 612,
"padded": 2428,
"non_padded": 20,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:public_relations|5": {
"hashes": {
"hash_examples": "b66d52e28e7d14e0",
"hash_full_prompts": "c6050b1748185950",
"hash_input_tokens": "2ba1d90c95e19dce",
"hash_cont_tokens": "f8327461a9cc5123"
},
"truncated": 0,
"non_truncated": 110,
"padded": 436,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:security_studies|5": {
"hashes": {
"hash_examples": "514c14feaf000ad9",
"hash_full_prompts": "4c6786915b670d03",
"hash_input_tokens": "b92f71eccf4f89bf",
"hash_cont_tokens": "c30b0c4d52c2875d"
},
"truncated": 0,
"non_truncated": 245,
"padded": 980,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:sociology|5": {
"hashes": {
"hash_examples": "f6c9bc9d18c80870",
"hash_full_prompts": "a2e9a27e985a4e9b",
"hash_input_tokens": "e821334ab55c0d44",
"hash_cont_tokens": "eef4bd16d536fbd6"
},
"truncated": 0,
"non_truncated": 201,
"padded": 804,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:us_foreign_policy|5": {
"hashes": {
"hash_examples": "ed7b78629db6678f",
"hash_full_prompts": "46d0986398662d59",
"hash_input_tokens": "9f6b40a7b6b8a3b2",
"hash_cont_tokens": "00520b0ec06da34f"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:virology|5": {
"hashes": {
"hash_examples": "bc52ffdc3f9b994a",
"hash_full_prompts": "6b591e3983159283",
"hash_input_tokens": "d7c6d39e149defc9",
"hash_cont_tokens": "f5fc195e049353c0"
},
"truncated": 0,
"non_truncated": 166,
"padded": 664,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:world_religions|5": {
"hashes": {
"hash_examples": "ecdb4a4f94f62930",
"hash_full_prompts": "8c2e37a02519af15",
"hash_input_tokens": "80b87b6e634441d6",
"hash_cont_tokens": "ada548665e87b1e0"
},
"truncated": 0,
"non_truncated": 171,
"padded": 684,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
}
},
"summary_general": {
"hashes": {
"hash_examples": "341a076d0beb7048",
"hash_full_prompts": "7c1eeddf962b8fc9",
"hash_input_tokens": "98bef9715b6ebf74",
"hash_cont_tokens": "3672212ca582e2d0"
},
"truncated": 0,
"non_truncated": 14042,
"padded": 56038,
"non_padded": 130,
"num_truncated_few_shots": 0
}
}