abhishek's picture
Upload eval_results/abhishek/autotrain-llama3-70b-orpo-v1/main/mmlu/results_2024-05-03T10-27-59.823724.json with huggingface_hub
0699215 verified
raw
history blame
82.8 kB
{
"config_general": {
"lighteval_sha": "?",
"num_fewshot_seeds": 1,
"override_batch_size": 4,
"max_samples": null,
"job_id": "",
"start_time": 19631.841670262,
"end_time": 32395.446210489,
"total_evaluation_time_secondes": "12763.604540227",
"model_name": "abhishek/autotrain-llama3-70b-orpo-v1",
"model_sha": "053236c6846cc561c1503ba05e2b28c94855a432",
"model_dtype": "torch.float16",
"model_size": "131.73 GB",
"config": null
},
"results": {
"leaderboard|mmlu:abstract_algebra|5": {
"acc": 0.49,
"acc_stderr": 0.05024183937956913
},
"leaderboard|mmlu:anatomy|5": {
"acc": 0.7703703703703704,
"acc_stderr": 0.036333844140734636
},
"leaderboard|mmlu:astronomy|5": {
"acc": 0.9210526315789473,
"acc_stderr": 0.02194434281824793
},
"leaderboard|mmlu:business_ethics|5": {
"acc": 0.85,
"acc_stderr": 0.03588702812826371
},
"leaderboard|mmlu:clinical_knowledge|5": {
"acc": 0.8528301886792453,
"acc_stderr": 0.02180412613479738
},
"leaderboard|mmlu:college_biology|5": {
"acc": 0.9236111111111112,
"acc_stderr": 0.022212203938345918
},
"leaderboard|mmlu:college_chemistry|5": {
"acc": 0.58,
"acc_stderr": 0.049604496374885836
},
"leaderboard|mmlu:college_computer_science|5": {
"acc": 0.67,
"acc_stderr": 0.04725815626252606
},
"leaderboard|mmlu:college_mathematics|5": {
"acc": 0.55,
"acc_stderr": 0.049999999999999996
},
"leaderboard|mmlu:college_medicine|5": {
"acc": 0.7861271676300579,
"acc_stderr": 0.03126511206173044
},
"leaderboard|mmlu:college_physics|5": {
"acc": 0.5490196078431373,
"acc_stderr": 0.049512182523962604
},
"leaderboard|mmlu:computer_security|5": {
"acc": 0.8,
"acc_stderr": 0.04020151261036846
},
"leaderboard|mmlu:conceptual_physics|5": {
"acc": 0.8127659574468085,
"acc_stderr": 0.025501588341883596
},
"leaderboard|mmlu:econometrics|5": {
"acc": 0.7192982456140351,
"acc_stderr": 0.04227054451232199
},
"leaderboard|mmlu:electrical_engineering|5": {
"acc": 0.7517241379310344,
"acc_stderr": 0.03600105692727771
},
"leaderboard|mmlu:elementary_mathematics|5": {
"acc": 0.6746031746031746,
"acc_stderr": 0.024130158299762616
},
"leaderboard|mmlu:formal_logic|5": {
"acc": 0.6428571428571429,
"acc_stderr": 0.04285714285714281
},
"leaderboard|mmlu:global_facts|5": {
"acc": 0.59,
"acc_stderr": 0.04943110704237101
},
"leaderboard|mmlu:high_school_biology|5": {
"acc": 0.9129032258064517,
"acc_stderr": 0.01604110074169669
},
"leaderboard|mmlu:high_school_chemistry|5": {
"acc": 0.6600985221674877,
"acc_stderr": 0.033327690684107895
},
"leaderboard|mmlu:high_school_computer_science|5": {
"acc": 0.9,
"acc_stderr": 0.030151134457776348
},
"leaderboard|mmlu:high_school_european_history|5": {
"acc": 0.8787878787878788,
"acc_stderr": 0.025485498373343237
},
"leaderboard|mmlu:high_school_geography|5": {
"acc": 0.9242424242424242,
"acc_stderr": 0.018852670234993093
},
"leaderboard|mmlu:high_school_government_and_politics|5": {
"acc": 0.9844559585492227,
"acc_stderr": 0.00892749271508434
},
"leaderboard|mmlu:high_school_macroeconomics|5": {
"acc": 0.8358974358974359,
"acc_stderr": 0.01877843431342372
},
"leaderboard|mmlu:high_school_mathematics|5": {
"acc": 0.5259259259259259,
"acc_stderr": 0.03044452852881074
},
"leaderboard|mmlu:high_school_microeconomics|5": {
"acc": 0.8949579831932774,
"acc_stderr": 0.019916300758805225
},
"leaderboard|mmlu:high_school_physics|5": {
"acc": 0.6158940397350994,
"acc_stderr": 0.03971301814719197
},
"leaderboard|mmlu:high_school_psychology|5": {
"acc": 0.9486238532110092,
"acc_stderr": 0.009465168181022974
},
"leaderboard|mmlu:high_school_statistics|5": {
"acc": 0.7361111111111112,
"acc_stderr": 0.030058202704309846
},
"leaderboard|mmlu:high_school_us_history|5": {
"acc": 0.9313725490196079,
"acc_stderr": 0.017744453647073322
},
"leaderboard|mmlu:high_school_world_history|5": {
"acc": 0.9240506329113924,
"acc_stderr": 0.01724463325106569
},
"leaderboard|mmlu:human_aging|5": {
"acc": 0.8161434977578476,
"acc_stderr": 0.025998379092356517
},
"leaderboard|mmlu:human_sexuality|5": {
"acc": 0.8625954198473282,
"acc_stderr": 0.030194823996804475
},
"leaderboard|mmlu:international_law|5": {
"acc": 0.9090909090909091,
"acc_stderr": 0.026243194054073885
},
"leaderboard|mmlu:jurisprudence|5": {
"acc": 0.8703703703703703,
"acc_stderr": 0.03247224389917946
},
"leaderboard|mmlu:logical_fallacies|5": {
"acc": 0.8466257668711656,
"acc_stderr": 0.0283116014414386
},
"leaderboard|mmlu:machine_learning|5": {
"acc": 0.7142857142857143,
"acc_stderr": 0.042878587513404544
},
"leaderboard|mmlu:management|5": {
"acc": 0.912621359223301,
"acc_stderr": 0.027960689125970654
},
"leaderboard|mmlu:marketing|5": {
"acc": 0.9358974358974359,
"acc_stderr": 0.016046261631673137
},
"leaderboard|mmlu:medical_genetics|5": {
"acc": 0.93,
"acc_stderr": 0.0256432399976243
},
"leaderboard|mmlu:miscellaneous|5": {
"acc": 0.9233716475095786,
"acc_stderr": 0.009512170699323858
},
"leaderboard|mmlu:moral_disputes|5": {
"acc": 0.8208092485549133,
"acc_stderr": 0.020647590029679332
},
"leaderboard|mmlu:moral_scenarios|5": {
"acc": 0.7463687150837989,
"acc_stderr": 0.014551553659369918
},
"leaderboard|mmlu:nutrition|5": {
"acc": 0.869281045751634,
"acc_stderr": 0.019301873624215267
},
"leaderboard|mmlu:philosophy|5": {
"acc": 0.797427652733119,
"acc_stderr": 0.022827317491059693
},
"leaderboard|mmlu:prehistory|5": {
"acc": 0.9012345679012346,
"acc_stderr": 0.01660046080164534
},
"leaderboard|mmlu:professional_accounting|5": {
"acc": 0.6595744680851063,
"acc_stderr": 0.02826765748265014
},
"leaderboard|mmlu:professional_law|5": {
"acc": 0.6323337679269883,
"acc_stderr": 0.012314845910071712
},
"leaderboard|mmlu:professional_medicine|5": {
"acc": 0.8933823529411765,
"acc_stderr": 0.018747725509716835
},
"leaderboard|mmlu:professional_psychology|5": {
"acc": 0.8611111111111112,
"acc_stderr": 0.013990806277040208
},
"leaderboard|mmlu:public_relations|5": {
"acc": 0.7454545454545455,
"acc_stderr": 0.041723430387053825
},
"leaderboard|mmlu:security_studies|5": {
"acc": 0.8081632653061225,
"acc_stderr": 0.0252069631542254
},
"leaderboard|mmlu:sociology|5": {
"acc": 0.9154228855721394,
"acc_stderr": 0.019675343217199173
},
"leaderboard|mmlu:us_foreign_policy|5": {
"acc": 0.92,
"acc_stderr": 0.0272659924344291
},
"leaderboard|mmlu:virology|5": {
"acc": 0.572289156626506,
"acc_stderr": 0.03851597683718533
},
"leaderboard|mmlu:world_religions|5": {
"acc": 0.8888888888888888,
"acc_stderr": 0.024103384202072864
},
"leaderboard|mmlu:_average|5": {
"acc": 0.7957951766493738,
"acc_stderr": 0.0280984014309186
},
"all": {
"acc": 0.7957951766493738,
"acc_stderr": 0.0280984014309186
}
},
"versions": {
"leaderboard|mmlu:abstract_algebra|5": 0,
"leaderboard|mmlu:anatomy|5": 0,
"leaderboard|mmlu:astronomy|5": 0,
"leaderboard|mmlu:business_ethics|5": 0,
"leaderboard|mmlu:clinical_knowledge|5": 0,
"leaderboard|mmlu:college_biology|5": 0,
"leaderboard|mmlu:college_chemistry|5": 0,
"leaderboard|mmlu:college_computer_science|5": 0,
"leaderboard|mmlu:college_mathematics|5": 0,
"leaderboard|mmlu:college_medicine|5": 0,
"leaderboard|mmlu:college_physics|5": 0,
"leaderboard|mmlu:computer_security|5": 0,
"leaderboard|mmlu:conceptual_physics|5": 0,
"leaderboard|mmlu:econometrics|5": 0,
"leaderboard|mmlu:electrical_engineering|5": 0,
"leaderboard|mmlu:elementary_mathematics|5": 0,
"leaderboard|mmlu:formal_logic|5": 0,
"leaderboard|mmlu:global_facts|5": 0,
"leaderboard|mmlu:high_school_biology|5": 0,
"leaderboard|mmlu:high_school_chemistry|5": 0,
"leaderboard|mmlu:high_school_computer_science|5": 0,
"leaderboard|mmlu:high_school_european_history|5": 0,
"leaderboard|mmlu:high_school_geography|5": 0,
"leaderboard|mmlu:high_school_government_and_politics|5": 0,
"leaderboard|mmlu:high_school_macroeconomics|5": 0,
"leaderboard|mmlu:high_school_mathematics|5": 0,
"leaderboard|mmlu:high_school_microeconomics|5": 0,
"leaderboard|mmlu:high_school_physics|5": 0,
"leaderboard|mmlu:high_school_psychology|5": 0,
"leaderboard|mmlu:high_school_statistics|5": 0,
"leaderboard|mmlu:high_school_us_history|5": 0,
"leaderboard|mmlu:high_school_world_history|5": 0,
"leaderboard|mmlu:human_aging|5": 0,
"leaderboard|mmlu:human_sexuality|5": 0,
"leaderboard|mmlu:international_law|5": 0,
"leaderboard|mmlu:jurisprudence|5": 0,
"leaderboard|mmlu:logical_fallacies|5": 0,
"leaderboard|mmlu:machine_learning|5": 0,
"leaderboard|mmlu:management|5": 0,
"leaderboard|mmlu:marketing|5": 0,
"leaderboard|mmlu:medical_genetics|5": 0,
"leaderboard|mmlu:miscellaneous|5": 0,
"leaderboard|mmlu:moral_disputes|5": 0,
"leaderboard|mmlu:moral_scenarios|5": 0,
"leaderboard|mmlu:nutrition|5": 0,
"leaderboard|mmlu:philosophy|5": 0,
"leaderboard|mmlu:prehistory|5": 0,
"leaderboard|mmlu:professional_accounting|5": 0,
"leaderboard|mmlu:professional_law|5": 0,
"leaderboard|mmlu:professional_medicine|5": 0,
"leaderboard|mmlu:professional_psychology|5": 0,
"leaderboard|mmlu:public_relations|5": 0,
"leaderboard|mmlu:security_studies|5": 0,
"leaderboard|mmlu:sociology|5": 0,
"leaderboard|mmlu:us_foreign_policy|5": 0,
"leaderboard|mmlu:virology|5": 0,
"leaderboard|mmlu:world_religions|5": 0
},
"config_tasks": {
"leaderboard|mmlu:abstract_algebra": {
"name": "mmlu:abstract_algebra",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "abstract_algebra",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:anatomy": {
"name": "mmlu:anatomy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "anatomy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 135,
"effective_num_docs": 135,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:astronomy": {
"name": "mmlu:astronomy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "astronomy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 152,
"effective_num_docs": 152,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:business_ethics": {
"name": "mmlu:business_ethics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "business_ethics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:clinical_knowledge": {
"name": "mmlu:clinical_knowledge",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "clinical_knowledge",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 265,
"effective_num_docs": 265,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:college_biology": {
"name": "mmlu:college_biology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_biology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 144,
"effective_num_docs": 144,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:college_chemistry": {
"name": "mmlu:college_chemistry",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_chemistry",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:college_computer_science": {
"name": "mmlu:college_computer_science",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_computer_science",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:college_mathematics": {
"name": "mmlu:college_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:college_medicine": {
"name": "mmlu:college_medicine",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_medicine",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 173,
"effective_num_docs": 173,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:college_physics": {
"name": "mmlu:college_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 102,
"effective_num_docs": 102,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:computer_security": {
"name": "mmlu:computer_security",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "computer_security",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:conceptual_physics": {
"name": "mmlu:conceptual_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "conceptual_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 235,
"effective_num_docs": 235,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:econometrics": {
"name": "mmlu:econometrics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "econometrics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 114,
"effective_num_docs": 114,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:electrical_engineering": {
"name": "mmlu:electrical_engineering",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "electrical_engineering",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 145,
"effective_num_docs": 145,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:elementary_mathematics": {
"name": "mmlu:elementary_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "elementary_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 378,
"effective_num_docs": 378,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:formal_logic": {
"name": "mmlu:formal_logic",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "formal_logic",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 126,
"effective_num_docs": 126,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:global_facts": {
"name": "mmlu:global_facts",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "global_facts",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_biology": {
"name": "mmlu:high_school_biology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_biology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 310,
"effective_num_docs": 310,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_chemistry": {
"name": "mmlu:high_school_chemistry",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_chemistry",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 203,
"effective_num_docs": 203,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_computer_science": {
"name": "mmlu:high_school_computer_science",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_computer_science",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_european_history": {
"name": "mmlu:high_school_european_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_european_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 165,
"effective_num_docs": 165,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_geography": {
"name": "mmlu:high_school_geography",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_geography",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 198,
"effective_num_docs": 198,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_government_and_politics": {
"name": "mmlu:high_school_government_and_politics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_government_and_politics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 193,
"effective_num_docs": 193,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_macroeconomics": {
"name": "mmlu:high_school_macroeconomics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_macroeconomics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 390,
"effective_num_docs": 390,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_mathematics": {
"name": "mmlu:high_school_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 270,
"effective_num_docs": 270,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_microeconomics": {
"name": "mmlu:high_school_microeconomics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_microeconomics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 238,
"effective_num_docs": 238,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_physics": {
"name": "mmlu:high_school_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 151,
"effective_num_docs": 151,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_psychology": {
"name": "mmlu:high_school_psychology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_psychology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 545,
"effective_num_docs": 545,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_statistics": {
"name": "mmlu:high_school_statistics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_statistics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 216,
"effective_num_docs": 216,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_us_history": {
"name": "mmlu:high_school_us_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_us_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 204,
"effective_num_docs": 204,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:high_school_world_history": {
"name": "mmlu:high_school_world_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_world_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 237,
"effective_num_docs": 237,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:human_aging": {
"name": "mmlu:human_aging",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "human_aging",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 223,
"effective_num_docs": 223,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:human_sexuality": {
"name": "mmlu:human_sexuality",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "human_sexuality",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 131,
"effective_num_docs": 131,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:international_law": {
"name": "mmlu:international_law",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "international_law",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 121,
"effective_num_docs": 121,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:jurisprudence": {
"name": "mmlu:jurisprudence",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "jurisprudence",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 108,
"effective_num_docs": 108,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:logical_fallacies": {
"name": "mmlu:logical_fallacies",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "logical_fallacies",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 163,
"effective_num_docs": 163,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:machine_learning": {
"name": "mmlu:machine_learning",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "machine_learning",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 112,
"effective_num_docs": 112,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:management": {
"name": "mmlu:management",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "management",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 103,
"effective_num_docs": 103,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:marketing": {
"name": "mmlu:marketing",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "marketing",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 234,
"effective_num_docs": 234,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:medical_genetics": {
"name": "mmlu:medical_genetics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "medical_genetics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:miscellaneous": {
"name": "mmlu:miscellaneous",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "miscellaneous",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 783,
"effective_num_docs": 783,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:moral_disputes": {
"name": "mmlu:moral_disputes",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "moral_disputes",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 346,
"effective_num_docs": 346,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:moral_scenarios": {
"name": "mmlu:moral_scenarios",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "moral_scenarios",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 895,
"effective_num_docs": 895,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:nutrition": {
"name": "mmlu:nutrition",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "nutrition",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 306,
"effective_num_docs": 306,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:philosophy": {
"name": "mmlu:philosophy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "philosophy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 311,
"effective_num_docs": 311,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:prehistory": {
"name": "mmlu:prehistory",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "prehistory",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 324,
"effective_num_docs": 324,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:professional_accounting": {
"name": "mmlu:professional_accounting",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_accounting",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 282,
"effective_num_docs": 282,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:professional_law": {
"name": "mmlu:professional_law",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_law",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 1534,
"effective_num_docs": 1534,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:professional_medicine": {
"name": "mmlu:professional_medicine",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_medicine",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 272,
"effective_num_docs": 272,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:professional_psychology": {
"name": "mmlu:professional_psychology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_psychology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 612,
"effective_num_docs": 612,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:public_relations": {
"name": "mmlu:public_relations",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "public_relations",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 110,
"effective_num_docs": 110,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:security_studies": {
"name": "mmlu:security_studies",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "security_studies",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 245,
"effective_num_docs": 245,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:sociology": {
"name": "mmlu:sociology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "sociology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 201,
"effective_num_docs": 201,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:us_foreign_policy": {
"name": "mmlu:us_foreign_policy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "us_foreign_policy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:virology": {
"name": "mmlu:virology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "virology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 166,
"effective_num_docs": 166,
"trust_dataset": true,
"must_remove_duplicate_docs": null
},
"leaderboard|mmlu:world_religions": {
"name": "mmlu:world_religions",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "world_religions",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 171,
"effective_num_docs": 171,
"trust_dataset": true,
"must_remove_duplicate_docs": null
}
},
"summary_tasks": {
"leaderboard|mmlu:abstract_algebra|5": {
"hashes": {
"hash_examples": "4c76229e00c9c0e9",
"hash_full_prompts": "a45d01c3409c889c",
"hash_input_tokens": "c58f21ad388e41a4",
"hash_cont_tokens": "b057f3f1d84e35e5"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:anatomy|5": {
"hashes": {
"hash_examples": "6a1f8104dccbd33b",
"hash_full_prompts": "e245c6600e03cc32",
"hash_input_tokens": "664ad983d943ad07",
"hash_cont_tokens": "eb0c9a1e487e77a6"
},
"truncated": 0,
"non_truncated": 135,
"padded": 540,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:astronomy|5": {
"hashes": {
"hash_examples": "1302effa3a76ce4c",
"hash_full_prompts": "390f9bddf857ad04",
"hash_input_tokens": "6b8419ce1ca61ae8",
"hash_cont_tokens": "2c8a49864c3d99c2"
},
"truncated": 0,
"non_truncated": 152,
"padded": 608,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:business_ethics|5": {
"hashes": {
"hash_examples": "03cb8bce5336419a",
"hash_full_prompts": "5504f893bc4f2fa1",
"hash_input_tokens": "bf7a56022072a446",
"hash_cont_tokens": "b057f3f1d84e35e5"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:clinical_knowledge|5": {
"hashes": {
"hash_examples": "ffbb9c7b2be257f9",
"hash_full_prompts": "106ad0bab4b90b78",
"hash_input_tokens": "1d310792e0aaf29c",
"hash_cont_tokens": "f2bfcea369926d68"
},
"truncated": 0,
"non_truncated": 265,
"padded": 1060,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_biology|5": {
"hashes": {
"hash_examples": "3ee77f176f38eb8e",
"hash_full_prompts": "59f9bdf2695cb226",
"hash_input_tokens": "1027babc822bd1c5",
"hash_cont_tokens": "061b1b91fc518400"
},
"truncated": 0,
"non_truncated": 144,
"padded": 576,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_chemistry|5": {
"hashes": {
"hash_examples": "ce61a69c46d47aeb",
"hash_full_prompts": "3cac9b759fcff7a0",
"hash_input_tokens": "4fa05a1d43eaf942",
"hash_cont_tokens": "b057f3f1d84e35e5"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_computer_science|5": {
"hashes": {
"hash_examples": "32805b52d7d5daab",
"hash_full_prompts": "010b0cca35070130",
"hash_input_tokens": "c78e59615689a133",
"hash_cont_tokens": "b057f3f1d84e35e5"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_mathematics|5": {
"hashes": {
"hash_examples": "55da1a0a0bd33722",
"hash_full_prompts": "511422eb9eefc773",
"hash_input_tokens": "f62164431ea60a6d",
"hash_cont_tokens": "b057f3f1d84e35e5"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_medicine|5": {
"hashes": {
"hash_examples": "c33e143163049176",
"hash_full_prompts": "c8cc1a82a51a046e",
"hash_input_tokens": "f3ffc86e05b4abab",
"hash_cont_tokens": "96f7e09eeaf3577a"
},
"truncated": 0,
"non_truncated": 173,
"padded": 692,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_physics|5": {
"hashes": {
"hash_examples": "ebdab1cdb7e555df",
"hash_full_prompts": "e40721b5059c5818",
"hash_input_tokens": "bfec18d4c2cf7331",
"hash_cont_tokens": "fb74f245268780f7"
},
"truncated": 0,
"non_truncated": 102,
"padded": 408,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:computer_security|5": {
"hashes": {
"hash_examples": "a24fd7d08a560921",
"hash_full_prompts": "946c9be5964ac44a",
"hash_input_tokens": "acde342892ed7ff9",
"hash_cont_tokens": "b057f3f1d84e35e5"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:conceptual_physics|5": {
"hashes": {
"hash_examples": "8300977a79386993",
"hash_full_prompts": "506a4f6094cc40c9",
"hash_input_tokens": "524121d9ddd4bf6a",
"hash_cont_tokens": "1f4e1e92f33812b5"
},
"truncated": 0,
"non_truncated": 235,
"padded": 940,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:econometrics|5": {
"hashes": {
"hash_examples": "ddde36788a04a46f",
"hash_full_prompts": "4ed2703f27f1ed05",
"hash_input_tokens": "a1dedb7b847d6b19",
"hash_cont_tokens": "4060f7b36b2f4140"
},
"truncated": 0,
"non_truncated": 114,
"padded": 456,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:electrical_engineering|5": {
"hashes": {
"hash_examples": "acbc5def98c19b3f",
"hash_full_prompts": "d8f4b3e11c23653c",
"hash_input_tokens": "825d9d44e9ca39de",
"hash_cont_tokens": "06e5a56f4fae638e"
},
"truncated": 0,
"non_truncated": 145,
"padded": 580,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:elementary_mathematics|5": {
"hashes": {
"hash_examples": "146e61d07497a9bd",
"hash_full_prompts": "256d111bd15647ff",
"hash_input_tokens": "6ef65f0c30222cfa",
"hash_cont_tokens": "c3385a2b3ed50305"
},
"truncated": 0,
"non_truncated": 378,
"padded": 1506,
"non_padded": 6,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:formal_logic|5": {
"hashes": {
"hash_examples": "8635216e1909a03f",
"hash_full_prompts": "1171d04f3b1a11f5",
"hash_input_tokens": "1c20d8fd0dcd2b9e",
"hash_cont_tokens": "e259fd8e83f5eabe"
},
"truncated": 0,
"non_truncated": 126,
"padded": 504,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:global_facts|5": {
"hashes": {
"hash_examples": "30b315aa6353ee47",
"hash_full_prompts": "a7e56dbc074c7529",
"hash_input_tokens": "4fe2c5925dbc174a",
"hash_cont_tokens": "b057f3f1d84e35e5"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_biology|5": {
"hashes": {
"hash_examples": "c9136373af2180de",
"hash_full_prompts": "ad6e859ed978e04a",
"hash_input_tokens": "ea52b862d9b7af2b",
"hash_cont_tokens": "6251059b06be9d97"
},
"truncated": 0,
"non_truncated": 310,
"padded": 1236,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_chemistry|5": {
"hashes": {
"hash_examples": "b0661bfa1add6404",
"hash_full_prompts": "6eb9c04bcc8a8f2a",
"hash_input_tokens": "057712c3fd4fe5dc",
"hash_cont_tokens": "ce467bdc2825a0b2"
},
"truncated": 0,
"non_truncated": 203,
"padded": 808,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_computer_science|5": {
"hashes": {
"hash_examples": "80fc1d623a3d665f",
"hash_full_prompts": "8e51bc91c81cf8dd",
"hash_input_tokens": "efbba49c91a4a950",
"hash_cont_tokens": "b057f3f1d84e35e5"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_european_history|5": {
"hashes": {
"hash_examples": "854da6e5af0fe1a1",
"hash_full_prompts": "664a1f16c9f3195c",
"hash_input_tokens": "26218866485baf4e",
"hash_cont_tokens": "f02ad2401d7aa667"
},
"truncated": 0,
"non_truncated": 165,
"padded": 656,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_geography|5": {
"hashes": {
"hash_examples": "7dc963c7acd19ad8",
"hash_full_prompts": "f3acf911f4023c8a",
"hash_input_tokens": "56cbe629d18ead5d",
"hash_cont_tokens": "26d9256a0ab4eece"
},
"truncated": 0,
"non_truncated": 198,
"padded": 785,
"non_padded": 7,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_government_and_politics|5": {
"hashes": {
"hash_examples": "1f675dcdebc9758f",
"hash_full_prompts": "066254feaa3158ae",
"hash_input_tokens": "b86d166b67953159",
"hash_cont_tokens": "990c9084748f34ab"
},
"truncated": 0,
"non_truncated": 193,
"padded": 772,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_macroeconomics|5": {
"hashes": {
"hash_examples": "2fb32cf2d80f0b35",
"hash_full_prompts": "19a7fa502aa85c95",
"hash_input_tokens": "75b5836573d4418d",
"hash_cont_tokens": "76312c2ea8fc4f71"
},
"truncated": 0,
"non_truncated": 390,
"padded": 1552,
"non_padded": 8,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_mathematics|5": {
"hashes": {
"hash_examples": "fd6646fdb5d58a1f",
"hash_full_prompts": "4f704e369778b5b0",
"hash_input_tokens": "5bc4d612b64cb82b",
"hash_cont_tokens": "369a1a933960fad5"
},
"truncated": 0,
"non_truncated": 270,
"padded": 1048,
"non_padded": 32,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_microeconomics|5": {
"hashes": {
"hash_examples": "2118f21f71d87d84",
"hash_full_prompts": "4350f9e2240f8010",
"hash_input_tokens": "f53ae91ff33dea98",
"hash_cont_tokens": "ce39343f06b04c0c"
},
"truncated": 0,
"non_truncated": 238,
"padded": 924,
"non_padded": 28,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_physics|5": {
"hashes": {
"hash_examples": "dc3ce06378548565",
"hash_full_prompts": "5dc0d6831b66188f",
"hash_input_tokens": "23181b0f6dc1e876",
"hash_cont_tokens": "34c4d04275713047"
},
"truncated": 0,
"non_truncated": 151,
"padded": 604,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_psychology|5": {
"hashes": {
"hash_examples": "c8d1d98a40e11f2f",
"hash_full_prompts": "af2b097da6d50365",
"hash_input_tokens": "1c03a6aa2ccd7497",
"hash_cont_tokens": "76367d535c896191"
},
"truncated": 0,
"non_truncated": 545,
"padded": 2176,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_statistics|5": {
"hashes": {
"hash_examples": "666c8759b98ee4ff",
"hash_full_prompts": "c757694421d6d68d",
"hash_input_tokens": "6c3d4c89ebb17624",
"hash_cont_tokens": "a076a9a5529c4701"
},
"truncated": 0,
"non_truncated": 216,
"padded": 864,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_us_history|5": {
"hashes": {
"hash_examples": "95fef1c4b7d3f81e",
"hash_full_prompts": "e34a028d0ddeec5e",
"hash_input_tokens": "3789cc86ffa04ee6",
"hash_cont_tokens": "ff9e65faaa6206d3"
},
"truncated": 0,
"non_truncated": 204,
"padded": 816,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_world_history|5": {
"hashes": {
"hash_examples": "7e5085b6184b0322",
"hash_full_prompts": "1fa3d51392765601",
"hash_input_tokens": "e2ade10b727cc567",
"hash_cont_tokens": "91d0b99f637c395d"
},
"truncated": 0,
"non_truncated": 237,
"padded": 948,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:human_aging|5": {
"hashes": {
"hash_examples": "c17333e7c7c10797",
"hash_full_prompts": "cac900721f9a1a94",
"hash_input_tokens": "e73491e153435aef",
"hash_cont_tokens": "503a59d0c8fd9fda"
},
"truncated": 0,
"non_truncated": 223,
"padded": 892,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:human_sexuality|5": {
"hashes": {
"hash_examples": "4edd1e9045df5e3d",
"hash_full_prompts": "0d6567bafee0a13c",
"hash_input_tokens": "5aac17e145c73388",
"hash_cont_tokens": "2ef8023fd099e328"
},
"truncated": 0,
"non_truncated": 131,
"padded": 524,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:international_law|5": {
"hashes": {
"hash_examples": "db2fa00d771a062a",
"hash_full_prompts": "d018f9116479795e",
"hash_input_tokens": "f823a8f71decfd4e",
"hash_cont_tokens": "1d135acf09cc77d7"
},
"truncated": 0,
"non_truncated": 121,
"padded": 484,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:jurisprudence|5": {
"hashes": {
"hash_examples": "e956f86b124076fe",
"hash_full_prompts": "1487e89a10ec58b7",
"hash_input_tokens": "1296fb073d25fdbd",
"hash_cont_tokens": "2bc5403ae73a42ee"
},
"truncated": 0,
"non_truncated": 108,
"padded": 420,
"non_padded": 12,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:logical_fallacies|5": {
"hashes": {
"hash_examples": "956e0e6365ab79f1",
"hash_full_prompts": "677785b2181f9243",
"hash_input_tokens": "5af73357ea4a33af",
"hash_cont_tokens": "343532d46d0dd784"
},
"truncated": 0,
"non_truncated": 163,
"padded": 648,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:machine_learning|5": {
"hashes": {
"hash_examples": "397997cc6f4d581e",
"hash_full_prompts": "769ee14a2aea49bb",
"hash_input_tokens": "36e7ee7692d6cb84",
"hash_cont_tokens": "ffa678813759b3dc"
},
"truncated": 0,
"non_truncated": 112,
"padded": 448,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:management|5": {
"hashes": {
"hash_examples": "2bcbe6f6ca63d740",
"hash_full_prompts": "cb1ff9dac9582144",
"hash_input_tokens": "b6132d503a6b6fe1",
"hash_cont_tokens": "c86a690085fd954f"
},
"truncated": 0,
"non_truncated": 103,
"padded": 412,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:marketing|5": {
"hashes": {
"hash_examples": "8ddb20d964a1b065",
"hash_full_prompts": "9fc2114a187ad9a2",
"hash_input_tokens": "cea9c7023072b6de",
"hash_cont_tokens": "a59b0611811f3b0d"
},
"truncated": 0,
"non_truncated": 234,
"padded": 892,
"non_padded": 44,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:medical_genetics|5": {
"hashes": {
"hash_examples": "182a71f4763d2cea",
"hash_full_prompts": "46a616fa51878959",
"hash_input_tokens": "26a5d3aa11ad9928",
"hash_cont_tokens": "b057f3f1d84e35e5"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:miscellaneous|5": {
"hashes": {
"hash_examples": "4c404fdbb4ca57fc",
"hash_full_prompts": "0813e1be36dbaae1",
"hash_input_tokens": "9368b6cccdca09c2",
"hash_cont_tokens": "468b0141d91cbe14"
},
"truncated": 0,
"non_truncated": 783,
"padded": 3132,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:moral_disputes|5": {
"hashes": {
"hash_examples": "60cbd2baa3fea5c9",
"hash_full_prompts": "1d14adebb9b62519",
"hash_input_tokens": "24ac7096ff952fb3",
"hash_cont_tokens": "5513cb6f0e3d0039"
},
"truncated": 0,
"non_truncated": 346,
"padded": 1380,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:moral_scenarios|5": {
"hashes": {
"hash_examples": "fd8b0431fbdd75ef",
"hash_full_prompts": "b80d3d236165e3de",
"hash_input_tokens": "c388cdbaa92ff35d",
"hash_cont_tokens": "9fd58c2ac3e72795"
},
"truncated": 0,
"non_truncated": 895,
"padded": 3447,
"non_padded": 133,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:nutrition|5": {
"hashes": {
"hash_examples": "71e55e2b829b6528",
"hash_full_prompts": "2bfb18e5fab8dea7",
"hash_input_tokens": "61d681ff34503244",
"hash_cont_tokens": "3294c626ca103ea9"
},
"truncated": 0,
"non_truncated": 306,
"padded": 1224,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:philosophy|5": {
"hashes": {
"hash_examples": "a6d489a8d208fa4b",
"hash_full_prompts": "e8c0d5b6dae3ccc8",
"hash_input_tokens": "99dccba498573ce7",
"hash_cont_tokens": "84fab74fb9dbe7cc"
},
"truncated": 0,
"non_truncated": 311,
"padded": 1244,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:prehistory|5": {
"hashes": {
"hash_examples": "6cc50f032a19acaa",
"hash_full_prompts": "4a6a1d3ab1bf28e4",
"hash_input_tokens": "0b5b291e21537231",
"hash_cont_tokens": "65ac3d5bc3a7107a"
},
"truncated": 0,
"non_truncated": 324,
"padded": 1256,
"non_padded": 40,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_accounting|5": {
"hashes": {
"hash_examples": "50f57ab32f5f6cea",
"hash_full_prompts": "e60129bd2d82ffc6",
"hash_input_tokens": "dfdbfa66c3879e04",
"hash_cont_tokens": "3408ef87473c956e"
},
"truncated": 0,
"non_truncated": 282,
"padded": 1108,
"non_padded": 20,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_law|5": {
"hashes": {
"hash_examples": "a8fdc85c64f4b215",
"hash_full_prompts": "0dbb1d9b72dcea03",
"hash_input_tokens": "c5e40216c766fc5d",
"hash_cont_tokens": "ee53bac4bdeb7c6f"
},
"truncated": 0,
"non_truncated": 1534,
"padded": 6136,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_medicine|5": {
"hashes": {
"hash_examples": "c373a28a3050a73a",
"hash_full_prompts": "5e040f9ca68b089e",
"hash_input_tokens": "6bb2fb0a41e6e74a",
"hash_cont_tokens": "5ba90d13b887dd10"
},
"truncated": 0,
"non_truncated": 272,
"padded": 1088,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_psychology|5": {
"hashes": {
"hash_examples": "bf5254fe818356af",
"hash_full_prompts": "b386ecda8b87150e",
"hash_input_tokens": "d5882dcb2ba36239",
"hash_cont_tokens": "81d224ca3a7cd1f8"
},
"truncated": 0,
"non_truncated": 612,
"padded": 2448,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:public_relations|5": {
"hashes": {
"hash_examples": "b66d52e28e7d14e0",
"hash_full_prompts": "fe43562263e25677",
"hash_input_tokens": "89e2f611b0e690d5",
"hash_cont_tokens": "97fc092d5801cddc"
},
"truncated": 0,
"non_truncated": 110,
"padded": 432,
"non_padded": 8,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:security_studies|5": {
"hashes": {
"hash_examples": "514c14feaf000ad9",
"hash_full_prompts": "27d4a2ac541ef4b9",
"hash_input_tokens": "da3c969306757935",
"hash_cont_tokens": "084e0267d5a5a853"
},
"truncated": 0,
"non_truncated": 245,
"padded": 980,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:sociology|5": {
"hashes": {
"hash_examples": "f6c9bc9d18c80870",
"hash_full_prompts": "c072ea7d1a1524f2",
"hash_input_tokens": "cba49ace6cf739f2",
"hash_cont_tokens": "c654d898bec36354"
},
"truncated": 0,
"non_truncated": 201,
"padded": 804,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:us_foreign_policy|5": {
"hashes": {
"hash_examples": "ed7b78629db6678f",
"hash_full_prompts": "341a97ca3e4d699d",
"hash_input_tokens": "3f5014206033d5c8",
"hash_cont_tokens": "b057f3f1d84e35e5"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:virology|5": {
"hashes": {
"hash_examples": "bc52ffdc3f9b994a",
"hash_full_prompts": "651d471e2eb8b5e9",
"hash_input_tokens": "21ca3e01eff77c5a",
"hash_cont_tokens": "648f351cffb42342"
},
"truncated": 0,
"non_truncated": 166,
"padded": 664,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:world_religions|5": {
"hashes": {
"hash_examples": "ecdb4a4f94f62930",
"hash_full_prompts": "3773f03542ce44a3",
"hash_input_tokens": "7904d43bc25add76",
"hash_cont_tokens": "6cee5406c3f21c2e"
},
"truncated": 0,
"non_truncated": 171,
"padded": 684,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
}
},
"summary_general": {
"hashes": {
"hash_examples": "341a076d0beb7048",
"hash_full_prompts": "a5c8f2b7ff4f5ae2",
"hash_input_tokens": "74ed818d997ed10b",
"hash_cont_tokens": "edf1783519f209b9"
},
"truncated": 0,
"non_truncated": 14042,
"padded": 55806,
"non_padded": 362,
"num_truncated_few_shots": 0
}
}