open-r1-eval-leaderboard
/
eval_results
/NousResearch
/Nous-Hermes-2-Yi-34B
/main
/mmlu
/results_2024-03-04T23-01-23.312414.json
{ | |
"config_general": { | |
"lighteval_sha": "?", | |
"num_fewshot_seeds": 1, | |
"override_batch_size": 1, | |
"max_samples": null, | |
"job_id": "", | |
"start_time": 897264.124981064, | |
"end_time": 906432.519028163, | |
"total_evaluation_time_secondes": "9168.39404709905", | |
"model_name": "NousResearch/Nous-Hermes-2-Yi-34B", | |
"model_sha": "fcb0a8847e76aea14aba9aa44009d4418ad7c18f", | |
"model_dtype": "torch.bfloat16", | |
"model_size": "64.17 GB", | |
"config": null | |
}, | |
"results": { | |
"lighteval|mmlu:abstract_algebra|5": { | |
"acc": 0.48, | |
"acc_stderr": 0.050211673156867795 | |
}, | |
"lighteval|mmlu:anatomy|5": { | |
"acc": 0.725925925925926, | |
"acc_stderr": 0.03853254836552003 | |
}, | |
"lighteval|mmlu:astronomy|5": { | |
"acc": 0.881578947368421, | |
"acc_stderr": 0.026293995855474928 | |
}, | |
"lighteval|mmlu:business_ethics|5": { | |
"acc": 0.78, | |
"acc_stderr": 0.04163331998932261 | |
}, | |
"lighteval|mmlu:clinical_knowledge|5": { | |
"acc": 0.8037735849056604, | |
"acc_stderr": 0.024442388131100817 | |
}, | |
"lighteval|mmlu:college_biology|5": { | |
"acc": 0.875, | |
"acc_stderr": 0.02765610492929436 | |
}, | |
"lighteval|mmlu:college_chemistry|5": { | |
"acc": 0.52, | |
"acc_stderr": 0.050211673156867795 | |
}, | |
"lighteval|mmlu:college_computer_science|5": { | |
"acc": 0.65, | |
"acc_stderr": 0.04793724854411018 | |
}, | |
"lighteval|mmlu:college_mathematics|5": { | |
"acc": 0.48, | |
"acc_stderr": 0.05021167315686779 | |
}, | |
"lighteval|mmlu:college_medicine|5": { | |
"acc": 0.7225433526011561, | |
"acc_stderr": 0.03414014007044036 | |
}, | |
"lighteval|mmlu:college_physics|5": { | |
"acc": 0.5, | |
"acc_stderr": 0.04975185951049946 | |
}, | |
"lighteval|mmlu:computer_security|5": { | |
"acc": 0.8, | |
"acc_stderr": 0.04020151261036845 | |
}, | |
"lighteval|mmlu:conceptual_physics|5": { | |
"acc": 0.7659574468085106, | |
"acc_stderr": 0.027678452578212387 | |
}, | |
"lighteval|mmlu:econometrics|5": { | |
"acc": 0.5877192982456141, | |
"acc_stderr": 0.04630653203366596 | |
}, | |
"lighteval|mmlu:electrical_engineering|5": { | |
"acc": 0.7655172413793103, | |
"acc_stderr": 0.035306258743465914 | |
}, | |
"lighteval|mmlu:elementary_mathematics|5": { | |
"acc": 0.6851851851851852, | |
"acc_stderr": 0.023919984164047736 | |
}, | |
"lighteval|mmlu:formal_logic|5": { | |
"acc": 0.5793650793650794, | |
"acc_stderr": 0.04415438226743745 | |
}, | |
"lighteval|mmlu:global_facts|5": { | |
"acc": 0.51, | |
"acc_stderr": 0.05024183937956911 | |
}, | |
"lighteval|mmlu:high_school_biology|5": { | |
"acc": 0.9064516129032258, | |
"acc_stderr": 0.01656575466827098 | |
}, | |
"lighteval|mmlu:high_school_chemistry|5": { | |
"acc": 0.6354679802955665, | |
"acc_stderr": 0.0338640574606209 | |
}, | |
"lighteval|mmlu:high_school_computer_science|5": { | |
"acc": 0.82, | |
"acc_stderr": 0.03861229196653694 | |
}, | |
"lighteval|mmlu:high_school_european_history|5": { | |
"acc": 0.8606060606060606, | |
"acc_stderr": 0.0270459488258654 | |
}, | |
"lighteval|mmlu:high_school_geography|5": { | |
"acc": 0.9292929292929293, | |
"acc_stderr": 0.01826310542019949 | |
}, | |
"lighteval|mmlu:high_school_government_and_politics|5": { | |
"acc": 0.9689119170984456, | |
"acc_stderr": 0.012525310625527033 | |
}, | |
"lighteval|mmlu:high_school_macroeconomics|5": { | |
"acc": 0.8102564102564103, | |
"acc_stderr": 0.019880165406588796 | |
}, | |
"lighteval|mmlu:high_school_mathematics|5": { | |
"acc": 0.4, | |
"acc_stderr": 0.0298696050953169 | |
}, | |
"lighteval|mmlu:high_school_microeconomics|5": { | |
"acc": 0.8571428571428571, | |
"acc_stderr": 0.02273020811930655 | |
}, | |
"lighteval|mmlu:high_school_physics|5": { | |
"acc": 0.5099337748344371, | |
"acc_stderr": 0.04081677107248436 | |
}, | |
"lighteval|mmlu:high_school_psychology|5": { | |
"acc": 0.9192660550458716, | |
"acc_stderr": 0.011680172292862088 | |
}, | |
"lighteval|mmlu:high_school_statistics|5": { | |
"acc": 0.6435185185185185, | |
"acc_stderr": 0.032664783315272714 | |
}, | |
"lighteval|mmlu:high_school_us_history|5": { | |
"acc": 0.9215686274509803, | |
"acc_stderr": 0.018869514646658928 | |
}, | |
"lighteval|mmlu:high_school_world_history|5": { | |
"acc": 0.9113924050632911, | |
"acc_stderr": 0.018498315206865384 | |
}, | |
"lighteval|mmlu:human_aging|5": { | |
"acc": 0.8026905829596412, | |
"acc_stderr": 0.02670985334496796 | |
}, | |
"lighteval|mmlu:human_sexuality|5": { | |
"acc": 0.8702290076335878, | |
"acc_stderr": 0.029473649496907065 | |
}, | |
"lighteval|mmlu:international_law|5": { | |
"acc": 0.9008264462809917, | |
"acc_stderr": 0.027285246312758957 | |
}, | |
"lighteval|mmlu:jurisprudence|5": { | |
"acc": 0.8796296296296297, | |
"acc_stderr": 0.031457038543062504 | |
}, | |
"lighteval|mmlu:logical_fallacies|5": { | |
"acc": 0.8466257668711656, | |
"acc_stderr": 0.028311601441438603 | |
}, | |
"lighteval|mmlu:machine_learning|5": { | |
"acc": 0.5803571428571429, | |
"acc_stderr": 0.04684099321077106 | |
}, | |
"lighteval|mmlu:management|5": { | |
"acc": 0.9029126213592233, | |
"acc_stderr": 0.02931596291881348 | |
}, | |
"lighteval|mmlu:marketing|5": { | |
"acc": 0.9273504273504274, | |
"acc_stderr": 0.017004368568132342 | |
}, | |
"lighteval|mmlu:medical_genetics|5": { | |
"acc": 0.89, | |
"acc_stderr": 0.03144660377352202 | |
}, | |
"lighteval|mmlu:miscellaneous|5": { | |
"acc": 0.9042145593869731, | |
"acc_stderr": 0.01052403107905583 | |
}, | |
"lighteval|mmlu:moral_disputes|5": { | |
"acc": 0.8179190751445087, | |
"acc_stderr": 0.02077676110251298 | |
}, | |
"lighteval|mmlu:moral_scenarios|5": { | |
"acc": 0.7508379888268156, | |
"acc_stderr": 0.014465893829859926 | |
}, | |
"lighteval|mmlu:nutrition|5": { | |
"acc": 0.8366013071895425, | |
"acc_stderr": 0.021170623011213502 | |
}, | |
"lighteval|mmlu:philosophy|5": { | |
"acc": 0.8167202572347267, | |
"acc_stderr": 0.02197419884826582 | |
}, | |
"lighteval|mmlu:prehistory|5": { | |
"acc": 0.8734567901234568, | |
"acc_stderr": 0.01849860055879091 | |
}, | |
"lighteval|mmlu:professional_accounting|5": { | |
"acc": 0.6347517730496454, | |
"acc_stderr": 0.02872386385328127 | |
}, | |
"lighteval|mmlu:professional_law|5": { | |
"acc": 0.6153846153846154, | |
"acc_stderr": 0.012425548416302952 | |
}, | |
"lighteval|mmlu:professional_medicine|5": { | |
"acc": 0.8382352941176471, | |
"acc_stderr": 0.02236867256288675 | |
}, | |
"lighteval|mmlu:professional_psychology|5": { | |
"acc": 0.8186274509803921, | |
"acc_stderr": 0.015588643495370464 | |
}, | |
"lighteval|mmlu:public_relations|5": { | |
"acc": 0.6818181818181818, | |
"acc_stderr": 0.04461272175910509 | |
}, | |
"lighteval|mmlu:security_studies|5": { | |
"acc": 0.8244897959183674, | |
"acc_stderr": 0.024352800722970015 | |
}, | |
"lighteval|mmlu:sociology|5": { | |
"acc": 0.8805970149253731, | |
"acc_stderr": 0.02292879327721974 | |
}, | |
"lighteval|mmlu:us_foreign_policy|5": { | |
"acc": 0.9, | |
"acc_stderr": 0.030151134457776334 | |
}, | |
"lighteval|mmlu:virology|5": { | |
"acc": 0.5843373493975904, | |
"acc_stderr": 0.03836722176598053 | |
}, | |
"lighteval|mmlu:world_religions|5": { | |
"acc": 0.8713450292397661, | |
"acc_stderr": 0.025679342723276908 | |
}, | |
"lighteval|mmlu:_average|5": { | |
"acc": 0.7623918125960152, | |
"acc_stderr": 0.029669679997188628 | |
} | |
}, | |
"versions": { | |
"lighteval|mmlu:abstract_algebra|5": 0, | |
"lighteval|mmlu:anatomy|5": 0, | |
"lighteval|mmlu:astronomy|5": 0, | |
"lighteval|mmlu:business_ethics|5": 0, | |
"lighteval|mmlu:clinical_knowledge|5": 0, | |
"lighteval|mmlu:college_biology|5": 0, | |
"lighteval|mmlu:college_chemistry|5": 0, | |
"lighteval|mmlu:college_computer_science|5": 0, | |
"lighteval|mmlu:college_mathematics|5": 0, | |
"lighteval|mmlu:college_medicine|5": 0, | |
"lighteval|mmlu:college_physics|5": 0, | |
"lighteval|mmlu:computer_security|5": 0, | |
"lighteval|mmlu:conceptual_physics|5": 0, | |
"lighteval|mmlu:econometrics|5": 0, | |
"lighteval|mmlu:electrical_engineering|5": 0, | |
"lighteval|mmlu:elementary_mathematics|5": 0, | |
"lighteval|mmlu:formal_logic|5": 0, | |
"lighteval|mmlu:global_facts|5": 0, | |
"lighteval|mmlu:high_school_biology|5": 0, | |
"lighteval|mmlu:high_school_chemistry|5": 0, | |
"lighteval|mmlu:high_school_computer_science|5": 0, | |
"lighteval|mmlu:high_school_european_history|5": 0, | |
"lighteval|mmlu:high_school_geography|5": 0, | |
"lighteval|mmlu:high_school_government_and_politics|5": 0, | |
"lighteval|mmlu:high_school_macroeconomics|5": 0, | |
"lighteval|mmlu:high_school_mathematics|5": 0, | |
"lighteval|mmlu:high_school_microeconomics|5": 0, | |
"lighteval|mmlu:high_school_physics|5": 0, | |
"lighteval|mmlu:high_school_psychology|5": 0, | |
"lighteval|mmlu:high_school_statistics|5": 0, | |
"lighteval|mmlu:high_school_us_history|5": 0, | |
"lighteval|mmlu:high_school_world_history|5": 0, | |
"lighteval|mmlu:human_aging|5": 0, | |
"lighteval|mmlu:human_sexuality|5": 0, | |
"lighteval|mmlu:international_law|5": 0, | |
"lighteval|mmlu:jurisprudence|5": 0, | |
"lighteval|mmlu:logical_fallacies|5": 0, | |
"lighteval|mmlu:machine_learning|5": 0, | |
"lighteval|mmlu:management|5": 0, | |
"lighteval|mmlu:marketing|5": 0, | |
"lighteval|mmlu:medical_genetics|5": 0, | |
"lighteval|mmlu:miscellaneous|5": 0, | |
"lighteval|mmlu:moral_disputes|5": 0, | |
"lighteval|mmlu:moral_scenarios|5": 0, | |
"lighteval|mmlu:nutrition|5": 0, | |
"lighteval|mmlu:philosophy|5": 0, | |
"lighteval|mmlu:prehistory|5": 0, | |
"lighteval|mmlu:professional_accounting|5": 0, | |
"lighteval|mmlu:professional_law|5": 0, | |
"lighteval|mmlu:professional_medicine|5": 0, | |
"lighteval|mmlu:professional_psychology|5": 0, | |
"lighteval|mmlu:public_relations|5": 0, | |
"lighteval|mmlu:security_studies|5": 0, | |
"lighteval|mmlu:sociology|5": 0, | |
"lighteval|mmlu:us_foreign_policy|5": 0, | |
"lighteval|mmlu:virology|5": 0, | |
"lighteval|mmlu:world_religions|5": 0 | |
}, | |
"config_tasks": { | |
"lighteval|mmlu:abstract_algebra": { | |
"name": "mmlu:abstract_algebra", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "abstract_algebra", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:anatomy": { | |
"name": "mmlu:anatomy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "anatomy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 135, | |
"effective_num_docs": 135 | |
}, | |
"lighteval|mmlu:astronomy": { | |
"name": "mmlu:astronomy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "astronomy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 152, | |
"effective_num_docs": 152 | |
}, | |
"lighteval|mmlu:business_ethics": { | |
"name": "mmlu:business_ethics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "business_ethics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:clinical_knowledge": { | |
"name": "mmlu:clinical_knowledge", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "clinical_knowledge", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 265, | |
"effective_num_docs": 265 | |
}, | |
"lighteval|mmlu:college_biology": { | |
"name": "mmlu:college_biology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_biology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 144, | |
"effective_num_docs": 144 | |
}, | |
"lighteval|mmlu:college_chemistry": { | |
"name": "mmlu:college_chemistry", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_chemistry", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:college_computer_science": { | |
"name": "mmlu:college_computer_science", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_computer_science", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:college_mathematics": { | |
"name": "mmlu:college_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:college_medicine": { | |
"name": "mmlu:college_medicine", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_medicine", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 173, | |
"effective_num_docs": 173 | |
}, | |
"lighteval|mmlu:college_physics": { | |
"name": "mmlu:college_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 102, | |
"effective_num_docs": 102 | |
}, | |
"lighteval|mmlu:computer_security": { | |
"name": "mmlu:computer_security", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "computer_security", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:conceptual_physics": { | |
"name": "mmlu:conceptual_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "conceptual_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 235, | |
"effective_num_docs": 235 | |
}, | |
"lighteval|mmlu:econometrics": { | |
"name": "mmlu:econometrics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "econometrics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 114, | |
"effective_num_docs": 114 | |
}, | |
"lighteval|mmlu:electrical_engineering": { | |
"name": "mmlu:electrical_engineering", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "electrical_engineering", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 145, | |
"effective_num_docs": 145 | |
}, | |
"lighteval|mmlu:elementary_mathematics": { | |
"name": "mmlu:elementary_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "elementary_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 378, | |
"effective_num_docs": 378 | |
}, | |
"lighteval|mmlu:formal_logic": { | |
"name": "mmlu:formal_logic", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "formal_logic", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 126, | |
"effective_num_docs": 126 | |
}, | |
"lighteval|mmlu:global_facts": { | |
"name": "mmlu:global_facts", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "global_facts", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:high_school_biology": { | |
"name": "mmlu:high_school_biology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_biology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 310, | |
"effective_num_docs": 310 | |
}, | |
"lighteval|mmlu:high_school_chemistry": { | |
"name": "mmlu:high_school_chemistry", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_chemistry", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 203, | |
"effective_num_docs": 203 | |
}, | |
"lighteval|mmlu:high_school_computer_science": { | |
"name": "mmlu:high_school_computer_science", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_computer_science", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:high_school_european_history": { | |
"name": "mmlu:high_school_european_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_european_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 165, | |
"effective_num_docs": 165 | |
}, | |
"lighteval|mmlu:high_school_geography": { | |
"name": "mmlu:high_school_geography", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_geography", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 198, | |
"effective_num_docs": 198 | |
}, | |
"lighteval|mmlu:high_school_government_and_politics": { | |
"name": "mmlu:high_school_government_and_politics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_government_and_politics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 193, | |
"effective_num_docs": 193 | |
}, | |
"lighteval|mmlu:high_school_macroeconomics": { | |
"name": "mmlu:high_school_macroeconomics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_macroeconomics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 390, | |
"effective_num_docs": 390 | |
}, | |
"lighteval|mmlu:high_school_mathematics": { | |
"name": "mmlu:high_school_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 270, | |
"effective_num_docs": 270 | |
}, | |
"lighteval|mmlu:high_school_microeconomics": { | |
"name": "mmlu:high_school_microeconomics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_microeconomics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 238, | |
"effective_num_docs": 238 | |
}, | |
"lighteval|mmlu:high_school_physics": { | |
"name": "mmlu:high_school_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 151, | |
"effective_num_docs": 151 | |
}, | |
"lighteval|mmlu:high_school_psychology": { | |
"name": "mmlu:high_school_psychology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_psychology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 545, | |
"effective_num_docs": 545 | |
}, | |
"lighteval|mmlu:high_school_statistics": { | |
"name": "mmlu:high_school_statistics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_statistics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 216, | |
"effective_num_docs": 216 | |
}, | |
"lighteval|mmlu:high_school_us_history": { | |
"name": "mmlu:high_school_us_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_us_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 204, | |
"effective_num_docs": 204 | |
}, | |
"lighteval|mmlu:high_school_world_history": { | |
"name": "mmlu:high_school_world_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_world_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 237, | |
"effective_num_docs": 237 | |
}, | |
"lighteval|mmlu:human_aging": { | |
"name": "mmlu:human_aging", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "human_aging", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 223, | |
"effective_num_docs": 223 | |
}, | |
"lighteval|mmlu:human_sexuality": { | |
"name": "mmlu:human_sexuality", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "human_sexuality", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 131, | |
"effective_num_docs": 131 | |
}, | |
"lighteval|mmlu:international_law": { | |
"name": "mmlu:international_law", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "international_law", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 121, | |
"effective_num_docs": 121 | |
}, | |
"lighteval|mmlu:jurisprudence": { | |
"name": "mmlu:jurisprudence", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "jurisprudence", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 108, | |
"effective_num_docs": 108 | |
}, | |
"lighteval|mmlu:logical_fallacies": { | |
"name": "mmlu:logical_fallacies", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "logical_fallacies", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 163, | |
"effective_num_docs": 163 | |
}, | |
"lighteval|mmlu:machine_learning": { | |
"name": "mmlu:machine_learning", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "machine_learning", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 112, | |
"effective_num_docs": 112 | |
}, | |
"lighteval|mmlu:management": { | |
"name": "mmlu:management", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "management", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 103, | |
"effective_num_docs": 103 | |
}, | |
"lighteval|mmlu:marketing": { | |
"name": "mmlu:marketing", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "marketing", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 234, | |
"effective_num_docs": 234 | |
}, | |
"lighteval|mmlu:medical_genetics": { | |
"name": "mmlu:medical_genetics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "medical_genetics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:miscellaneous": { | |
"name": "mmlu:miscellaneous", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "miscellaneous", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 783, | |
"effective_num_docs": 783 | |
}, | |
"lighteval|mmlu:moral_disputes": { | |
"name": "mmlu:moral_disputes", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "moral_disputes", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 346, | |
"effective_num_docs": 346 | |
}, | |
"lighteval|mmlu:moral_scenarios": { | |
"name": "mmlu:moral_scenarios", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "moral_scenarios", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 895, | |
"effective_num_docs": 895 | |
}, | |
"lighteval|mmlu:nutrition": { | |
"name": "mmlu:nutrition", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "nutrition", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 306, | |
"effective_num_docs": 306 | |
}, | |
"lighteval|mmlu:philosophy": { | |
"name": "mmlu:philosophy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "philosophy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 311, | |
"effective_num_docs": 311 | |
}, | |
"lighteval|mmlu:prehistory": { | |
"name": "mmlu:prehistory", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "prehistory", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 324, | |
"effective_num_docs": 324 | |
}, | |
"lighteval|mmlu:professional_accounting": { | |
"name": "mmlu:professional_accounting", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_accounting", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 282, | |
"effective_num_docs": 282 | |
}, | |
"lighteval|mmlu:professional_law": { | |
"name": "mmlu:professional_law", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_law", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 1534, | |
"effective_num_docs": 1534 | |
}, | |
"lighteval|mmlu:professional_medicine": { | |
"name": "mmlu:professional_medicine", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_medicine", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 272, | |
"effective_num_docs": 272 | |
}, | |
"lighteval|mmlu:professional_psychology": { | |
"name": "mmlu:professional_psychology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_psychology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 612, | |
"effective_num_docs": 612 | |
}, | |
"lighteval|mmlu:public_relations": { | |
"name": "mmlu:public_relations", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "public_relations", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 110, | |
"effective_num_docs": 110 | |
}, | |
"lighteval|mmlu:security_studies": { | |
"name": "mmlu:security_studies", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "security_studies", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 245, | |
"effective_num_docs": 245 | |
}, | |
"lighteval|mmlu:sociology": { | |
"name": "mmlu:sociology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "sociology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 201, | |
"effective_num_docs": 201 | |
}, | |
"lighteval|mmlu:us_foreign_policy": { | |
"name": "mmlu:us_foreign_policy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "us_foreign_policy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:virology": { | |
"name": "mmlu:virology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "virology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 166, | |
"effective_num_docs": 166 | |
}, | |
"lighteval|mmlu:world_religions": { | |
"name": "mmlu:world_religions", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "world_religions", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 171, | |
"effective_num_docs": 171 | |
} | |
}, | |
"summary_tasks": { | |
"lighteval|mmlu:abstract_algebra|5": { | |
"hashes": { | |
"hash_examples": "4c76229e00c9c0e9", | |
"hash_full_prompts": "a45d01c3409c889c", | |
"hash_input_tokens": "1945c492b526866d", | |
"hash_cont_tokens": "1c8aa5f98bf1c43e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:anatomy|5": { | |
"hashes": { | |
"hash_examples": "6a1f8104dccbd33b", | |
"hash_full_prompts": "e245c6600e03cc32", | |
"hash_input_tokens": "d5999a9b22e7a381", | |
"hash_cont_tokens": "2d52999077293385" | |
}, | |
"truncated": 0, | |
"non_truncated": 135, | |
"padded": 540, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:astronomy|5": { | |
"hashes": { | |
"hash_examples": "1302effa3a76ce4c", | |
"hash_full_prompts": "390f9bddf857ad04", | |
"hash_input_tokens": "a90d69fd0eddac41", | |
"hash_cont_tokens": "3fd89b4e35168d48" | |
}, | |
"truncated": 0, | |
"non_truncated": 152, | |
"padded": 608, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:business_ethics|5": { | |
"hashes": { | |
"hash_examples": "03cb8bce5336419a", | |
"hash_full_prompts": "5504f893bc4f2fa1", | |
"hash_input_tokens": "e7a9ad62edba223f", | |
"hash_cont_tokens": "1c8aa5f98bf1c43e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:clinical_knowledge|5": { | |
"hashes": { | |
"hash_examples": "ffbb9c7b2be257f9", | |
"hash_full_prompts": "106ad0bab4b90b78", | |
"hash_input_tokens": "763edde9c7c23212", | |
"hash_cont_tokens": "1c0bb7de54326eaf" | |
}, | |
"truncated": 0, | |
"non_truncated": 265, | |
"padded": 1037, | |
"non_padded": 23, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_biology|5": { | |
"hashes": { | |
"hash_examples": "3ee77f176f38eb8e", | |
"hash_full_prompts": "59f9bdf2695cb226", | |
"hash_input_tokens": "73aecd9276c2a74d", | |
"hash_cont_tokens": "947f385de600b02a" | |
}, | |
"truncated": 0, | |
"non_truncated": 144, | |
"padded": 574, | |
"non_padded": 2, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_chemistry|5": { | |
"hashes": { | |
"hash_examples": "ce61a69c46d47aeb", | |
"hash_full_prompts": "3cac9b759fcff7a0", | |
"hash_input_tokens": "b43125b896c4c922", | |
"hash_cont_tokens": "1c8aa5f98bf1c43e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_computer_science|5": { | |
"hashes": { | |
"hash_examples": "32805b52d7d5daab", | |
"hash_full_prompts": "010b0cca35070130", | |
"hash_input_tokens": "95b70783fcf1373f", | |
"hash_cont_tokens": "1c8aa5f98bf1c43e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_mathematics|5": { | |
"hashes": { | |
"hash_examples": "55da1a0a0bd33722", | |
"hash_full_prompts": "511422eb9eefc773", | |
"hash_input_tokens": "49c3595ef75fa9db", | |
"hash_cont_tokens": "1c8aa5f98bf1c43e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_medicine|5": { | |
"hashes": { | |
"hash_examples": "c33e143163049176", | |
"hash_full_prompts": "c8cc1a82a51a046e", | |
"hash_input_tokens": "c9720d485ad65de8", | |
"hash_cont_tokens": "011ffdbf01ee74b7" | |
}, | |
"truncated": 0, | |
"non_truncated": 173, | |
"padded": 688, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_physics|5": { | |
"hashes": { | |
"hash_examples": "ebdab1cdb7e555df", | |
"hash_full_prompts": "e40721b5059c5818", | |
"hash_input_tokens": "4e32a735b7a516f4", | |
"hash_cont_tokens": "d84312ae75931ffc" | |
}, | |
"truncated": 0, | |
"non_truncated": 102, | |
"padded": 408, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:computer_security|5": { | |
"hashes": { | |
"hash_examples": "a24fd7d08a560921", | |
"hash_full_prompts": "946c9be5964ac44a", | |
"hash_input_tokens": "6fee63c2fbaeedd3", | |
"hash_cont_tokens": "1c8aa5f98bf1c43e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:conceptual_physics|5": { | |
"hashes": { | |
"hash_examples": "8300977a79386993", | |
"hash_full_prompts": "506a4f6094cc40c9", | |
"hash_input_tokens": "d16a6654fba45bfa", | |
"hash_cont_tokens": "f1b24cc6b5fb0c71" | |
}, | |
"truncated": 0, | |
"non_truncated": 235, | |
"padded": 940, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:econometrics|5": { | |
"hashes": { | |
"hash_examples": "ddde36788a04a46f", | |
"hash_full_prompts": "4ed2703f27f1ed05", | |
"hash_input_tokens": "f1c2e0ce1ee9c28a", | |
"hash_cont_tokens": "d2c3ef177ec4e940" | |
}, | |
"truncated": 0, | |
"non_truncated": 114, | |
"padded": 456, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:electrical_engineering|5": { | |
"hashes": { | |
"hash_examples": "acbc5def98c19b3f", | |
"hash_full_prompts": "d8f4b3e11c23653c", | |
"hash_input_tokens": "2f62e24e97f12fd2", | |
"hash_cont_tokens": "8c1fb73e4ea149d0" | |
}, | |
"truncated": 0, | |
"non_truncated": 145, | |
"padded": 580, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:elementary_mathematics|5": { | |
"hashes": { | |
"hash_examples": "146e61d07497a9bd", | |
"hash_full_prompts": "256d111bd15647ff", | |
"hash_input_tokens": "5245d6d0d8508382", | |
"hash_cont_tokens": "e00048455700e000" | |
}, | |
"truncated": 0, | |
"non_truncated": 378, | |
"padded": 1492, | |
"non_padded": 20, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:formal_logic|5": { | |
"hashes": { | |
"hash_examples": "8635216e1909a03f", | |
"hash_full_prompts": "1171d04f3b1a11f5", | |
"hash_input_tokens": "faeb1049b8c59008", | |
"hash_cont_tokens": "e51151ca7f0bc830" | |
}, | |
"truncated": 0, | |
"non_truncated": 126, | |
"padded": 504, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:global_facts|5": { | |
"hashes": { | |
"hash_examples": "30b315aa6353ee47", | |
"hash_full_prompts": "a7e56dbc074c7529", | |
"hash_input_tokens": "d6b00d70bf1bd244", | |
"hash_cont_tokens": "1c8aa5f98bf1c43e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_biology|5": { | |
"hashes": { | |
"hash_examples": "c9136373af2180de", | |
"hash_full_prompts": "ad6e859ed978e04a", | |
"hash_input_tokens": "32a9d38ed6695b72", | |
"hash_cont_tokens": "6f36e5837c95585d" | |
}, | |
"truncated": 0, | |
"non_truncated": 310, | |
"padded": 1228, | |
"non_padded": 12, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_chemistry|5": { | |
"hashes": { | |
"hash_examples": "b0661bfa1add6404", | |
"hash_full_prompts": "6eb9c04bcc8a8f2a", | |
"hash_input_tokens": "593b041619a73d50", | |
"hash_cont_tokens": "8cc3c718e7d51318" | |
}, | |
"truncated": 0, | |
"non_truncated": 203, | |
"padded": 808, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_computer_science|5": { | |
"hashes": { | |
"hash_examples": "80fc1d623a3d665f", | |
"hash_full_prompts": "8e51bc91c81cf8dd", | |
"hash_input_tokens": "34b406347368abcf", | |
"hash_cont_tokens": "1c8aa5f98bf1c43e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_european_history|5": { | |
"hashes": { | |
"hash_examples": "854da6e5af0fe1a1", | |
"hash_full_prompts": "664a1f16c9f3195c", | |
"hash_input_tokens": "661a32a554c10b97", | |
"hash_cont_tokens": "007ea4467a6019c4" | |
}, | |
"truncated": 0, | |
"non_truncated": 165, | |
"padded": 656, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_geography|5": { | |
"hashes": { | |
"hash_examples": "7dc963c7acd19ad8", | |
"hash_full_prompts": "f3acf911f4023c8a", | |
"hash_input_tokens": "bb416820390b545d", | |
"hash_cont_tokens": "4f1fa1464eff651d" | |
}, | |
"truncated": 0, | |
"non_truncated": 198, | |
"padded": 784, | |
"non_padded": 8, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_government_and_politics|5": { | |
"hashes": { | |
"hash_examples": "1f675dcdebc9758f", | |
"hash_full_prompts": "066254feaa3158ae", | |
"hash_input_tokens": "ad7e9d3075dfe057", | |
"hash_cont_tokens": "43d162268d47b8b4" | |
}, | |
"truncated": 0, | |
"non_truncated": 193, | |
"padded": 772, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_macroeconomics|5": { | |
"hashes": { | |
"hash_examples": "2fb32cf2d80f0b35", | |
"hash_full_prompts": "19a7fa502aa85c95", | |
"hash_input_tokens": "c424a600fe403098", | |
"hash_cont_tokens": "a6bea83d9ce59980" | |
}, | |
"truncated": 0, | |
"non_truncated": 390, | |
"padded": 1560, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_mathematics|5": { | |
"hashes": { | |
"hash_examples": "fd6646fdb5d58a1f", | |
"hash_full_prompts": "4f704e369778b5b0", | |
"hash_input_tokens": "bb4755a7e81cb33b", | |
"hash_cont_tokens": "2fd6c3a9ad765f25" | |
}, | |
"truncated": 0, | |
"non_truncated": 270, | |
"padded": 1060, | |
"non_padded": 20, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_microeconomics|5": { | |
"hashes": { | |
"hash_examples": "2118f21f71d87d84", | |
"hash_full_prompts": "4350f9e2240f8010", | |
"hash_input_tokens": "9a4e27cced827307", | |
"hash_cont_tokens": "fd181377bfcfc472" | |
}, | |
"truncated": 0, | |
"non_truncated": 238, | |
"padded": 952, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_physics|5": { | |
"hashes": { | |
"hash_examples": "dc3ce06378548565", | |
"hash_full_prompts": "5dc0d6831b66188f", | |
"hash_input_tokens": "2fabf7629a526a8f", | |
"hash_cont_tokens": "26903a0b1dfcdc43" | |
}, | |
"truncated": 0, | |
"non_truncated": 151, | |
"padded": 600, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_psychology|5": { | |
"hashes": { | |
"hash_examples": "c8d1d98a40e11f2f", | |
"hash_full_prompts": "af2b097da6d50365", | |
"hash_input_tokens": "cd3db6878121ee65", | |
"hash_cont_tokens": "8aa690c61d503a2f" | |
}, | |
"truncated": 0, | |
"non_truncated": 545, | |
"padded": 2180, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_statistics|5": { | |
"hashes": { | |
"hash_examples": "666c8759b98ee4ff", | |
"hash_full_prompts": "c757694421d6d68d", | |
"hash_input_tokens": "f9f29d7b93155249", | |
"hash_cont_tokens": "6b4ee10d4a80d543" | |
}, | |
"truncated": 0, | |
"non_truncated": 216, | |
"padded": 864, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_us_history|5": { | |
"hashes": { | |
"hash_examples": "95fef1c4b7d3f81e", | |
"hash_full_prompts": "e34a028d0ddeec5e", | |
"hash_input_tokens": "6edd2936c78ba724", | |
"hash_cont_tokens": "ddd67f68971dba1b" | |
}, | |
"truncated": 0, | |
"non_truncated": 204, | |
"padded": 816, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_world_history|5": { | |
"hashes": { | |
"hash_examples": "7e5085b6184b0322", | |
"hash_full_prompts": "1fa3d51392765601", | |
"hash_input_tokens": "4596a58728db47dd", | |
"hash_cont_tokens": "7c38999d46e6eb66" | |
}, | |
"truncated": 0, | |
"non_truncated": 237, | |
"padded": 948, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:human_aging|5": { | |
"hashes": { | |
"hash_examples": "c17333e7c7c10797", | |
"hash_full_prompts": "cac900721f9a1a94", | |
"hash_input_tokens": "8de221eee408e93d", | |
"hash_cont_tokens": "2c726e3946356403" | |
}, | |
"truncated": 0, | |
"non_truncated": 223, | |
"padded": 892, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:human_sexuality|5": { | |
"hashes": { | |
"hash_examples": "4edd1e9045df5e3d", | |
"hash_full_prompts": "0d6567bafee0a13c", | |
"hash_input_tokens": "89d04121bcb9c324", | |
"hash_cont_tokens": "85bf16b838b7e234" | |
}, | |
"truncated": 0, | |
"non_truncated": 131, | |
"padded": 524, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:international_law|5": { | |
"hashes": { | |
"hash_examples": "db2fa00d771a062a", | |
"hash_full_prompts": "d018f9116479795e", | |
"hash_input_tokens": "771f76b9867d2e3c", | |
"hash_cont_tokens": "df1e130b3147678a" | |
}, | |
"truncated": 0, | |
"non_truncated": 121, | |
"padded": 484, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:jurisprudence|5": { | |
"hashes": { | |
"hash_examples": "e956f86b124076fe", | |
"hash_full_prompts": "1487e89a10ec58b7", | |
"hash_input_tokens": "d9f161d325f9184c", | |
"hash_cont_tokens": "2ef1ed2cbe4c3487" | |
}, | |
"truncated": 0, | |
"non_truncated": 108, | |
"padded": 432, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:logical_fallacies|5": { | |
"hashes": { | |
"hash_examples": "956e0e6365ab79f1", | |
"hash_full_prompts": "677785b2181f9243", | |
"hash_input_tokens": "6d07620c2b75bfc4", | |
"hash_cont_tokens": "1f573f46f46a469f" | |
}, | |
"truncated": 0, | |
"non_truncated": 163, | |
"padded": 636, | |
"non_padded": 16, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:machine_learning|5": { | |
"hashes": { | |
"hash_examples": "397997cc6f4d581e", | |
"hash_full_prompts": "769ee14a2aea49bb", | |
"hash_input_tokens": "7c1f54f4c43a387d", | |
"hash_cont_tokens": "2c76ddbc61b4f924" | |
}, | |
"truncated": 0, | |
"non_truncated": 112, | |
"padded": 448, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:management|5": { | |
"hashes": { | |
"hash_examples": "2bcbe6f6ca63d740", | |
"hash_full_prompts": "cb1ff9dac9582144", | |
"hash_input_tokens": "2f1d93c4a7d9aa59", | |
"hash_cont_tokens": "eb5084b3fb4f1fb1" | |
}, | |
"truncated": 0, | |
"non_truncated": 103, | |
"padded": 412, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:marketing|5": { | |
"hashes": { | |
"hash_examples": "8ddb20d964a1b065", | |
"hash_full_prompts": "9fc2114a187ad9a2", | |
"hash_input_tokens": "88f3c1308c3582fd", | |
"hash_cont_tokens": "5260ce4e17dfe786" | |
}, | |
"truncated": 0, | |
"non_truncated": 234, | |
"padded": 900, | |
"non_padded": 36, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:medical_genetics|5": { | |
"hashes": { | |
"hash_examples": "182a71f4763d2cea", | |
"hash_full_prompts": "46a616fa51878959", | |
"hash_input_tokens": "77bf533ea67dfb52", | |
"hash_cont_tokens": "1c8aa5f98bf1c43e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:miscellaneous|5": { | |
"hashes": { | |
"hash_examples": "4c404fdbb4ca57fc", | |
"hash_full_prompts": "0813e1be36dbaae1", | |
"hash_input_tokens": "0423c29227dbf4ab", | |
"hash_cont_tokens": "22a03da5efd7116b" | |
}, | |
"truncated": 0, | |
"non_truncated": 783, | |
"padded": 3128, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:moral_disputes|5": { | |
"hashes": { | |
"hash_examples": "60cbd2baa3fea5c9", | |
"hash_full_prompts": "1d14adebb9b62519", | |
"hash_input_tokens": "122143036a3e2dcf", | |
"hash_cont_tokens": "aeefd12afa9befb9" | |
}, | |
"truncated": 0, | |
"non_truncated": 346, | |
"padded": 1384, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:moral_scenarios|5": { | |
"hashes": { | |
"hash_examples": "fd8b0431fbdd75ef", | |
"hash_full_prompts": "b80d3d236165e3de", | |
"hash_input_tokens": "d6773181937dc84a", | |
"hash_cont_tokens": "9062afac72c8c450" | |
}, | |
"truncated": 0, | |
"non_truncated": 895, | |
"padded": 3551, | |
"non_padded": 29, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:nutrition|5": { | |
"hashes": { | |
"hash_examples": "71e55e2b829b6528", | |
"hash_full_prompts": "2bfb18e5fab8dea7", | |
"hash_input_tokens": "3260c93ef04fa67e", | |
"hash_cont_tokens": "d3acbaa8b457c949" | |
}, | |
"truncated": 0, | |
"non_truncated": 306, | |
"padded": 1224, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:philosophy|5": { | |
"hashes": { | |
"hash_examples": "a6d489a8d208fa4b", | |
"hash_full_prompts": "e8c0d5b6dae3ccc8", | |
"hash_input_tokens": "e04d36dedb00f4a4", | |
"hash_cont_tokens": "983582758322c94b" | |
}, | |
"truncated": 0, | |
"non_truncated": 311, | |
"padded": 1244, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:prehistory|5": { | |
"hashes": { | |
"hash_examples": "6cc50f032a19acaa", | |
"hash_full_prompts": "4a6a1d3ab1bf28e4", | |
"hash_input_tokens": "5a982a6881244049", | |
"hash_cont_tokens": "7448a562fc8c6568" | |
}, | |
"truncated": 0, | |
"non_truncated": 324, | |
"padded": 1288, | |
"non_padded": 8, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:professional_accounting|5": { | |
"hashes": { | |
"hash_examples": "50f57ab32f5f6cea", | |
"hash_full_prompts": "e60129bd2d82ffc6", | |
"hash_input_tokens": "52e7ef264834cbea", | |
"hash_cont_tokens": "926bd9b39ef55d34" | |
}, | |
"truncated": 0, | |
"non_truncated": 282, | |
"padded": 1116, | |
"non_padded": 12, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:professional_law|5": { | |
"hashes": { | |
"hash_examples": "a8fdc85c64f4b215", | |
"hash_full_prompts": "0dbb1d9b72dcea03", | |
"hash_input_tokens": "009d5c5f64f244c2", | |
"hash_cont_tokens": "8ad4cce3143876c1" | |
}, | |
"truncated": 0, | |
"non_truncated": 1534, | |
"padded": 6136, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:professional_medicine|5": { | |
"hashes": { | |
"hash_examples": "c373a28a3050a73a", | |
"hash_full_prompts": "5e040f9ca68b089e", | |
"hash_input_tokens": "befc366e06312b47", | |
"hash_cont_tokens": "54911ddf393f3918" | |
}, | |
"truncated": 0, | |
"non_truncated": 272, | |
"padded": 1088, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:professional_psychology|5": { | |
"hashes": { | |
"hash_examples": "bf5254fe818356af", | |
"hash_full_prompts": "b386ecda8b87150e", | |
"hash_input_tokens": "bcfd722643103684", | |
"hash_cont_tokens": "b0ce00ea933fba38" | |
}, | |
"truncated": 0, | |
"non_truncated": 612, | |
"padded": 2432, | |
"non_padded": 16, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:public_relations|5": { | |
"hashes": { | |
"hash_examples": "b66d52e28e7d14e0", | |
"hash_full_prompts": "fe43562263e25677", | |
"hash_input_tokens": "b812174d7faa89d8", | |
"hash_cont_tokens": "25884a4cae00ae73" | |
}, | |
"truncated": 0, | |
"non_truncated": 110, | |
"padded": 428, | |
"non_padded": 12, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:security_studies|5": { | |
"hashes": { | |
"hash_examples": "514c14feaf000ad9", | |
"hash_full_prompts": "27d4a2ac541ef4b9", | |
"hash_input_tokens": "979ce1eb305d45d7", | |
"hash_cont_tokens": "f2c262424d0d3445" | |
}, | |
"truncated": 0, | |
"non_truncated": 245, | |
"padded": 980, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:sociology|5": { | |
"hashes": { | |
"hash_examples": "f6c9bc9d18c80870", | |
"hash_full_prompts": "c072ea7d1a1524f2", | |
"hash_input_tokens": "248e0f576901b94f", | |
"hash_cont_tokens": "ea387c7647d9b2d6" | |
}, | |
"truncated": 0, | |
"non_truncated": 201, | |
"padded": 804, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:us_foreign_policy|5": { | |
"hashes": { | |
"hash_examples": "ed7b78629db6678f", | |
"hash_full_prompts": "341a97ca3e4d699d", | |
"hash_input_tokens": "905d7e3d3ea4a199", | |
"hash_cont_tokens": "1c8aa5f98bf1c43e" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 396, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:virology|5": { | |
"hashes": { | |
"hash_examples": "bc52ffdc3f9b994a", | |
"hash_full_prompts": "651d471e2eb8b5e9", | |
"hash_input_tokens": "0372a2766369f01f", | |
"hash_cont_tokens": "c0e4c149f2b07c69" | |
}, | |
"truncated": 0, | |
"non_truncated": 166, | |
"padded": 664, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:world_religions|5": { | |
"hashes": { | |
"hash_examples": "ecdb4a4f94f62930", | |
"hash_full_prompts": "3773f03542ce44a3", | |
"hash_input_tokens": "d5f028ed7c980a92", | |
"hash_cont_tokens": "e856ec7cfea826d9" | |
}, | |
"truncated": 0, | |
"non_truncated": 171, | |
"padded": 684, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
} | |
}, | |
"summary_general": { | |
"hashes": { | |
"hash_examples": "341a076d0beb7048", | |
"hash_full_prompts": "a5c8f2b7ff4f5ae2", | |
"hash_input_tokens": "e20361cf2ffb59e8", | |
"hash_cont_tokens": "df800c72427e779e" | |
}, | |
"truncated": 0, | |
"non_truncated": 14042, | |
"padded": 55930, | |
"non_padded": 238, | |
"num_truncated_few_shots": 0 | |
} | |
} |