open-r1-eval-leaderboard
/
eval_results
/meta-llama
/Llama-2-13b-chat-hf
/main
/mmlu
/results_2024-03-04T22-06-39.590736.json
{ | |
"config_general": { | |
"lighteval_sha": "?", | |
"num_fewshot_seeds": 1, | |
"override_batch_size": 1, | |
"max_samples": null, | |
"job_id": "", | |
"start_time": 6591066.869820222, | |
"end_time": 6592454.129919868, | |
"total_evaluation_time_secondes": "1387.260099645704", | |
"model_name": "meta-llama/Llama-2-13b-chat-hf", | |
"model_sha": "c2f3ec81aac798ae26dcc57799a994dfbf521496", | |
"model_dtype": "torch.bfloat16", | |
"model_size": "24.32 GB", | |
"config": null | |
}, | |
"results": { | |
"lighteval|mmlu:abstract_algebra|5": { | |
"acc": 0.33, | |
"acc_stderr": 0.04725815626252605 | |
}, | |
"lighteval|mmlu:anatomy|5": { | |
"acc": 0.5037037037037037, | |
"acc_stderr": 0.04319223625811331 | |
}, | |
"lighteval|mmlu:astronomy|5": { | |
"acc": 0.4342105263157895, | |
"acc_stderr": 0.0403356566784832 | |
}, | |
"lighteval|mmlu:business_ethics|5": { | |
"acc": 0.56, | |
"acc_stderr": 0.04988876515698589 | |
}, | |
"lighteval|mmlu:clinical_knowledge|5": { | |
"acc": 0.5132075471698113, | |
"acc_stderr": 0.030762134874500476 | |
}, | |
"lighteval|mmlu:college_biology|5": { | |
"acc": 0.5486111111111112, | |
"acc_stderr": 0.041614023984032786 | |
}, | |
"lighteval|mmlu:college_chemistry|5": { | |
"acc": 0.32, | |
"acc_stderr": 0.046882617226215034 | |
}, | |
"lighteval|mmlu:college_computer_science|5": { | |
"acc": 0.26, | |
"acc_stderr": 0.04408440022768078 | |
}, | |
"lighteval|mmlu:college_mathematics|5": { | |
"acc": 0.26, | |
"acc_stderr": 0.0440844002276808 | |
}, | |
"lighteval|mmlu:college_medicine|5": { | |
"acc": 0.3988439306358382, | |
"acc_stderr": 0.037336266553835096 | |
}, | |
"lighteval|mmlu:college_physics|5": { | |
"acc": 0.2647058823529412, | |
"acc_stderr": 0.043898699568087785 | |
}, | |
"lighteval|mmlu:computer_security|5": { | |
"acc": 0.68, | |
"acc_stderr": 0.04688261722621505 | |
}, | |
"lighteval|mmlu:conceptual_physics|5": { | |
"acc": 0.4425531914893617, | |
"acc_stderr": 0.03246956919789958 | |
}, | |
"lighteval|mmlu:econometrics|5": { | |
"acc": 0.2894736842105263, | |
"acc_stderr": 0.04266339443159394 | |
}, | |
"lighteval|mmlu:electrical_engineering|5": { | |
"acc": 0.4896551724137931, | |
"acc_stderr": 0.04165774775728763 | |
}, | |
"lighteval|mmlu:elementary_mathematics|5": { | |
"acc": 0.291005291005291, | |
"acc_stderr": 0.023393826500484875 | |
}, | |
"lighteval|mmlu:formal_logic|5": { | |
"acc": 0.31746031746031744, | |
"acc_stderr": 0.04163453031302859 | |
}, | |
"lighteval|mmlu:global_facts|5": { | |
"acc": 0.4, | |
"acc_stderr": 0.049236596391733084 | |
}, | |
"lighteval|mmlu:high_school_biology|5": { | |
"acc": 0.5967741935483871, | |
"acc_stderr": 0.027906150826041146 | |
}, | |
"lighteval|mmlu:high_school_chemistry|5": { | |
"acc": 0.2857142857142857, | |
"acc_stderr": 0.03178529710642749 | |
}, | |
"lighteval|mmlu:high_school_computer_science|5": { | |
"acc": 0.48, | |
"acc_stderr": 0.050211673156867795 | |
}, | |
"lighteval|mmlu:high_school_european_history|5": { | |
"acc": 0.4303030303030303, | |
"acc_stderr": 0.03866225962879077 | |
}, | |
"lighteval|mmlu:high_school_geography|5": { | |
"acc": 0.6868686868686869, | |
"acc_stderr": 0.033042050878136525 | |
}, | |
"lighteval|mmlu:high_school_government_and_politics|5": { | |
"acc": 0.6839378238341969, | |
"acc_stderr": 0.03355397369686173 | |
}, | |
"lighteval|mmlu:high_school_macroeconomics|5": { | |
"acc": 0.5076923076923077, | |
"acc_stderr": 0.02534800603153477 | |
}, | |
"lighteval|mmlu:high_school_mathematics|5": { | |
"acc": 0.27037037037037037, | |
"acc_stderr": 0.027080372815145668 | |
}, | |
"lighteval|mmlu:high_school_microeconomics|5": { | |
"acc": 0.5, | |
"acc_stderr": 0.032478490123081544 | |
}, | |
"lighteval|mmlu:high_school_physics|5": { | |
"acc": 0.2847682119205298, | |
"acc_stderr": 0.03684881521389023 | |
}, | |
"lighteval|mmlu:high_school_psychology|5": { | |
"acc": 0.6954128440366972, | |
"acc_stderr": 0.019732299420354055 | |
}, | |
"lighteval|mmlu:high_school_statistics|5": { | |
"acc": 0.3425925925925926, | |
"acc_stderr": 0.032365852526021574 | |
}, | |
"lighteval|mmlu:high_school_us_history|5": { | |
"acc": 0.4068627450980392, | |
"acc_stderr": 0.03447891136353383 | |
}, | |
"lighteval|mmlu:high_school_world_history|5": { | |
"acc": 0.4810126582278481, | |
"acc_stderr": 0.03252375148090448 | |
}, | |
"lighteval|mmlu:human_aging|5": { | |
"acc": 0.6098654708520179, | |
"acc_stderr": 0.03273766725459156 | |
}, | |
"lighteval|mmlu:human_sexuality|5": { | |
"acc": 0.5725190839694656, | |
"acc_stderr": 0.04338920305792401 | |
}, | |
"lighteval|mmlu:international_law|5": { | |
"acc": 0.6611570247933884, | |
"acc_stderr": 0.043207678075366705 | |
}, | |
"lighteval|mmlu:jurisprudence|5": { | |
"acc": 0.5925925925925926, | |
"acc_stderr": 0.04750077341199984 | |
}, | |
"lighteval|mmlu:logical_fallacies|5": { | |
"acc": 0.6380368098159509, | |
"acc_stderr": 0.037757007291414416 | |
}, | |
"lighteval|mmlu:machine_learning|5": { | |
"acc": 0.36607142857142855, | |
"acc_stderr": 0.0457237235873743 | |
}, | |
"lighteval|mmlu:management|5": { | |
"acc": 0.6601941747572816, | |
"acc_stderr": 0.04689765937278135 | |
}, | |
"lighteval|mmlu:marketing|5": { | |
"acc": 0.8076923076923077, | |
"acc_stderr": 0.025819233256483724 | |
}, | |
"lighteval|mmlu:medical_genetics|5": { | |
"acc": 0.55, | |
"acc_stderr": 0.04999999999999998 | |
}, | |
"lighteval|mmlu:miscellaneous|5": { | |
"acc": 0.7586206896551724, | |
"acc_stderr": 0.015302380123542103 | |
}, | |
"lighteval|mmlu:moral_disputes|5": { | |
"acc": 0.5115606936416185, | |
"acc_stderr": 0.026911898686377927 | |
}, | |
"lighteval|mmlu:moral_scenarios|5": { | |
"acc": 0.4, | |
"acc_stderr": 0.016384638410380827 | |
}, | |
"lighteval|mmlu:nutrition|5": { | |
"acc": 0.5915032679738562, | |
"acc_stderr": 0.028146405993096358 | |
}, | |
"lighteval|mmlu:philosophy|5": { | |
"acc": 0.5594855305466238, | |
"acc_stderr": 0.028196400574197422 | |
}, | |
"lighteval|mmlu:prehistory|5": { | |
"acc": 0.5524691358024691, | |
"acc_stderr": 0.027667138569422708 | |
}, | |
"lighteval|mmlu:professional_accounting|5": { | |
"acc": 0.3191489361702128, | |
"acc_stderr": 0.027807990141320193 | |
}, | |
"lighteval|mmlu:professional_law|5": { | |
"acc": 0.303129074315515, | |
"acc_stderr": 0.011738669951254294 | |
}, | |
"lighteval|mmlu:professional_medicine|5": { | |
"acc": 0.40808823529411764, | |
"acc_stderr": 0.029855261393483924 | |
}, | |
"lighteval|mmlu:professional_psychology|5": { | |
"acc": 0.5, | |
"acc_stderr": 0.020227834851568375 | |
}, | |
"lighteval|mmlu:public_relations|5": { | |
"acc": 0.6181818181818182, | |
"acc_stderr": 0.046534298079135075 | |
}, | |
"lighteval|mmlu:security_studies|5": { | |
"acc": 0.5591836734693878, | |
"acc_stderr": 0.03178419114175363 | |
}, | |
"lighteval|mmlu:sociology|5": { | |
"acc": 0.6915422885572139, | |
"acc_stderr": 0.03265819588512699 | |
}, | |
"lighteval|mmlu:us_foreign_policy|5": { | |
"acc": 0.77, | |
"acc_stderr": 0.04229525846816507 | |
}, | |
"lighteval|mmlu:virology|5": { | |
"acc": 0.4578313253012048, | |
"acc_stderr": 0.038786267710023595 | |
}, | |
"lighteval|mmlu:world_religions|5": { | |
"acc": 0.7543859649122807, | |
"acc_stderr": 0.0330140594698725 | |
}, | |
"lighteval|mmlu:_average|5": { | |
"acc": 0.49419297605160306, | |
"acc_stderr": 0.035677848734397055 | |
} | |
}, | |
"versions": { | |
"lighteval|mmlu:abstract_algebra|5": 0, | |
"lighteval|mmlu:anatomy|5": 0, | |
"lighteval|mmlu:astronomy|5": 0, | |
"lighteval|mmlu:business_ethics|5": 0, | |
"lighteval|mmlu:clinical_knowledge|5": 0, | |
"lighteval|mmlu:college_biology|5": 0, | |
"lighteval|mmlu:college_chemistry|5": 0, | |
"lighteval|mmlu:college_computer_science|5": 0, | |
"lighteval|mmlu:college_mathematics|5": 0, | |
"lighteval|mmlu:college_medicine|5": 0, | |
"lighteval|mmlu:college_physics|5": 0, | |
"lighteval|mmlu:computer_security|5": 0, | |
"lighteval|mmlu:conceptual_physics|5": 0, | |
"lighteval|mmlu:econometrics|5": 0, | |
"lighteval|mmlu:electrical_engineering|5": 0, | |
"lighteval|mmlu:elementary_mathematics|5": 0, | |
"lighteval|mmlu:formal_logic|5": 0, | |
"lighteval|mmlu:global_facts|5": 0, | |
"lighteval|mmlu:high_school_biology|5": 0, | |
"lighteval|mmlu:high_school_chemistry|5": 0, | |
"lighteval|mmlu:high_school_computer_science|5": 0, | |
"lighteval|mmlu:high_school_european_history|5": 0, | |
"lighteval|mmlu:high_school_geography|5": 0, | |
"lighteval|mmlu:high_school_government_and_politics|5": 0, | |
"lighteval|mmlu:high_school_macroeconomics|5": 0, | |
"lighteval|mmlu:high_school_mathematics|5": 0, | |
"lighteval|mmlu:high_school_microeconomics|5": 0, | |
"lighteval|mmlu:high_school_physics|5": 0, | |
"lighteval|mmlu:high_school_psychology|5": 0, | |
"lighteval|mmlu:high_school_statistics|5": 0, | |
"lighteval|mmlu:high_school_us_history|5": 0, | |
"lighteval|mmlu:high_school_world_history|5": 0, | |
"lighteval|mmlu:human_aging|5": 0, | |
"lighteval|mmlu:human_sexuality|5": 0, | |
"lighteval|mmlu:international_law|5": 0, | |
"lighteval|mmlu:jurisprudence|5": 0, | |
"lighteval|mmlu:logical_fallacies|5": 0, | |
"lighteval|mmlu:machine_learning|5": 0, | |
"lighteval|mmlu:management|5": 0, | |
"lighteval|mmlu:marketing|5": 0, | |
"lighteval|mmlu:medical_genetics|5": 0, | |
"lighteval|mmlu:miscellaneous|5": 0, | |
"lighteval|mmlu:moral_disputes|5": 0, | |
"lighteval|mmlu:moral_scenarios|5": 0, | |
"lighteval|mmlu:nutrition|5": 0, | |
"lighteval|mmlu:philosophy|5": 0, | |
"lighteval|mmlu:prehistory|5": 0, | |
"lighteval|mmlu:professional_accounting|5": 0, | |
"lighteval|mmlu:professional_law|5": 0, | |
"lighteval|mmlu:professional_medicine|5": 0, | |
"lighteval|mmlu:professional_psychology|5": 0, | |
"lighteval|mmlu:public_relations|5": 0, | |
"lighteval|mmlu:security_studies|5": 0, | |
"lighteval|mmlu:sociology|5": 0, | |
"lighteval|mmlu:us_foreign_policy|5": 0, | |
"lighteval|mmlu:virology|5": 0, | |
"lighteval|mmlu:world_religions|5": 0 | |
}, | |
"config_tasks": { | |
"lighteval|mmlu:abstract_algebra": { | |
"name": "mmlu:abstract_algebra", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "abstract_algebra", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:anatomy": { | |
"name": "mmlu:anatomy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "anatomy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 135, | |
"effective_num_docs": 135 | |
}, | |
"lighteval|mmlu:astronomy": { | |
"name": "mmlu:astronomy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "astronomy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 152, | |
"effective_num_docs": 152 | |
}, | |
"lighteval|mmlu:business_ethics": { | |
"name": "mmlu:business_ethics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "business_ethics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:clinical_knowledge": { | |
"name": "mmlu:clinical_knowledge", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "clinical_knowledge", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 265, | |
"effective_num_docs": 265 | |
}, | |
"lighteval|mmlu:college_biology": { | |
"name": "mmlu:college_biology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_biology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 144, | |
"effective_num_docs": 144 | |
}, | |
"lighteval|mmlu:college_chemistry": { | |
"name": "mmlu:college_chemistry", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_chemistry", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:college_computer_science": { | |
"name": "mmlu:college_computer_science", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_computer_science", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:college_mathematics": { | |
"name": "mmlu:college_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:college_medicine": { | |
"name": "mmlu:college_medicine", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_medicine", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 173, | |
"effective_num_docs": 173 | |
}, | |
"lighteval|mmlu:college_physics": { | |
"name": "mmlu:college_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 102, | |
"effective_num_docs": 102 | |
}, | |
"lighteval|mmlu:computer_security": { | |
"name": "mmlu:computer_security", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "computer_security", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:conceptual_physics": { | |
"name": "mmlu:conceptual_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "conceptual_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 235, | |
"effective_num_docs": 235 | |
}, | |
"lighteval|mmlu:econometrics": { | |
"name": "mmlu:econometrics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "econometrics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 114, | |
"effective_num_docs": 114 | |
}, | |
"lighteval|mmlu:electrical_engineering": { | |
"name": "mmlu:electrical_engineering", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "electrical_engineering", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 145, | |
"effective_num_docs": 145 | |
}, | |
"lighteval|mmlu:elementary_mathematics": { | |
"name": "mmlu:elementary_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "elementary_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 378, | |
"effective_num_docs": 378 | |
}, | |
"lighteval|mmlu:formal_logic": { | |
"name": "mmlu:formal_logic", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "formal_logic", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 126, | |
"effective_num_docs": 126 | |
}, | |
"lighteval|mmlu:global_facts": { | |
"name": "mmlu:global_facts", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "global_facts", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:high_school_biology": { | |
"name": "mmlu:high_school_biology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_biology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 310, | |
"effective_num_docs": 310 | |
}, | |
"lighteval|mmlu:high_school_chemistry": { | |
"name": "mmlu:high_school_chemistry", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_chemistry", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 203, | |
"effective_num_docs": 203 | |
}, | |
"lighteval|mmlu:high_school_computer_science": { | |
"name": "mmlu:high_school_computer_science", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_computer_science", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:high_school_european_history": { | |
"name": "mmlu:high_school_european_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_european_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 165, | |
"effective_num_docs": 165 | |
}, | |
"lighteval|mmlu:high_school_geography": { | |
"name": "mmlu:high_school_geography", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_geography", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 198, | |
"effective_num_docs": 198 | |
}, | |
"lighteval|mmlu:high_school_government_and_politics": { | |
"name": "mmlu:high_school_government_and_politics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_government_and_politics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 193, | |
"effective_num_docs": 193 | |
}, | |
"lighteval|mmlu:high_school_macroeconomics": { | |
"name": "mmlu:high_school_macroeconomics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_macroeconomics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 390, | |
"effective_num_docs": 390 | |
}, | |
"lighteval|mmlu:high_school_mathematics": { | |
"name": "mmlu:high_school_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 270, | |
"effective_num_docs": 270 | |
}, | |
"lighteval|mmlu:high_school_microeconomics": { | |
"name": "mmlu:high_school_microeconomics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_microeconomics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 238, | |
"effective_num_docs": 238 | |
}, | |
"lighteval|mmlu:high_school_physics": { | |
"name": "mmlu:high_school_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 151, | |
"effective_num_docs": 151 | |
}, | |
"lighteval|mmlu:high_school_psychology": { | |
"name": "mmlu:high_school_psychology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_psychology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 545, | |
"effective_num_docs": 545 | |
}, | |
"lighteval|mmlu:high_school_statistics": { | |
"name": "mmlu:high_school_statistics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_statistics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 216, | |
"effective_num_docs": 216 | |
}, | |
"lighteval|mmlu:high_school_us_history": { | |
"name": "mmlu:high_school_us_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_us_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 204, | |
"effective_num_docs": 204 | |
}, | |
"lighteval|mmlu:high_school_world_history": { | |
"name": "mmlu:high_school_world_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_world_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 237, | |
"effective_num_docs": 237 | |
}, | |
"lighteval|mmlu:human_aging": { | |
"name": "mmlu:human_aging", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "human_aging", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 223, | |
"effective_num_docs": 223 | |
}, | |
"lighteval|mmlu:human_sexuality": { | |
"name": "mmlu:human_sexuality", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "human_sexuality", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 131, | |
"effective_num_docs": 131 | |
}, | |
"lighteval|mmlu:international_law": { | |
"name": "mmlu:international_law", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "international_law", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 121, | |
"effective_num_docs": 121 | |
}, | |
"lighteval|mmlu:jurisprudence": { | |
"name": "mmlu:jurisprudence", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "jurisprudence", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 108, | |
"effective_num_docs": 108 | |
}, | |
"lighteval|mmlu:logical_fallacies": { | |
"name": "mmlu:logical_fallacies", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "logical_fallacies", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 163, | |
"effective_num_docs": 163 | |
}, | |
"lighteval|mmlu:machine_learning": { | |
"name": "mmlu:machine_learning", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "machine_learning", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 112, | |
"effective_num_docs": 112 | |
}, | |
"lighteval|mmlu:management": { | |
"name": "mmlu:management", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "management", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 103, | |
"effective_num_docs": 103 | |
}, | |
"lighteval|mmlu:marketing": { | |
"name": "mmlu:marketing", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "marketing", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 234, | |
"effective_num_docs": 234 | |
}, | |
"lighteval|mmlu:medical_genetics": { | |
"name": "mmlu:medical_genetics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "medical_genetics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:miscellaneous": { | |
"name": "mmlu:miscellaneous", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "miscellaneous", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 783, | |
"effective_num_docs": 783 | |
}, | |
"lighteval|mmlu:moral_disputes": { | |
"name": "mmlu:moral_disputes", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "moral_disputes", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 346, | |
"effective_num_docs": 346 | |
}, | |
"lighteval|mmlu:moral_scenarios": { | |
"name": "mmlu:moral_scenarios", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "moral_scenarios", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 895, | |
"effective_num_docs": 895 | |
}, | |
"lighteval|mmlu:nutrition": { | |
"name": "mmlu:nutrition", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "nutrition", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 306, | |
"effective_num_docs": 306 | |
}, | |
"lighteval|mmlu:philosophy": { | |
"name": "mmlu:philosophy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "philosophy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 311, | |
"effective_num_docs": 311 | |
}, | |
"lighteval|mmlu:prehistory": { | |
"name": "mmlu:prehistory", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "prehistory", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 324, | |
"effective_num_docs": 324 | |
}, | |
"lighteval|mmlu:professional_accounting": { | |
"name": "mmlu:professional_accounting", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_accounting", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 282, | |
"effective_num_docs": 282 | |
}, | |
"lighteval|mmlu:professional_law": { | |
"name": "mmlu:professional_law", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_law", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 1534, | |
"effective_num_docs": 1534 | |
}, | |
"lighteval|mmlu:professional_medicine": { | |
"name": "mmlu:professional_medicine", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_medicine", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 272, | |
"effective_num_docs": 272 | |
}, | |
"lighteval|mmlu:professional_psychology": { | |
"name": "mmlu:professional_psychology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_psychology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 612, | |
"effective_num_docs": 612 | |
}, | |
"lighteval|mmlu:public_relations": { | |
"name": "mmlu:public_relations", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "public_relations", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 110, | |
"effective_num_docs": 110 | |
}, | |
"lighteval|mmlu:security_studies": { | |
"name": "mmlu:security_studies", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "security_studies", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 245, | |
"effective_num_docs": 245 | |
}, | |
"lighteval|mmlu:sociology": { | |
"name": "mmlu:sociology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "sociology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 201, | |
"effective_num_docs": 201 | |
}, | |
"lighteval|mmlu:us_foreign_policy": { | |
"name": "mmlu:us_foreign_policy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "us_foreign_policy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100 | |
}, | |
"lighteval|mmlu:virology": { | |
"name": "mmlu:virology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "virology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 166, | |
"effective_num_docs": 166 | |
}, | |
"lighteval|mmlu:world_religions": { | |
"name": "mmlu:world_religions", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "world_religions", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval", | |
"mmlu" | |
], | |
"original_num_docs": 171, | |
"effective_num_docs": 171 | |
} | |
}, | |
"summary_tasks": { | |
"lighteval|mmlu:abstract_algebra|5": { | |
"hashes": { | |
"hash_examples": "4c76229e00c9c0e9", | |
"hash_full_prompts": "83d6a55586dc2bba", | |
"hash_input_tokens": "6afa7b1220d16abf", | |
"hash_cont_tokens": "50421e30bef398f9" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:anatomy|5": { | |
"hashes": { | |
"hash_examples": "6a1f8104dccbd33b", | |
"hash_full_prompts": "5c09993ae6ff6cde", | |
"hash_input_tokens": "4532e5ce7f1b7940", | |
"hash_cont_tokens": "f11971a765cb609f" | |
}, | |
"truncated": 0, | |
"non_truncated": 135, | |
"padded": 540, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:astronomy|5": { | |
"hashes": { | |
"hash_examples": "1302effa3a76ce4c", | |
"hash_full_prompts": "17cda215865f2b6f", | |
"hash_input_tokens": "ec87e1f8691f647b", | |
"hash_cont_tokens": "440a970fadecdc7b" | |
}, | |
"truncated": 0, | |
"non_truncated": 152, | |
"padded": 608, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:business_ethics|5": { | |
"hashes": { | |
"hash_examples": "03cb8bce5336419a", | |
"hash_full_prompts": "d8ebf97adfbf5cb7", | |
"hash_input_tokens": "15e450bd72359853", | |
"hash_cont_tokens": "50421e30bef398f9" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:clinical_knowledge|5": { | |
"hashes": { | |
"hash_examples": "ffbb9c7b2be257f9", | |
"hash_full_prompts": "85b545797b42968a", | |
"hash_input_tokens": "f206d8e9f45b8c81", | |
"hash_cont_tokens": "7ecd60c25b9bfe5b" | |
}, | |
"truncated": 0, | |
"non_truncated": 265, | |
"padded": 1053, | |
"non_padded": 7, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_biology|5": { | |
"hashes": { | |
"hash_examples": "3ee77f176f38eb8e", | |
"hash_full_prompts": "4ff32339b148b0e1", | |
"hash_input_tokens": "6416b7b322c0e214", | |
"hash_cont_tokens": "875cde3af7a0ee14" | |
}, | |
"truncated": 0, | |
"non_truncated": 144, | |
"padded": 576, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_chemistry|5": { | |
"hashes": { | |
"hash_examples": "ce61a69c46d47aeb", | |
"hash_full_prompts": "2cd9f275bf2eea56", | |
"hash_input_tokens": "a531f340add05429", | |
"hash_cont_tokens": "50421e30bef398f9" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_computer_science|5": { | |
"hashes": { | |
"hash_examples": "32805b52d7d5daab", | |
"hash_full_prompts": "94fedd93db562fc7", | |
"hash_input_tokens": "afd7ab0263226f70", | |
"hash_cont_tokens": "50421e30bef398f9" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_mathematics|5": { | |
"hashes": { | |
"hash_examples": "55da1a0a0bd33722", | |
"hash_full_prompts": "a81c2b2be970af1f", | |
"hash_input_tokens": "431d21b244fea6d6", | |
"hash_cont_tokens": "50421e30bef398f9" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_medicine|5": { | |
"hashes": { | |
"hash_examples": "c33e143163049176", | |
"hash_full_prompts": "2efdf60ad4ce3170", | |
"hash_input_tokens": "473af2553548595f", | |
"hash_cont_tokens": "702fb6d82ff0d6ac" | |
}, | |
"truncated": 0, | |
"non_truncated": 173, | |
"padded": 692, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:college_physics|5": { | |
"hashes": { | |
"hash_examples": "ebdab1cdb7e555df", | |
"hash_full_prompts": "40031d9e95399539", | |
"hash_input_tokens": "2d60278e5715b403", | |
"hash_cont_tokens": "f7b8097afc16a47c" | |
}, | |
"truncated": 0, | |
"non_truncated": 102, | |
"padded": 408, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:computer_security|5": { | |
"hashes": { | |
"hash_examples": "a24fd7d08a560921", | |
"hash_full_prompts": "5ac8304db2af6546", | |
"hash_input_tokens": "aac7df63338e13a1", | |
"hash_cont_tokens": "50421e30bef398f9" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 392, | |
"non_padded": 8, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:conceptual_physics|5": { | |
"hashes": { | |
"hash_examples": "8300977a79386993", | |
"hash_full_prompts": "09ded89f95151bdc", | |
"hash_input_tokens": "c73191d47ecae882", | |
"hash_cont_tokens": "aa0e8bc655f2f641" | |
}, | |
"truncated": 0, | |
"non_truncated": 235, | |
"padded": 940, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:econometrics|5": { | |
"hashes": { | |
"hash_examples": "ddde36788a04a46f", | |
"hash_full_prompts": "ac6be342fcac5fa7", | |
"hash_input_tokens": "b122559fe7e19869", | |
"hash_cont_tokens": "b1cc6e7e9fcd3827" | |
}, | |
"truncated": 0, | |
"non_truncated": 114, | |
"padded": 456, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:electrical_engineering|5": { | |
"hashes": { | |
"hash_examples": "acbc5def98c19b3f", | |
"hash_full_prompts": "8b244e310aec21ed", | |
"hash_input_tokens": "7a0035dd5281c8eb", | |
"hash_cont_tokens": "2425a3f084a591ef" | |
}, | |
"truncated": 0, | |
"non_truncated": 145, | |
"padded": 580, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:elementary_mathematics|5": { | |
"hashes": { | |
"hash_examples": "146e61d07497a9bd", | |
"hash_full_prompts": "bb24af5e261ffad9", | |
"hash_input_tokens": "aec75c292917b828", | |
"hash_cont_tokens": "bd87bf0c060fd925" | |
}, | |
"truncated": 0, | |
"non_truncated": 378, | |
"padded": 1512, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:formal_logic|5": { | |
"hashes": { | |
"hash_examples": "8635216e1909a03f", | |
"hash_full_prompts": "6691157a2d8d6cd8", | |
"hash_input_tokens": "cf7e1ef025696ad1", | |
"hash_cont_tokens": "eb8932890e0605db" | |
}, | |
"truncated": 0, | |
"non_truncated": 126, | |
"padded": 504, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:global_facts|5": { | |
"hashes": { | |
"hash_examples": "30b315aa6353ee47", | |
"hash_full_prompts": "ede009cc2291b4ba", | |
"hash_input_tokens": "329005b065e08a91", | |
"hash_cont_tokens": "50421e30bef398f9" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_biology|5": { | |
"hashes": { | |
"hash_examples": "c9136373af2180de", | |
"hash_full_prompts": "516069e5c683712f", | |
"hash_input_tokens": "3e26e677f0be2a81", | |
"hash_cont_tokens": "1ddcb86d28cde266" | |
}, | |
"truncated": 0, | |
"non_truncated": 310, | |
"padded": 1240, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_chemistry|5": { | |
"hashes": { | |
"hash_examples": "b0661bfa1add6404", | |
"hash_full_prompts": "6d7933decfae2e7c", | |
"hash_input_tokens": "c84abd67de8a1504", | |
"hash_cont_tokens": "176c8dcff38c5f8f" | |
}, | |
"truncated": 0, | |
"non_truncated": 203, | |
"padded": 812, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_computer_science|5": { | |
"hashes": { | |
"hash_examples": "80fc1d623a3d665f", | |
"hash_full_prompts": "225ba4f8d43f2d92", | |
"hash_input_tokens": "980fc609a0bc2bde", | |
"hash_cont_tokens": "50421e30bef398f9" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_european_history|5": { | |
"hashes": { | |
"hash_examples": "854da6e5af0fe1a1", | |
"hash_full_prompts": "bda513652abc1615", | |
"hash_input_tokens": "e62fb6e6dbe635e3", | |
"hash_cont_tokens": "674fc454bdc5ac93" | |
}, | |
"truncated": 0, | |
"non_truncated": 165, | |
"padded": 656, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_geography|5": { | |
"hashes": { | |
"hash_examples": "7dc963c7acd19ad8", | |
"hash_full_prompts": "82a9b986bb3eace0", | |
"hash_input_tokens": "2d96d460c1438e4c", | |
"hash_cont_tokens": "03a5012b916274ea" | |
}, | |
"truncated": 0, | |
"non_truncated": 198, | |
"padded": 784, | |
"non_padded": 8, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_government_and_politics|5": { | |
"hashes": { | |
"hash_examples": "1f675dcdebc9758f", | |
"hash_full_prompts": "b2ee2e5792a426f0", | |
"hash_input_tokens": "baf2310c7c8d3c2c", | |
"hash_cont_tokens": "873d2aab226ba1d8" | |
}, | |
"truncated": 0, | |
"non_truncated": 193, | |
"padded": 772, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_macroeconomics|5": { | |
"hashes": { | |
"hash_examples": "2fb32cf2d80f0b35", | |
"hash_full_prompts": "699e18bfbabc98e1", | |
"hash_input_tokens": "f3bde1f14ba43ed5", | |
"hash_cont_tokens": "c583432ad27fcfe0" | |
}, | |
"truncated": 0, | |
"non_truncated": 390, | |
"padded": 1544, | |
"non_padded": 16, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_mathematics|5": { | |
"hashes": { | |
"hash_examples": "fd6646fdb5d58a1f", | |
"hash_full_prompts": "31abdccceebec1d8", | |
"hash_input_tokens": "21bf36b1e1a060ec", | |
"hash_cont_tokens": "d7907b61bcb8c123" | |
}, | |
"truncated": 0, | |
"non_truncated": 270, | |
"padded": 1074, | |
"non_padded": 6, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_microeconomics|5": { | |
"hashes": { | |
"hash_examples": "2118f21f71d87d84", | |
"hash_full_prompts": "c9de4b55818be79a", | |
"hash_input_tokens": "4d9737635bd72765", | |
"hash_cont_tokens": "f47f041de50333b9" | |
}, | |
"truncated": 0, | |
"non_truncated": 238, | |
"padded": 952, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_physics|5": { | |
"hashes": { | |
"hash_examples": "dc3ce06378548565", | |
"hash_full_prompts": "fffd4a8be164ffd4", | |
"hash_input_tokens": "9f6ab08c88ba571a", | |
"hash_cont_tokens": "0d56317b3e5eedb5" | |
}, | |
"truncated": 0, | |
"non_truncated": 151, | |
"padded": 592, | |
"non_padded": 12, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_psychology|5": { | |
"hashes": { | |
"hash_examples": "c8d1d98a40e11f2f", | |
"hash_full_prompts": "4d84120f57f50f02", | |
"hash_input_tokens": "07b78c4c2244353c", | |
"hash_cont_tokens": "09ba1243e7390c0f" | |
}, | |
"truncated": 0, | |
"non_truncated": 545, | |
"padded": 2172, | |
"non_padded": 8, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_statistics|5": { | |
"hashes": { | |
"hash_examples": "666c8759b98ee4ff", | |
"hash_full_prompts": "387300220abf339e", | |
"hash_input_tokens": "5e77be1bbc4426f3", | |
"hash_cont_tokens": "9cc29889c3d3f77d" | |
}, | |
"truncated": 0, | |
"non_truncated": 216, | |
"padded": 864, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_us_history|5": { | |
"hashes": { | |
"hash_examples": "95fef1c4b7d3f81e", | |
"hash_full_prompts": "fdcab3be5102227c", | |
"hash_input_tokens": "affc3a4bd4a0c21f", | |
"hash_cont_tokens": "cdd0b3dc06d933e5" | |
}, | |
"truncated": 0, | |
"non_truncated": 204, | |
"padded": 816, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:high_school_world_history|5": { | |
"hashes": { | |
"hash_examples": "7e5085b6184b0322", | |
"hash_full_prompts": "39c029c50f1b0640", | |
"hash_input_tokens": "2a203cdb948df59b", | |
"hash_cont_tokens": "e02816433ff28daf" | |
}, | |
"truncated": 0, | |
"non_truncated": 237, | |
"padded": 948, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:human_aging|5": { | |
"hashes": { | |
"hash_examples": "c17333e7c7c10797", | |
"hash_full_prompts": "242bdd661b223ba4", | |
"hash_input_tokens": "21ba91007072ffa6", | |
"hash_cont_tokens": "142a4a8a1138a214" | |
}, | |
"truncated": 0, | |
"non_truncated": 223, | |
"padded": 892, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:human_sexuality|5": { | |
"hashes": { | |
"hash_examples": "4edd1e9045df5e3d", | |
"hash_full_prompts": "d869e8f8bb290349", | |
"hash_input_tokens": "ab6f67680a2fa263", | |
"hash_cont_tokens": "bc54813e809b796d" | |
}, | |
"truncated": 0, | |
"non_truncated": 131, | |
"padded": 524, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:international_law|5": { | |
"hashes": { | |
"hash_examples": "db2fa00d771a062a", | |
"hash_full_prompts": "a98416f299224c5b", | |
"hash_input_tokens": "7f319045bc383cb2", | |
"hash_cont_tokens": "8ea8c5ff76a15bca" | |
}, | |
"truncated": 0, | |
"non_truncated": 121, | |
"padded": 484, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:jurisprudence|5": { | |
"hashes": { | |
"hash_examples": "e956f86b124076fe", | |
"hash_full_prompts": "b1f32c5674b5a48d", | |
"hash_input_tokens": "5eb4454b5d8937c3", | |
"hash_cont_tokens": "e3a8cd951b6e3469" | |
}, | |
"truncated": 0, | |
"non_truncated": 108, | |
"padded": 420, | |
"non_padded": 12, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:logical_fallacies|5": { | |
"hashes": { | |
"hash_examples": "956e0e6365ab79f1", | |
"hash_full_prompts": "6497883381b3f4af", | |
"hash_input_tokens": "94ce2e2ab5fe038b", | |
"hash_cont_tokens": "3e9e0bdc248fd88a" | |
}, | |
"truncated": 0, | |
"non_truncated": 163, | |
"padded": 648, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:machine_learning|5": { | |
"hashes": { | |
"hash_examples": "397997cc6f4d581e", | |
"hash_full_prompts": "897f529d4362936b", | |
"hash_input_tokens": "f4eb773cc5cf7c95", | |
"hash_cont_tokens": "55b12fb138c6a064" | |
}, | |
"truncated": 0, | |
"non_truncated": 112, | |
"padded": 448, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:management|5": { | |
"hashes": { | |
"hash_examples": "2bcbe6f6ca63d740", | |
"hash_full_prompts": "44fff80136a06711", | |
"hash_input_tokens": "7126719085e0248f", | |
"hash_cont_tokens": "a01d6d39a83c4597" | |
}, | |
"truncated": 0, | |
"non_truncated": 103, | |
"padded": 412, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:marketing|5": { | |
"hashes": { | |
"hash_examples": "8ddb20d964a1b065", | |
"hash_full_prompts": "e57f73aec1e72165", | |
"hash_input_tokens": "427e02b8387ff88e", | |
"hash_cont_tokens": "6aeaed4d823c98aa" | |
}, | |
"truncated": 0, | |
"non_truncated": 234, | |
"padded": 900, | |
"non_padded": 36, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:medical_genetics|5": { | |
"hashes": { | |
"hash_examples": "182a71f4763d2cea", | |
"hash_full_prompts": "317a5aa9f670aa48", | |
"hash_input_tokens": "e7bc234a302c57a0", | |
"hash_cont_tokens": "50421e30bef398f9" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:miscellaneous|5": { | |
"hashes": { | |
"hash_examples": "4c404fdbb4ca57fc", | |
"hash_full_prompts": "04b9eae11b4bc60b", | |
"hash_input_tokens": "dc785f6379125830", | |
"hash_cont_tokens": "9b0ab02a64603081" | |
}, | |
"truncated": 0, | |
"non_truncated": 783, | |
"padded": 3132, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:moral_disputes|5": { | |
"hashes": { | |
"hash_examples": "60cbd2baa3fea5c9", | |
"hash_full_prompts": "e6b49906a31128e8", | |
"hash_input_tokens": "e0886a867e559770", | |
"hash_cont_tokens": "3b8bbe9108e55ce9" | |
}, | |
"truncated": 0, | |
"non_truncated": 346, | |
"padded": 1384, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:moral_scenarios|5": { | |
"hashes": { | |
"hash_examples": "fd8b0431fbdd75ef", | |
"hash_full_prompts": "c8f9ea0e674b4920", | |
"hash_input_tokens": "e5c4455b03b0b686", | |
"hash_cont_tokens": "3e9bfc0362e97330" | |
}, | |
"truncated": 0, | |
"non_truncated": 895, | |
"padded": 3559, | |
"non_padded": 21, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:nutrition|5": { | |
"hashes": { | |
"hash_examples": "71e55e2b829b6528", | |
"hash_full_prompts": "af0def9bd07d0f70", | |
"hash_input_tokens": "a1f49e2355dd50df", | |
"hash_cont_tokens": "23b2dc6ee2da4cfc" | |
}, | |
"truncated": 0, | |
"non_truncated": 306, | |
"padded": 1220, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:philosophy|5": { | |
"hashes": { | |
"hash_examples": "a6d489a8d208fa4b", | |
"hash_full_prompts": "e5fd038e997b0827", | |
"hash_input_tokens": "a885d646dece66ec", | |
"hash_cont_tokens": "9f6ff69d23a48783" | |
}, | |
"truncated": 0, | |
"non_truncated": 311, | |
"padded": 1244, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:prehistory|5": { | |
"hashes": { | |
"hash_examples": "6cc50f032a19acaa", | |
"hash_full_prompts": "e2e12e96ae88eb65", | |
"hash_input_tokens": "61280a9a1c37f3cb", | |
"hash_cont_tokens": "d6458d743d875837" | |
}, | |
"truncated": 0, | |
"non_truncated": 324, | |
"padded": 1276, | |
"non_padded": 20, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:professional_accounting|5": { | |
"hashes": { | |
"hash_examples": "50f57ab32f5f6cea", | |
"hash_full_prompts": "709ac34683b906fe", | |
"hash_input_tokens": "5fae44b341e24a35", | |
"hash_cont_tokens": "922a195f53a35662" | |
}, | |
"truncated": 0, | |
"non_truncated": 282, | |
"padded": 1124, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:professional_law|5": { | |
"hashes": { | |
"hash_examples": "a8fdc85c64f4b215", | |
"hash_full_prompts": "8fe93d8d5bfeb57a", | |
"hash_input_tokens": "1ed7802f29ab2b34", | |
"hash_cont_tokens": "2e590029ef41fbcd" | |
}, | |
"truncated": 0, | |
"non_truncated": 1534, | |
"padded": 6136, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:professional_medicine|5": { | |
"hashes": { | |
"hash_examples": "c373a28a3050a73a", | |
"hash_full_prompts": "53933dcbf8afa5a5", | |
"hash_input_tokens": "a3388c9371d26293", | |
"hash_cont_tokens": "7cfee54dbddd5a98" | |
}, | |
"truncated": 0, | |
"non_truncated": 272, | |
"padded": 1088, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:professional_psychology|5": { | |
"hashes": { | |
"hash_examples": "bf5254fe818356af", | |
"hash_full_prompts": "f4a1dfab8b12fe95", | |
"hash_input_tokens": "f2d78354ca383e16", | |
"hash_cont_tokens": "a86677b2a45c20e1" | |
}, | |
"truncated": 0, | |
"non_truncated": 612, | |
"padded": 2444, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:public_relations|5": { | |
"hashes": { | |
"hash_examples": "b66d52e28e7d14e0", | |
"hash_full_prompts": "1e0c1e4d2c44ed22", | |
"hash_input_tokens": "14f65f3eeade13a6", | |
"hash_cont_tokens": "0d756ccaae031757" | |
}, | |
"truncated": 0, | |
"non_truncated": 110, | |
"padded": 428, | |
"non_padded": 12, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:security_studies|5": { | |
"hashes": { | |
"hash_examples": "514c14feaf000ad9", | |
"hash_full_prompts": "21134f4e5193373b", | |
"hash_input_tokens": "979bb1e8e4e4ba30", | |
"hash_cont_tokens": "b2229bc2cfbf594b" | |
}, | |
"truncated": 0, | |
"non_truncated": 245, | |
"padded": 980, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:sociology|5": { | |
"hashes": { | |
"hash_examples": "f6c9bc9d18c80870", | |
"hash_full_prompts": "cc1730ae6e80c625", | |
"hash_input_tokens": "f08f749f4d38b78d", | |
"hash_cont_tokens": "c3a3bdfd177eed5b" | |
}, | |
"truncated": 0, | |
"non_truncated": 201, | |
"padded": 800, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:us_foreign_policy|5": { | |
"hashes": { | |
"hash_examples": "ed7b78629db6678f", | |
"hash_full_prompts": "6bb5314ffd743cd1", | |
"hash_input_tokens": "83495b4c8ca473cb", | |
"hash_cont_tokens": "50421e30bef398f9" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 388, | |
"non_padded": 12, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:virology|5": { | |
"hashes": { | |
"hash_examples": "bc52ffdc3f9b994a", | |
"hash_full_prompts": "6980ff6fd7ad9628", | |
"hash_input_tokens": "a816d800d6dbefa2", | |
"hash_cont_tokens": "af8b3658088cb37f" | |
}, | |
"truncated": 0, | |
"non_truncated": 166, | |
"padded": 664, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|mmlu:world_religions|5": { | |
"hashes": { | |
"hash_examples": "ecdb4a4f94f62930", | |
"hash_full_prompts": "7c4159157e4b8e2c", | |
"hash_input_tokens": "10d1c4b9398d606e", | |
"hash_cont_tokens": "060118bef6de4e0a" | |
}, | |
"truncated": 0, | |
"non_truncated": 171, | |
"padded": 684, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
} | |
}, | |
"summary_general": { | |
"hashes": { | |
"hash_examples": "341a076d0beb7048", | |
"hash_full_prompts": "78ce04640f232f8c", | |
"hash_input_tokens": "fb81c762ca7825b1", | |
"hash_cont_tokens": "1841060297599b66" | |
}, | |
"truncated": 0, | |
"non_truncated": 14042, | |
"padded": 55966, | |
"non_padded": 202, | |
"num_truncated_few_shots": 0 | |
} | |
} |