{ "config_general": { "lighteval_sha": "?", "num_fewshot_seeds": 1, "override_batch_size": 4, "max_samples": null, "job_id": "", "start_time": 19631.841670262, "end_time": 32395.446210489, "total_evaluation_time_secondes": "12763.604540227", "model_name": "abhishek/autotrain-llama3-70b-orpo-v1", "model_sha": "053236c6846cc561c1503ba05e2b28c94855a432", "model_dtype": "torch.float16", "model_size": "131.73 GB", "config": null }, "results": { "leaderboard|mmlu:abstract_algebra|5": { "acc": 0.49, "acc_stderr": 0.05024183937956913 }, "leaderboard|mmlu:anatomy|5": { "acc": 0.7703703703703704, "acc_stderr": 0.036333844140734636 }, "leaderboard|mmlu:astronomy|5": { "acc": 0.9210526315789473, "acc_stderr": 0.02194434281824793 }, "leaderboard|mmlu:business_ethics|5": { "acc": 0.85, "acc_stderr": 0.03588702812826371 }, "leaderboard|mmlu:clinical_knowledge|5": { "acc": 0.8528301886792453, "acc_stderr": 0.02180412613479738 }, "leaderboard|mmlu:college_biology|5": { "acc": 0.9236111111111112, "acc_stderr": 0.022212203938345918 }, "leaderboard|mmlu:college_chemistry|5": { "acc": 0.58, "acc_stderr": 0.049604496374885836 }, "leaderboard|mmlu:college_computer_science|5": { "acc": 0.67, "acc_stderr": 0.04725815626252606 }, "leaderboard|mmlu:college_mathematics|5": { "acc": 0.55, "acc_stderr": 0.049999999999999996 }, "leaderboard|mmlu:college_medicine|5": { "acc": 0.7861271676300579, "acc_stderr": 0.03126511206173044 }, "leaderboard|mmlu:college_physics|5": { "acc": 0.5490196078431373, "acc_stderr": 0.049512182523962604 }, "leaderboard|mmlu:computer_security|5": { "acc": 0.8, "acc_stderr": 0.04020151261036846 }, "leaderboard|mmlu:conceptual_physics|5": { "acc": 0.8127659574468085, "acc_stderr": 0.025501588341883596 }, "leaderboard|mmlu:econometrics|5": { "acc": 0.7192982456140351, "acc_stderr": 0.04227054451232199 }, "leaderboard|mmlu:electrical_engineering|5": { "acc": 0.7517241379310344, "acc_stderr": 0.03600105692727771 }, "leaderboard|mmlu:elementary_mathematics|5": { "acc": 0.6746031746031746, "acc_stderr": 0.024130158299762616 }, "leaderboard|mmlu:formal_logic|5": { "acc": 0.6428571428571429, "acc_stderr": 0.04285714285714281 }, "leaderboard|mmlu:global_facts|5": { "acc": 0.59, "acc_stderr": 0.04943110704237101 }, "leaderboard|mmlu:high_school_biology|5": { "acc": 0.9129032258064517, "acc_stderr": 0.01604110074169669 }, "leaderboard|mmlu:high_school_chemistry|5": { "acc": 0.6600985221674877, "acc_stderr": 0.033327690684107895 }, "leaderboard|mmlu:high_school_computer_science|5": { "acc": 0.9, "acc_stderr": 0.030151134457776348 }, "leaderboard|mmlu:high_school_european_history|5": { "acc": 0.8787878787878788, "acc_stderr": 0.025485498373343237 }, "leaderboard|mmlu:high_school_geography|5": { "acc": 0.9242424242424242, "acc_stderr": 0.018852670234993093 }, "leaderboard|mmlu:high_school_government_and_politics|5": { "acc": 0.9844559585492227, "acc_stderr": 0.00892749271508434 }, "leaderboard|mmlu:high_school_macroeconomics|5": { "acc": 0.8358974358974359, "acc_stderr": 0.01877843431342372 }, "leaderboard|mmlu:high_school_mathematics|5": { "acc": 0.5259259259259259, "acc_stderr": 0.03044452852881074 }, "leaderboard|mmlu:high_school_microeconomics|5": { "acc": 0.8949579831932774, "acc_stderr": 0.019916300758805225 }, "leaderboard|mmlu:high_school_physics|5": { "acc": 0.6158940397350994, "acc_stderr": 0.03971301814719197 }, "leaderboard|mmlu:high_school_psychology|5": { "acc": 0.9486238532110092, "acc_stderr": 0.009465168181022974 }, "leaderboard|mmlu:high_school_statistics|5": { "acc": 0.7361111111111112, "acc_stderr": 0.030058202704309846 }, "leaderboard|mmlu:high_school_us_history|5": { "acc": 0.9313725490196079, "acc_stderr": 0.017744453647073322 }, "leaderboard|mmlu:high_school_world_history|5": { "acc": 0.9240506329113924, "acc_stderr": 0.01724463325106569 }, "leaderboard|mmlu:human_aging|5": { "acc": 0.8161434977578476, "acc_stderr": 0.025998379092356517 }, "leaderboard|mmlu:human_sexuality|5": { "acc": 0.8625954198473282, "acc_stderr": 0.030194823996804475 }, "leaderboard|mmlu:international_law|5": { "acc": 0.9090909090909091, "acc_stderr": 0.026243194054073885 }, "leaderboard|mmlu:jurisprudence|5": { "acc": 0.8703703703703703, "acc_stderr": 0.03247224389917946 }, "leaderboard|mmlu:logical_fallacies|5": { "acc": 0.8466257668711656, "acc_stderr": 0.0283116014414386 }, "leaderboard|mmlu:machine_learning|5": { "acc": 0.7142857142857143, "acc_stderr": 0.042878587513404544 }, "leaderboard|mmlu:management|5": { "acc": 0.912621359223301, "acc_stderr": 0.027960689125970654 }, "leaderboard|mmlu:marketing|5": { "acc": 0.9358974358974359, "acc_stderr": 0.016046261631673137 }, "leaderboard|mmlu:medical_genetics|5": { "acc": 0.93, "acc_stderr": 0.0256432399976243 }, "leaderboard|mmlu:miscellaneous|5": { "acc": 0.9233716475095786, "acc_stderr": 0.009512170699323858 }, "leaderboard|mmlu:moral_disputes|5": { "acc": 0.8208092485549133, "acc_stderr": 0.020647590029679332 }, "leaderboard|mmlu:moral_scenarios|5": { "acc": 0.7463687150837989, "acc_stderr": 0.014551553659369918 }, "leaderboard|mmlu:nutrition|5": { "acc": 0.869281045751634, "acc_stderr": 0.019301873624215267 }, "leaderboard|mmlu:philosophy|5": { "acc": 0.797427652733119, "acc_stderr": 0.022827317491059693 }, "leaderboard|mmlu:prehistory|5": { "acc": 0.9012345679012346, "acc_stderr": 0.01660046080164534 }, "leaderboard|mmlu:professional_accounting|5": { "acc": 0.6595744680851063, "acc_stderr": 0.02826765748265014 }, "leaderboard|mmlu:professional_law|5": { "acc": 0.6323337679269883, "acc_stderr": 0.012314845910071712 }, "leaderboard|mmlu:professional_medicine|5": { "acc": 0.8933823529411765, "acc_stderr": 0.018747725509716835 }, "leaderboard|mmlu:professional_psychology|5": { "acc": 0.8611111111111112, "acc_stderr": 0.013990806277040208 }, "leaderboard|mmlu:public_relations|5": { "acc": 0.7454545454545455, "acc_stderr": 0.041723430387053825 }, "leaderboard|mmlu:security_studies|5": { "acc": 0.8081632653061225, "acc_stderr": 0.0252069631542254 }, "leaderboard|mmlu:sociology|5": { "acc": 0.9154228855721394, "acc_stderr": 0.019675343217199173 }, "leaderboard|mmlu:us_foreign_policy|5": { "acc": 0.92, "acc_stderr": 0.0272659924344291 }, "leaderboard|mmlu:virology|5": { "acc": 0.572289156626506, "acc_stderr": 0.03851597683718533 }, "leaderboard|mmlu:world_religions|5": { "acc": 0.8888888888888888, "acc_stderr": 0.024103384202072864 }, "leaderboard|mmlu:_average|5": { "acc": 0.7957951766493738, "acc_stderr": 0.0280984014309186 }, "all": { "acc": 0.7957951766493738, "acc_stderr": 0.0280984014309186 } }, "versions": { "leaderboard|mmlu:abstract_algebra|5": 0, "leaderboard|mmlu:anatomy|5": 0, "leaderboard|mmlu:astronomy|5": 0, "leaderboard|mmlu:business_ethics|5": 0, "leaderboard|mmlu:clinical_knowledge|5": 0, "leaderboard|mmlu:college_biology|5": 0, "leaderboard|mmlu:college_chemistry|5": 0, "leaderboard|mmlu:college_computer_science|5": 0, "leaderboard|mmlu:college_mathematics|5": 0, "leaderboard|mmlu:college_medicine|5": 0, "leaderboard|mmlu:college_physics|5": 0, "leaderboard|mmlu:computer_security|5": 0, "leaderboard|mmlu:conceptual_physics|5": 0, "leaderboard|mmlu:econometrics|5": 0, "leaderboard|mmlu:electrical_engineering|5": 0, "leaderboard|mmlu:elementary_mathematics|5": 0, "leaderboard|mmlu:formal_logic|5": 0, "leaderboard|mmlu:global_facts|5": 0, "leaderboard|mmlu:high_school_biology|5": 0, "leaderboard|mmlu:high_school_chemistry|5": 0, "leaderboard|mmlu:high_school_computer_science|5": 0, "leaderboard|mmlu:high_school_european_history|5": 0, "leaderboard|mmlu:high_school_geography|5": 0, "leaderboard|mmlu:high_school_government_and_politics|5": 0, "leaderboard|mmlu:high_school_macroeconomics|5": 0, "leaderboard|mmlu:high_school_mathematics|5": 0, "leaderboard|mmlu:high_school_microeconomics|5": 0, "leaderboard|mmlu:high_school_physics|5": 0, "leaderboard|mmlu:high_school_psychology|5": 0, "leaderboard|mmlu:high_school_statistics|5": 0, "leaderboard|mmlu:high_school_us_history|5": 0, "leaderboard|mmlu:high_school_world_history|5": 0, "leaderboard|mmlu:human_aging|5": 0, "leaderboard|mmlu:human_sexuality|5": 0, "leaderboard|mmlu:international_law|5": 0, "leaderboard|mmlu:jurisprudence|5": 0, "leaderboard|mmlu:logical_fallacies|5": 0, "leaderboard|mmlu:machine_learning|5": 0, "leaderboard|mmlu:management|5": 0, "leaderboard|mmlu:marketing|5": 0, "leaderboard|mmlu:medical_genetics|5": 0, "leaderboard|mmlu:miscellaneous|5": 0, "leaderboard|mmlu:moral_disputes|5": 0, "leaderboard|mmlu:moral_scenarios|5": 0, "leaderboard|mmlu:nutrition|5": 0, "leaderboard|mmlu:philosophy|5": 0, "leaderboard|mmlu:prehistory|5": 0, "leaderboard|mmlu:professional_accounting|5": 0, "leaderboard|mmlu:professional_law|5": 0, "leaderboard|mmlu:professional_medicine|5": 0, "leaderboard|mmlu:professional_psychology|5": 0, "leaderboard|mmlu:public_relations|5": 0, "leaderboard|mmlu:security_studies|5": 0, "leaderboard|mmlu:sociology|5": 0, "leaderboard|mmlu:us_foreign_policy|5": 0, "leaderboard|mmlu:virology|5": 0, "leaderboard|mmlu:world_religions|5": 0 }, "config_tasks": { "leaderboard|mmlu:abstract_algebra": { "name": "mmlu:abstract_algebra", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "abstract_algebra", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:anatomy": { "name": "mmlu:anatomy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "anatomy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 135, "effective_num_docs": 135, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:astronomy": { "name": "mmlu:astronomy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "astronomy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 152, "effective_num_docs": 152, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:business_ethics": { "name": "mmlu:business_ethics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "business_ethics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:clinical_knowledge": { "name": "mmlu:clinical_knowledge", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "clinical_knowledge", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 265, "effective_num_docs": 265, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:college_biology": { "name": "mmlu:college_biology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_biology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 144, "effective_num_docs": 144, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:college_chemistry": { "name": "mmlu:college_chemistry", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_chemistry", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:college_computer_science": { "name": "mmlu:college_computer_science", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_computer_science", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:college_mathematics": { "name": "mmlu:college_mathematics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_mathematics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:college_medicine": { "name": "mmlu:college_medicine", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_medicine", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 173, "effective_num_docs": 173, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:college_physics": { "name": "mmlu:college_physics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_physics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 102, "effective_num_docs": 102, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:computer_security": { "name": "mmlu:computer_security", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "computer_security", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:conceptual_physics": { "name": "mmlu:conceptual_physics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "conceptual_physics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 235, "effective_num_docs": 235, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:econometrics": { "name": "mmlu:econometrics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "econometrics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 114, "effective_num_docs": 114, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:electrical_engineering": { "name": "mmlu:electrical_engineering", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "electrical_engineering", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 145, "effective_num_docs": 145, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:elementary_mathematics": { "name": "mmlu:elementary_mathematics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "elementary_mathematics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 378, "effective_num_docs": 378, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:formal_logic": { "name": "mmlu:formal_logic", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "formal_logic", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 126, "effective_num_docs": 126, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:global_facts": { "name": "mmlu:global_facts", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "global_facts", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_biology": { "name": "mmlu:high_school_biology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_biology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 310, "effective_num_docs": 310, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_chemistry": { "name": "mmlu:high_school_chemistry", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_chemistry", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 203, "effective_num_docs": 203, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_computer_science": { "name": "mmlu:high_school_computer_science", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_computer_science", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_european_history": { "name": "mmlu:high_school_european_history", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_european_history", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 165, "effective_num_docs": 165, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_geography": { "name": "mmlu:high_school_geography", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_geography", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 198, "effective_num_docs": 198, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_government_and_politics": { "name": "mmlu:high_school_government_and_politics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_government_and_politics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 193, "effective_num_docs": 193, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_macroeconomics": { "name": "mmlu:high_school_macroeconomics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_macroeconomics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 390, "effective_num_docs": 390, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_mathematics": { "name": "mmlu:high_school_mathematics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_mathematics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 270, "effective_num_docs": 270, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_microeconomics": { "name": "mmlu:high_school_microeconomics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_microeconomics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 238, "effective_num_docs": 238, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_physics": { "name": "mmlu:high_school_physics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_physics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 151, "effective_num_docs": 151, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_psychology": { "name": "mmlu:high_school_psychology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_psychology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 545, "effective_num_docs": 545, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_statistics": { "name": "mmlu:high_school_statistics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_statistics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 216, "effective_num_docs": 216, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_us_history": { "name": "mmlu:high_school_us_history", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_us_history", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 204, "effective_num_docs": 204, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:high_school_world_history": { "name": "mmlu:high_school_world_history", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_world_history", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 237, "effective_num_docs": 237, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:human_aging": { "name": "mmlu:human_aging", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "human_aging", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 223, "effective_num_docs": 223, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:human_sexuality": { "name": "mmlu:human_sexuality", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "human_sexuality", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 131, "effective_num_docs": 131, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:international_law": { "name": "mmlu:international_law", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "international_law", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 121, "effective_num_docs": 121, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:jurisprudence": { "name": "mmlu:jurisprudence", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "jurisprudence", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 108, "effective_num_docs": 108, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:logical_fallacies": { "name": "mmlu:logical_fallacies", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "logical_fallacies", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 163, "effective_num_docs": 163, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:machine_learning": { "name": "mmlu:machine_learning", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "machine_learning", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 112, "effective_num_docs": 112, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:management": { "name": "mmlu:management", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "management", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 103, "effective_num_docs": 103, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:marketing": { "name": "mmlu:marketing", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "marketing", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 234, "effective_num_docs": 234, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:medical_genetics": { "name": "mmlu:medical_genetics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "medical_genetics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:miscellaneous": { "name": "mmlu:miscellaneous", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "miscellaneous", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 783, "effective_num_docs": 783, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:moral_disputes": { "name": "mmlu:moral_disputes", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "moral_disputes", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 346, "effective_num_docs": 346, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:moral_scenarios": { "name": "mmlu:moral_scenarios", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "moral_scenarios", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 895, "effective_num_docs": 895, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:nutrition": { "name": "mmlu:nutrition", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "nutrition", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 306, "effective_num_docs": 306, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:philosophy": { "name": "mmlu:philosophy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "philosophy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 311, "effective_num_docs": 311, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:prehistory": { "name": "mmlu:prehistory", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "prehistory", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 324, "effective_num_docs": 324, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:professional_accounting": { "name": "mmlu:professional_accounting", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_accounting", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 282, "effective_num_docs": 282, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:professional_law": { "name": "mmlu:professional_law", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_law", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 1534, "effective_num_docs": 1534, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:professional_medicine": { "name": "mmlu:professional_medicine", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_medicine", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 272, "effective_num_docs": 272, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:professional_psychology": { "name": "mmlu:professional_psychology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_psychology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 612, "effective_num_docs": 612, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:public_relations": { "name": "mmlu:public_relations", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "public_relations", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 110, "effective_num_docs": 110, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:security_studies": { "name": "mmlu:security_studies", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "security_studies", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 245, "effective_num_docs": 245, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:sociology": { "name": "mmlu:sociology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "sociology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 201, "effective_num_docs": 201, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:us_foreign_policy": { "name": "mmlu:us_foreign_policy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "us_foreign_policy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:virology": { "name": "mmlu:virology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "virology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 166, "effective_num_docs": 166, "trust_dataset": true, "must_remove_duplicate_docs": null }, "leaderboard|mmlu:world_religions": { "name": "mmlu:world_religions", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "world_religions", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 171, "effective_num_docs": 171, "trust_dataset": true, "must_remove_duplicate_docs": null } }, "summary_tasks": { "leaderboard|mmlu:abstract_algebra|5": { "hashes": { "hash_examples": "4c76229e00c9c0e9", "hash_full_prompts": "a45d01c3409c889c", "hash_input_tokens": "c58f21ad388e41a4", "hash_cont_tokens": "b057f3f1d84e35e5" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:anatomy|5": { "hashes": { "hash_examples": "6a1f8104dccbd33b", "hash_full_prompts": "e245c6600e03cc32", "hash_input_tokens": "664ad983d943ad07", "hash_cont_tokens": "eb0c9a1e487e77a6" }, "truncated": 0, "non_truncated": 135, "padded": 540, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:astronomy|5": { "hashes": { "hash_examples": "1302effa3a76ce4c", "hash_full_prompts": "390f9bddf857ad04", "hash_input_tokens": "6b8419ce1ca61ae8", "hash_cont_tokens": "2c8a49864c3d99c2" }, "truncated": 0, "non_truncated": 152, "padded": 608, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:business_ethics|5": { "hashes": { "hash_examples": "03cb8bce5336419a", "hash_full_prompts": "5504f893bc4f2fa1", "hash_input_tokens": "bf7a56022072a446", "hash_cont_tokens": "b057f3f1d84e35e5" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:clinical_knowledge|5": { "hashes": { "hash_examples": "ffbb9c7b2be257f9", "hash_full_prompts": "106ad0bab4b90b78", "hash_input_tokens": "1d310792e0aaf29c", "hash_cont_tokens": "f2bfcea369926d68" }, "truncated": 0, "non_truncated": 265, "padded": 1060, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_biology|5": { "hashes": { "hash_examples": "3ee77f176f38eb8e", "hash_full_prompts": "59f9bdf2695cb226", "hash_input_tokens": "1027babc822bd1c5", "hash_cont_tokens": "061b1b91fc518400" }, "truncated": 0, "non_truncated": 144, "padded": 576, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_chemistry|5": { "hashes": { "hash_examples": "ce61a69c46d47aeb", "hash_full_prompts": "3cac9b759fcff7a0", "hash_input_tokens": "4fa05a1d43eaf942", "hash_cont_tokens": "b057f3f1d84e35e5" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_computer_science|5": { "hashes": { "hash_examples": "32805b52d7d5daab", "hash_full_prompts": "010b0cca35070130", "hash_input_tokens": "c78e59615689a133", "hash_cont_tokens": "b057f3f1d84e35e5" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_mathematics|5": { "hashes": { "hash_examples": "55da1a0a0bd33722", "hash_full_prompts": "511422eb9eefc773", "hash_input_tokens": "f62164431ea60a6d", "hash_cont_tokens": "b057f3f1d84e35e5" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_medicine|5": { "hashes": { "hash_examples": "c33e143163049176", "hash_full_prompts": "c8cc1a82a51a046e", "hash_input_tokens": "f3ffc86e05b4abab", "hash_cont_tokens": "96f7e09eeaf3577a" }, "truncated": 0, "non_truncated": 173, "padded": 692, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_physics|5": { "hashes": { "hash_examples": "ebdab1cdb7e555df", "hash_full_prompts": "e40721b5059c5818", "hash_input_tokens": "bfec18d4c2cf7331", "hash_cont_tokens": "fb74f245268780f7" }, "truncated": 0, "non_truncated": 102, "padded": 408, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:computer_security|5": { "hashes": { "hash_examples": "a24fd7d08a560921", "hash_full_prompts": "946c9be5964ac44a", "hash_input_tokens": "acde342892ed7ff9", "hash_cont_tokens": "b057f3f1d84e35e5" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:conceptual_physics|5": { "hashes": { "hash_examples": "8300977a79386993", "hash_full_prompts": "506a4f6094cc40c9", "hash_input_tokens": "524121d9ddd4bf6a", "hash_cont_tokens": "1f4e1e92f33812b5" }, "truncated": 0, "non_truncated": 235, "padded": 940, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:econometrics|5": { "hashes": { "hash_examples": "ddde36788a04a46f", "hash_full_prompts": "4ed2703f27f1ed05", "hash_input_tokens": "a1dedb7b847d6b19", "hash_cont_tokens": "4060f7b36b2f4140" }, "truncated": 0, "non_truncated": 114, "padded": 456, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:electrical_engineering|5": { "hashes": { "hash_examples": "acbc5def98c19b3f", "hash_full_prompts": "d8f4b3e11c23653c", "hash_input_tokens": "825d9d44e9ca39de", "hash_cont_tokens": "06e5a56f4fae638e" }, "truncated": 0, "non_truncated": 145, "padded": 580, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:elementary_mathematics|5": { "hashes": { "hash_examples": "146e61d07497a9bd", "hash_full_prompts": "256d111bd15647ff", "hash_input_tokens": "6ef65f0c30222cfa", "hash_cont_tokens": "c3385a2b3ed50305" }, "truncated": 0, "non_truncated": 378, "padded": 1506, "non_padded": 6, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:formal_logic|5": { "hashes": { "hash_examples": "8635216e1909a03f", "hash_full_prompts": "1171d04f3b1a11f5", "hash_input_tokens": "1c20d8fd0dcd2b9e", "hash_cont_tokens": "e259fd8e83f5eabe" }, "truncated": 0, "non_truncated": 126, "padded": 504, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:global_facts|5": { "hashes": { "hash_examples": "30b315aa6353ee47", "hash_full_prompts": "a7e56dbc074c7529", "hash_input_tokens": "4fe2c5925dbc174a", "hash_cont_tokens": "b057f3f1d84e35e5" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_biology|5": { "hashes": { "hash_examples": "c9136373af2180de", "hash_full_prompts": "ad6e859ed978e04a", "hash_input_tokens": "ea52b862d9b7af2b", "hash_cont_tokens": "6251059b06be9d97" }, "truncated": 0, "non_truncated": 310, "padded": 1236, "non_padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_chemistry|5": { "hashes": { "hash_examples": "b0661bfa1add6404", "hash_full_prompts": "6eb9c04bcc8a8f2a", "hash_input_tokens": "057712c3fd4fe5dc", "hash_cont_tokens": "ce467bdc2825a0b2" }, "truncated": 0, "non_truncated": 203, "padded": 808, "non_padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_computer_science|5": { "hashes": { "hash_examples": "80fc1d623a3d665f", "hash_full_prompts": "8e51bc91c81cf8dd", "hash_input_tokens": "efbba49c91a4a950", "hash_cont_tokens": "b057f3f1d84e35e5" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_european_history|5": { "hashes": { "hash_examples": "854da6e5af0fe1a1", "hash_full_prompts": "664a1f16c9f3195c", "hash_input_tokens": "26218866485baf4e", "hash_cont_tokens": "f02ad2401d7aa667" }, "truncated": 0, "non_truncated": 165, "padded": 656, "non_padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_geography|5": { "hashes": { "hash_examples": "7dc963c7acd19ad8", "hash_full_prompts": "f3acf911f4023c8a", "hash_input_tokens": "56cbe629d18ead5d", "hash_cont_tokens": "26d9256a0ab4eece" }, "truncated": 0, "non_truncated": 198, "padded": 785, "non_padded": 7, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_government_and_politics|5": { "hashes": { "hash_examples": "1f675dcdebc9758f", "hash_full_prompts": "066254feaa3158ae", "hash_input_tokens": "b86d166b67953159", "hash_cont_tokens": "990c9084748f34ab" }, "truncated": 0, "non_truncated": 193, "padded": 772, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_macroeconomics|5": { "hashes": { "hash_examples": "2fb32cf2d80f0b35", "hash_full_prompts": "19a7fa502aa85c95", "hash_input_tokens": "75b5836573d4418d", "hash_cont_tokens": "76312c2ea8fc4f71" }, "truncated": 0, "non_truncated": 390, "padded": 1552, "non_padded": 8, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_mathematics|5": { "hashes": { "hash_examples": "fd6646fdb5d58a1f", "hash_full_prompts": "4f704e369778b5b0", "hash_input_tokens": "5bc4d612b64cb82b", "hash_cont_tokens": "369a1a933960fad5" }, "truncated": 0, "non_truncated": 270, "padded": 1048, "non_padded": 32, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_microeconomics|5": { "hashes": { "hash_examples": "2118f21f71d87d84", "hash_full_prompts": "4350f9e2240f8010", "hash_input_tokens": "f53ae91ff33dea98", "hash_cont_tokens": "ce39343f06b04c0c" }, "truncated": 0, "non_truncated": 238, "padded": 924, "non_padded": 28, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_physics|5": { "hashes": { "hash_examples": "dc3ce06378548565", "hash_full_prompts": "5dc0d6831b66188f", "hash_input_tokens": "23181b0f6dc1e876", "hash_cont_tokens": "34c4d04275713047" }, "truncated": 0, "non_truncated": 151, "padded": 604, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_psychology|5": { "hashes": { "hash_examples": "c8d1d98a40e11f2f", "hash_full_prompts": "af2b097da6d50365", "hash_input_tokens": "1c03a6aa2ccd7497", "hash_cont_tokens": "76367d535c896191" }, "truncated": 0, "non_truncated": 545, "padded": 2176, "non_padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_statistics|5": { "hashes": { "hash_examples": "666c8759b98ee4ff", "hash_full_prompts": "c757694421d6d68d", "hash_input_tokens": "6c3d4c89ebb17624", "hash_cont_tokens": "a076a9a5529c4701" }, "truncated": 0, "non_truncated": 216, "padded": 864, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_us_history|5": { "hashes": { "hash_examples": "95fef1c4b7d3f81e", "hash_full_prompts": "e34a028d0ddeec5e", "hash_input_tokens": "3789cc86ffa04ee6", "hash_cont_tokens": "ff9e65faaa6206d3" }, "truncated": 0, "non_truncated": 204, "padded": 816, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_world_history|5": { "hashes": { "hash_examples": "7e5085b6184b0322", "hash_full_prompts": "1fa3d51392765601", "hash_input_tokens": "e2ade10b727cc567", "hash_cont_tokens": "91d0b99f637c395d" }, "truncated": 0, "non_truncated": 237, "padded": 948, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:human_aging|5": { "hashes": { "hash_examples": "c17333e7c7c10797", "hash_full_prompts": "cac900721f9a1a94", "hash_input_tokens": "e73491e153435aef", "hash_cont_tokens": "503a59d0c8fd9fda" }, "truncated": 0, "non_truncated": 223, "padded": 892, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:human_sexuality|5": { "hashes": { "hash_examples": "4edd1e9045df5e3d", "hash_full_prompts": "0d6567bafee0a13c", "hash_input_tokens": "5aac17e145c73388", "hash_cont_tokens": "2ef8023fd099e328" }, "truncated": 0, "non_truncated": 131, "padded": 524, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:international_law|5": { "hashes": { "hash_examples": "db2fa00d771a062a", "hash_full_prompts": "d018f9116479795e", "hash_input_tokens": "f823a8f71decfd4e", "hash_cont_tokens": "1d135acf09cc77d7" }, "truncated": 0, "non_truncated": 121, "padded": 484, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:jurisprudence|5": { "hashes": { "hash_examples": "e956f86b124076fe", "hash_full_prompts": "1487e89a10ec58b7", "hash_input_tokens": "1296fb073d25fdbd", "hash_cont_tokens": "2bc5403ae73a42ee" }, "truncated": 0, "non_truncated": 108, "padded": 420, "non_padded": 12, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:logical_fallacies|5": { "hashes": { "hash_examples": "956e0e6365ab79f1", "hash_full_prompts": "677785b2181f9243", "hash_input_tokens": "5af73357ea4a33af", "hash_cont_tokens": "343532d46d0dd784" }, "truncated": 0, "non_truncated": 163, "padded": 648, "non_padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:machine_learning|5": { "hashes": { "hash_examples": "397997cc6f4d581e", "hash_full_prompts": "769ee14a2aea49bb", "hash_input_tokens": "36e7ee7692d6cb84", "hash_cont_tokens": "ffa678813759b3dc" }, "truncated": 0, "non_truncated": 112, "padded": 448, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:management|5": { "hashes": { "hash_examples": "2bcbe6f6ca63d740", "hash_full_prompts": "cb1ff9dac9582144", "hash_input_tokens": "b6132d503a6b6fe1", "hash_cont_tokens": "c86a690085fd954f" }, "truncated": 0, "non_truncated": 103, "padded": 412, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:marketing|5": { "hashes": { "hash_examples": "8ddb20d964a1b065", "hash_full_prompts": "9fc2114a187ad9a2", "hash_input_tokens": "cea9c7023072b6de", "hash_cont_tokens": "a59b0611811f3b0d" }, "truncated": 0, "non_truncated": 234, "padded": 892, "non_padded": 44, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:medical_genetics|5": { "hashes": { "hash_examples": "182a71f4763d2cea", "hash_full_prompts": "46a616fa51878959", "hash_input_tokens": "26a5d3aa11ad9928", "hash_cont_tokens": "b057f3f1d84e35e5" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:miscellaneous|5": { "hashes": { "hash_examples": "4c404fdbb4ca57fc", "hash_full_prompts": "0813e1be36dbaae1", "hash_input_tokens": "9368b6cccdca09c2", "hash_cont_tokens": "468b0141d91cbe14" }, "truncated": 0, "non_truncated": 783, "padded": 3132, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:moral_disputes|5": { "hashes": { "hash_examples": "60cbd2baa3fea5c9", "hash_full_prompts": "1d14adebb9b62519", "hash_input_tokens": "24ac7096ff952fb3", "hash_cont_tokens": "5513cb6f0e3d0039" }, "truncated": 0, "non_truncated": 346, "padded": 1380, "non_padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:moral_scenarios|5": { "hashes": { "hash_examples": "fd8b0431fbdd75ef", "hash_full_prompts": "b80d3d236165e3de", "hash_input_tokens": "c388cdbaa92ff35d", "hash_cont_tokens": "9fd58c2ac3e72795" }, "truncated": 0, "non_truncated": 895, "padded": 3447, "non_padded": 133, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:nutrition|5": { "hashes": { "hash_examples": "71e55e2b829b6528", "hash_full_prompts": "2bfb18e5fab8dea7", "hash_input_tokens": "61d681ff34503244", "hash_cont_tokens": "3294c626ca103ea9" }, "truncated": 0, "non_truncated": 306, "padded": 1224, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:philosophy|5": { "hashes": { "hash_examples": "a6d489a8d208fa4b", "hash_full_prompts": "e8c0d5b6dae3ccc8", "hash_input_tokens": "99dccba498573ce7", "hash_cont_tokens": "84fab74fb9dbe7cc" }, "truncated": 0, "non_truncated": 311, "padded": 1244, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:prehistory|5": { "hashes": { "hash_examples": "6cc50f032a19acaa", "hash_full_prompts": "4a6a1d3ab1bf28e4", "hash_input_tokens": "0b5b291e21537231", "hash_cont_tokens": "65ac3d5bc3a7107a" }, "truncated": 0, "non_truncated": 324, "padded": 1256, "non_padded": 40, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:professional_accounting|5": { "hashes": { "hash_examples": "50f57ab32f5f6cea", "hash_full_prompts": "e60129bd2d82ffc6", "hash_input_tokens": "dfdbfa66c3879e04", "hash_cont_tokens": "3408ef87473c956e" }, "truncated": 0, "non_truncated": 282, "padded": 1108, "non_padded": 20, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:professional_law|5": { "hashes": { "hash_examples": "a8fdc85c64f4b215", "hash_full_prompts": "0dbb1d9b72dcea03", "hash_input_tokens": "c5e40216c766fc5d", "hash_cont_tokens": "ee53bac4bdeb7c6f" }, "truncated": 0, "non_truncated": 1534, "padded": 6136, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:professional_medicine|5": { "hashes": { "hash_examples": "c373a28a3050a73a", "hash_full_prompts": "5e040f9ca68b089e", "hash_input_tokens": "6bb2fb0a41e6e74a", "hash_cont_tokens": "5ba90d13b887dd10" }, "truncated": 0, "non_truncated": 272, "padded": 1088, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:professional_psychology|5": { "hashes": { "hash_examples": "bf5254fe818356af", "hash_full_prompts": "b386ecda8b87150e", "hash_input_tokens": "d5882dcb2ba36239", "hash_cont_tokens": "81d224ca3a7cd1f8" }, "truncated": 0, "non_truncated": 612, "padded": 2448, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:public_relations|5": { "hashes": { "hash_examples": "b66d52e28e7d14e0", "hash_full_prompts": "fe43562263e25677", "hash_input_tokens": "89e2f611b0e690d5", "hash_cont_tokens": "97fc092d5801cddc" }, "truncated": 0, "non_truncated": 110, "padded": 432, "non_padded": 8, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:security_studies|5": { "hashes": { "hash_examples": "514c14feaf000ad9", "hash_full_prompts": "27d4a2ac541ef4b9", "hash_input_tokens": "da3c969306757935", "hash_cont_tokens": "084e0267d5a5a853" }, "truncated": 0, "non_truncated": 245, "padded": 980, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:sociology|5": { "hashes": { "hash_examples": "f6c9bc9d18c80870", "hash_full_prompts": "c072ea7d1a1524f2", "hash_input_tokens": "cba49ace6cf739f2", "hash_cont_tokens": "c654d898bec36354" }, "truncated": 0, "non_truncated": 201, "padded": 804, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:us_foreign_policy|5": { "hashes": { "hash_examples": "ed7b78629db6678f", "hash_full_prompts": "341a97ca3e4d699d", "hash_input_tokens": "3f5014206033d5c8", "hash_cont_tokens": "b057f3f1d84e35e5" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:virology|5": { "hashes": { "hash_examples": "bc52ffdc3f9b994a", "hash_full_prompts": "651d471e2eb8b5e9", "hash_input_tokens": "21ca3e01eff77c5a", "hash_cont_tokens": "648f351cffb42342" }, "truncated": 0, "non_truncated": 166, "padded": 664, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:world_religions|5": { "hashes": { "hash_examples": "ecdb4a4f94f62930", "hash_full_prompts": "3773f03542ce44a3", "hash_input_tokens": "7904d43bc25add76", "hash_cont_tokens": "6cee5406c3f21c2e" }, "truncated": 0, "non_truncated": 171, "padded": 684, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 } }, "summary_general": { "hashes": { "hash_examples": "341a076d0beb7048", "hash_full_prompts": "a5c8f2b7ff4f5ae2", "hash_input_tokens": "74ed818d997ed10b", "hash_cont_tokens": "edf1783519f209b9" }, "truncated": 0, "non_truncated": 14042, "padded": 55806, "non_padded": 362, "num_truncated_few_shots": 0 } }