{ "config_general": { "lighteval_sha": "?", "num_fewshot_seeds": 1, "override_batch_size": 1, "max_samples": null, "job_id": "", "start_time": 6407207.96520301, "end_time": 6418340.110943477, "total_evaluation_time_secondes": "11132.145740467124", "model_name": "Qwen/Qwen1.5-72B-Chat", "model_sha": "1a6ccc1215278f962c794b1848c710c29ef4053d", "model_dtype": "torch.bfloat16", "model_size": "135.9 GB", "config": null }, "results": { "lighteval|mmlu:abstract_algebra|5": { "acc": 0.53, "acc_stderr": 0.050161355804659205 }, "lighteval|mmlu:anatomy|5": { "acc": 0.7333333333333333, "acc_stderr": 0.038201699145179055 }, "lighteval|mmlu:astronomy|5": { "acc": 0.8881578947368421, "acc_stderr": 0.025648341251693612 }, "lighteval|mmlu:business_ethics|5": { "acc": 0.82, "acc_stderr": 0.038612291966536955 }, "lighteval|mmlu:clinical_knowledge|5": { "acc": 0.7962264150943397, "acc_stderr": 0.024790784501775402 }, "lighteval|mmlu:college_biology|5": { "acc": 0.9097222222222222, "acc_stderr": 0.023964965777906935 }, "lighteval|mmlu:college_chemistry|5": { "acc": 0.6, "acc_stderr": 0.049236596391733084 }, "lighteval|mmlu:college_computer_science|5": { "acc": 0.65, "acc_stderr": 0.04793724854411019 }, "lighteval|mmlu:college_mathematics|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795 }, "lighteval|mmlu:college_medicine|5": { "acc": 0.7514450867052023, "acc_stderr": 0.03295304696818318 }, "lighteval|mmlu:college_physics|5": { "acc": 0.5686274509803921, "acc_stderr": 0.04928099597287534 }, "lighteval|mmlu:computer_security|5": { "acc": 0.83, "acc_stderr": 0.03775251680686371 }, "lighteval|mmlu:conceptual_physics|5": { "acc": 0.8042553191489362, "acc_stderr": 0.025937853139977148 }, "lighteval|mmlu:econometrics|5": { "acc": 0.5789473684210527, "acc_stderr": 0.04644602091222317 }, "lighteval|mmlu:electrical_engineering|5": { "acc": 0.8137931034482758, "acc_stderr": 0.03243946159004616 }, "lighteval|mmlu:elementary_mathematics|5": { "acc": 0.6587301587301587, "acc_stderr": 0.02441923496681907 }, "lighteval|mmlu:formal_logic|5": { "acc": 0.6111111111111112, "acc_stderr": 0.04360314860077459 }, "lighteval|mmlu:global_facts|5": { "acc": 0.55, "acc_stderr": 0.05 }, "lighteval|mmlu:high_school_biology|5": { "acc": 0.8935483870967742, "acc_stderr": 0.01754510295165663 }, "lighteval|mmlu:high_school_chemistry|5": { "acc": 0.6798029556650246, "acc_stderr": 0.032826493853041504 }, "lighteval|mmlu:high_school_computer_science|5": { "acc": 0.79, "acc_stderr": 0.040936018074033256 }, "lighteval|mmlu:high_school_european_history|5": { "acc": 0.8484848484848485, "acc_stderr": 0.027998073798781675 }, "lighteval|mmlu:high_school_geography|5": { "acc": 0.9090909090909091, "acc_stderr": 0.020482086775424208 }, "lighteval|mmlu:high_school_government_and_politics|5": { "acc": 0.9896373056994818, "acc_stderr": 0.007308424386792208 }, "lighteval|mmlu:high_school_macroeconomics|5": { "acc": 0.8025641025641026, "acc_stderr": 0.02018264696867484 }, "lighteval|mmlu:high_school_mathematics|5": { "acc": 0.5185185185185185, "acc_stderr": 0.030464621718895322 }, "lighteval|mmlu:high_school_microeconomics|5": { "acc": 0.8403361344537815, "acc_stderr": 0.023793353997528802 }, "lighteval|mmlu:high_school_physics|5": { "acc": 0.543046357615894, "acc_stderr": 0.04067325174247443 }, "lighteval|mmlu:high_school_psychology|5": { "acc": 0.9229357798165138, "acc_stderr": 0.011434381698911096 }, "lighteval|mmlu:high_school_statistics|5": { "acc": 0.6666666666666666, "acc_stderr": 0.03214952147802749 }, "lighteval|mmlu:high_school_us_history|5": { "acc": 0.9117647058823529, "acc_stderr": 0.01990739979131694 }, "lighteval|mmlu:high_school_world_history|5": { "acc": 0.9113924050632911, "acc_stderr": 0.018498315206865384 }, "lighteval|mmlu:human_aging|5": { "acc": 0.8071748878923767, "acc_stderr": 0.026478240960489365 }, "lighteval|mmlu:human_sexuality|5": { "acc": 0.8778625954198473, "acc_stderr": 0.02871877688934232 }, "lighteval|mmlu:international_law|5": { "acc": 0.8429752066115702, "acc_stderr": 0.03321244842547128 }, "lighteval|mmlu:jurisprudence|5": { "acc": 0.8703703703703703, "acc_stderr": 0.032472243899179465 }, "lighteval|mmlu:logical_fallacies|5": { "acc": 0.8466257668711656, "acc_stderr": 0.0283116014414386 }, "lighteval|mmlu:machine_learning|5": { "acc": 0.5892857142857143, "acc_stderr": 0.04669510663875191 }, "lighteval|mmlu:management|5": { "acc": 0.8640776699029126, "acc_stderr": 0.03393295729761011 }, "lighteval|mmlu:marketing|5": { "acc": 0.9316239316239316, "acc_stderr": 0.01653462768431136 }, "lighteval|mmlu:medical_genetics|5": { "acc": 0.82, "acc_stderr": 0.038612291966536955 }, "lighteval|mmlu:miscellaneous|5": { "acc": 0.9118773946360154, "acc_stderr": 0.010136978203312642 }, "lighteval|mmlu:moral_disputes|5": { "acc": 0.8323699421965318, "acc_stderr": 0.02011057991973484 }, "lighteval|mmlu:moral_scenarios|5": { "acc": 0.735195530726257, "acc_stderr": 0.014756906483260659 }, "lighteval|mmlu:nutrition|5": { "acc": 0.8366013071895425, "acc_stderr": 0.0211706230112135 }, "lighteval|mmlu:philosophy|5": { "acc": 0.8102893890675241, "acc_stderr": 0.02226819625878323 }, "lighteval|mmlu:prehistory|5": { "acc": 0.8549382716049383, "acc_stderr": 0.01959487701972797 }, "lighteval|mmlu:professional_accounting|5": { "acc": 0.6099290780141844, "acc_stderr": 0.02909767559946393 }, "lighteval|mmlu:professional_law|5": { "acc": 0.6049543676662321, "acc_stderr": 0.012485727813251565 }, "lighteval|mmlu:professional_medicine|5": { "acc": 0.8308823529411765, "acc_stderr": 0.022770868010112987 }, "lighteval|mmlu:professional_psychology|5": { "acc": 0.8071895424836601, "acc_stderr": 0.015959983971206737 }, "lighteval|mmlu:public_relations|5": { "acc": 0.7454545454545455, "acc_stderr": 0.04172343038705383 }, "lighteval|mmlu:security_studies|5": { "acc": 0.7918367346938775, "acc_stderr": 0.025991117672813296 }, "lighteval|mmlu:sociology|5": { "acc": 0.8756218905472637, "acc_stderr": 0.023335401790166327 }, "lighteval|mmlu:us_foreign_policy|5": { "acc": 0.94, "acc_stderr": 0.023868325657594197 }, "lighteval|mmlu:virology|5": { "acc": 0.5963855421686747, "acc_stderr": 0.03819486140758398 }, "lighteval|mmlu:world_religions|5": { "acc": 0.8713450292397661, "acc_stderr": 0.02567934272327692 }, "lighteval|mmlu:_average|5": { "acc": 0.7710000982834756, "acc_stderr": 0.02961245822933924 } }, "versions": { "lighteval|mmlu:abstract_algebra|5": 0, "lighteval|mmlu:anatomy|5": 0, "lighteval|mmlu:astronomy|5": 0, "lighteval|mmlu:business_ethics|5": 0, "lighteval|mmlu:clinical_knowledge|5": 0, "lighteval|mmlu:college_biology|5": 0, "lighteval|mmlu:college_chemistry|5": 0, "lighteval|mmlu:college_computer_science|5": 0, "lighteval|mmlu:college_mathematics|5": 0, "lighteval|mmlu:college_medicine|5": 0, "lighteval|mmlu:college_physics|5": 0, "lighteval|mmlu:computer_security|5": 0, "lighteval|mmlu:conceptual_physics|5": 0, "lighteval|mmlu:econometrics|5": 0, "lighteval|mmlu:electrical_engineering|5": 0, "lighteval|mmlu:elementary_mathematics|5": 0, "lighteval|mmlu:formal_logic|5": 0, "lighteval|mmlu:global_facts|5": 0, "lighteval|mmlu:high_school_biology|5": 0, "lighteval|mmlu:high_school_chemistry|5": 0, "lighteval|mmlu:high_school_computer_science|5": 0, "lighteval|mmlu:high_school_european_history|5": 0, "lighteval|mmlu:high_school_geography|5": 0, "lighteval|mmlu:high_school_government_and_politics|5": 0, "lighteval|mmlu:high_school_macroeconomics|5": 0, "lighteval|mmlu:high_school_mathematics|5": 0, "lighteval|mmlu:high_school_microeconomics|5": 0, "lighteval|mmlu:high_school_physics|5": 0, "lighteval|mmlu:high_school_psychology|5": 0, "lighteval|mmlu:high_school_statistics|5": 0, "lighteval|mmlu:high_school_us_history|5": 0, "lighteval|mmlu:high_school_world_history|5": 0, "lighteval|mmlu:human_aging|5": 0, "lighteval|mmlu:human_sexuality|5": 0, "lighteval|mmlu:international_law|5": 0, "lighteval|mmlu:jurisprudence|5": 0, "lighteval|mmlu:logical_fallacies|5": 0, "lighteval|mmlu:machine_learning|5": 0, "lighteval|mmlu:management|5": 0, "lighteval|mmlu:marketing|5": 0, "lighteval|mmlu:medical_genetics|5": 0, "lighteval|mmlu:miscellaneous|5": 0, "lighteval|mmlu:moral_disputes|5": 0, "lighteval|mmlu:moral_scenarios|5": 0, "lighteval|mmlu:nutrition|5": 0, "lighteval|mmlu:philosophy|5": 0, "lighteval|mmlu:prehistory|5": 0, "lighteval|mmlu:professional_accounting|5": 0, "lighteval|mmlu:professional_law|5": 0, "lighteval|mmlu:professional_medicine|5": 0, "lighteval|mmlu:professional_psychology|5": 0, "lighteval|mmlu:public_relations|5": 0, "lighteval|mmlu:security_studies|5": 0, "lighteval|mmlu:sociology|5": 0, "lighteval|mmlu:us_foreign_policy|5": 0, "lighteval|mmlu:virology|5": 0, "lighteval|mmlu:world_religions|5": 0 }, "config_tasks": { "lighteval|mmlu:abstract_algebra": { "name": "mmlu:abstract_algebra", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "abstract_algebra", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100 }, "lighteval|mmlu:anatomy": { "name": "mmlu:anatomy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "anatomy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 135, "effective_num_docs": 135 }, "lighteval|mmlu:astronomy": { "name": "mmlu:astronomy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "astronomy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 152, "effective_num_docs": 152 }, "lighteval|mmlu:business_ethics": { "name": "mmlu:business_ethics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "business_ethics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100 }, "lighteval|mmlu:clinical_knowledge": { "name": "mmlu:clinical_knowledge", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "clinical_knowledge", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 265, "effective_num_docs": 265 }, "lighteval|mmlu:college_biology": { "name": "mmlu:college_biology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_biology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 144, "effective_num_docs": 144 }, "lighteval|mmlu:college_chemistry": { "name": "mmlu:college_chemistry", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_chemistry", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100 }, "lighteval|mmlu:college_computer_science": { "name": "mmlu:college_computer_science", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_computer_science", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100 }, "lighteval|mmlu:college_mathematics": { "name": "mmlu:college_mathematics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_mathematics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100 }, "lighteval|mmlu:college_medicine": { "name": "mmlu:college_medicine", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_medicine", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 173, "effective_num_docs": 173 }, "lighteval|mmlu:college_physics": { "name": "mmlu:college_physics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_physics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 102, "effective_num_docs": 102 }, "lighteval|mmlu:computer_security": { "name": "mmlu:computer_security", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "computer_security", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100 }, "lighteval|mmlu:conceptual_physics": { "name": "mmlu:conceptual_physics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "conceptual_physics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 235, "effective_num_docs": 235 }, "lighteval|mmlu:econometrics": { "name": "mmlu:econometrics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "econometrics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 114, "effective_num_docs": 114 }, "lighteval|mmlu:electrical_engineering": { "name": "mmlu:electrical_engineering", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "electrical_engineering", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 145, "effective_num_docs": 145 }, "lighteval|mmlu:elementary_mathematics": { "name": "mmlu:elementary_mathematics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "elementary_mathematics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 378, "effective_num_docs": 378 }, "lighteval|mmlu:formal_logic": { "name": "mmlu:formal_logic", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "formal_logic", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 126, "effective_num_docs": 126 }, "lighteval|mmlu:global_facts": { "name": "mmlu:global_facts", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "global_facts", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100 }, "lighteval|mmlu:high_school_biology": { "name": "mmlu:high_school_biology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_biology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 310, "effective_num_docs": 310 }, "lighteval|mmlu:high_school_chemistry": { "name": "mmlu:high_school_chemistry", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_chemistry", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 203, "effective_num_docs": 203 }, "lighteval|mmlu:high_school_computer_science": { "name": "mmlu:high_school_computer_science", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_computer_science", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100 }, "lighteval|mmlu:high_school_european_history": { "name": "mmlu:high_school_european_history", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_european_history", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 165, "effective_num_docs": 165 }, "lighteval|mmlu:high_school_geography": { "name": "mmlu:high_school_geography", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_geography", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 198, "effective_num_docs": 198 }, "lighteval|mmlu:high_school_government_and_politics": { "name": "mmlu:high_school_government_and_politics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_government_and_politics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 193, "effective_num_docs": 193 }, "lighteval|mmlu:high_school_macroeconomics": { "name": "mmlu:high_school_macroeconomics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_macroeconomics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 390, "effective_num_docs": 390 }, "lighteval|mmlu:high_school_mathematics": { "name": "mmlu:high_school_mathematics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_mathematics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 270, "effective_num_docs": 270 }, "lighteval|mmlu:high_school_microeconomics": { "name": "mmlu:high_school_microeconomics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_microeconomics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 238, "effective_num_docs": 238 }, "lighteval|mmlu:high_school_physics": { "name": "mmlu:high_school_physics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_physics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 151, "effective_num_docs": 151 }, "lighteval|mmlu:high_school_psychology": { "name": "mmlu:high_school_psychology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_psychology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 545, "effective_num_docs": 545 }, "lighteval|mmlu:high_school_statistics": { "name": "mmlu:high_school_statistics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_statistics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 216, "effective_num_docs": 216 }, "lighteval|mmlu:high_school_us_history": { "name": "mmlu:high_school_us_history", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_us_history", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 204, "effective_num_docs": 204 }, "lighteval|mmlu:high_school_world_history": { "name": "mmlu:high_school_world_history", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_world_history", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 237, "effective_num_docs": 237 }, "lighteval|mmlu:human_aging": { "name": "mmlu:human_aging", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "human_aging", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 223, "effective_num_docs": 223 }, "lighteval|mmlu:human_sexuality": { "name": "mmlu:human_sexuality", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "human_sexuality", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 131, "effective_num_docs": 131 }, "lighteval|mmlu:international_law": { "name": "mmlu:international_law", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "international_law", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 121, "effective_num_docs": 121 }, "lighteval|mmlu:jurisprudence": { "name": "mmlu:jurisprudence", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "jurisprudence", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 108, "effective_num_docs": 108 }, "lighteval|mmlu:logical_fallacies": { "name": "mmlu:logical_fallacies", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "logical_fallacies", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 163, "effective_num_docs": 163 }, "lighteval|mmlu:machine_learning": { "name": "mmlu:machine_learning", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "machine_learning", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 112, "effective_num_docs": 112 }, "lighteval|mmlu:management": { "name": "mmlu:management", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "management", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 103, "effective_num_docs": 103 }, "lighteval|mmlu:marketing": { "name": "mmlu:marketing", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "marketing", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 234, "effective_num_docs": 234 }, "lighteval|mmlu:medical_genetics": { "name": "mmlu:medical_genetics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "medical_genetics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100 }, "lighteval|mmlu:miscellaneous": { "name": "mmlu:miscellaneous", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "miscellaneous", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 783, "effective_num_docs": 783 }, "lighteval|mmlu:moral_disputes": { "name": "mmlu:moral_disputes", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "moral_disputes", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 346, "effective_num_docs": 346 }, "lighteval|mmlu:moral_scenarios": { "name": "mmlu:moral_scenarios", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "moral_scenarios", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 895, "effective_num_docs": 895 }, "lighteval|mmlu:nutrition": { "name": "mmlu:nutrition", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "nutrition", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 306, "effective_num_docs": 306 }, "lighteval|mmlu:philosophy": { "name": "mmlu:philosophy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "philosophy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 311, "effective_num_docs": 311 }, "lighteval|mmlu:prehistory": { "name": "mmlu:prehistory", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "prehistory", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 324, "effective_num_docs": 324 }, "lighteval|mmlu:professional_accounting": { "name": "mmlu:professional_accounting", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_accounting", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 282, "effective_num_docs": 282 }, "lighteval|mmlu:professional_law": { "name": "mmlu:professional_law", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_law", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 1534, "effective_num_docs": 1534 }, "lighteval|mmlu:professional_medicine": { "name": "mmlu:professional_medicine", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_medicine", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 272, "effective_num_docs": 272 }, "lighteval|mmlu:professional_psychology": { "name": "mmlu:professional_psychology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_psychology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 612, "effective_num_docs": 612 }, "lighteval|mmlu:public_relations": { "name": "mmlu:public_relations", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "public_relations", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 110, "effective_num_docs": 110 }, "lighteval|mmlu:security_studies": { "name": "mmlu:security_studies", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "security_studies", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 245, "effective_num_docs": 245 }, "lighteval|mmlu:sociology": { "name": "mmlu:sociology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "sociology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 201, "effective_num_docs": 201 }, "lighteval|mmlu:us_foreign_policy": { "name": "mmlu:us_foreign_policy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "us_foreign_policy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100 }, "lighteval|mmlu:virology": { "name": "mmlu:virology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "virology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 166, "effective_num_docs": 166 }, "lighteval|mmlu:world_religions": { "name": "mmlu:world_religions", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "world_religions", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "mmlu" ], "original_num_docs": 171, "effective_num_docs": 171 } }, "summary_tasks": { "lighteval|mmlu:abstract_algebra|5": { "hashes": { "hash_examples": "4c76229e00c9c0e9", "hash_full_prompts": "273278cb9fb5ac01", "hash_input_tokens": "caf9777ccf71eab5", "hash_cont_tokens": "00520b0ec06da34f" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:anatomy|5": { "hashes": { "hash_examples": "6a1f8104dccbd33b", "hash_full_prompts": "e77b5ebe030aabba", "hash_input_tokens": "d192cd7584fda4dc", "hash_cont_tokens": "263324e6ce7f9b36" }, "truncated": 0, "non_truncated": 135, "padded": 540, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:astronomy|5": { "hashes": { "hash_examples": "1302effa3a76ce4c", "hash_full_prompts": "0ff37ef4519e63f9", "hash_input_tokens": "d241783f0bfdf860", "hash_cont_tokens": "18ba399c6801138e" }, "truncated": 0, "non_truncated": 152, "padded": 608, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:business_ethics|5": { "hashes": { "hash_examples": "03cb8bce5336419a", "hash_full_prompts": "7c4d312a23bdd669", "hash_input_tokens": "0aee5ed969278926", "hash_cont_tokens": "00520b0ec06da34f" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:clinical_knowledge|5": { "hashes": { "hash_examples": "ffbb9c7b2be257f9", "hash_full_prompts": "472d93369b1a8382", "hash_input_tokens": "aa05960be77863d3", "hash_cont_tokens": "9d7500060e0dd995" }, "truncated": 0, "non_truncated": 265, "padded": 1060, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:college_biology|5": { "hashes": { "hash_examples": "3ee77f176f38eb8e", "hash_full_prompts": "6853bf027b349083", "hash_input_tokens": "3843b5375a04262c", "hash_cont_tokens": "78a731af5d2f6472" }, "truncated": 0, "non_truncated": 144, "padded": 576, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:college_chemistry|5": { "hashes": { "hash_examples": "ce61a69c46d47aeb", "hash_full_prompts": "e0f8624971f7af71", "hash_input_tokens": "2096d1652e232764", "hash_cont_tokens": "00520b0ec06da34f" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:college_computer_science|5": { "hashes": { "hash_examples": "32805b52d7d5daab", "hash_full_prompts": "841e9d2ecfbb104d", "hash_input_tokens": "1e007ac047722e9b", "hash_cont_tokens": "00520b0ec06da34f" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:college_mathematics|5": { "hashes": { "hash_examples": "55da1a0a0bd33722", "hash_full_prompts": "696c5f73522b8706", "hash_input_tokens": "c3061d57b5a4ad7e", "hash_cont_tokens": "00520b0ec06da34f" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:college_medicine|5": { "hashes": { "hash_examples": "c33e143163049176", "hash_full_prompts": "7d2530816f672426", "hash_input_tokens": "4cddd091001776d7", "hash_cont_tokens": "699c8eb24e3e446b" }, "truncated": 0, "non_truncated": 173, "padded": 692, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:college_physics|5": { "hashes": { "hash_examples": "ebdab1cdb7e555df", "hash_full_prompts": "66b3a61507c4c92b", "hash_input_tokens": "821b169941167548", "hash_cont_tokens": "075997110cbe055e" }, "truncated": 0, "non_truncated": 102, "padded": 408, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:computer_security|5": { "hashes": { "hash_examples": "a24fd7d08a560921", "hash_full_prompts": "f1143da88158bf03", "hash_input_tokens": "02e64465d74344b4", "hash_cont_tokens": "00520b0ec06da34f" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:conceptual_physics|5": { "hashes": { "hash_examples": "8300977a79386993", "hash_full_prompts": "d2b4c706b65a71d9", "hash_input_tokens": "5c7a2235529d2821", "hash_cont_tokens": "f22daa6d4818086f" }, "truncated": 0, "non_truncated": 235, "padded": 940, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:econometrics|5": { "hashes": { "hash_examples": "ddde36788a04a46f", "hash_full_prompts": "aa5255d923b0e3a3", "hash_input_tokens": "e0a79ea9e037599d", "hash_cont_tokens": "26791a0b1941b4c4" }, "truncated": 0, "non_truncated": 114, "padded": 456, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:electrical_engineering|5": { "hashes": { "hash_examples": "acbc5def98c19b3f", "hash_full_prompts": "c1f9a9087987d1d7", "hash_input_tokens": "e48ddb58b2efa8e3", "hash_cont_tokens": "3e336577994f6c0d" }, "truncated": 0, "non_truncated": 145, "padded": 580, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:elementary_mathematics|5": { "hashes": { "hash_examples": "146e61d07497a9bd", "hash_full_prompts": "57fb9ddf2f814bb5", "hash_input_tokens": "9e81373b5265da10", "hash_cont_tokens": "1d6bbfa8a67327c8" }, "truncated": 0, "non_truncated": 378, "padded": 1512, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:formal_logic|5": { "hashes": { "hash_examples": "8635216e1909a03f", "hash_full_prompts": "dc7e34e04346adfd", "hash_input_tokens": "0378ed1f1a9bb3f6", "hash_cont_tokens": "60508d85eb7693a4" }, "truncated": 0, "non_truncated": 126, "padded": 504, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:global_facts|5": { "hashes": { "hash_examples": "30b315aa6353ee47", "hash_full_prompts": "7dedb5baa45f3a38", "hash_input_tokens": "d20db9bd82fb76c1", "hash_cont_tokens": "00520b0ec06da34f" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_biology|5": { "hashes": { "hash_examples": "c9136373af2180de", "hash_full_prompts": "15157813fc668acf", "hash_input_tokens": "c3c10eef8c477c93", "hash_cont_tokens": "d236ce982144e65f" }, "truncated": 0, "non_truncated": 310, "padded": 1240, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_chemistry|5": { "hashes": { "hash_examples": "b0661bfa1add6404", "hash_full_prompts": "f51dfd92a2d6fdba", "hash_input_tokens": "dc53c87961ef4ab5", "hash_cont_tokens": "59f93238ec5aead6" }, "truncated": 0, "non_truncated": 203, "padded": 812, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_computer_science|5": { "hashes": { "hash_examples": "80fc1d623a3d665f", "hash_full_prompts": "fe432a03fe8cc766", "hash_input_tokens": "61fa356c3ea98372", "hash_cont_tokens": "00520b0ec06da34f" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_european_history|5": { "hashes": { "hash_examples": "854da6e5af0fe1a1", "hash_full_prompts": "09a62e1560fb1171", "hash_input_tokens": "272f8d31300ef0af", "hash_cont_tokens": "7b7414d6a5da3d91" }, "truncated": 0, "non_truncated": 165, "padded": 656, "non_padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_geography|5": { "hashes": { "hash_examples": "7dc963c7acd19ad8", "hash_full_prompts": "8284151c76cee4d8", "hash_input_tokens": "12624aed9bf6356b", "hash_cont_tokens": "1b66289e10988f84" }, "truncated": 0, "non_truncated": 198, "padded": 792, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_government_and_politics|5": { "hashes": { "hash_examples": "1f675dcdebc9758f", "hash_full_prompts": "083339a69a8bfafa", "hash_input_tokens": "32e30c43a4a5347e", "hash_cont_tokens": "5ab3c3415b1d3a55" }, "truncated": 0, "non_truncated": 193, "padded": 772, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_macroeconomics|5": { "hashes": { "hash_examples": "2fb32cf2d80f0b35", "hash_full_prompts": "ececedb0c4a4ffcd", "hash_input_tokens": "dc2cd6b398f5f86e", "hash_cont_tokens": "2f5457058d187374" }, "truncated": 0, "non_truncated": 390, "padded": 1557, "non_padded": 3, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_mathematics|5": { "hashes": { "hash_examples": "fd6646fdb5d58a1f", "hash_full_prompts": "d58a3ca5c8ed6780", "hash_input_tokens": "6f9c5ce6428dd87d", "hash_cont_tokens": "e35137cb972e1918" }, "truncated": 0, "non_truncated": 270, "padded": 1080, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_microeconomics|5": { "hashes": { "hash_examples": "2118f21f71d87d84", "hash_full_prompts": "bd49ce8a930e3e78", "hash_input_tokens": "44722cbe1d85e636", "hash_cont_tokens": "f756093278ebb83e" }, "truncated": 0, "non_truncated": 238, "padded": 908, "non_padded": 44, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_physics|5": { "hashes": { "hash_examples": "dc3ce06378548565", "hash_full_prompts": "3904af994b32b959", "hash_input_tokens": "2132f616c2587937", "hash_cont_tokens": "9cf883ebf1c82176" }, "truncated": 0, "non_truncated": 151, "padded": 604, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_psychology|5": { "hashes": { "hash_examples": "c8d1d98a40e11f2f", "hash_full_prompts": "d3a4d5dd3f3513f8", "hash_input_tokens": "6cc69cf1a89e4a88", "hash_cont_tokens": "bda0f77331ebb21a" }, "truncated": 0, "non_truncated": 545, "padded": 2178, "non_padded": 2, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_statistics|5": { "hashes": { "hash_examples": "666c8759b98ee4ff", "hash_full_prompts": "1b5599f9d4edc7de", "hash_input_tokens": "60af7a873b579818", "hash_cont_tokens": "4d04f014105a0bad" }, "truncated": 0, "non_truncated": 216, "padded": 864, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_us_history|5": { "hashes": { "hash_examples": "95fef1c4b7d3f81e", "hash_full_prompts": "001f7e7cc8185618", "hash_input_tokens": "8c2d01a0f291db69", "hash_cont_tokens": "f4590c58f12f2766" }, "truncated": 0, "non_truncated": 204, "padded": 816, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:high_school_world_history|5": { "hashes": { "hash_examples": "7e5085b6184b0322", "hash_full_prompts": "6a5c2a43cf7c6cb1", "hash_input_tokens": "612ed95e43bc21b5", "hash_cont_tokens": "db6bcddd891df5d9" }, "truncated": 0, "non_truncated": 237, "padded": 948, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:human_aging|5": { "hashes": { "hash_examples": "c17333e7c7c10797", "hash_full_prompts": "a3ad8e679fe07bef", "hash_input_tokens": "4c948b081b40ba31", "hash_cont_tokens": "25cec8d640319105" }, "truncated": 0, "non_truncated": 223, "padded": 892, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:human_sexuality|5": { "hashes": { "hash_examples": "4edd1e9045df5e3d", "hash_full_prompts": "3389ffb95929a661", "hash_input_tokens": "9e649cc80ef9f2fe", "hash_cont_tokens": "6778302b4a10b645" }, "truncated": 0, "non_truncated": 131, "padded": 524, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:international_law|5": { "hashes": { "hash_examples": "db2fa00d771a062a", "hash_full_prompts": "104f48c64f6f9622", "hash_input_tokens": "c51db1d4a2a87eed", "hash_cont_tokens": "9eb54e1a46032749" }, "truncated": 0, "non_truncated": 121, "padded": 484, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:jurisprudence|5": { "hashes": { "hash_examples": "e956f86b124076fe", "hash_full_prompts": "49295d36462ddc97", "hash_input_tokens": "a779a1b30bc13f30", "hash_cont_tokens": "f17d9a372cfd66b1" }, "truncated": 0, "non_truncated": 108, "padded": 420, "non_padded": 12, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:logical_fallacies|5": { "hashes": { "hash_examples": "956e0e6365ab79f1", "hash_full_prompts": "b64f452752d5cd23", "hash_input_tokens": "61d99e8d4d4d8652", "hash_cont_tokens": "cf44a68f5bca9a96" }, "truncated": 0, "non_truncated": 163, "padded": 648, "non_padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:machine_learning|5": { "hashes": { "hash_examples": "397997cc6f4d581e", "hash_full_prompts": "54da136ebd708042", "hash_input_tokens": "11e6731506fcf366", "hash_cont_tokens": "eace00d420f4f32c" }, "truncated": 0, "non_truncated": 112, "padded": 448, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:management|5": { "hashes": { "hash_examples": "2bcbe6f6ca63d740", "hash_full_prompts": "a4b864ff27598ba3", "hash_input_tokens": "caffa6e4e80cbd5e", "hash_cont_tokens": "b7c51d0250c252d8" }, "truncated": 0, "non_truncated": 103, "padded": 412, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:marketing|5": { "hashes": { "hash_examples": "8ddb20d964a1b065", "hash_full_prompts": "c7183ac32f36104d", "hash_input_tokens": "5cd238ac5e8f19f4", "hash_cont_tokens": "086fb63f8b1d1339" }, "truncated": 0, "non_truncated": 234, "padded": 924, "non_padded": 12, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:medical_genetics|5": { "hashes": { "hash_examples": "182a71f4763d2cea", "hash_full_prompts": "c17b0a66e3027303", "hash_input_tokens": "46c0c8a573b43089", "hash_cont_tokens": "00520b0ec06da34f" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:miscellaneous|5": { "hashes": { "hash_examples": "4c404fdbb4ca57fc", "hash_full_prompts": "bc5fa37ce20a2503", "hash_input_tokens": "5327cd4585062ac2", "hash_cont_tokens": "1827274fa6537077" }, "truncated": 0, "non_truncated": 783, "padded": 3132, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:moral_disputes|5": { "hashes": { "hash_examples": "60cbd2baa3fea5c9", "hash_full_prompts": "075742051236078f", "hash_input_tokens": "a2c9da202f686839", "hash_cont_tokens": "472c223f6f28cfc7" }, "truncated": 0, "non_truncated": 346, "padded": 1384, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:moral_scenarios|5": { "hashes": { "hash_examples": "fd8b0431fbdd75ef", "hash_full_prompts": "533c4700637599a2", "hash_input_tokens": "9a1a9f3900b372e6", "hash_cont_tokens": "e90dade00a092f9e" }, "truncated": 0, "non_truncated": 895, "padded": 3567, "non_padded": 13, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:nutrition|5": { "hashes": { "hash_examples": "71e55e2b829b6528", "hash_full_prompts": "02b6877dc5a603a6", "hash_input_tokens": "dd91fec063272e23", "hash_cont_tokens": "128e0ec97d96b165" }, "truncated": 0, "non_truncated": 306, "padded": 1224, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:philosophy|5": { "hashes": { "hash_examples": "a6d489a8d208fa4b", "hash_full_prompts": "0e65b5f40a9ceb20", "hash_input_tokens": "2255e15265a7d96a", "hash_cont_tokens": "cbfd7829a3e0f082" }, "truncated": 0, "non_truncated": 311, "padded": 1244, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:prehistory|5": { "hashes": { "hash_examples": "6cc50f032a19acaa", "hash_full_prompts": "e838e60749e4a598", "hash_input_tokens": "1b9b906efbcc97fd", "hash_cont_tokens": "9c0cf5a2f71afa7e" }, "truncated": 0, "non_truncated": 324, "padded": 1284, "non_padded": 12, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:professional_accounting|5": { "hashes": { "hash_examples": "50f57ab32f5f6cea", "hash_full_prompts": "9abf7319f68b7ba8", "hash_input_tokens": "d42c8275cd4e10e1", "hash_cont_tokens": "50f011c2453517ee" }, "truncated": 0, "non_truncated": 282, "padded": 1128, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:professional_law|5": { "hashes": { "hash_examples": "a8fdc85c64f4b215", "hash_full_prompts": "4074faf1eaedcfda", "hash_input_tokens": "215c854d27e741b8", "hash_cont_tokens": "73527e852c24186c" }, "truncated": 0, "non_truncated": 1534, "padded": 6136, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:professional_medicine|5": { "hashes": { "hash_examples": "c373a28a3050a73a", "hash_full_prompts": "e72202fc20fcab70", "hash_input_tokens": "5a6e9aaaaea83544", "hash_cont_tokens": "ceb7af5e2e789abc" }, "truncated": 0, "non_truncated": 272, "padded": 1088, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:professional_psychology|5": { "hashes": { "hash_examples": "bf5254fe818356af", "hash_full_prompts": "4dcb71c9ef602791", "hash_input_tokens": "316d0ba731b0de4f", "hash_cont_tokens": "8cfdced8a9667380" }, "truncated": 0, "non_truncated": 612, "padded": 2428, "non_padded": 20, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:public_relations|5": { "hashes": { "hash_examples": "b66d52e28e7d14e0", "hash_full_prompts": "c6050b1748185950", "hash_input_tokens": "2ba1d90c95e19dce", "hash_cont_tokens": "f8327461a9cc5123" }, "truncated": 0, "non_truncated": 110, "padded": 436, "non_padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:security_studies|5": { "hashes": { "hash_examples": "514c14feaf000ad9", "hash_full_prompts": "4c6786915b670d03", "hash_input_tokens": "b92f71eccf4f89bf", "hash_cont_tokens": "c30b0c4d52c2875d" }, "truncated": 0, "non_truncated": 245, "padded": 980, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:sociology|5": { "hashes": { "hash_examples": "f6c9bc9d18c80870", "hash_full_prompts": "a2e9a27e985a4e9b", "hash_input_tokens": "e821334ab55c0d44", "hash_cont_tokens": "eef4bd16d536fbd6" }, "truncated": 0, "non_truncated": 201, "padded": 804, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:us_foreign_policy|5": { "hashes": { "hash_examples": "ed7b78629db6678f", "hash_full_prompts": "46d0986398662d59", "hash_input_tokens": "9f6b40a7b6b8a3b2", "hash_cont_tokens": "00520b0ec06da34f" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:virology|5": { "hashes": { "hash_examples": "bc52ffdc3f9b994a", "hash_full_prompts": "6b591e3983159283", "hash_input_tokens": "d7c6d39e149defc9", "hash_cont_tokens": "f5fc195e049353c0" }, "truncated": 0, "non_truncated": 166, "padded": 664, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "lighteval|mmlu:world_religions|5": { "hashes": { "hash_examples": "ecdb4a4f94f62930", "hash_full_prompts": "8c2e37a02519af15", "hash_input_tokens": "80b87b6e634441d6", "hash_cont_tokens": "ada548665e87b1e0" }, "truncated": 0, "non_truncated": 171, "padded": 684, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 } }, "summary_general": { "hashes": { "hash_examples": "341a076d0beb7048", "hash_full_prompts": "7c1eeddf962b8fc9", "hash_input_tokens": "98bef9715b6ebf74", "hash_cont_tokens": "3672212ca582e2d0" }, "truncated": 0, "non_truncated": 14042, "padded": 56038, "non_padded": 130, "num_truncated_few_shots": 0 } }