{ "results": { "mmlu": { "acc,none": 0.6249821962683378, "acc_stderr,none": 0.0038618499647845327, "alias": "mmlu" }, "mmlu_humanities": { "alias": " - humanities", "acc,none": 0.5819341126461212, "acc_stderr,none": 0.0067830807124966905 }, "mmlu_formal_logic": { "alias": " - formal_logic", "acc,none": 0.42857142857142855, "acc_stderr,none": 0.04426266681379909 }, "mmlu_high_school_european_history": { "alias": " - high_school_european_history", "acc,none": 0.7515151515151515, "acc_stderr,none": 0.03374402644139405 }, "mmlu_high_school_us_history": { "alias": " - high_school_us_history", "acc,none": 0.7990196078431373, "acc_stderr,none": 0.028125972265654373 }, "mmlu_high_school_world_history": { "alias": " - high_school_world_history", "acc,none": 0.7848101265822784, "acc_stderr,none": 0.02675082699467618 }, "mmlu_international_law": { "alias": " - international_law", "acc,none": 0.768595041322314, "acc_stderr,none": 0.03849856098794087 }, "mmlu_jurisprudence": { "alias": " - jurisprudence", "acc,none": 0.8148148148148148, "acc_stderr,none": 0.03755265865037182 }, "mmlu_logical_fallacies": { "alias": " - logical_fallacies", "acc,none": 0.7914110429447853, "acc_stderr,none": 0.031921934489347235 }, "mmlu_moral_disputes": { "alias": " - moral_disputes", "acc,none": 0.7254335260115607, "acc_stderr,none": 0.02402774515526501 }, "mmlu_moral_scenarios": { "alias": " - moral_scenarios", "acc,none": 0.38324022346368714, "acc_stderr,none": 0.016260159604429128 }, "mmlu_philosophy": { "alias": " - philosophy", "acc,none": 0.7138263665594855, "acc_stderr,none": 0.025670259242188936 }, "mmlu_prehistory": { "alias": " - prehistory", "acc,none": 0.7222222222222222, "acc_stderr,none": 0.02492200116888634 }, "mmlu_professional_law": { "alias": " - professional_law", "acc,none": 0.4641460234680574, "acc_stderr,none": 0.01273736131873058 }, "mmlu_world_religions": { "alias": " - world_religions", "acc,none": 0.8128654970760234, "acc_stderr,none": 0.029913127232368036 }, "mmlu_other": { "alias": " - other", "acc,none": 0.6887672996459607, "acc_stderr,none": 0.00795163954721013 }, "mmlu_business_ethics": { "alias": " - business_ethics", "acc,none": 0.57, "acc_stderr,none": 0.04975698519562428 }, "mmlu_clinical_knowledge": { "alias": " - clinical_knowledge", "acc,none": 0.6981132075471698, "acc_stderr,none": 0.02825420034443866 }, "mmlu_college_medicine": { "alias": " - college_medicine", "acc,none": 0.6473988439306358, "acc_stderr,none": 0.036430371689585496 }, "mmlu_global_facts": { "alias": " - global_facts", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316 }, "mmlu_human_aging": { "alias": " - human_aging", "acc,none": 0.6547085201793722, "acc_stderr,none": 0.03191100192835794 }, "mmlu_management": { "alias": " - management", "acc,none": 0.7864077669902912, "acc_stderr,none": 0.04058042015646034 }, "mmlu_marketing": { "alias": " - marketing", "acc,none": 0.8717948717948718, "acc_stderr,none": 0.021901905115073318 }, "mmlu_medical_genetics": { "alias": " - medical_genetics", "acc,none": 0.71, "acc_stderr,none": 0.04560480215720684 }, "mmlu_miscellaneous": { "alias": " - miscellaneous", "acc,none": 0.8007662835249042, "acc_stderr,none": 0.014283378044296418 }, "mmlu_nutrition": { "alias": " - nutrition", "acc,none": 0.7647058823529411, "acc_stderr,none": 0.024288619466046105 }, "mmlu_professional_accounting": { "alias": " - professional_accounting", "acc,none": 0.4432624113475177, "acc_stderr,none": 0.029634838473766006 }, "mmlu_professional_medicine": { "alias": " - professional_medicine", "acc,none": 0.6580882352941176, "acc_stderr,none": 0.02881472242225418 }, "mmlu_virology": { "alias": " - virology", "acc,none": 0.5301204819277109, "acc_stderr,none": 0.03885425420866766 }, "mmlu_social_sciences": { "alias": " - social_sciences", "acc,none": 0.7253818654533637, "acc_stderr,none": 0.007876193174297675 }, "mmlu_econometrics": { "alias": " - econometrics", "acc,none": 0.4649122807017544, "acc_stderr,none": 0.04692008381368911 }, "mmlu_high_school_geography": { "alias": " - high_school_geography", "acc,none": 0.7878787878787878, "acc_stderr,none": 0.029126522834586815 }, "mmlu_high_school_government_and_politics": { "alias": " - high_school_government_and_politics", "acc,none": 0.8601036269430051, "acc_stderr,none": 0.025033870583015184 }, "mmlu_high_school_macroeconomics": { "alias": " - high_school_macroeconomics", "acc,none": 0.6512820512820513, "acc_stderr,none": 0.02416278028401772 }, "mmlu_high_school_microeconomics": { "alias": " - high_school_microeconomics", "acc,none": 0.6386554621848739, "acc_stderr,none": 0.031204691225150016 }, "mmlu_high_school_psychology": { "alias": " - high_school_psychology", "acc,none": 0.8110091743119267, "acc_stderr,none": 0.016785481159203638 }, "mmlu_human_sexuality": { "alias": " - human_sexuality", "acc,none": 0.7786259541984732, "acc_stderr,none": 0.0364129708131373 }, "mmlu_professional_psychology": { "alias": " - professional_psychology", "acc,none": 0.6781045751633987, "acc_stderr,none": 0.018901015322093092 }, "mmlu_public_relations": { "alias": " - public_relations", "acc,none": 0.6363636363636364, "acc_stderr,none": 0.046075820907199756 }, "mmlu_security_studies": { "alias": " - security_studies", "acc,none": 0.6816326530612244, "acc_stderr,none": 0.02982253379398208 }, "mmlu_sociology": { "alias": " - sociology", "acc,none": 0.8208955223880597, "acc_stderr,none": 0.027113286753111837 }, "mmlu_us_foreign_policy": { "alias": " - us_foreign_policy", "acc,none": 0.9, "acc_stderr,none": 0.030151134457776334 }, "mmlu_stem": { "alias": " - stem", "acc,none": 0.5283856644465589, "acc_stderr,none": 0.00853646388304497 }, "mmlu_abstract_algebra": { "alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542128 }, "mmlu_anatomy": { "alias": " - anatomy", "acc,none": 0.6370370370370371, "acc_stderr,none": 0.041539484047424 }, "mmlu_astronomy": { "alias": " - astronomy", "acc,none": 0.6644736842105263, "acc_stderr,none": 0.03842498559395268 }, "mmlu_college_biology": { "alias": " - college_biology", "acc,none": 0.7430555555555556, "acc_stderr,none": 0.03653946969442099 }, "mmlu_college_chemistry": { "alias": " - college_chemistry", "acc,none": 0.51, "acc_stderr,none": 0.05024183937956914 }, "mmlu_college_computer_science": { "alias": " - college_computer_science", "acc,none": 0.51, "acc_stderr,none": 0.05024183937956912 }, "mmlu_college_mathematics": { "alias": " - college_mathematics", "acc,none": 0.36, "acc_stderr,none": 0.04824181513244218 }, "mmlu_college_physics": { "alias": " - college_physics", "acc,none": 0.4411764705882353, "acc_stderr,none": 0.04940635630605659 }, "mmlu_computer_security": { "alias": " - computer_security", "acc,none": 0.77, "acc_stderr,none": 0.04229525846816506 }, "mmlu_conceptual_physics": { "alias": " - conceptual_physics", "acc,none": 0.574468085106383, "acc_stderr,none": 0.032321469162244695 }, "mmlu_electrical_engineering": { "alias": " - electrical_engineering", "acc,none": 0.5793103448275863, "acc_stderr,none": 0.0411391498118926 }, "mmlu_elementary_mathematics": { "alias": " - elementary_mathematics", "acc,none": 0.40476190476190477, "acc_stderr,none": 0.0252798503974049 }, "mmlu_high_school_biology": { "alias": " - high_school_biology", "acc,none": 0.7612903225806451, "acc_stderr,none": 0.024251071262208837 }, "mmlu_high_school_chemistry": { "alias": " - high_school_chemistry", "acc,none": 0.5123152709359606, "acc_stderr,none": 0.035169204442208966 }, "mmlu_high_school_computer_science": { "alias": " - high_school_computer_science", "acc,none": 0.66, "acc_stderr,none": 0.04760952285695237 }, "mmlu_high_school_mathematics": { "alias": " - high_school_mathematics", "acc,none": 0.3592592592592593, "acc_stderr,none": 0.029252905927251976 }, "mmlu_high_school_physics": { "alias": " - high_school_physics", "acc,none": 0.3443708609271523, "acc_stderr,none": 0.038796870240733264 }, "mmlu_high_school_statistics": { "alias": " - high_school_statistics", "acc,none": 0.47685185185185186, "acc_stderr,none": 0.03406315360711507 }, "mmlu_machine_learning": { "alias": " - machine_learning", "acc,none": 0.48214285714285715, "acc_stderr,none": 0.047427623612430116 } }, "groups": { "mmlu": { "acc,none": 0.6249821962683378, "acc_stderr,none": 0.0038618499647845327, "alias": "mmlu" }, "mmlu_humanities": { "alias": " - humanities", "acc,none": 0.5819341126461212, "acc_stderr,none": 0.0067830807124966905 }, "mmlu_other": { "alias": " - other", "acc,none": 0.6887672996459607, "acc_stderr,none": 0.00795163954721013 }, "mmlu_social_sciences": { "alias": " - social_sciences", "acc,none": 0.7253818654533637, "acc_stderr,none": 0.007876193174297675 }, "mmlu_stem": { "alias": " - stem", "acc,none": 0.5283856644465589, "acc_stderr,none": 0.00853646388304497 } }, "configs": { "mmlu_abstract_algebra": { "task": "mmlu_abstract_algebra", "task_alias": "abstract_algebra", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "abstract_algebra", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_anatomy": { "task": "mmlu_anatomy", "task_alias": "anatomy", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "anatomy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_astronomy": { "task": "mmlu_astronomy", "task_alias": "astronomy", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "astronomy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_business_ethics": { "task": "mmlu_business_ethics", "task_alias": "business_ethics", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "business_ethics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_clinical_knowledge": { "task": "mmlu_clinical_knowledge", "task_alias": "clinical_knowledge", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "clinical_knowledge", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_college_biology": { "task": "mmlu_college_biology", "task_alias": "college_biology", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "college_biology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college biology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_college_chemistry": { "task": "mmlu_college_chemistry", "task_alias": "college_chemistry", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "college_chemistry", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_college_computer_science": { "task": "mmlu_college_computer_science", "task_alias": "college_computer_science", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "college_computer_science", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_college_mathematics": { "task": "mmlu_college_mathematics", "task_alias": "college_mathematics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "college_mathematics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_college_medicine": { "task": "mmlu_college_medicine", "task_alias": "college_medicine", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "college_medicine", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_college_physics": { "task": "mmlu_college_physics", "task_alias": "college_physics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "college_physics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college physics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_computer_security": { "task": "mmlu_computer_security", "task_alias": "computer_security", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "computer_security", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about computer security.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_conceptual_physics": { "task": "mmlu_conceptual_physics", "task_alias": "conceptual_physics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "conceptual_physics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_econometrics": { "task": "mmlu_econometrics", "task_alias": "econometrics", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "econometrics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_electrical_engineering": { "task": "mmlu_electrical_engineering", "task_alias": "electrical_engineering", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "electrical_engineering", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_elementary_mathematics": { "task": "mmlu_elementary_mathematics", "task_alias": "elementary_mathematics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "elementary_mathematics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_formal_logic": { "task": "mmlu_formal_logic", "task_alias": "formal_logic", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "formal_logic", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_global_facts": { "task": "mmlu_global_facts", "task_alias": "global_facts", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "global_facts", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about global facts.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_biology": { "task": "mmlu_high_school_biology", "task_alias": "high_school_biology", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_biology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_chemistry": { "task": "mmlu_high_school_chemistry", "task_alias": "high_school_chemistry", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_chemistry", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_computer_science": { "task": "mmlu_high_school_computer_science", "task_alias": "high_school_computer_science", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_computer_science", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_european_history": { "task": "mmlu_high_school_european_history", "task_alias": "high_school_european_history", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_european_history", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_geography": { "task": "mmlu_high_school_geography", "task_alias": "high_school_geography", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_geography", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_government_and_politics": { "task": "mmlu_high_school_government_and_politics", "task_alias": "high_school_government_and_politics", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_government_and_politics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_macroeconomics": { "task": "mmlu_high_school_macroeconomics", "task_alias": "high_school_macroeconomics", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_macroeconomics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_mathematics": { "task": "mmlu_high_school_mathematics", "task_alias": "high_school_mathematics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_mathematics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_microeconomics": { "task": "mmlu_high_school_microeconomics", "task_alias": "high_school_microeconomics", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_microeconomics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_physics": { "task": "mmlu_high_school_physics", "task_alias": "high_school_physics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_physics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_psychology": { "task": "mmlu_high_school_psychology", "task_alias": "high_school_psychology", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_psychology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_statistics": { "task": "mmlu_high_school_statistics", "task_alias": "high_school_statistics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_statistics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_us_history": { "task": "mmlu_high_school_us_history", "task_alias": "high_school_us_history", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_us_history", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_world_history": { "task": "mmlu_high_school_world_history", "task_alias": "high_school_world_history", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_world_history", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_human_aging": { "task": "mmlu_human_aging", "task_alias": "human_aging", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "human_aging", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about human aging.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_human_sexuality": { "task": "mmlu_human_sexuality", "task_alias": "human_sexuality", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "human_sexuality", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_international_law": { "task": "mmlu_international_law", "task_alias": "international_law", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "international_law", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about international law.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_jurisprudence": { "task": "mmlu_jurisprudence", "task_alias": "jurisprudence", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "jurisprudence", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_logical_fallacies": { "task": "mmlu_logical_fallacies", "task_alias": "logical_fallacies", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "logical_fallacies", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_machine_learning": { "task": "mmlu_machine_learning", "task_alias": "machine_learning", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "machine_learning", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_management": { "task": "mmlu_management", "task_alias": "management", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "management", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about management.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_marketing": { "task": "mmlu_marketing", "task_alias": "marketing", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "marketing", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about marketing.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_medical_genetics": { "task": "mmlu_medical_genetics", "task_alias": "medical_genetics", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "medical_genetics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_miscellaneous": { "task": "mmlu_miscellaneous", "task_alias": "miscellaneous", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "miscellaneous", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_moral_disputes": { "task": "mmlu_moral_disputes", "task_alias": "moral_disputes", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "moral_disputes", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_moral_scenarios": { "task": "mmlu_moral_scenarios", "task_alias": "moral_scenarios", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "moral_scenarios", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_nutrition": { "task": "mmlu_nutrition", "task_alias": "nutrition", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "nutrition", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_philosophy": { "task": "mmlu_philosophy", "task_alias": "philosophy", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "philosophy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_prehistory": { "task": "mmlu_prehistory", "task_alias": "prehistory", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "prehistory", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_professional_accounting": { "task": "mmlu_professional_accounting", "task_alias": "professional_accounting", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "professional_accounting", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_professional_law": { "task": "mmlu_professional_law", "task_alias": "professional_law", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "professional_law", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional law.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_professional_medicine": { "task": "mmlu_professional_medicine", "task_alias": "professional_medicine", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "professional_medicine", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_professional_psychology": { "task": "mmlu_professional_psychology", "task_alias": "professional_psychology", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "professional_psychology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_public_relations": { "task": "mmlu_public_relations", "task_alias": "public_relations", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "public_relations", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about public relations.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_security_studies": { "task": "mmlu_security_studies", "task_alias": "security_studies", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "security_studies", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about security studies.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_sociology": { "task": "mmlu_sociology", "task_alias": "sociology", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "sociology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about sociology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_us_foreign_policy": { "task": "mmlu_us_foreign_policy", "task_alias": "us_foreign_policy", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "us_foreign_policy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_virology": { "task": "mmlu_virology", "task_alias": "virology", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "virology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about virology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_world_religions": { "task": "mmlu_world_religions", "task_alias": "world_religions", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "world_religions", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about world religions.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 5, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } } }, "versions": { "mmlu": "N/A", "mmlu_abstract_algebra": 0.0, "mmlu_anatomy": 0.0, "mmlu_astronomy": 0.0, "mmlu_business_ethics": 0.0, "mmlu_clinical_knowledge": 0.0, "mmlu_college_biology": 0.0, "mmlu_college_chemistry": 0.0, "mmlu_college_computer_science": 0.0, "mmlu_college_mathematics": 0.0, "mmlu_college_medicine": 0.0, "mmlu_college_physics": 0.0, "mmlu_computer_security": 0.0, "mmlu_conceptual_physics": 0.0, "mmlu_econometrics": 0.0, "mmlu_electrical_engineering": 0.0, "mmlu_elementary_mathematics": 0.0, "mmlu_formal_logic": 0.0, "mmlu_global_facts": 0.0, "mmlu_high_school_biology": 0.0, "mmlu_high_school_chemistry": 0.0, "mmlu_high_school_computer_science": 0.0, "mmlu_high_school_european_history": 0.0, "mmlu_high_school_geography": 0.0, "mmlu_high_school_government_and_politics": 0.0, "mmlu_high_school_macroeconomics": 0.0, "mmlu_high_school_mathematics": 0.0, "mmlu_high_school_microeconomics": 0.0, "mmlu_high_school_physics": 0.0, "mmlu_high_school_psychology": 0.0, "mmlu_high_school_statistics": 0.0, "mmlu_high_school_us_history": 0.0, "mmlu_high_school_world_history": 0.0, "mmlu_human_aging": 0.0, "mmlu_human_sexuality": 0.0, "mmlu_humanities": "N/A", "mmlu_international_law": 0.0, "mmlu_jurisprudence": 0.0, "mmlu_logical_fallacies": 0.0, "mmlu_machine_learning": 0.0, "mmlu_management": 0.0, "mmlu_marketing": 0.0, "mmlu_medical_genetics": 0.0, "mmlu_miscellaneous": 0.0, "mmlu_moral_disputes": 0.0, "mmlu_moral_scenarios": 0.0, "mmlu_nutrition": 0.0, "mmlu_other": "N/A", "mmlu_philosophy": 0.0, "mmlu_prehistory": 0.0, "mmlu_professional_accounting": 0.0, "mmlu_professional_law": 0.0, "mmlu_professional_medicine": 0.0, "mmlu_professional_psychology": 0.0, "mmlu_public_relations": 0.0, "mmlu_security_studies": 0.0, "mmlu_social_sciences": "N/A", "mmlu_sociology": 0.0, "mmlu_stem": "N/A", "mmlu_us_foreign_policy": 0.0, "mmlu_virology": 0.0, "mmlu_world_religions": 0.0 }, "n-shot": { "mmlu": 0, "mmlu_abstract_algebra": 5, "mmlu_anatomy": 5, "mmlu_astronomy": 5, "mmlu_business_ethics": 5, "mmlu_clinical_knowledge": 5, "mmlu_college_biology": 5, "mmlu_college_chemistry": 5, "mmlu_college_computer_science": 5, "mmlu_college_mathematics": 5, "mmlu_college_medicine": 5, "mmlu_college_physics": 5, "mmlu_computer_security": 5, "mmlu_conceptual_physics": 5, "mmlu_econometrics": 5, "mmlu_electrical_engineering": 5, "mmlu_elementary_mathematics": 5, "mmlu_formal_logic": 5, "mmlu_global_facts": 5, "mmlu_high_school_biology": 5, "mmlu_high_school_chemistry": 5, "mmlu_high_school_computer_science": 5, "mmlu_high_school_european_history": 5, "mmlu_high_school_geography": 5, "mmlu_high_school_government_and_politics": 5, "mmlu_high_school_macroeconomics": 5, "mmlu_high_school_mathematics": 5, "mmlu_high_school_microeconomics": 5, "mmlu_high_school_physics": 5, "mmlu_high_school_psychology": 5, "mmlu_high_school_statistics": 5, "mmlu_high_school_us_history": 5, "mmlu_high_school_world_history": 5, "mmlu_human_aging": 5, "mmlu_human_sexuality": 5, "mmlu_humanities": 5, "mmlu_international_law": 5, "mmlu_jurisprudence": 5, "mmlu_logical_fallacies": 5, "mmlu_machine_learning": 5, "mmlu_management": 5, "mmlu_marketing": 5, "mmlu_medical_genetics": 5, "mmlu_miscellaneous": 5, "mmlu_moral_disputes": 5, "mmlu_moral_scenarios": 5, "mmlu_nutrition": 5, "mmlu_other": 5, "mmlu_philosophy": 5, "mmlu_prehistory": 5, "mmlu_professional_accounting": 5, "mmlu_professional_law": 5, "mmlu_professional_medicine": 5, "mmlu_professional_psychology": 5, "mmlu_public_relations": 5, "mmlu_security_studies": 5, "mmlu_social_sciences": 5, "mmlu_sociology": 5, "mmlu_stem": 5, "mmlu_us_foreign_policy": 5, "mmlu_virology": 5, "mmlu_world_religions": 5 }, "config": { "model": "hf", "model_args": "pretrained=HuggingFaceH4/mistral-7b-ift,revision=v31.0,dtype=bfloat16", "batch_size": "auto", "batch_sizes": [ 16 ], "device": null, "use_cache": null, "limit": null, "bootstrap_iters": 100000, "gen_kwargs": null }, "git_hash": "42f4bc9" }