lewtun's picture
lewtun HF staff
Upload eval_results/Qwen/Qwen1.5-0.5B-Chat/main/eval_mmlu.json with huggingface_hub
bacab84 verified
raw
history blame
80.2 kB
{
"results": {
"mmlu": {
"acc,none": 0.3245976356644353,
"acc_stderr,none": 0.003912389812489731,
"alias": "mmlu"
},
"mmlu_humanities": {
"alias": " - humanities",
"acc,none": 0.32986184909670563,
"acc_stderr,none": 0.006773943268013009
},
"mmlu_formal_logic": {
"alias": " - formal_logic",
"acc,none": 0.23015873015873015,
"acc_stderr,none": 0.03764950879790606
},
"mmlu_high_school_european_history": {
"alias": " - high_school_european_history",
"acc,none": 0.48484848484848486,
"acc_stderr,none": 0.03902551007374449
},
"mmlu_high_school_us_history": {
"alias": " - high_school_us_history",
"acc,none": 0.4117647058823529,
"acc_stderr,none": 0.034542365853806094
},
"mmlu_high_school_world_history": {
"alias": " - high_school_world_history",
"acc,none": 0.45569620253164556,
"acc_stderr,none": 0.03241920684693334
},
"mmlu_international_law": {
"alias": " - international_law",
"acc,none": 0.47107438016528924,
"acc_stderr,none": 0.04556710331269498
},
"mmlu_jurisprudence": {
"alias": " - jurisprudence",
"acc,none": 0.4166666666666667,
"acc_stderr,none": 0.04766075165356462
},
"mmlu_logical_fallacies": {
"alias": " - logical_fallacies",
"acc,none": 0.31901840490797545,
"acc_stderr,none": 0.03661997551073836
},
"mmlu_moral_disputes": {
"alias": " - moral_disputes",
"acc,none": 0.4161849710982659,
"acc_stderr,none": 0.026538189104705488
},
"mmlu_moral_scenarios": {
"alias": " - moral_scenarios",
"acc,none": 0.23798882681564246,
"acc_stderr,none": 0.014242630070574892
},
"mmlu_philosophy": {
"alias": " - philosophy",
"acc,none": 0.40192926045016075,
"acc_stderr,none": 0.027846476005930477
},
"mmlu_prehistory": {
"alias": " - prehistory",
"acc,none": 0.33024691358024694,
"acc_stderr,none": 0.026168298456732846
},
"mmlu_professional_law": {
"alias": " - professional_law",
"acc,none": 0.29726205997392435,
"acc_stderr,none": 0.011673346173086036
},
"mmlu_world_religions": {
"alias": " - world_religions",
"acc,none": 0.30409356725146197,
"acc_stderr,none": 0.03528211258245231
},
"mmlu_other": {
"alias": " - other",
"acc,none": 0.3405214032829096,
"acc_stderr,none": 0.008449694012074094
},
"mmlu_business_ethics": {
"alias": " - business_ethics",
"acc,none": 0.42,
"acc_stderr,none": 0.049604496374885836
},
"mmlu_clinical_knowledge": {
"alias": " - clinical_knowledge",
"acc,none": 0.3471698113207547,
"acc_stderr,none": 0.029300101705549652
},
"mmlu_college_medicine": {
"alias": " - college_medicine",
"acc,none": 0.3063583815028902,
"acc_stderr,none": 0.03514942551267437
},
"mmlu_global_facts": {
"alias": " - global_facts",
"acc,none": 0.31,
"acc_stderr,none": 0.04648231987117316
},
"mmlu_human_aging": {
"alias": " - human_aging",
"acc,none": 0.2600896860986547,
"acc_stderr,none": 0.029442495585857497
},
"mmlu_management": {
"alias": " - management",
"acc,none": 0.46601941747572817,
"acc_stderr,none": 0.0493929144727348
},
"mmlu_marketing": {
"alias": " - marketing",
"acc,none": 0.45726495726495725,
"acc_stderr,none": 0.03263622596380688
},
"mmlu_medical_genetics": {
"alias": " - medical_genetics",
"acc,none": 0.36,
"acc_stderr,none": 0.04824181513244218
},
"mmlu_miscellaneous": {
"alias": " - miscellaneous",
"acc,none": 0.34610472541507026,
"acc_stderr,none": 0.01701196526641207
},
"mmlu_nutrition": {
"alias": " - nutrition",
"acc,none": 0.3790849673202614,
"acc_stderr,none": 0.02778014120702335
},
"mmlu_professional_accounting": {
"alias": " - professional_accounting",
"acc,none": 0.25886524822695034,
"acc_stderr,none": 0.026129572527180848
},
"mmlu_professional_medicine": {
"alias": " - professional_medicine",
"acc,none": 0.2610294117647059,
"acc_stderr,none": 0.026679252270103124
},
"mmlu_virology": {
"alias": " - virology",
"acc,none": 0.3614457831325301,
"acc_stderr,none": 0.0374005938202932
},
"mmlu_social_sciences": {
"alias": " - social_sciences",
"acc,none": 0.3350666233344166,
"acc_stderr,none": 0.00840831530400127
},
"mmlu_econometrics": {
"alias": " - econometrics",
"acc,none": 0.22807017543859648,
"acc_stderr,none": 0.03947152782669415
},
"mmlu_high_school_geography": {
"alias": " - high_school_geography",
"acc,none": 0.31313131313131315,
"acc_stderr,none": 0.03304205087813652
},
"mmlu_high_school_government_and_politics": {
"alias": " - high_school_government_and_politics",
"acc,none": 0.32124352331606215,
"acc_stderr,none": 0.033699508685490674
},
"mmlu_high_school_macroeconomics": {
"alias": " - high_school_macroeconomics",
"acc,none": 0.2512820512820513,
"acc_stderr,none": 0.021992016662370547
},
"mmlu_high_school_microeconomics": {
"alias": " - high_school_microeconomics",
"acc,none": 0.31512605042016806,
"acc_stderr,none": 0.03017680828897434
},
"mmlu_high_school_psychology": {
"alias": " - high_school_psychology",
"acc,none": 0.3908256880733945,
"acc_stderr,none": 0.02092005834611107
},
"mmlu_human_sexuality": {
"alias": " - human_sexuality",
"acc,none": 0.37404580152671757,
"acc_stderr,none": 0.042438692422305246
},
"mmlu_professional_psychology": {
"alias": " - professional_psychology",
"acc,none": 0.3235294117647059,
"acc_stderr,none": 0.018926082916083397
},
"mmlu_public_relations": {
"alias": " - public_relations",
"acc,none": 0.4,
"acc_stderr,none": 0.0469237132203465
},
"mmlu_security_studies": {
"alias": " - security_studies",
"acc,none": 0.2163265306122449,
"acc_stderr,none": 0.02635891633490404
},
"mmlu_sociology": {
"alias": " - sociology",
"acc,none": 0.5223880597014925,
"acc_stderr,none": 0.035319879302087305
},
"mmlu_us_foreign_policy": {
"alias": " - us_foreign_policy",
"acc,none": 0.46,
"acc_stderr,none": 0.05009082659620333
},
"mmlu_stem": {
"alias": " - stem",
"acc,none": 0.29083412622898824,
"acc_stderr,none": 0.008047086659488232
},
"mmlu_abstract_algebra": {
"alias": " - abstract_algebra",
"acc,none": 0.31,
"acc_stderr,none": 0.04648231987117316
},
"mmlu_anatomy": {
"alias": " - anatomy",
"acc,none": 0.2814814814814815,
"acc_stderr,none": 0.03885004245800255
},
"mmlu_astronomy": {
"alias": " - astronomy",
"acc,none": 0.26973684210526316,
"acc_stderr,none": 0.03611780560284898
},
"mmlu_college_biology": {
"alias": " - college_biology",
"acc,none": 0.2777777777777778,
"acc_stderr,none": 0.037455547914624576
},
"mmlu_college_chemistry": {
"alias": " - college_chemistry",
"acc,none": 0.34,
"acc_stderr,none": 0.04760952285695235
},
"mmlu_college_computer_science": {
"alias": " - college_computer_science",
"acc,none": 0.34,
"acc_stderr,none": 0.04760952285695235
},
"mmlu_college_mathematics": {
"alias": " - college_mathematics",
"acc,none": 0.32,
"acc_stderr,none": 0.046882617226215034
},
"mmlu_college_physics": {
"alias": " - college_physics",
"acc,none": 0.2647058823529412,
"acc_stderr,none": 0.043898699568087785
},
"mmlu_computer_security": {
"alias": " - computer_security",
"acc,none": 0.39,
"acc_stderr,none": 0.04902071300001975
},
"mmlu_conceptual_physics": {
"alias": " - conceptual_physics",
"acc,none": 0.24680851063829787,
"acc_stderr,none": 0.028185441301234116
},
"mmlu_electrical_engineering": {
"alias": " - electrical_engineering",
"acc,none": 0.45517241379310347,
"acc_stderr,none": 0.04149886942192117
},
"mmlu_elementary_mathematics": {
"alias": " - elementary_mathematics",
"acc,none": 0.24338624338624337,
"acc_stderr,none": 0.022101128787415436
},
"mmlu_high_school_biology": {
"alias": " - high_school_biology",
"acc,none": 0.2870967741935484,
"acc_stderr,none": 0.025736542745594525
},
"mmlu_high_school_chemistry": {
"alias": " - high_school_chemistry",
"acc,none": 0.270935960591133,
"acc_stderr,none": 0.031270907132976984
},
"mmlu_high_school_computer_science": {
"alias": " - high_school_computer_science",
"acc,none": 0.37,
"acc_stderr,none": 0.048523658709391
},
"mmlu_high_school_mathematics": {
"alias": " - high_school_mathematics",
"acc,none": 0.24444444444444444,
"acc_stderr,none": 0.02620276653465215
},
"mmlu_high_school_physics": {
"alias": " - high_school_physics",
"acc,none": 0.2847682119205298,
"acc_stderr,none": 0.03684881521389023
},
"mmlu_high_school_statistics": {
"alias": " - high_school_statistics",
"acc,none": 0.23148148148148148,
"acc_stderr,none": 0.02876511171804696
},
"mmlu_machine_learning": {
"alias": " - machine_learning",
"acc,none": 0.4017857142857143,
"acc_stderr,none": 0.04653333146973646
}
},
"groups": {
"mmlu": {
"acc,none": 0.3245976356644353,
"acc_stderr,none": 0.003912389812489731,
"alias": "mmlu"
},
"mmlu_humanities": {
"alias": " - humanities",
"acc,none": 0.32986184909670563,
"acc_stderr,none": 0.006773943268013009
},
"mmlu_other": {
"alias": " - other",
"acc,none": 0.3405214032829096,
"acc_stderr,none": 0.008449694012074094
},
"mmlu_social_sciences": {
"alias": " - social_sciences",
"acc,none": 0.3350666233344166,
"acc_stderr,none": 0.00840831530400127
},
"mmlu_stem": {
"alias": " - stem",
"acc,none": 0.29083412622898824,
"acc_stderr,none": 0.008047086659488232
}
},
"configs": {
"mmlu_abstract_algebra": {
"task": "mmlu_abstract_algebra",
"task_alias": "abstract_algebra",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "abstract_algebra",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_anatomy": {
"task": "mmlu_anatomy",
"task_alias": "anatomy",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "anatomy",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about anatomy.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_astronomy": {
"task": "mmlu_astronomy",
"task_alias": "astronomy",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "astronomy",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about astronomy.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_business_ethics": {
"task": "mmlu_business_ethics",
"task_alias": "business_ethics",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "business_ethics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about business ethics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_clinical_knowledge": {
"task": "mmlu_clinical_knowledge",
"task_alias": "clinical_knowledge",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "clinical_knowledge",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_college_biology": {
"task": "mmlu_college_biology",
"task_alias": "college_biology",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "college_biology",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about college biology.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_college_chemistry": {
"task": "mmlu_college_chemistry",
"task_alias": "college_chemistry",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "college_chemistry",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_college_computer_science": {
"task": "mmlu_college_computer_science",
"task_alias": "college_computer_science",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "college_computer_science",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about college computer science.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_college_mathematics": {
"task": "mmlu_college_mathematics",
"task_alias": "college_mathematics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "college_mathematics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_college_medicine": {
"task": "mmlu_college_medicine",
"task_alias": "college_medicine",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "college_medicine",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about college medicine.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_college_physics": {
"task": "mmlu_college_physics",
"task_alias": "college_physics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "college_physics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about college physics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_computer_security": {
"task": "mmlu_computer_security",
"task_alias": "computer_security",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "computer_security",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about computer security.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_conceptual_physics": {
"task": "mmlu_conceptual_physics",
"task_alias": "conceptual_physics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "conceptual_physics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_econometrics": {
"task": "mmlu_econometrics",
"task_alias": "econometrics",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "econometrics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about econometrics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_electrical_engineering": {
"task": "mmlu_electrical_engineering",
"task_alias": "electrical_engineering",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "electrical_engineering",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_elementary_mathematics": {
"task": "mmlu_elementary_mathematics",
"task_alias": "elementary_mathematics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "elementary_mathematics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_formal_logic": {
"task": "mmlu_formal_logic",
"task_alias": "formal_logic",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "formal_logic",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about formal logic.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_global_facts": {
"task": "mmlu_global_facts",
"task_alias": "global_facts",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "global_facts",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about global facts.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_biology": {
"task": "mmlu_high_school_biology",
"task_alias": "high_school_biology",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_biology",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school biology.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_chemistry": {
"task": "mmlu_high_school_chemistry",
"task_alias": "high_school_chemistry",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_chemistry",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_computer_science": {
"task": "mmlu_high_school_computer_science",
"task_alias": "high_school_computer_science",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_computer_science",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_european_history": {
"task": "mmlu_high_school_european_history",
"task_alias": "high_school_european_history",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_european_history",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school european history.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_geography": {
"task": "mmlu_high_school_geography",
"task_alias": "high_school_geography",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_geography",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school geography.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_government_and_politics": {
"task": "mmlu_high_school_government_and_politics",
"task_alias": "high_school_government_and_politics",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_government_and_politics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_macroeconomics": {
"task": "mmlu_high_school_macroeconomics",
"task_alias": "high_school_macroeconomics",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_macroeconomics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_mathematics": {
"task": "mmlu_high_school_mathematics",
"task_alias": "high_school_mathematics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_mathematics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_microeconomics": {
"task": "mmlu_high_school_microeconomics",
"task_alias": "high_school_microeconomics",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_microeconomics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_physics": {
"task": "mmlu_high_school_physics",
"task_alias": "high_school_physics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_physics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school physics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_psychology": {
"task": "mmlu_high_school_psychology",
"task_alias": "high_school_psychology",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_psychology",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_statistics": {
"task": "mmlu_high_school_statistics",
"task_alias": "high_school_statistics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_statistics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_us_history": {
"task": "mmlu_high_school_us_history",
"task_alias": "high_school_us_history",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_us_history",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school us history.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_world_history": {
"task": "mmlu_high_school_world_history",
"task_alias": "high_school_world_history",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "high_school_world_history",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school world history.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_human_aging": {
"task": "mmlu_human_aging",
"task_alias": "human_aging",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "human_aging",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about human aging.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_human_sexuality": {
"task": "mmlu_human_sexuality",
"task_alias": "human_sexuality",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "human_sexuality",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_international_law": {
"task": "mmlu_international_law",
"task_alias": "international_law",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "international_law",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about international law.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_jurisprudence": {
"task": "mmlu_jurisprudence",
"task_alias": "jurisprudence",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "jurisprudence",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_logical_fallacies": {
"task": "mmlu_logical_fallacies",
"task_alias": "logical_fallacies",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "logical_fallacies",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_machine_learning": {
"task": "mmlu_machine_learning",
"task_alias": "machine_learning",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "machine_learning",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about machine learning.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_management": {
"task": "mmlu_management",
"task_alias": "management",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "management",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about management.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_marketing": {
"task": "mmlu_marketing",
"task_alias": "marketing",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "marketing",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about marketing.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_medical_genetics": {
"task": "mmlu_medical_genetics",
"task_alias": "medical_genetics",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "medical_genetics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_miscellaneous": {
"task": "mmlu_miscellaneous",
"task_alias": "miscellaneous",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "miscellaneous",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_moral_disputes": {
"task": "mmlu_moral_disputes",
"task_alias": "moral_disputes",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "moral_disputes",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_moral_scenarios": {
"task": "mmlu_moral_scenarios",
"task_alias": "moral_scenarios",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "moral_scenarios",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_nutrition": {
"task": "mmlu_nutrition",
"task_alias": "nutrition",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "nutrition",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about nutrition.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_philosophy": {
"task": "mmlu_philosophy",
"task_alias": "philosophy",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "philosophy",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about philosophy.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_prehistory": {
"task": "mmlu_prehistory",
"task_alias": "prehistory",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "prehistory",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about prehistory.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_professional_accounting": {
"task": "mmlu_professional_accounting",
"task_alias": "professional_accounting",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "professional_accounting",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_professional_law": {
"task": "mmlu_professional_law",
"task_alias": "professional_law",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "professional_law",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about professional law.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_professional_medicine": {
"task": "mmlu_professional_medicine",
"task_alias": "professional_medicine",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "professional_medicine",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_professional_psychology": {
"task": "mmlu_professional_psychology",
"task_alias": "professional_psychology",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "professional_psychology",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_public_relations": {
"task": "mmlu_public_relations",
"task_alias": "public_relations",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "public_relations",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about public relations.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_security_studies": {
"task": "mmlu_security_studies",
"task_alias": "security_studies",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "security_studies",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about security studies.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_sociology": {
"task": "mmlu_sociology",
"task_alias": "sociology",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "sociology",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about sociology.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_us_foreign_policy": {
"task": "mmlu_us_foreign_policy",
"task_alias": "us_foreign_policy",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "us_foreign_policy",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_virology": {
"task": "mmlu_virology",
"task_alias": "virology",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "virology",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about virology.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_world_religions": {
"task": "mmlu_world_religions",
"task_alias": "world_religions",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "hails/mmlu_no_train",
"dataset_name": "world_religions",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about world religions.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
}
},
"versions": {
"mmlu": "N/A",
"mmlu_abstract_algebra": 0.0,
"mmlu_anatomy": 0.0,
"mmlu_astronomy": 0.0,
"mmlu_business_ethics": 0.0,
"mmlu_clinical_knowledge": 0.0,
"mmlu_college_biology": 0.0,
"mmlu_college_chemistry": 0.0,
"mmlu_college_computer_science": 0.0,
"mmlu_college_mathematics": 0.0,
"mmlu_college_medicine": 0.0,
"mmlu_college_physics": 0.0,
"mmlu_computer_security": 0.0,
"mmlu_conceptual_physics": 0.0,
"mmlu_econometrics": 0.0,
"mmlu_electrical_engineering": 0.0,
"mmlu_elementary_mathematics": 0.0,
"mmlu_formal_logic": 0.0,
"mmlu_global_facts": 0.0,
"mmlu_high_school_biology": 0.0,
"mmlu_high_school_chemistry": 0.0,
"mmlu_high_school_computer_science": 0.0,
"mmlu_high_school_european_history": 0.0,
"mmlu_high_school_geography": 0.0,
"mmlu_high_school_government_and_politics": 0.0,
"mmlu_high_school_macroeconomics": 0.0,
"mmlu_high_school_mathematics": 0.0,
"mmlu_high_school_microeconomics": 0.0,
"mmlu_high_school_physics": 0.0,
"mmlu_high_school_psychology": 0.0,
"mmlu_high_school_statistics": 0.0,
"mmlu_high_school_us_history": 0.0,
"mmlu_high_school_world_history": 0.0,
"mmlu_human_aging": 0.0,
"mmlu_human_sexuality": 0.0,
"mmlu_humanities": "N/A",
"mmlu_international_law": 0.0,
"mmlu_jurisprudence": 0.0,
"mmlu_logical_fallacies": 0.0,
"mmlu_machine_learning": 0.0,
"mmlu_management": 0.0,
"mmlu_marketing": 0.0,
"mmlu_medical_genetics": 0.0,
"mmlu_miscellaneous": 0.0,
"mmlu_moral_disputes": 0.0,
"mmlu_moral_scenarios": 0.0,
"mmlu_nutrition": 0.0,
"mmlu_other": "N/A",
"mmlu_philosophy": 0.0,
"mmlu_prehistory": 0.0,
"mmlu_professional_accounting": 0.0,
"mmlu_professional_law": 0.0,
"mmlu_professional_medicine": 0.0,
"mmlu_professional_psychology": 0.0,
"mmlu_public_relations": 0.0,
"mmlu_security_studies": 0.0,
"mmlu_social_sciences": "N/A",
"mmlu_sociology": 0.0,
"mmlu_stem": "N/A",
"mmlu_us_foreign_policy": 0.0,
"mmlu_virology": 0.0,
"mmlu_world_religions": 0.0
},
"n-shot": {
"mmlu": 0,
"mmlu_abstract_algebra": 5,
"mmlu_anatomy": 5,
"mmlu_astronomy": 5,
"mmlu_business_ethics": 5,
"mmlu_clinical_knowledge": 5,
"mmlu_college_biology": 5,
"mmlu_college_chemistry": 5,
"mmlu_college_computer_science": 5,
"mmlu_college_mathematics": 5,
"mmlu_college_medicine": 5,
"mmlu_college_physics": 5,
"mmlu_computer_security": 5,
"mmlu_conceptual_physics": 5,
"mmlu_econometrics": 5,
"mmlu_electrical_engineering": 5,
"mmlu_elementary_mathematics": 5,
"mmlu_formal_logic": 5,
"mmlu_global_facts": 5,
"mmlu_high_school_biology": 5,
"mmlu_high_school_chemistry": 5,
"mmlu_high_school_computer_science": 5,
"mmlu_high_school_european_history": 5,
"mmlu_high_school_geography": 5,
"mmlu_high_school_government_and_politics": 5,
"mmlu_high_school_macroeconomics": 5,
"mmlu_high_school_mathematics": 5,
"mmlu_high_school_microeconomics": 5,
"mmlu_high_school_physics": 5,
"mmlu_high_school_psychology": 5,
"mmlu_high_school_statistics": 5,
"mmlu_high_school_us_history": 5,
"mmlu_high_school_world_history": 5,
"mmlu_human_aging": 5,
"mmlu_human_sexuality": 5,
"mmlu_humanities": 5,
"mmlu_international_law": 5,
"mmlu_jurisprudence": 5,
"mmlu_logical_fallacies": 5,
"mmlu_machine_learning": 5,
"mmlu_management": 5,
"mmlu_marketing": 5,
"mmlu_medical_genetics": 5,
"mmlu_miscellaneous": 5,
"mmlu_moral_disputes": 5,
"mmlu_moral_scenarios": 5,
"mmlu_nutrition": 5,
"mmlu_other": 5,
"mmlu_philosophy": 5,
"mmlu_prehistory": 5,
"mmlu_professional_accounting": 5,
"mmlu_professional_law": 5,
"mmlu_professional_medicine": 5,
"mmlu_professional_psychology": 5,
"mmlu_public_relations": 5,
"mmlu_security_studies": 5,
"mmlu_social_sciences": 5,
"mmlu_sociology": 5,
"mmlu_stem": 5,
"mmlu_us_foreign_policy": 5,
"mmlu_virology": 5,
"mmlu_world_religions": 5
},
"config": {
"model": "hf",
"model_args": "pretrained=Qwen/Qwen1.5-0.5B-Chat,revision=main,dtype=bfloat16",
"batch_size": "auto",
"batch_sizes": [
8
],
"device": null,
"use_cache": null,
"limit": null,
"bootstrap_iters": 100000,
"gen_kwargs": null
},
"git_hash": "2d2e67f"
}