DeepSeek-R1-Distill-Qwen-32B-gptq-4bit / lm-eval-DeepSeek-R1-Distill-Qwen-32B-gptq-4bit.json

Upload lm-eval-DeepSeek-R1-Distill-Qwen-32B-gptq-4bit.json

300726d verified 7 days ago

108 kB

	{
	"results": {
	"arc_challenge": {
	"alias": "arc_challenge",
	"acc,none": 0.4104095563139932,
	"acc_stderr,none": 0.01437492219264266,
	"acc_norm,none": 0.3984641638225256,
	"acc_norm_stderr,none": 0.014306946052735565
	},
	"mmlu": {
	"acc,none": 0.2973935336846603,
	"acc_stderr,none": 0.0038396438372097657,
	"alias": "mmlu"
	},
	"mmlu_humanities": {
	"acc,none": 0.32051009564293303,
	"acc_stderr,none": 0.006759141773447958,
	"alias": " - humanities"
	},
	"mmlu_formal_logic": {
	"alias": " - formal_logic",
	"acc,none": 0.36507936507936506,
	"acc_stderr,none": 0.043062412591271554
	},
	"mmlu_high_school_european_history": {
	"alias": " - high_school_european_history",
	"acc,none": 0.3090909090909091,
	"acc_stderr,none": 0.036085410115739666
	},
	"mmlu_high_school_us_history": {
	"alias": " - high_school_us_history",
	"acc,none": 0.3235294117647059,
	"acc_stderr,none": 0.03283472056108567
	},
	"mmlu_high_school_world_history": {
	"alias": " - high_school_world_history",
	"acc,none": 0.45569620253164556,
	"acc_stderr,none": 0.03241920684693334
	},
	"mmlu_international_law": {
	"alias": " - international_law",
	"acc,none": 0.256198347107438,
	"acc_stderr,none": 0.03984979653302871
	},
	"mmlu_jurisprudence": {
	"alias": " - jurisprudence",
	"acc,none": 0.28703703703703703,
	"acc_stderr,none": 0.043733130409147614
	},
	"mmlu_logical_fallacies": {
	"alias": " - logical_fallacies",
	"acc,none": 0.3558282208588957,
	"acc_stderr,none": 0.03761521380046734
	},
	"mmlu_moral_disputes": {
	"alias": " - moral_disputes",
	"acc,none": 0.407514450867052,
	"acc_stderr,none": 0.0264545781469315
	},
	"mmlu_moral_scenarios": {
	"alias": " - moral_scenarios",
	"acc,none": 0.3642458100558659,
	"acc_stderr,none": 0.016094338768474596
	},
	"mmlu_philosophy": {
	"alias": " - philosophy",
	"acc,none": 0.3247588424437299,
	"acc_stderr,none": 0.026596782287697046
	},
	"mmlu_prehistory": {
	"alias": " - prehistory",
	"acc,none": 0.2777777777777778,
	"acc_stderr,none": 0.024922001168886335
	},
	"mmlu_professional_law": {
	"alias": " - professional_law",
	"acc,none": 0.26010430247718386,
	"acc_stderr,none": 0.011204382887823838
	},
	"mmlu_world_religions": {
	"alias": " - world_religions",
	"acc,none": 0.3508771929824561,
	"acc_stderr,none": 0.03660298834049164
	},
	"mmlu_other": {
	"acc,none": 0.2793691663984551,
	"acc_stderr,none": 0.008038833508038407,
	"alias": " - other"
	},
	"mmlu_business_ethics": {
	"alias": " - business_ethics",
	"acc,none": 0.35,
	"acc_stderr,none": 0.04793724854411019
	},
	"mmlu_clinical_knowledge": {
	"alias": " - clinical_knowledge",
	"acc,none": 0.26037735849056604,
	"acc_stderr,none": 0.02700876609070809
	},
	"mmlu_college_medicine": {
	"alias": " - college_medicine",
	"acc,none": 0.2543352601156069,
	"acc_stderr,none": 0.0332055644308557
	},
	"mmlu_global_facts": {
	"alias": " - global_facts",
	"acc,none": 0.19,
	"acc_stderr,none": 0.039427724440366234
	},
	"mmlu_human_aging": {
	"alias": " - human_aging",
	"acc,none": 0.32286995515695066,
	"acc_stderr,none": 0.03138147637575498
	},
	"mmlu_management": {
	"alias": " - management",
	"acc,none": 0.1941747572815534,
	"acc_stderr,none": 0.03916667762822584
	},
	"mmlu_marketing": {
	"alias": " - marketing",
	"acc,none": 0.34615384615384615,
	"acc_stderr,none": 0.031166957367235903
	},
	"mmlu_medical_genetics": {
	"alias": " - medical_genetics",
	"acc,none": 0.29,
	"acc_stderr,none": 0.04560480215720683
	},
	"mmlu_miscellaneous": {
	"alias": " - miscellaneous",
	"acc,none": 0.28991060025542786,
	"acc_stderr,none": 0.01622501794477097
	},
	"mmlu_nutrition": {
	"alias": " - nutrition",
	"acc,none": 0.26143790849673204,
	"acc_stderr,none": 0.025160998214292456
	},
	"mmlu_professional_accounting": {
	"alias": " - professional_accounting",
	"acc,none": 0.28368794326241137,
	"acc_stderr,none": 0.02689170942834396
	},
	"mmlu_professional_medicine": {
	"alias": " - professional_medicine",
	"acc,none": 0.2426470588235294,
	"acc_stderr,none": 0.02604066247420127
	},
	"mmlu_virology": {
	"alias": " - virology",
	"acc,none": 0.27710843373493976,
	"acc_stderr,none": 0.03484331592680587
	},
	"mmlu_social_sciences": {
	"acc,none": 0.30321741956451087,
	"acc_stderr,none": 0.008259266274407914,
	"alias": " - social sciences"
	},
	"mmlu_econometrics": {
	"alias": " - econometrics",
	"acc,none": 0.2807017543859649,
	"acc_stderr,none": 0.042270544512321984
	},
	"mmlu_high_school_geography": {
	"alias": " - high_school_geography",
	"acc,none": 0.20202020202020202,
	"acc_stderr,none": 0.02860620428922987
	},
	"mmlu_high_school_government_and_politics": {
	"alias": " - high_school_government_and_politics",
	"acc,none": 0.21761658031088082,
	"acc_stderr,none": 0.029778663037752964
	},
	"mmlu_high_school_macroeconomics": {
	"alias": " - high_school_macroeconomics",
	"acc,none": 0.28974358974358977,
	"acc_stderr,none": 0.023000628243687978
	},
	"mmlu_high_school_microeconomics": {
	"alias": " - high_school_microeconomics",
	"acc,none": 0.31932773109243695,
	"acc_stderr,none": 0.030283995525884396
	},
	"mmlu_high_school_psychology": {
	"alias": " - high_school_psychology",
	"acc,none": 0.3504587155963303,
	"acc_stderr,none": 0.020456077599824457
	},
	"mmlu_human_sexuality": {
	"alias": " - human_sexuality",
	"acc,none": 0.33587786259541985,
	"acc_stderr,none": 0.04142313771996665
	},
	"mmlu_professional_psychology": {
	"alias": " - professional_psychology",
	"acc,none": 0.3464052287581699,
	"acc_stderr,none": 0.019249785691717217
	},
	"mmlu_public_relations": {
	"alias": " - public_relations",
	"acc,none": 0.2636363636363636,
	"acc_stderr,none": 0.04220224692971987
	},
	"mmlu_security_studies": {
	"alias": " - security_studies",
	"acc,none": 0.2612244897959184,
	"acc_stderr,none": 0.028123429335142797
	},
	"mmlu_sociology": {
	"alias": " - sociology",
	"acc,none": 0.2935323383084577,
	"acc_stderr,none": 0.03220024104534204
	},
	"mmlu_us_foreign_policy": {
	"alias": " - us_foreign_policy",
	"acc,none": 0.31,
	"acc_stderr,none": 0.04648231987117316
	},
	"mmlu_stem": {
	"acc,none": 0.27497621313035203,
	"acc_stderr,none": 0.007934123391489032,
	"alias": " - stem"
	},
	"mmlu_abstract_algebra": {
	"alias": " - abstract_algebra",
	"acc,none": 0.22,
	"acc_stderr,none": 0.04163331998932269
	},
	"mmlu_anatomy": {
	"alias": " - anatomy",
	"acc,none": 0.2518518518518518,
	"acc_stderr,none": 0.03749850709174021
	},
	"mmlu_astronomy": {
	"alias": " - astronomy",
	"acc,none": 0.24342105263157895,
	"acc_stderr,none": 0.034923496688842384
	},
	"mmlu_college_biology": {
	"alias": " - college_biology",
	"acc,none": 0.3680555555555556,
	"acc_stderr,none": 0.040329990539607195
	},
	"mmlu_college_chemistry": {
	"alias": " - college_chemistry",
	"acc,none": 0.26,
	"acc_stderr,none": 0.04408440022768077
	},
	"mmlu_college_computer_science": {
	"alias": " - college_computer_science",
	"acc,none": 0.29,
	"acc_stderr,none": 0.045604802157206845
	},
	"mmlu_college_mathematics": {
	"alias": " - college_mathematics",
	"acc,none": 0.21,
	"acc_stderr,none": 0.040936018074033256
	},
	"mmlu_college_physics": {
	"alias": " - college_physics",
	"acc,none": 0.29411764705882354,
	"acc_stderr,none": 0.04533838195929776
	},
	"mmlu_computer_security": {
	"alias": " - computer_security",
	"acc,none": 0.3,
	"acc_stderr,none": 0.046056618647183814
	},
	"mmlu_conceptual_physics": {
	"alias": " - conceptual_physics",
	"acc,none": 0.3276595744680851,
	"acc_stderr,none": 0.030683020843231008
	},
	"mmlu_electrical_engineering": {
	"alias": " - electrical_engineering",
	"acc,none": 0.2896551724137931,
	"acc_stderr,none": 0.03780019230438014
	},
	"mmlu_elementary_mathematics": {
	"alias": " - elementary_mathematics",
	"acc,none": 0.2962962962962963,
	"acc_stderr,none": 0.023517294335963286
	},
	"mmlu_high_school_biology": {
	"alias": " - high_school_biology",
	"acc,none": 0.2967741935483871,
	"acc_stderr,none": 0.025988500792411884
	},
	"mmlu_high_school_chemistry": {
	"alias": " - high_school_chemistry",
	"acc,none": 0.21182266009852216,
	"acc_stderr,none": 0.028748983689941072
	},
	"mmlu_high_school_computer_science": {
	"alias": " - high_school_computer_science",
	"acc,none": 0.35,
	"acc_stderr,none": 0.04793724854411018
	},
	"mmlu_high_school_mathematics": {
	"alias": " - high_school_mathematics",
	"acc,none": 0.21481481481481482,
	"acc_stderr,none": 0.025040443877000683
	},
	"mmlu_high_school_physics": {
	"alias": " - high_school_physics",
	"acc,none": 0.26490066225165565,
	"acc_stderr,none": 0.03603038545360384
	},
	"mmlu_high_school_statistics": {
	"alias": " - high_school_statistics",
	"acc,none": 0.22685185185185186,
	"acc_stderr,none": 0.028561650102422256
	},
	"mmlu_machine_learning": {
	"alias": " - machine_learning",
	"acc,none": 0.33035714285714285,
	"acc_stderr,none": 0.04464285714285713
	}
	},
	"groups": {
	"mmlu": {
	"acc,none": 0.2973935336846603,
	"acc_stderr,none": 0.0038396438372097657,
	"alias": "mmlu"
	},
	"mmlu_humanities": {
	"acc,none": 0.32051009564293303,
	"acc_stderr,none": 0.006759141773447958,
	"alias": " - humanities"
	},
	"mmlu_other": {
	"acc,none": 0.2793691663984551,
	"acc_stderr,none": 0.008038833508038407,
	"alias": " - other"
	},
	"mmlu_social_sciences": {
	"acc,none": 0.30321741956451087,
	"acc_stderr,none": 0.008259266274407914,
	"alias": " - social sciences"
	},
	"mmlu_stem": {
	"acc,none": 0.27497621313035203,
	"acc_stderr,none": 0.007934123391489032,
	"alias": " - stem"
	}
	},
	"group_subtasks": {
	"arc_challenge": [],
	"mmlu_humanities": [
	"mmlu_formal_logic",
	"mmlu_prehistory",
	"mmlu_world_religions",
	"mmlu_philosophy",
	"mmlu_high_school_world_history",
	"mmlu_professional_law",
	"mmlu_high_school_us_history",
	"mmlu_logical_fallacies",
	"mmlu_international_law",
	"mmlu_high_school_european_history",
	"mmlu_moral_disputes",
	"mmlu_moral_scenarios",
	"mmlu_jurisprudence"
	],
	"mmlu_social_sciences": [
	"mmlu_public_relations",
	"mmlu_sociology",
	"mmlu_security_studies",
	"mmlu_high_school_government_and_politics",
	"mmlu_high_school_psychology",
	"mmlu_human_sexuality",
	"mmlu_us_foreign_policy",
	"mmlu_high_school_microeconomics",
	"mmlu_econometrics",
	"mmlu_high_school_macroeconomics",
	"mmlu_high_school_geography",
	"mmlu_professional_psychology"
	],
	"mmlu_other": [
	"mmlu_medical_genetics",
	"mmlu_global_facts",
	"mmlu_marketing",
	"mmlu_college_medicine",
	"mmlu_human_aging",
	"mmlu_virology",
	"mmlu_business_ethics",
	"mmlu_clinical_knowledge",
	"mmlu_professional_medicine",
	"mmlu_nutrition",
	"mmlu_miscellaneous",
	"mmlu_professional_accounting",
	"mmlu_management"
	],
	"mmlu_stem": [
	"mmlu_conceptual_physics",
	"mmlu_high_school_chemistry",
	"mmlu_college_biology",
	"mmlu_college_chemistry",
	"mmlu_machine_learning",
	"mmlu_elementary_mathematics",
	"mmlu_abstract_algebra",
	"mmlu_astronomy",
	"mmlu_high_school_statistics",
	"mmlu_anatomy",
	"mmlu_college_mathematics",
	"mmlu_computer_security",
	"mmlu_college_computer_science",
	"mmlu_electrical_engineering",
	"mmlu_college_physics",
	"mmlu_high_school_computer_science",
	"mmlu_high_school_physics",
	"mmlu_high_school_biology",
	"mmlu_high_school_mathematics"
	],
	"mmlu": [
	"mmlu_stem",
	"mmlu_other",
	"mmlu_social_sciences",
	"mmlu_humanities"
	]
	},
	"configs": {
	"arc_challenge": {
	"task": "arc_challenge",
	"tag": [
	"ai2_arc"
	],
	"dataset_path": "allenai/ai2_arc",
	"dataset_name": "ARC-Challenge",
	"training_split": "train",
	"validation_split": "validation",
	"test_split": "test",
	"doc_to_text": "Question: {{question}}\nAnswer:",
	"doc_to_target": "{{choices.label.index(answerKey)}}",
	"doc_to_choice": "{{choices.text}}",
	"description": "",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	},
	{
	"metric": "acc_norm",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": true,
	"doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_abstract_algebra": {
	"task": "mmlu_abstract_algebra",
	"task_alias": "abstract_algebra",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "abstract_algebra",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_anatomy": {
	"task": "mmlu_anatomy",
	"task_alias": "anatomy",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "anatomy",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about anatomy.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_astronomy": {
	"task": "mmlu_astronomy",
	"task_alias": "astronomy",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "astronomy",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about astronomy.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_business_ethics": {
	"task": "mmlu_business_ethics",
	"task_alias": "business_ethics",
	"tag": "mmlu_other_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "business_ethics",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about business ethics.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_clinical_knowledge": {
	"task": "mmlu_clinical_knowledge",
	"task_alias": "clinical_knowledge",
	"tag": "mmlu_other_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "clinical_knowledge",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_college_biology": {
	"task": "mmlu_college_biology",
	"task_alias": "college_biology",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "college_biology",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about college biology.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_college_chemistry": {
	"task": "mmlu_college_chemistry",
	"task_alias": "college_chemistry",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "college_chemistry",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_college_computer_science": {
	"task": "mmlu_college_computer_science",
	"task_alias": "college_computer_science",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "college_computer_science",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about college computer science.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_college_mathematics": {
	"task": "mmlu_college_mathematics",
	"task_alias": "college_mathematics",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "college_mathematics",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_college_medicine": {
	"task": "mmlu_college_medicine",
	"task_alias": "college_medicine",
	"tag": "mmlu_other_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "college_medicine",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about college medicine.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_college_physics": {
	"task": "mmlu_college_physics",
	"task_alias": "college_physics",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "college_physics",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about college physics.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_computer_security": {
	"task": "mmlu_computer_security",
	"task_alias": "computer_security",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "computer_security",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about computer security.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_conceptual_physics": {
	"task": "mmlu_conceptual_physics",
	"task_alias": "conceptual_physics",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "conceptual_physics",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_econometrics": {
	"task": "mmlu_econometrics",
	"task_alias": "econometrics",
	"tag": "mmlu_social_sciences_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "econometrics",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about econometrics.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_electrical_engineering": {
	"task": "mmlu_electrical_engineering",
	"task_alias": "electrical_engineering",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "electrical_engineering",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_elementary_mathematics": {
	"task": "mmlu_elementary_mathematics",
	"task_alias": "elementary_mathematics",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "elementary_mathematics",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_formal_logic": {
	"task": "mmlu_formal_logic",
	"task_alias": "formal_logic",
	"tag": "mmlu_humanities_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "formal_logic",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about formal logic.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_global_facts": {
	"task": "mmlu_global_facts",
	"task_alias": "global_facts",
	"tag": "mmlu_other_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "global_facts",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about global facts.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_biology": {
	"task": "mmlu_high_school_biology",
	"task_alias": "high_school_biology",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_biology",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school biology.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_chemistry": {
	"task": "mmlu_high_school_chemistry",
	"task_alias": "high_school_chemistry",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_chemistry",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_computer_science": {
	"task": "mmlu_high_school_computer_science",
	"task_alias": "high_school_computer_science",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_computer_science",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_european_history": {
	"task": "mmlu_high_school_european_history",
	"task_alias": "high_school_european_history",
	"tag": "mmlu_humanities_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_european_history",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school european history.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_geography": {
	"task": "mmlu_high_school_geography",
	"task_alias": "high_school_geography",
	"tag": "mmlu_social_sciences_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_geography",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school geography.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_government_and_politics": {
	"task": "mmlu_high_school_government_and_politics",
	"task_alias": "high_school_government_and_politics",
	"tag": "mmlu_social_sciences_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_government_and_politics",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_macroeconomics": {
	"task": "mmlu_high_school_macroeconomics",
	"task_alias": "high_school_macroeconomics",
	"tag": "mmlu_social_sciences_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_macroeconomics",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_mathematics": {
	"task": "mmlu_high_school_mathematics",
	"task_alias": "high_school_mathematics",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_mathematics",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_microeconomics": {
	"task": "mmlu_high_school_microeconomics",
	"task_alias": "high_school_microeconomics",
	"tag": "mmlu_social_sciences_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_microeconomics",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_physics": {
	"task": "mmlu_high_school_physics",
	"task_alias": "high_school_physics",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_physics",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school physics.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_psychology": {
	"task": "mmlu_high_school_psychology",
	"task_alias": "high_school_psychology",
	"tag": "mmlu_social_sciences_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_psychology",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_statistics": {
	"task": "mmlu_high_school_statistics",
	"task_alias": "high_school_statistics",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_statistics",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_us_history": {
	"task": "mmlu_high_school_us_history",
	"task_alias": "high_school_us_history",
	"tag": "mmlu_humanities_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_us_history",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school us history.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_high_school_world_history": {
	"task": "mmlu_high_school_world_history",
	"task_alias": "high_school_world_history",
	"tag": "mmlu_humanities_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "high_school_world_history",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about high school world history.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_human_aging": {
	"task": "mmlu_human_aging",
	"task_alias": "human_aging",
	"tag": "mmlu_other_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "human_aging",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about human aging.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_human_sexuality": {
	"task": "mmlu_human_sexuality",
	"task_alias": "human_sexuality",
	"tag": "mmlu_social_sciences_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "human_sexuality",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_international_law": {
	"task": "mmlu_international_law",
	"task_alias": "international_law",
	"tag": "mmlu_humanities_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "international_law",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about international law.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_jurisprudence": {
	"task": "mmlu_jurisprudence",
	"task_alias": "jurisprudence",
	"tag": "mmlu_humanities_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "jurisprudence",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_logical_fallacies": {
	"task": "mmlu_logical_fallacies",
	"task_alias": "logical_fallacies",
	"tag": "mmlu_humanities_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "logical_fallacies",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_machine_learning": {
	"task": "mmlu_machine_learning",
	"task_alias": "machine_learning",
	"tag": "mmlu_stem_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "machine_learning",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about machine learning.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_management": {
	"task": "mmlu_management",
	"task_alias": "management",
	"tag": "mmlu_other_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "management",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about management.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_marketing": {
	"task": "mmlu_marketing",
	"task_alias": "marketing",
	"tag": "mmlu_other_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "marketing",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about marketing.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_medical_genetics": {
	"task": "mmlu_medical_genetics",
	"task_alias": "medical_genetics",
	"tag": "mmlu_other_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "medical_genetics",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_miscellaneous": {
	"task": "mmlu_miscellaneous",
	"task_alias": "miscellaneous",
	"tag": "mmlu_other_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "miscellaneous",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_moral_disputes": {
	"task": "mmlu_moral_disputes",
	"task_alias": "moral_disputes",
	"tag": "mmlu_humanities_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "moral_disputes",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_moral_scenarios": {
	"task": "mmlu_moral_scenarios",
	"task_alias": "moral_scenarios",
	"tag": "mmlu_humanities_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "moral_scenarios",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_nutrition": {
	"task": "mmlu_nutrition",
	"task_alias": "nutrition",
	"tag": "mmlu_other_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "nutrition",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about nutrition.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_philosophy": {
	"task": "mmlu_philosophy",
	"task_alias": "philosophy",
	"tag": "mmlu_humanities_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "philosophy",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about philosophy.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_prehistory": {
	"task": "mmlu_prehistory",
	"task_alias": "prehistory",
	"tag": "mmlu_humanities_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "prehistory",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about prehistory.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_professional_accounting": {
	"task": "mmlu_professional_accounting",
	"task_alias": "professional_accounting",
	"tag": "mmlu_other_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "professional_accounting",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_professional_law": {
	"task": "mmlu_professional_law",
	"task_alias": "professional_law",
	"tag": "mmlu_humanities_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "professional_law",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about professional law.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_professional_medicine": {
	"task": "mmlu_professional_medicine",
	"task_alias": "professional_medicine",
	"tag": "mmlu_other_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "professional_medicine",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_professional_psychology": {
	"task": "mmlu_professional_psychology",
	"task_alias": "professional_psychology",
	"tag": "mmlu_social_sciences_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "professional_psychology",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_public_relations": {
	"task": "mmlu_public_relations",
	"task_alias": "public_relations",
	"tag": "mmlu_social_sciences_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "public_relations",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about public relations.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_security_studies": {
	"task": "mmlu_security_studies",
	"task_alias": "security_studies",
	"tag": "mmlu_social_sciences_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "security_studies",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about security studies.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_sociology": {
	"task": "mmlu_sociology",
	"task_alias": "sociology",
	"tag": "mmlu_social_sciences_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "sociology",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about sociology.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_us_foreign_policy": {
	"task": "mmlu_us_foreign_policy",
	"task_alias": "us_foreign_policy",
	"tag": "mmlu_social_sciences_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "us_foreign_policy",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_virology": {
	"task": "mmlu_virology",
	"task_alias": "virology",
	"tag": "mmlu_other_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "virology",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about virology.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"mmlu_world_religions": {
	"task": "mmlu_world_religions",
	"task_alias": "world_religions",
	"tag": "mmlu_humanities_tasks",
	"dataset_path": "hails/mmlu_no_train",
	"dataset_name": "world_religions",
	"dataset_kwargs": {
	"trust_remote_code": true
	},
	"test_split": "test",
	"fewshot_split": "dev",
	"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
	"doc_to_target": "answer",
	"doc_to_choice": [
	"A",
	"B",
	"C",
	"D"
	],
	"description": "The following are multiple choice questions (with answers) about world religions.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "mean",
	"higher_is_better": true
	}
	],
	"output_type": "multiple_choice",
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	}
	},
	"versions": {
	"arc_challenge": 1.0,
	"mmlu": 2,
	"mmlu_abstract_algebra": 1.0,
	"mmlu_anatomy": 1.0,
	"mmlu_astronomy": 1.0,
	"mmlu_business_ethics": 1.0,
	"mmlu_clinical_knowledge": 1.0,
	"mmlu_college_biology": 1.0,
	"mmlu_college_chemistry": 1.0,
	"mmlu_college_computer_science": 1.0,
	"mmlu_college_mathematics": 1.0,
	"mmlu_college_medicine": 1.0,
	"mmlu_college_physics": 1.0,
	"mmlu_computer_security": 1.0,
	"mmlu_conceptual_physics": 1.0,
	"mmlu_econometrics": 1.0,
	"mmlu_electrical_engineering": 1.0,
	"mmlu_elementary_mathematics": 1.0,
	"mmlu_formal_logic": 1.0,
	"mmlu_global_facts": 1.0,
	"mmlu_high_school_biology": 1.0,
	"mmlu_high_school_chemistry": 1.0,
	"mmlu_high_school_computer_science": 1.0,
	"mmlu_high_school_european_history": 1.0,
	"mmlu_high_school_geography": 1.0,
	"mmlu_high_school_government_and_politics": 1.0,
	"mmlu_high_school_macroeconomics": 1.0,
	"mmlu_high_school_mathematics": 1.0,
	"mmlu_high_school_microeconomics": 1.0,
	"mmlu_high_school_physics": 1.0,
	"mmlu_high_school_psychology": 1.0,
	"mmlu_high_school_statistics": 1.0,
	"mmlu_high_school_us_history": 1.0,
	"mmlu_high_school_world_history": 1.0,
	"mmlu_human_aging": 1.0,
	"mmlu_human_sexuality": 1.0,
	"mmlu_humanities": 2,
	"mmlu_international_law": 1.0,
	"mmlu_jurisprudence": 1.0,
	"mmlu_logical_fallacies": 1.0,
	"mmlu_machine_learning": 1.0,
	"mmlu_management": 1.0,
	"mmlu_marketing": 1.0,
	"mmlu_medical_genetics": 1.0,
	"mmlu_miscellaneous": 1.0,
	"mmlu_moral_disputes": 1.0,
	"mmlu_moral_scenarios": 1.0,
	"mmlu_nutrition": 1.0,
	"mmlu_other": 2,
	"mmlu_philosophy": 1.0,
	"mmlu_prehistory": 1.0,
	"mmlu_professional_accounting": 1.0,
	"mmlu_professional_law": 1.0,
	"mmlu_professional_medicine": 1.0,
	"mmlu_professional_psychology": 1.0,
	"mmlu_public_relations": 1.0,
	"mmlu_security_studies": 1.0,
	"mmlu_social_sciences": 2,
	"mmlu_sociology": 1.0,
	"mmlu_stem": 2,
	"mmlu_us_foreign_policy": 1.0,
	"mmlu_virology": 1.0,
	"mmlu_world_religions": 1.0
	},
	"n-shot": {
	"arc_challenge": 0,
	"mmlu_abstract_algebra": 0,
	"mmlu_anatomy": 0,
	"mmlu_astronomy": 0,
	"mmlu_business_ethics": 0,
	"mmlu_clinical_knowledge": 0,
	"mmlu_college_biology": 0,
	"mmlu_college_chemistry": 0,
	"mmlu_college_computer_science": 0,
	"mmlu_college_mathematics": 0,
	"mmlu_college_medicine": 0,
	"mmlu_college_physics": 0,
	"mmlu_computer_security": 0,
	"mmlu_conceptual_physics": 0,
	"mmlu_econometrics": 0,
	"mmlu_electrical_engineering": 0,
	"mmlu_elementary_mathematics": 0,
	"mmlu_formal_logic": 0,
	"mmlu_global_facts": 0,
	"mmlu_high_school_biology": 0,
	"mmlu_high_school_chemistry": 0,
	"mmlu_high_school_computer_science": 0,
	"mmlu_high_school_european_history": 0,
	"mmlu_high_school_geography": 0,
	"mmlu_high_school_government_and_politics": 0,
	"mmlu_high_school_macroeconomics": 0,
	"mmlu_high_school_mathematics": 0,
	"mmlu_high_school_microeconomics": 0,
	"mmlu_high_school_physics": 0,
	"mmlu_high_school_psychology": 0,
	"mmlu_high_school_statistics": 0,
	"mmlu_high_school_us_history": 0,
	"mmlu_high_school_world_history": 0,
	"mmlu_human_aging": 0,
	"mmlu_human_sexuality": 0,
	"mmlu_international_law": 0,
	"mmlu_jurisprudence": 0,
	"mmlu_logical_fallacies": 0,
	"mmlu_machine_learning": 0,
	"mmlu_management": 0,
	"mmlu_marketing": 0,
	"mmlu_medical_genetics": 0,
	"mmlu_miscellaneous": 0,
	"mmlu_moral_disputes": 0,
	"mmlu_moral_scenarios": 0,
	"mmlu_nutrition": 0,
	"mmlu_philosophy": 0,
	"mmlu_prehistory": 0,
	"mmlu_professional_accounting": 0,
	"mmlu_professional_law": 0,
	"mmlu_professional_medicine": 0,
	"mmlu_professional_psychology": 0,
	"mmlu_public_relations": 0,
	"mmlu_security_studies": 0,
	"mmlu_sociology": 0,
	"mmlu_us_foreign_policy": 0,
	"mmlu_virology": 0,
	"mmlu_world_religions": 0
	},
	"higher_is_better": {
	"arc_challenge": {
	"acc": true,
	"acc_norm": true
	},
	"mmlu": {
	"acc": true
	},
	"mmlu_abstract_algebra": {
	"acc": true
	},
	"mmlu_anatomy": {
	"acc": true
	},
	"mmlu_astronomy": {
	"acc": true
	},
	"mmlu_business_ethics": {
	"acc": true
	},
	"mmlu_clinical_knowledge": {
	"acc": true
	},
	"mmlu_college_biology": {
	"acc": true
	},
	"mmlu_college_chemistry": {
	"acc": true
	},
	"mmlu_college_computer_science": {
	"acc": true
	},
	"mmlu_college_mathematics": {
	"acc": true
	},
	"mmlu_college_medicine": {
	"acc": true
	},
	"mmlu_college_physics": {
	"acc": true
	},
	"mmlu_computer_security": {
	"acc": true
	},
	"mmlu_conceptual_physics": {
	"acc": true
	},
	"mmlu_econometrics": {
	"acc": true
	},
	"mmlu_electrical_engineering": {
	"acc": true
	},
	"mmlu_elementary_mathematics": {
	"acc": true
	},
	"mmlu_formal_logic": {
	"acc": true
	},
	"mmlu_global_facts": {
	"acc": true
	},
	"mmlu_high_school_biology": {
	"acc": true
	},
	"mmlu_high_school_chemistry": {
	"acc": true
	},
	"mmlu_high_school_computer_science": {
	"acc": true
	},
	"mmlu_high_school_european_history": {
	"acc": true
	},
	"mmlu_high_school_geography": {
	"acc": true
	},
	"mmlu_high_school_government_and_politics": {
	"acc": true
	},
	"mmlu_high_school_macroeconomics": {
	"acc": true
	},
	"mmlu_high_school_mathematics": {
	"acc": true
	},
	"mmlu_high_school_microeconomics": {
	"acc": true
	},
	"mmlu_high_school_physics": {
	"acc": true
	},
	"mmlu_high_school_psychology": {
	"acc": true
	},
	"mmlu_high_school_statistics": {
	"acc": true
	},
	"mmlu_high_school_us_history": {
	"acc": true
	},
	"mmlu_high_school_world_history": {
	"acc": true
	},
	"mmlu_human_aging": {
	"acc": true
	},
	"mmlu_human_sexuality": {
	"acc": true
	},
	"mmlu_humanities": {
	"acc": true
	},
	"mmlu_international_law": {
	"acc": true
	},
	"mmlu_jurisprudence": {
	"acc": true
	},
	"mmlu_logical_fallacies": {
	"acc": true
	},
	"mmlu_machine_learning": {
	"acc": true
	},
	"mmlu_management": {
	"acc": true
	},
	"mmlu_marketing": {
	"acc": true
	},
	"mmlu_medical_genetics": {
	"acc": true
	},
	"mmlu_miscellaneous": {
	"acc": true
	},
	"mmlu_moral_disputes": {
	"acc": true
	},
	"mmlu_moral_scenarios": {
	"acc": true
	},
	"mmlu_nutrition": {
	"acc": true
	},
	"mmlu_other": {
	"acc": true
	},
	"mmlu_philosophy": {
	"acc": true
	},
	"mmlu_prehistory": {
	"acc": true
	},
	"mmlu_professional_accounting": {
	"acc": true
	},
	"mmlu_professional_law": {
	"acc": true
	},
	"mmlu_professional_medicine": {
	"acc": true
	},
	"mmlu_professional_psychology": {
	"acc": true
	},
	"mmlu_public_relations": {
	"acc": true
	},
	"mmlu_security_studies": {
	"acc": true
	},
	"mmlu_social_sciences": {
	"acc": true
	},
	"mmlu_sociology": {
	"acc": true
	},
	"mmlu_stem": {
	"acc": true
	},
	"mmlu_us_foreign_policy": {
	"acc": true
	},
	"mmlu_virology": {
	"acc": true
	},
	"mmlu_world_religions": {
	"acc": true
	}
	},
	"n-samples": {
	"mmlu_conceptual_physics": {
	"original": 235,
	"effective": 235
	},
	"mmlu_high_school_chemistry": {
	"original": 203,
	"effective": 203
	},
	"mmlu_college_biology": {
	"original": 144,
	"effective": 144
	},
	"mmlu_college_chemistry": {
	"original": 100,
	"effective": 100
	},
	"mmlu_machine_learning": {
	"original": 112,
	"effective": 112
	},
	"mmlu_elementary_mathematics": {
	"original": 378,
	"effective": 378
	},
	"mmlu_abstract_algebra": {
	"original": 100,
	"effective": 100
	},
	"mmlu_astronomy": {
	"original": 152,
	"effective": 152
	},
	"mmlu_high_school_statistics": {
	"original": 216,
	"effective": 216
	},
	"mmlu_anatomy": {
	"original": 135,
	"effective": 135
	},
	"mmlu_college_mathematics": {
	"original": 100,
	"effective": 100
	},
	"mmlu_computer_security": {
	"original": 100,
	"effective": 100
	},
	"mmlu_college_computer_science": {
	"original": 100,
	"effective": 100
	},
	"mmlu_electrical_engineering": {
	"original": 145,
	"effective": 145
	},
	"mmlu_college_physics": {
	"original": 102,
	"effective": 102
	},
	"mmlu_high_school_computer_science": {
	"original": 100,
	"effective": 100
	},
	"mmlu_high_school_physics": {
	"original": 151,
	"effective": 151
	},
	"mmlu_high_school_biology": {
	"original": 310,
	"effective": 310
	},
	"mmlu_high_school_mathematics": {
	"original": 270,
	"effective": 270
	},
	"mmlu_medical_genetics": {
	"original": 100,
	"effective": 100
	},
	"mmlu_global_facts": {
	"original": 100,
	"effective": 100
	},
	"mmlu_marketing": {
	"original": 234,
	"effective": 234
	},
	"mmlu_college_medicine": {
	"original": 173,
	"effective": 173
	},
	"mmlu_human_aging": {
	"original": 223,
	"effective": 223
	},
	"mmlu_virology": {
	"original": 166,
	"effective": 166
	},
	"mmlu_business_ethics": {
	"original": 100,
	"effective": 100
	},
	"mmlu_clinical_knowledge": {
	"original": 265,
	"effective": 265
	},
	"mmlu_professional_medicine": {
	"original": 272,
	"effective": 272
	},
	"mmlu_nutrition": {
	"original": 306,
	"effective": 306
	},
	"mmlu_miscellaneous": {
	"original": 783,
	"effective": 783
	},
	"mmlu_professional_accounting": {
	"original": 282,
	"effective": 282
	},
	"mmlu_management": {
	"original": 103,
	"effective": 103
	},
	"mmlu_public_relations": {
	"original": 110,
	"effective": 110
	},
	"mmlu_sociology": {
	"original": 201,
	"effective": 201
	},
	"mmlu_security_studies": {
	"original": 245,
	"effective": 245
	},
	"mmlu_high_school_government_and_politics": {
	"original": 193,
	"effective": 193
	},
	"mmlu_high_school_psychology": {
	"original": 545,
	"effective": 545
	},
	"mmlu_human_sexuality": {
	"original": 131,
	"effective": 131
	},
	"mmlu_us_foreign_policy": {
	"original": 100,
	"effective": 100
	},
	"mmlu_high_school_microeconomics": {
	"original": 238,
	"effective": 238
	},
	"mmlu_econometrics": {
	"original": 114,
	"effective": 114
	},
	"mmlu_high_school_macroeconomics": {
	"original": 390,
	"effective": 390
	},
	"mmlu_high_school_geography": {
	"original": 198,
	"effective": 198
	},
	"mmlu_professional_psychology": {
	"original": 612,
	"effective": 612
	},
	"mmlu_formal_logic": {
	"original": 126,
	"effective": 126
	},
	"mmlu_prehistory": {
	"original": 324,
	"effective": 324
	},
	"mmlu_world_religions": {
	"original": 171,
	"effective": 171
	},
	"mmlu_philosophy": {
	"original": 311,
	"effective": 311
	},
	"mmlu_high_school_world_history": {
	"original": 237,
	"effective": 237
	},
	"mmlu_professional_law": {
	"original": 1534,
	"effective": 1534
	},
	"mmlu_high_school_us_history": {
	"original": 204,
	"effective": 204
	},
	"mmlu_logical_fallacies": {
	"original": 163,
	"effective": 163
	},
	"mmlu_international_law": {
	"original": 121,
	"effective": 121
	},
	"mmlu_high_school_european_history": {
	"original": 165,
	"effective": 165
	},
	"mmlu_moral_disputes": {
	"original": 346,
	"effective": 346
	},
	"mmlu_moral_scenarios": {
	"original": 895,
	"effective": 895
	},
	"mmlu_jurisprudence": {
	"original": 108,
	"effective": 108
	},
	"arc_challenge": {
	"original": 1172,
	"effective": 1172
	}
	},
	"config": {
	"model": "hf",
	"model_args": "pretrained=DeepSeek-R1-Distill-Qwen-32B-gptq-4bit,gptqmodel=True",
	"model_num_parameters": 5736502272,
	"model_dtype": "torch.float16",
	"model_revision": "main",
	"model_sha": "",
	"batch_size": 1,
	"batch_sizes": [],
	"device": null,
	"use_cache": null,
	"limit": null,
	"bootstrap_iters": 100000,
	"gen_kwargs": null,
	"random_seed": 0,
	"numpy_seed": 1234,
	"torch_seed": 1234,
	"fewshot_seed": 1234
	},
	"git_hash": "b1f7295",
	"date": 1739700827.6612206,
	"pretty_env_info": "PyTorch version: 2.5.0+cu124\nIs debug build: False\nCUDA used to build PyTorch: 12.4\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 24.04.1 LTS (x86_64)\nGCC version: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.39\n\nPython version: 3.10.16 \| packaged by conda-forge \| (main, Dec 5 2024, 14:16:10) [GCC 13.3.0] (64-bit runtime)\nPython platform: Linux-6.8.0-1021-azure-x86_64-with-glibc2.39\nIs CUDA available: True\nCUDA runtime version: 12.0.140\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100 80GB PCIe\nNvidia driver version: 550.54.14\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 24\nOn-line CPU(s) list: 0-23\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 24\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.87\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves user_shstk clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 768 KiB (24 instances)\nL1i cache: 768 KiB (24 instances)\nL2 cache: 12 MiB (24 instances)\nL3 cache: 96 MiB (3 instances)\nNUMA node(s): 1\nNUMA node0 CPU(s): 0-23\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Vulnerable: Safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] gptqmodel==1.7.4+cu124torch2.5\n[pip3] numpy==1.26.4\n[pip3] torch==2.5.0\n[pip3] torchaudio==2.5.0+cu124\n[pip3] torchvision==0.20.0\n[pip3] triton==3.1.0\n[conda] cudatoolkit 11.8.0 h4ba93d1_13 conda-forge\n[conda] gptqmodel 1.7.4+cu124torch2.5 pypi_0 pypi\n[conda] libtorch 2.5.1 cpu_generic_h90be84d_11 conda-forge\n[conda] nomkl 1.0 h5ca1d4c_0 conda-forge\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.5.0 pypi_0 pypi\n[conda] torchaudio 2.5.0+cu124 pypi_0 pypi\n[conda] torchvision 0.20.0 pypi_0 pypi\n[conda] triton 3.1.0 pypi_0 pypi",
	"transformers_version": "4.48.2",
	"upper_git_hash": null,
	"tokenizer_pad_token": [
	"<unk>",
	"128244"
	],
	"tokenizer_eos_token": [
	"<｜end▁of▁sentence｜>",
	"151643"
	],
	"tokenizer_bos_token": [
	"<｜begin▁of▁sentence｜>",
	"151646"
	],
	"eot_token_id": 151643,
	"max_length": 131072,
	"task_hashes": {
	"mmlu_conceptual_physics": "e44e160e9de59b66b27f9845286798fa84b1f978d47fab83916947fdce9d558a",
	"mmlu_high_school_chemistry": "7b0aa278f3d736a17f8272ffebfb731297b2ef07665fa775a01b48a98584e63d",
	"mmlu_college_biology": "67badf9a8a9eadf4793aed211c6114416aac791dfa1e2b3b1a2b2d0418fe83d9",
	"mmlu_college_chemistry": "7cf7bbde59ef7d3ac36ad6a1f09823c10151aaa37313c53909a9249820ac42a7",
	"mmlu_machine_learning": "5d44d6f9356eea8613bccda3f7c0de4abe0652a7087e3cc8ed5ad0a54601b4aa",
	"mmlu_elementary_mathematics": "e61d9026fa8be2b227195ed796ff07c3a609225ef081bcb98ae6c5e4dd85dc9c",
	"mmlu_abstract_algebra": "9af9d070b9cf616053d53513d6b61870f758da004979b1d2a5f5237fda624bc2",
	"mmlu_astronomy": "d1158c2b8a534c85e29a0ef6dbf97f76d80d2a2b8ed6dfd4ea46f97bd7650652",
	"mmlu_high_school_statistics": "251d2d800b0c4b41d4cd68ca9b7b7e1003f02a56da68113d1bf6777da34a15c3",
	"mmlu_anatomy": "bcff5e29b7522dd3a9e38abad9ed23138347ffbdb5c56d4d0a9a0e7f8d1b781c",
	"mmlu_college_mathematics": "7de9fea1cc7d1df17e55653035838e0e9251e16df8d23fe9905feb3f52c2d295",
	"mmlu_computer_security": "bcf86d055731822d43b9ea71e361a44b2a1e884f74790e189256e69e1154437b",
	"mmlu_college_computer_science": "7e6c0169773451f5a3e224fc9da2c21e0f218283d8176a75d9a44f1ec37fbce1",
	"mmlu_electrical_engineering": "d64db9707012d81e0924fa3a785ff419b53faf7e9a13781be3596287aa7f6f28",
	"mmlu_college_physics": "7ceeb19eb9785b4372c4facb61eb51e4d9a199a249c6576796bd7e77147f96d4",
	"mmlu_high_school_computer_science": "feafcc88c4efe7b7d2d4058672c4895b985bf0947f983055a771da137827f467",
	"mmlu_high_school_physics": "fd4e84b9f983621e61e4f4efd6833659bad4c1dd4ec5e11669dc620cd11124a7",
	"mmlu_high_school_biology": "788524572d0bad43c502d0377893abeba647f95d7bbfda03ac0c170f3c2ebe3d",
	"mmlu_high_school_mathematics": "3498bee9814d09ea3c4785a8ff7e059122902d599fe669f20b548832eee5a210",
	"mmlu_medical_genetics": "38c379b96a9b5dad162fc3ca02bd6860720931228c1bbdb563444676aa560d8b",
	"mmlu_global_facts": "6a24ab3772264f824d644148a9e594c9311a9abb832e2e01ebb8e30295f21a73",
	"mmlu_marketing": "3c35ab45fe954b4a99c49a1371de508eb804a1d404755ed579700f8b07339eb3",
	"mmlu_college_medicine": "a99f25347c639f44d59f14688e63b372d410c6fdeda20460e6da0887b58a08dc",
	"mmlu_human_aging": "2d3b53f216e855735c554601d17806d772cc0aa32c6e0a3ad56be0b596c10597",
	"mmlu_virology": "e37defad93a8fed48d1cd444617c16cec31a4654b01bd3b5865248a278355cb0",
	"mmlu_business_ethics": "f43c4f96059d3dd1223e3a2b5f7bef9c0e8c934d48174bfc067207642614683e",
	"mmlu_clinical_knowledge": "bbfbeaaf991677c9abf8a2542c73858a18d762cead9d0c5fde817657edc3e0bb",
	"mmlu_professional_medicine": "d9d3810d35fda320ed3acf4bd62f08e82cda46317d211255a1685f32995de66f",
	"mmlu_nutrition": "ee22165a7e6e52d891002d0c3f96577cb970623135cb95c5ccd70a498beacd11",
	"mmlu_miscellaneous": "7318105f736493ead94ab2560f0741a3c281478c46068a525bbb5644c30cf883",
	"mmlu_professional_accounting": "1f81d684d03a3c64557a8f17b21d0b6db31e3652708c5cfa2de01cdbfd1fba64",
	"mmlu_management": "b86f3b3e03c37b74eb84b97c410c9e591cc30283bad42b3310cb1667f4d6c496",
	"mmlu_public_relations": "cd4b6c1cf0e2535103be225017caac7de8d0629aa2852428df5a44e1979ee8ed",
	"mmlu_sociology": "41ca5f38e3c2da34027eddf39502da75b70cbaf1f4875b0db40c672c2a7c9e77",
	"mmlu_security_studies": "9090d84bb431ed36c63d72c46ab14c8c831ef37d3721656d9619c08817a272e8",
	"mmlu_high_school_government_and_politics": "2c9006d3e6a17d51db851e5a08687293b4160d9448d14b89589f2de669fdb3f3",
	"mmlu_high_school_psychology": "3fb17b0fa3d98fada697e70060bfd231d4e78e1af47a0cfa2ea27a67dfe6ec5e",
	"mmlu_human_sexuality": "4e3e5f3c9f9c77c24553cfd398f4c132c9dbe05588ddda82d614e53ab09132a8",
	"mmlu_us_foreign_policy": "a9ca53568f3daa8c23e5c3746ba889ad65eb69d3f0add04f880edeb6f968eee5",
	"mmlu_high_school_microeconomics": "b2d3b5f2ea56678868b043b83da826713bf4faa330f612bdbd7a8a0a264737f9",
	"mmlu_econometrics": "1bc2e20960800cabc5871cfeb1273faebf7135cdf422b4584cf4df229f6c7796",
	"mmlu_high_school_macroeconomics": "af91dcb4b4f0fdd03f1aa2400e00a85515f4cc3d78921c9344192de60512511a",
	"mmlu_high_school_geography": "e2b10ea70273ee1ea3067d2e2d0b01227a4e845f348b966caa83344e0587d0b2",
	"mmlu_professional_psychology": "5df9d5c9d999c021a60f157a18403bcaf39f7187f9c28b7b21b04ed73f3e0efc",
	"mmlu_formal_logic": "904413b3c48c56c27d51a026467f86dd7ebb996e59b4a3547e5523b98e1ca76c",
	"mmlu_prehistory": "01ff8c2029d2e816a5c547cb11b06f16602f4cb9543e2d0ad3c7b7e7d4567930",
	"mmlu_world_religions": "362d75721ca689e2004dbaaa888f8f0733f15366aa2ebf4aa9ee7e33ee03003b",
	"mmlu_philosophy": "7d37dcd5c5a2fdb4783fb47aac7422f78f549b2dd699fd62ea7b43b2ed3fa9e4",
	"mmlu_high_school_world_history": "638f3897c2875cd8b74d8f860bfc0684e80f2ad05dfc3f10013635256ea14ba5",
	"mmlu_professional_law": "944247baa44b214a437996293db7462d35b8bd9e70ebb63a37e1d07215c5e3db",
	"mmlu_high_school_us_history": "4dd38134bf4919c2b7efffd8740c0ea47a7ae26fa4415ea58be296e698ac119c",
	"mmlu_logical_fallacies": "2abf7f08f106c4f1d107b38934447d184962341f0424ec3473d2ae53f775919d",
	"mmlu_international_law": "03942fc6ba90528f4cf7d9c9ea7ad11af33ce40043dd4eaed9504c5d3d740de2",
	"mmlu_high_school_european_history": "d296775eca79ec0a144615e167665255e0db747c166c0baebc63e0cef2bce142",
	"mmlu_moral_disputes": "9df8fb79547677e18eb458a8e05119b70144f617a09383c04fce8827a90f4778",
	"mmlu_moral_scenarios": "de57e5892ade37f040a28c23608285fbbaf33bf9b1dd123382611f4de1528fd0",
	"mmlu_jurisprudence": "a03927ba52301ee52619a8d2c0a270945d6f1b5a84b3a23678595bb8b1bcf9c1",
	"arc_challenge": "84e51e80f164ae698e98637454ad407407ccc6ed14441621fe4a5e7f399d1907"
	},
	"model_source": "hf",
	"model_name": "DeepSeek-R1-Distill-Qwen-32B-gptq-4bit",
	"model_name_sanitized": "DeepSeek-R1-Distill-Qwen-32B-gptq-4bit",
	"system_instruction": null,
	"system_instruction_sha": null,
	"fewshot_as_multiturn": false,
	"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n'}}{% endif %}",
	"chat_template_sha": "56a1447ad31926fdc21fb07e56e5642bd9c850c4f52d8c8af7bbe5f079a84f5f",
	"start_time": 567333.679121267,
	"end_time": 569114.181343909,
	"total_evaluation_time_seconds": "1780.5022226419533"
	}