open-r1-eval-leaderboard
/
eval_results
/HuggingFaceH4
/qwen-1.5-1.8b-dpo
/v1.13
/mmlu
/results_2024-03-27T16-06-54.401883.json
![edbeeching's picture](https://cdn-avatars.huggingface.co/v1/production/uploads/1644220542819-noauth.jpeg)
edbeeching
HF staff
Upload eval_results/HuggingFaceH4/qwen-1.5-1.8b-dpo/v1.13/mmlu/results_2024-03-27T16-06-54.401883.json with huggingface_hub
e3afa16
verified
{ | |
"config_general": { | |
"lighteval_sha": "?", | |
"num_fewshot_seeds": 1, | |
"override_batch_size": 1, | |
"max_samples": null, | |
"job_id": "", | |
"start_time": 169825.381866122, | |
"end_time": 170696.599000437, | |
"total_evaluation_time_secondes": "871.2171343149967", | |
"model_name": "HuggingFaceH4/qwen-1.5-1.8b-dpo", | |
"model_sha": "a2be15b94de31aba8d978874a5e94b551ae32ba1", | |
"model_dtype": "torch.bfloat16", | |
"model_size": "3.79 GB", | |
"config": null | |
}, | |
"results": { | |
"leaderboard|mmlu:abstract_algebra|5": { | |
"acc": 0.26, | |
"acc_stderr": 0.04408440022768078 | |
}, | |
"leaderboard|mmlu:anatomy|5": { | |
"acc": 0.45185185185185184, | |
"acc_stderr": 0.04299268905480864 | |
}, | |
"leaderboard|mmlu:astronomy|5": { | |
"acc": 0.5131578947368421, | |
"acc_stderr": 0.04067533136309173 | |
}, | |
"leaderboard|mmlu:business_ethics|5": { | |
"acc": 0.42, | |
"acc_stderr": 0.049604496374885836 | |
}, | |
"leaderboard|mmlu:clinical_knowledge|5": { | |
"acc": 0.5433962264150943, | |
"acc_stderr": 0.030656748696739435 | |
}, | |
"leaderboard|mmlu:college_biology|5": { | |
"acc": 0.4583333333333333, | |
"acc_stderr": 0.04166666666666666 | |
}, | |
"leaderboard|mmlu:college_chemistry|5": { | |
"acc": 0.33, | |
"acc_stderr": 0.047258156262526045 | |
}, | |
"leaderboard|mmlu:college_computer_science|5": { | |
"acc": 0.42, | |
"acc_stderr": 0.049604496374885836 | |
}, | |
"leaderboard|mmlu:college_mathematics|5": { | |
"acc": 0.27, | |
"acc_stderr": 0.0446196043338474 | |
}, | |
"leaderboard|mmlu:college_medicine|5": { | |
"acc": 0.4624277456647399, | |
"acc_stderr": 0.0380168510452446 | |
}, | |
"leaderboard|mmlu:college_physics|5": { | |
"acc": 0.3235294117647059, | |
"acc_stderr": 0.0465501041131962 | |
}, | |
"leaderboard|mmlu:computer_security|5": { | |
"acc": 0.57, | |
"acc_stderr": 0.04975698519562428 | |
}, | |
"leaderboard|mmlu:conceptual_physics|5": { | |
"acc": 0.4, | |
"acc_stderr": 0.03202563076101737 | |
}, | |
"leaderboard|mmlu:econometrics|5": { | |
"acc": 0.3333333333333333, | |
"acc_stderr": 0.044346007015849245 | |
}, | |
"leaderboard|mmlu:electrical_engineering|5": { | |
"acc": 0.4896551724137931, | |
"acc_stderr": 0.041657747757287644 | |
}, | |
"leaderboard|mmlu:elementary_mathematics|5": { | |
"acc": 0.41005291005291006, | |
"acc_stderr": 0.025331202438944406 | |
}, | |
"leaderboard|mmlu:formal_logic|5": { | |
"acc": 0.29365079365079366, | |
"acc_stderr": 0.04073524322147125 | |
}, | |
"leaderboard|mmlu:global_facts|5": { | |
"acc": 0.3, | |
"acc_stderr": 0.046056618647183814 | |
}, | |
"leaderboard|mmlu:high_school_biology|5": { | |
"acc": 0.5387096774193548, | |
"acc_stderr": 0.02835863485983694 | |
}, | |
"leaderboard|mmlu:high_school_chemistry|5": { | |
"acc": 0.3891625615763547, | |
"acc_stderr": 0.034304624161038716 | |
}, | |
"leaderboard|mmlu:high_school_computer_science|5": { | |
"acc": 0.53, | |
"acc_stderr": 0.05016135580465919 | |
}, | |
"leaderboard|mmlu:high_school_european_history|5": { | |
"acc": 0.6121212121212121, | |
"acc_stderr": 0.038049136539710114 | |
}, | |
"leaderboard|mmlu:high_school_geography|5": { | |
"acc": 0.5454545454545454, | |
"acc_stderr": 0.03547601494006938 | |
}, | |
"leaderboard|mmlu:high_school_government_and_politics|5": { | |
"acc": 0.6787564766839378, | |
"acc_stderr": 0.033699508685490674 | |
}, | |
"leaderboard|mmlu:high_school_macroeconomics|5": { | |
"acc": 0.441025641025641, | |
"acc_stderr": 0.025174048384000742 | |
}, | |
"leaderboard|mmlu:high_school_mathematics|5": { | |
"acc": 0.2962962962962963, | |
"acc_stderr": 0.027840811495871934 | |
}, | |
"leaderboard|mmlu:high_school_microeconomics|5": { | |
"acc": 0.47058823529411764, | |
"acc_stderr": 0.03242225027115006 | |
}, | |
"leaderboard|mmlu:high_school_physics|5": { | |
"acc": 0.304635761589404, | |
"acc_stderr": 0.03757949922943342 | |
}, | |
"leaderboard|mmlu:high_school_psychology|5": { | |
"acc": 0.6146788990825688, | |
"acc_stderr": 0.02086585085279412 | |
}, | |
"leaderboard|mmlu:high_school_statistics|5": { | |
"acc": 0.3333333333333333, | |
"acc_stderr": 0.03214952147802748 | |
}, | |
"leaderboard|mmlu:high_school_us_history|5": { | |
"acc": 0.5098039215686274, | |
"acc_stderr": 0.03508637358630572 | |
}, | |
"leaderboard|mmlu:high_school_world_history|5": { | |
"acc": 0.6075949367088608, | |
"acc_stderr": 0.03178471874564729 | |
}, | |
"leaderboard|mmlu:human_aging|5": { | |
"acc": 0.47533632286995514, | |
"acc_stderr": 0.033516951676526276 | |
}, | |
"leaderboard|mmlu:human_sexuality|5": { | |
"acc": 0.4961832061068702, | |
"acc_stderr": 0.043851623256015534 | |
}, | |
"leaderboard|mmlu:international_law|5": { | |
"acc": 0.5950413223140496, | |
"acc_stderr": 0.04481137755942469 | |
}, | |
"leaderboard|mmlu:jurisprudence|5": { | |
"acc": 0.5833333333333334, | |
"acc_stderr": 0.04766075165356461 | |
}, | |
"leaderboard|mmlu:logical_fallacies|5": { | |
"acc": 0.4723926380368098, | |
"acc_stderr": 0.0392237829061099 | |
}, | |
"leaderboard|mmlu:machine_learning|5": { | |
"acc": 0.33035714285714285, | |
"acc_stderr": 0.04464285714285712 | |
}, | |
"leaderboard|mmlu:management|5": { | |
"acc": 0.6504854368932039, | |
"acc_stderr": 0.047211885060971716 | |
}, | |
"leaderboard|mmlu:marketing|5": { | |
"acc": 0.7649572649572649, | |
"acc_stderr": 0.027778835904935434 | |
}, | |
"leaderboard|mmlu:medical_genetics|5": { | |
"acc": 0.53, | |
"acc_stderr": 0.05016135580465919 | |
}, | |
"leaderboard|mmlu:miscellaneous|5": { | |
"acc": 0.6091954022988506, | |
"acc_stderr": 0.017448366067062526 | |
}, | |
"leaderboard|mmlu:moral_disputes|5": { | |
"acc": 0.49710982658959535, | |
"acc_stderr": 0.026918645383239015 | |
}, | |
"leaderboard|mmlu:moral_scenarios|5": { | |
"acc": 0.27262569832402234, | |
"acc_stderr": 0.014893391735249588 | |
}, | |
"leaderboard|mmlu:nutrition|5": { | |
"acc": 0.5359477124183006, | |
"acc_stderr": 0.028555827516528787 | |
}, | |
"leaderboard|mmlu:philosophy|5": { | |
"acc": 0.5080385852090032, | |
"acc_stderr": 0.02839442137098453 | |
}, | |
"leaderboard|mmlu:prehistory|5": { | |
"acc": 0.48148148148148145, | |
"acc_stderr": 0.027801656212323667 | |
}, | |
"leaderboard|mmlu:professional_accounting|5": { | |
"acc": 0.3971631205673759, | |
"acc_stderr": 0.02918980567358709 | |
}, | |
"leaderboard|mmlu:professional_law|5": { | |
"acc": 0.36962190352020863, | |
"acc_stderr": 0.012328445778575255 | |
}, | |
"leaderboard|mmlu:professional_medicine|5": { | |
"acc": 0.45955882352941174, | |
"acc_stderr": 0.03027332507734576 | |
}, | |
"leaderboard|mmlu:professional_psychology|5": { | |
"acc": 0.4035947712418301, | |
"acc_stderr": 0.019848280168401167 | |
}, | |
"leaderboard|mmlu:public_relations|5": { | |
"acc": 0.509090909090909, | |
"acc_stderr": 0.0478833976870286 | |
}, | |
"leaderboard|mmlu:security_studies|5": { | |
"acc": 0.47346938775510206, | |
"acc_stderr": 0.03196412734523272 | |
}, | |
"leaderboard|mmlu:sociology|5": { | |
"acc": 0.5472636815920398, | |
"acc_stderr": 0.03519702717576915 | |
}, | |
"leaderboard|mmlu:us_foreign_policy|5": { | |
"acc": 0.66, | |
"acc_stderr": 0.04760952285695237 | |
}, | |
"leaderboard|mmlu:virology|5": { | |
"acc": 0.3855421686746988, | |
"acc_stderr": 0.037891344246115496 | |
}, | |
"leaderboard|mmlu:world_religions|5": { | |
"acc": 0.6023391812865497, | |
"acc_stderr": 0.03753638955761691 | |
}, | |
"leaderboard|mmlu:_average|5": { | |
"acc": 0.46897683319859057, | |
"acc_stderr": 0.036336574094773054 | |
} | |
}, | |
"versions": { | |
"leaderboard|mmlu:abstract_algebra|5": 0, | |
"leaderboard|mmlu:anatomy|5": 0, | |
"leaderboard|mmlu:astronomy|5": 0, | |
"leaderboard|mmlu:business_ethics|5": 0, | |
"leaderboard|mmlu:clinical_knowledge|5": 0, | |
"leaderboard|mmlu:college_biology|5": 0, | |
"leaderboard|mmlu:college_chemistry|5": 0, | |
"leaderboard|mmlu:college_computer_science|5": 0, | |
"leaderboard|mmlu:college_mathematics|5": 0, | |
"leaderboard|mmlu:college_medicine|5": 0, | |
"leaderboard|mmlu:college_physics|5": 0, | |
"leaderboard|mmlu:computer_security|5": 0, | |
"leaderboard|mmlu:conceptual_physics|5": 0, | |
"leaderboard|mmlu:econometrics|5": 0, | |
"leaderboard|mmlu:electrical_engineering|5": 0, | |
"leaderboard|mmlu:elementary_mathematics|5": 0, | |
"leaderboard|mmlu:formal_logic|5": 0, | |
"leaderboard|mmlu:global_facts|5": 0, | |
"leaderboard|mmlu:high_school_biology|5": 0, | |
"leaderboard|mmlu:high_school_chemistry|5": 0, | |
"leaderboard|mmlu:high_school_computer_science|5": 0, | |
"leaderboard|mmlu:high_school_european_history|5": 0, | |
"leaderboard|mmlu:high_school_geography|5": 0, | |
"leaderboard|mmlu:high_school_government_and_politics|5": 0, | |
"leaderboard|mmlu:high_school_macroeconomics|5": 0, | |
"leaderboard|mmlu:high_school_mathematics|5": 0, | |
"leaderboard|mmlu:high_school_microeconomics|5": 0, | |
"leaderboard|mmlu:high_school_physics|5": 0, | |
"leaderboard|mmlu:high_school_psychology|5": 0, | |
"leaderboard|mmlu:high_school_statistics|5": 0, | |
"leaderboard|mmlu:high_school_us_history|5": 0, | |
"leaderboard|mmlu:high_school_world_history|5": 0, | |
"leaderboard|mmlu:human_aging|5": 0, | |
"leaderboard|mmlu:human_sexuality|5": 0, | |
"leaderboard|mmlu:international_law|5": 0, | |
"leaderboard|mmlu:jurisprudence|5": 0, | |
"leaderboard|mmlu:logical_fallacies|5": 0, | |
"leaderboard|mmlu:machine_learning|5": 0, | |
"leaderboard|mmlu:management|5": 0, | |
"leaderboard|mmlu:marketing|5": 0, | |
"leaderboard|mmlu:medical_genetics|5": 0, | |
"leaderboard|mmlu:miscellaneous|5": 0, | |
"leaderboard|mmlu:moral_disputes|5": 0, | |
"leaderboard|mmlu:moral_scenarios|5": 0, | |
"leaderboard|mmlu:nutrition|5": 0, | |
"leaderboard|mmlu:philosophy|5": 0, | |
"leaderboard|mmlu:prehistory|5": 0, | |
"leaderboard|mmlu:professional_accounting|5": 0, | |
"leaderboard|mmlu:professional_law|5": 0, | |
"leaderboard|mmlu:professional_medicine|5": 0, | |
"leaderboard|mmlu:professional_psychology|5": 0, | |
"leaderboard|mmlu:public_relations|5": 0, | |
"leaderboard|mmlu:security_studies|5": 0, | |
"leaderboard|mmlu:sociology|5": 0, | |
"leaderboard|mmlu:us_foreign_policy|5": 0, | |
"leaderboard|mmlu:virology|5": 0, | |
"leaderboard|mmlu:world_religions|5": 0 | |
}, | |
"config_tasks": { | |
"leaderboard|mmlu:abstract_algebra": { | |
"name": "mmlu:abstract_algebra", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "abstract_algebra", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:anatomy": { | |
"name": "mmlu:anatomy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "anatomy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 135, | |
"effective_num_docs": 135, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:astronomy": { | |
"name": "mmlu:astronomy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "astronomy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 152, | |
"effective_num_docs": 152, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:business_ethics": { | |
"name": "mmlu:business_ethics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "business_ethics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:clinical_knowledge": { | |
"name": "mmlu:clinical_knowledge", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "clinical_knowledge", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 265, | |
"effective_num_docs": 265, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:college_biology": { | |
"name": "mmlu:college_biology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_biology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 144, | |
"effective_num_docs": 144, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:college_chemistry": { | |
"name": "mmlu:college_chemistry", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_chemistry", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:college_computer_science": { | |
"name": "mmlu:college_computer_science", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_computer_science", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:college_mathematics": { | |
"name": "mmlu:college_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:college_medicine": { | |
"name": "mmlu:college_medicine", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_medicine", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 173, | |
"effective_num_docs": 173, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:college_physics": { | |
"name": "mmlu:college_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "college_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 102, | |
"effective_num_docs": 102, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:computer_security": { | |
"name": "mmlu:computer_security", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "computer_security", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:conceptual_physics": { | |
"name": "mmlu:conceptual_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "conceptual_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 235, | |
"effective_num_docs": 235, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:econometrics": { | |
"name": "mmlu:econometrics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "econometrics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 114, | |
"effective_num_docs": 114, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:electrical_engineering": { | |
"name": "mmlu:electrical_engineering", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "electrical_engineering", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 145, | |
"effective_num_docs": 145, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:elementary_mathematics": { | |
"name": "mmlu:elementary_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "elementary_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 378, | |
"effective_num_docs": 378, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:formal_logic": { | |
"name": "mmlu:formal_logic", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "formal_logic", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 126, | |
"effective_num_docs": 126, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:global_facts": { | |
"name": "mmlu:global_facts", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "global_facts", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_biology": { | |
"name": "mmlu:high_school_biology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_biology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 310, | |
"effective_num_docs": 310, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_chemistry": { | |
"name": "mmlu:high_school_chemistry", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_chemistry", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 203, | |
"effective_num_docs": 203, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_computer_science": { | |
"name": "mmlu:high_school_computer_science", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_computer_science", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_european_history": { | |
"name": "mmlu:high_school_european_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_european_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 165, | |
"effective_num_docs": 165, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_geography": { | |
"name": "mmlu:high_school_geography", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_geography", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 198, | |
"effective_num_docs": 198, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_government_and_politics": { | |
"name": "mmlu:high_school_government_and_politics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_government_and_politics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 193, | |
"effective_num_docs": 193, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_macroeconomics": { | |
"name": "mmlu:high_school_macroeconomics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_macroeconomics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 390, | |
"effective_num_docs": 390, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_mathematics": { | |
"name": "mmlu:high_school_mathematics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_mathematics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 270, | |
"effective_num_docs": 270, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_microeconomics": { | |
"name": "mmlu:high_school_microeconomics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_microeconomics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 238, | |
"effective_num_docs": 238, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_physics": { | |
"name": "mmlu:high_school_physics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_physics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 151, | |
"effective_num_docs": 151, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_psychology": { | |
"name": "mmlu:high_school_psychology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_psychology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 545, | |
"effective_num_docs": 545, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_statistics": { | |
"name": "mmlu:high_school_statistics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_statistics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 216, | |
"effective_num_docs": 216, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_us_history": { | |
"name": "mmlu:high_school_us_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_us_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 204, | |
"effective_num_docs": 204, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:high_school_world_history": { | |
"name": "mmlu:high_school_world_history", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "high_school_world_history", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 237, | |
"effective_num_docs": 237, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:human_aging": { | |
"name": "mmlu:human_aging", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "human_aging", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 223, | |
"effective_num_docs": 223, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:human_sexuality": { | |
"name": "mmlu:human_sexuality", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "human_sexuality", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 131, | |
"effective_num_docs": 131, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:international_law": { | |
"name": "mmlu:international_law", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "international_law", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 121, | |
"effective_num_docs": 121, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:jurisprudence": { | |
"name": "mmlu:jurisprudence", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "jurisprudence", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 108, | |
"effective_num_docs": 108, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:logical_fallacies": { | |
"name": "mmlu:logical_fallacies", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "logical_fallacies", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 163, | |
"effective_num_docs": 163, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:machine_learning": { | |
"name": "mmlu:machine_learning", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "machine_learning", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 112, | |
"effective_num_docs": 112, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:management": { | |
"name": "mmlu:management", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "management", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 103, | |
"effective_num_docs": 103, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:marketing": { | |
"name": "mmlu:marketing", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "marketing", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 234, | |
"effective_num_docs": 234, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:medical_genetics": { | |
"name": "mmlu:medical_genetics", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "medical_genetics", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:miscellaneous": { | |
"name": "mmlu:miscellaneous", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "miscellaneous", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 783, | |
"effective_num_docs": 783, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:moral_disputes": { | |
"name": "mmlu:moral_disputes", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "moral_disputes", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 346, | |
"effective_num_docs": 346, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:moral_scenarios": { | |
"name": "mmlu:moral_scenarios", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "moral_scenarios", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 895, | |
"effective_num_docs": 895, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:nutrition": { | |
"name": "mmlu:nutrition", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "nutrition", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 306, | |
"effective_num_docs": 306, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:philosophy": { | |
"name": "mmlu:philosophy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "philosophy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 311, | |
"effective_num_docs": 311, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:prehistory": { | |
"name": "mmlu:prehistory", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "prehistory", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 324, | |
"effective_num_docs": 324, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:professional_accounting": { | |
"name": "mmlu:professional_accounting", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_accounting", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 282, | |
"effective_num_docs": 282, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:professional_law": { | |
"name": "mmlu:professional_law", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_law", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 1534, | |
"effective_num_docs": 1534, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:professional_medicine": { | |
"name": "mmlu:professional_medicine", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_medicine", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 272, | |
"effective_num_docs": 272, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:professional_psychology": { | |
"name": "mmlu:professional_psychology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "professional_psychology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 612, | |
"effective_num_docs": 612, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:public_relations": { | |
"name": "mmlu:public_relations", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "public_relations", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 110, | |
"effective_num_docs": 110, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:security_studies": { | |
"name": "mmlu:security_studies", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "security_studies", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 245, | |
"effective_num_docs": 245, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:sociology": { | |
"name": "mmlu:sociology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "sociology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 201, | |
"effective_num_docs": 201, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:us_foreign_policy": { | |
"name": "mmlu:us_foreign_policy", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "us_foreign_policy", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 100, | |
"effective_num_docs": 100, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:virology": { | |
"name": "mmlu:virology", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "virology", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 166, | |
"effective_num_docs": 166, | |
"trust_dataset": true | |
}, | |
"leaderboard|mmlu:world_religions": { | |
"name": "mmlu:world_religions", | |
"prompt_function": "mmlu_harness", | |
"hf_repo": "lighteval/mmlu", | |
"hf_subset": "world_religions", | |
"metric": [ | |
"loglikelihood_acc" | |
], | |
"hf_avail_splits": [ | |
"auxiliary_train", | |
"test", | |
"validation", | |
"dev" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": "dev", | |
"few_shots_select": "sequential", | |
"generation_size": 1, | |
"stop_sequence": [ | |
"\n" | |
], | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"leaderboard", | |
"mmlu" | |
], | |
"original_num_docs": 171, | |
"effective_num_docs": 171, | |
"trust_dataset": true | |
} | |
}, | |
"summary_tasks": { | |
"leaderboard|mmlu:abstract_algebra|5": { | |
"hashes": { | |
"hash_examples": "4c76229e00c9c0e9", | |
"hash_full_prompts": "a45d01c3409c889c", | |
"hash_input_tokens": "d0571b6ffb835507", | |
"hash_cont_tokens": "00520b0ec06da34f" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:anatomy|5": { | |
"hashes": { | |
"hash_examples": "6a1f8104dccbd33b", | |
"hash_full_prompts": "e245c6600e03cc32", | |
"hash_input_tokens": "8dd20ec55e9ad889", | |
"hash_cont_tokens": "263324e6ce7f9b36" | |
}, | |
"truncated": 0, | |
"non_truncated": 135, | |
"padded": 540, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:astronomy|5": { | |
"hashes": { | |
"hash_examples": "1302effa3a76ce4c", | |
"hash_full_prompts": "390f9bddf857ad04", | |
"hash_input_tokens": "81e8167c0c820f24", | |
"hash_cont_tokens": "18ba399c6801138e" | |
}, | |
"truncated": 0, | |
"non_truncated": 152, | |
"padded": 608, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:business_ethics|5": { | |
"hashes": { | |
"hash_examples": "03cb8bce5336419a", | |
"hash_full_prompts": "5504f893bc4f2fa1", | |
"hash_input_tokens": "668443aa86633b73", | |
"hash_cont_tokens": "00520b0ec06da34f" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:clinical_knowledge|5": { | |
"hashes": { | |
"hash_examples": "ffbb9c7b2be257f9", | |
"hash_full_prompts": "106ad0bab4b90b78", | |
"hash_input_tokens": "726c176b444e3c55", | |
"hash_cont_tokens": "9d7500060e0dd995" | |
}, | |
"truncated": 0, | |
"non_truncated": 265, | |
"padded": 1060, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:college_biology|5": { | |
"hashes": { | |
"hash_examples": "3ee77f176f38eb8e", | |
"hash_full_prompts": "59f9bdf2695cb226", | |
"hash_input_tokens": "7535ef44daca8b2e", | |
"hash_cont_tokens": "78a731af5d2f6472" | |
}, | |
"truncated": 0, | |
"non_truncated": 144, | |
"padded": 576, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:college_chemistry|5": { | |
"hashes": { | |
"hash_examples": "ce61a69c46d47aeb", | |
"hash_full_prompts": "3cac9b759fcff7a0", | |
"hash_input_tokens": "e98bdaf1fa27ef3b", | |
"hash_cont_tokens": "00520b0ec06da34f" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:college_computer_science|5": { | |
"hashes": { | |
"hash_examples": "32805b52d7d5daab", | |
"hash_full_prompts": "010b0cca35070130", | |
"hash_input_tokens": "40494a193cf906d1", | |
"hash_cont_tokens": "00520b0ec06da34f" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:college_mathematics|5": { | |
"hashes": { | |
"hash_examples": "55da1a0a0bd33722", | |
"hash_full_prompts": "511422eb9eefc773", | |
"hash_input_tokens": "2f512892d24b0086", | |
"hash_cont_tokens": "00520b0ec06da34f" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:college_medicine|5": { | |
"hashes": { | |
"hash_examples": "c33e143163049176", | |
"hash_full_prompts": "c8cc1a82a51a046e", | |
"hash_input_tokens": "41ba4385551feaf3", | |
"hash_cont_tokens": "699c8eb24e3e446b" | |
}, | |
"truncated": 0, | |
"non_truncated": 173, | |
"padded": 692, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:college_physics|5": { | |
"hashes": { | |
"hash_examples": "ebdab1cdb7e555df", | |
"hash_full_prompts": "e40721b5059c5818", | |
"hash_input_tokens": "1f357d859f4e78c2", | |
"hash_cont_tokens": "075997110cbe055e" | |
}, | |
"truncated": 0, | |
"non_truncated": 102, | |
"padded": 408, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:computer_security|5": { | |
"hashes": { | |
"hash_examples": "a24fd7d08a560921", | |
"hash_full_prompts": "946c9be5964ac44a", | |
"hash_input_tokens": "def9fb5a2fab003a", | |
"hash_cont_tokens": "00520b0ec06da34f" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:conceptual_physics|5": { | |
"hashes": { | |
"hash_examples": "8300977a79386993", | |
"hash_full_prompts": "506a4f6094cc40c9", | |
"hash_input_tokens": "b398cceaff8512f7", | |
"hash_cont_tokens": "f22daa6d4818086f" | |
}, | |
"truncated": 0, | |
"non_truncated": 235, | |
"padded": 940, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:econometrics|5": { | |
"hashes": { | |
"hash_examples": "ddde36788a04a46f", | |
"hash_full_prompts": "4ed2703f27f1ed05", | |
"hash_input_tokens": "cf227ca8af4bc815", | |
"hash_cont_tokens": "26791a0b1941b4c4" | |
}, | |
"truncated": 0, | |
"non_truncated": 114, | |
"padded": 456, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:electrical_engineering|5": { | |
"hashes": { | |
"hash_examples": "acbc5def98c19b3f", | |
"hash_full_prompts": "d8f4b3e11c23653c", | |
"hash_input_tokens": "295e278cbce7ed04", | |
"hash_cont_tokens": "3e336577994f6c0d" | |
}, | |
"truncated": 0, | |
"non_truncated": 145, | |
"padded": 580, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:elementary_mathematics|5": { | |
"hashes": { | |
"hash_examples": "146e61d07497a9bd", | |
"hash_full_prompts": "256d111bd15647ff", | |
"hash_input_tokens": "2474a420d7b931ff", | |
"hash_cont_tokens": "1d6bbfa8a67327c8" | |
}, | |
"truncated": 0, | |
"non_truncated": 378, | |
"padded": 1512, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:formal_logic|5": { | |
"hashes": { | |
"hash_examples": "8635216e1909a03f", | |
"hash_full_prompts": "1171d04f3b1a11f5", | |
"hash_input_tokens": "f269941d7dabea05", | |
"hash_cont_tokens": "60508d85eb7693a4" | |
}, | |
"truncated": 0, | |
"non_truncated": 126, | |
"padded": 504, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:global_facts|5": { | |
"hashes": { | |
"hash_examples": "30b315aa6353ee47", | |
"hash_full_prompts": "a7e56dbc074c7529", | |
"hash_input_tokens": "2036a912407797e6", | |
"hash_cont_tokens": "00520b0ec06da34f" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_biology|5": { | |
"hashes": { | |
"hash_examples": "c9136373af2180de", | |
"hash_full_prompts": "ad6e859ed978e04a", | |
"hash_input_tokens": "1bc8ad087ca8f65b", | |
"hash_cont_tokens": "d236ce982144e65f" | |
}, | |
"truncated": 0, | |
"non_truncated": 310, | |
"padded": 1240, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_chemistry|5": { | |
"hashes": { | |
"hash_examples": "b0661bfa1add6404", | |
"hash_full_prompts": "6eb9c04bcc8a8f2a", | |
"hash_input_tokens": "ead708921e3a1c93", | |
"hash_cont_tokens": "59f93238ec5aead6" | |
}, | |
"truncated": 0, | |
"non_truncated": 203, | |
"padded": 812, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_computer_science|5": { | |
"hashes": { | |
"hash_examples": "80fc1d623a3d665f", | |
"hash_full_prompts": "8e51bc91c81cf8dd", | |
"hash_input_tokens": "604f88a2f17d5159", | |
"hash_cont_tokens": "00520b0ec06da34f" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_european_history|5": { | |
"hashes": { | |
"hash_examples": "854da6e5af0fe1a1", | |
"hash_full_prompts": "664a1f16c9f3195c", | |
"hash_input_tokens": "1dfe455312f2e6cf", | |
"hash_cont_tokens": "7b7414d6a5da3d91" | |
}, | |
"truncated": 0, | |
"non_truncated": 165, | |
"padded": 656, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_geography|5": { | |
"hashes": { | |
"hash_examples": "7dc963c7acd19ad8", | |
"hash_full_prompts": "f3acf911f4023c8a", | |
"hash_input_tokens": "1985ba6f69f57d66", | |
"hash_cont_tokens": "1b66289e10988f84" | |
}, | |
"truncated": 0, | |
"non_truncated": 198, | |
"padded": 792, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_government_and_politics|5": { | |
"hashes": { | |
"hash_examples": "1f675dcdebc9758f", | |
"hash_full_prompts": "066254feaa3158ae", | |
"hash_input_tokens": "e6960d7d906ffb15", | |
"hash_cont_tokens": "5ab3c3415b1d3a55" | |
}, | |
"truncated": 0, | |
"non_truncated": 193, | |
"padded": 772, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_macroeconomics|5": { | |
"hashes": { | |
"hash_examples": "2fb32cf2d80f0b35", | |
"hash_full_prompts": "19a7fa502aa85c95", | |
"hash_input_tokens": "4ea59b7b8c4856d2", | |
"hash_cont_tokens": "2f5457058d187374" | |
}, | |
"truncated": 0, | |
"non_truncated": 390, | |
"padded": 1557, | |
"non_padded": 3, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_mathematics|5": { | |
"hashes": { | |
"hash_examples": "fd6646fdb5d58a1f", | |
"hash_full_prompts": "4f704e369778b5b0", | |
"hash_input_tokens": "7d39279726411bb3", | |
"hash_cont_tokens": "e35137cb972e1918" | |
}, | |
"truncated": 0, | |
"non_truncated": 270, | |
"padded": 1080, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_microeconomics|5": { | |
"hashes": { | |
"hash_examples": "2118f21f71d87d84", | |
"hash_full_prompts": "4350f9e2240f8010", | |
"hash_input_tokens": "2be919ac2e73f3d1", | |
"hash_cont_tokens": "f756093278ebb83e" | |
}, | |
"truncated": 0, | |
"non_truncated": 238, | |
"padded": 908, | |
"non_padded": 44, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_physics|5": { | |
"hashes": { | |
"hash_examples": "dc3ce06378548565", | |
"hash_full_prompts": "5dc0d6831b66188f", | |
"hash_input_tokens": "9b2e07d3183ade24", | |
"hash_cont_tokens": "9cf883ebf1c82176" | |
}, | |
"truncated": 0, | |
"non_truncated": 151, | |
"padded": 604, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_psychology|5": { | |
"hashes": { | |
"hash_examples": "c8d1d98a40e11f2f", | |
"hash_full_prompts": "af2b097da6d50365", | |
"hash_input_tokens": "a0f7b561c0177eb7", | |
"hash_cont_tokens": "bda0f77331ebb21a" | |
}, | |
"truncated": 0, | |
"non_truncated": 545, | |
"padded": 2178, | |
"non_padded": 2, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_statistics|5": { | |
"hashes": { | |
"hash_examples": "666c8759b98ee4ff", | |
"hash_full_prompts": "c757694421d6d68d", | |
"hash_input_tokens": "0e353fc06f61e59b", | |
"hash_cont_tokens": "4d04f014105a0bad" | |
}, | |
"truncated": 0, | |
"non_truncated": 216, | |
"padded": 864, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_us_history|5": { | |
"hashes": { | |
"hash_examples": "95fef1c4b7d3f81e", | |
"hash_full_prompts": "e34a028d0ddeec5e", | |
"hash_input_tokens": "7c7f37778e6ccda2", | |
"hash_cont_tokens": "f4590c58f12f2766" | |
}, | |
"truncated": 0, | |
"non_truncated": 204, | |
"padded": 816, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:high_school_world_history|5": { | |
"hashes": { | |
"hash_examples": "7e5085b6184b0322", | |
"hash_full_prompts": "1fa3d51392765601", | |
"hash_input_tokens": "71993d416140265b", | |
"hash_cont_tokens": "db6bcddd891df5d9" | |
}, | |
"truncated": 0, | |
"non_truncated": 237, | |
"padded": 948, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:human_aging|5": { | |
"hashes": { | |
"hash_examples": "c17333e7c7c10797", | |
"hash_full_prompts": "cac900721f9a1a94", | |
"hash_input_tokens": "b0fa52119d4303e9", | |
"hash_cont_tokens": "25cec8d640319105" | |
}, | |
"truncated": 0, | |
"non_truncated": 223, | |
"padded": 892, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:human_sexuality|5": { | |
"hashes": { | |
"hash_examples": "4edd1e9045df5e3d", | |
"hash_full_prompts": "0d6567bafee0a13c", | |
"hash_input_tokens": "879018ae27bdf5b0", | |
"hash_cont_tokens": "6778302b4a10b645" | |
}, | |
"truncated": 0, | |
"non_truncated": 131, | |
"padded": 524, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:international_law|5": { | |
"hashes": { | |
"hash_examples": "db2fa00d771a062a", | |
"hash_full_prompts": "d018f9116479795e", | |
"hash_input_tokens": "be4409fc3ab936f3", | |
"hash_cont_tokens": "9eb54e1a46032749" | |
}, | |
"truncated": 0, | |
"non_truncated": 121, | |
"padded": 484, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:jurisprudence|5": { | |
"hashes": { | |
"hash_examples": "e956f86b124076fe", | |
"hash_full_prompts": "1487e89a10ec58b7", | |
"hash_input_tokens": "888c2eab4655e553", | |
"hash_cont_tokens": "f17d9a372cfd66b1" | |
}, | |
"truncated": 0, | |
"non_truncated": 108, | |
"padded": 420, | |
"non_padded": 12, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:logical_fallacies|5": { | |
"hashes": { | |
"hash_examples": "956e0e6365ab79f1", | |
"hash_full_prompts": "677785b2181f9243", | |
"hash_input_tokens": "8cee26c610ab13a1", | |
"hash_cont_tokens": "cf44a68f5bca9a96" | |
}, | |
"truncated": 0, | |
"non_truncated": 163, | |
"padded": 648, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:machine_learning|5": { | |
"hashes": { | |
"hash_examples": "397997cc6f4d581e", | |
"hash_full_prompts": "769ee14a2aea49bb", | |
"hash_input_tokens": "1d8a213f41f96aee", | |
"hash_cont_tokens": "eace00d420f4f32c" | |
}, | |
"truncated": 0, | |
"non_truncated": 112, | |
"padded": 448, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:management|5": { | |
"hashes": { | |
"hash_examples": "2bcbe6f6ca63d740", | |
"hash_full_prompts": "cb1ff9dac9582144", | |
"hash_input_tokens": "44ba435973dce9d1", | |
"hash_cont_tokens": "b7c51d0250c252d8" | |
}, | |
"truncated": 0, | |
"non_truncated": 103, | |
"padded": 412, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:marketing|5": { | |
"hashes": { | |
"hash_examples": "8ddb20d964a1b065", | |
"hash_full_prompts": "9fc2114a187ad9a2", | |
"hash_input_tokens": "e86c7f7e4f27bcb7", | |
"hash_cont_tokens": "086fb63f8b1d1339" | |
}, | |
"truncated": 0, | |
"non_truncated": 234, | |
"padded": 924, | |
"non_padded": 12, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:medical_genetics|5": { | |
"hashes": { | |
"hash_examples": "182a71f4763d2cea", | |
"hash_full_prompts": "46a616fa51878959", | |
"hash_input_tokens": "84615035f844ffa0", | |
"hash_cont_tokens": "00520b0ec06da34f" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:miscellaneous|5": { | |
"hashes": { | |
"hash_examples": "4c404fdbb4ca57fc", | |
"hash_full_prompts": "0813e1be36dbaae1", | |
"hash_input_tokens": "f816152d0e727938", | |
"hash_cont_tokens": "1827274fa6537077" | |
}, | |
"truncated": 0, | |
"non_truncated": 783, | |
"padded": 3132, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:moral_disputes|5": { | |
"hashes": { | |
"hash_examples": "60cbd2baa3fea5c9", | |
"hash_full_prompts": "1d14adebb9b62519", | |
"hash_input_tokens": "53082748f1b5e440", | |
"hash_cont_tokens": "472c223f6f28cfc7" | |
}, | |
"truncated": 0, | |
"non_truncated": 346, | |
"padded": 1384, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:moral_scenarios|5": { | |
"hashes": { | |
"hash_examples": "fd8b0431fbdd75ef", | |
"hash_full_prompts": "b80d3d236165e3de", | |
"hash_input_tokens": "b5318303d9c36325", | |
"hash_cont_tokens": "e90dade00a092f9e" | |
}, | |
"truncated": 0, | |
"non_truncated": 895, | |
"padded": 3567, | |
"non_padded": 13, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:nutrition|5": { | |
"hashes": { | |
"hash_examples": "71e55e2b829b6528", | |
"hash_full_prompts": "2bfb18e5fab8dea7", | |
"hash_input_tokens": "2ed8503c57d6afbf", | |
"hash_cont_tokens": "128e0ec97d96b165" | |
}, | |
"truncated": 0, | |
"non_truncated": 306, | |
"padded": 1224, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:philosophy|5": { | |
"hashes": { | |
"hash_examples": "a6d489a8d208fa4b", | |
"hash_full_prompts": "e8c0d5b6dae3ccc8", | |
"hash_input_tokens": "7e8ad59a08a00f3b", | |
"hash_cont_tokens": "cbfd7829a3e0f082" | |
}, | |
"truncated": 0, | |
"non_truncated": 311, | |
"padded": 1244, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:prehistory|5": { | |
"hashes": { | |
"hash_examples": "6cc50f032a19acaa", | |
"hash_full_prompts": "4a6a1d3ab1bf28e4", | |
"hash_input_tokens": "8bba5be57a92c467", | |
"hash_cont_tokens": "9c0cf5a2f71afa7e" | |
}, | |
"truncated": 0, | |
"non_truncated": 324, | |
"padded": 1284, | |
"non_padded": 12, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:professional_accounting|5": { | |
"hashes": { | |
"hash_examples": "50f57ab32f5f6cea", | |
"hash_full_prompts": "e60129bd2d82ffc6", | |
"hash_input_tokens": "236927cb4e27f724", | |
"hash_cont_tokens": "50f011c2453517ee" | |
}, | |
"truncated": 0, | |
"non_truncated": 282, | |
"padded": 1128, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:professional_law|5": { | |
"hashes": { | |
"hash_examples": "a8fdc85c64f4b215", | |
"hash_full_prompts": "0dbb1d9b72dcea03", | |
"hash_input_tokens": "7958ac5eb01fed27", | |
"hash_cont_tokens": "73527e852c24186c" | |
}, | |
"truncated": 0, | |
"non_truncated": 1534, | |
"padded": 6136, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:professional_medicine|5": { | |
"hashes": { | |
"hash_examples": "c373a28a3050a73a", | |
"hash_full_prompts": "5e040f9ca68b089e", | |
"hash_input_tokens": "f520600f7896a87b", | |
"hash_cont_tokens": "ceb7af5e2e789abc" | |
}, | |
"truncated": 0, | |
"non_truncated": 272, | |
"padded": 1088, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:professional_psychology|5": { | |
"hashes": { | |
"hash_examples": "bf5254fe818356af", | |
"hash_full_prompts": "b386ecda8b87150e", | |
"hash_input_tokens": "fb3f225a047d0f0f", | |
"hash_cont_tokens": "8cfdced8a9667380" | |
}, | |
"truncated": 0, | |
"non_truncated": 612, | |
"padded": 2428, | |
"non_padded": 20, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:public_relations|5": { | |
"hashes": { | |
"hash_examples": "b66d52e28e7d14e0", | |
"hash_full_prompts": "fe43562263e25677", | |
"hash_input_tokens": "9dfb929ef5e3362b", | |
"hash_cont_tokens": "f8327461a9cc5123" | |
}, | |
"truncated": 0, | |
"non_truncated": 110, | |
"padded": 436, | |
"non_padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:security_studies|5": { | |
"hashes": { | |
"hash_examples": "514c14feaf000ad9", | |
"hash_full_prompts": "27d4a2ac541ef4b9", | |
"hash_input_tokens": "f620744b07919b24", | |
"hash_cont_tokens": "c30b0c4d52c2875d" | |
}, | |
"truncated": 0, | |
"non_truncated": 245, | |
"padded": 980, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:sociology|5": { | |
"hashes": { | |
"hash_examples": "f6c9bc9d18c80870", | |
"hash_full_prompts": "c072ea7d1a1524f2", | |
"hash_input_tokens": "76d03f98f30dbe11", | |
"hash_cont_tokens": "eef4bd16d536fbd6" | |
}, | |
"truncated": 0, | |
"non_truncated": 201, | |
"padded": 804, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:us_foreign_policy|5": { | |
"hashes": { | |
"hash_examples": "ed7b78629db6678f", | |
"hash_full_prompts": "341a97ca3e4d699d", | |
"hash_input_tokens": "f0b4b93f91f3d7f4", | |
"hash_cont_tokens": "00520b0ec06da34f" | |
}, | |
"truncated": 0, | |
"non_truncated": 100, | |
"padded": 400, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:virology|5": { | |
"hashes": { | |
"hash_examples": "bc52ffdc3f9b994a", | |
"hash_full_prompts": "651d471e2eb8b5e9", | |
"hash_input_tokens": "1c7d23a204c7cbf6", | |
"hash_cont_tokens": "f5fc195e049353c0" | |
}, | |
"truncated": 0, | |
"non_truncated": 166, | |
"padded": 664, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"leaderboard|mmlu:world_religions|5": { | |
"hashes": { | |
"hash_examples": "ecdb4a4f94f62930", | |
"hash_full_prompts": "3773f03542ce44a3", | |
"hash_input_tokens": "be42fd2c9cc2da08", | |
"hash_cont_tokens": "ada548665e87b1e0" | |
}, | |
"truncated": 0, | |
"non_truncated": 171, | |
"padded": 684, | |
"non_padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
} | |
}, | |
"summary_general": { | |
"hashes": { | |
"hash_examples": "341a076d0beb7048", | |
"hash_full_prompts": "a5c8f2b7ff4f5ae2", | |
"hash_input_tokens": "917c40aba1546e12", | |
"hash_cont_tokens": "3672212ca582e2d0" | |
}, | |
"truncated": 0, | |
"non_truncated": 14042, | |
"padded": 56038, | |
"non_padded": 130, | |
"num_truncated_few_shots": 0 | |
} | |
} |