hendrycksTest-moral_disputes | acc | 0.28612716763 |
---|
acc_stderr | 0.0243321467791 |
---|
acc_norm | 0.28612716763 |
---|
acc_norm_stderr | 0.0243321467791 |
---|
|
---|
hendrycksTest-nutrition | acc | 0.287581699346 |
---|
acc_stderr | 0.0259178061171 |
---|
acc_norm | 0.287581699346 |
---|
acc_norm_stderr | 0.0259178061171 |
---|
|
---|
hendrycksTest-prehistory | acc | 0.311728395062 |
---|
acc_stderr | 0.0257731111696 |
---|
acc_norm | 0.311728395062 |
---|
acc_norm_stderr | 0.0257731111696 |
---|
|
---|
hendrycksTest-business_ethics | acc | 0.3 |
---|
acc_stderr | 0.0460566186472 |
---|
acc_norm | 0.3 |
---|
acc_norm_stderr | 0.0460566186472 |
---|
|
---|
hendrycksTest-professional_medicine | acc | 0.180147058824 |
---|
acc_stderr | 0.0233451636165 |
---|
acc_norm | 0.180147058824 |
---|
acc_norm_stderr | 0.0233451636165 |
---|
|
---|
hendrycksTest-international_law | acc | 0.388429752066 |
---|
acc_stderr | 0.0444927035007 |
---|
acc_norm | 0.388429752066 |
---|
acc_norm_stderr | 0.0444927035007 |
---|
|
---|
hendrycksTest-high_school_world_history | acc | 0.286919831224 |
---|
acc_stderr | 0.0294437730226 |
---|
acc_norm | 0.286919831224 |
---|
acc_norm_stderr | 0.0294437730226 |
---|
|
---|
hendrycksTest-management | acc | 0.330097087379 |
---|
acc_stderr | 0.0465614711001 |
---|
acc_norm | 0.330097087379 |
---|
acc_norm_stderr | 0.0465614711001 |
---|
|
---|
hendrycksTest-college_biology | acc | 0.256944444444 |
---|
acc_stderr | 0.0365394696944 |
---|
acc_norm | 0.256944444444 |
---|
acc_norm_stderr | 0.0365394696944 |
---|
|
---|
hendrycksTest-high_school_statistics | acc | 0.25462962963 |
---|
acc_stderr | 0.02971127586 |
---|
acc_norm | 0.25462962963 |
---|
acc_norm_stderr | 0.02971127586 |
---|
|
---|
hendrycksTest-college_medicine | acc | 0.231213872832 |
---|
acc_stderr | 0.0321473730203 |
---|
acc_norm | 0.231213872832 |
---|
acc_norm_stderr | 0.0321473730203 |
---|
|
---|
hendrycksTest-conceptual_physics | acc | 0.276595744681 |
---|
acc_stderr | 0.0292418838696 |
---|
acc_norm | 0.276595744681 |
---|
acc_norm_stderr | 0.0292418838696 |
---|
|
---|
hendrycksTest-electrical_engineering | acc | 0.206896551724 |
---|
acc_stderr | 0.0337567244956 |
---|
acc_norm | 0.206896551724 |
---|
acc_norm_stderr | 0.0337567244956 |
---|
|
---|
hendrycksTest-high_school_computer_science | acc | 0.33 |
---|
acc_stderr | 0.0472581562625 |
---|
acc_norm | 0.33 |
---|
acc_norm_stderr | 0.0472581562625 |
---|
|
---|
hendrycksTest-us_foreign_policy | acc | 0.32 |
---|
acc_stderr | 0.0468826172262 |
---|
acc_norm | 0.32 |
---|
acc_norm_stderr | 0.0468826172262 |
---|
|
---|
hendrycksTest-high_school_chemistry | acc | 0.28078817734 |
---|
acc_stderr | 0.0316185633536 |
---|
acc_norm | 0.28078817734 |
---|
acc_norm_stderr | 0.0316185633536 |
---|
|
---|
hendrycksTest-high_school_government_and_politics | acc | 0.253886010363 |
---|
acc_stderr | 0.0314102478057 |
---|
acc_norm | 0.253886010363 |
---|
acc_norm_stderr | 0.0314102478057 |
---|
|
---|
hendrycksTest-high_school_macroeconomics | acc | 0.274358974359 |
---|
acc_stderr | 0.0226227657675 |
---|
acc_norm | 0.274358974359 |
---|
acc_norm_stderr | 0.0226227657675 |
---|
|
---|
hendrycksTest-college_chemistry | acc | 0.22 |
---|
acc_stderr | 0.0416333199893 |
---|
acc_norm | 0.22 |
---|
acc_norm_stderr | 0.0416333199893 |
---|
|
---|
hendrycksTest-miscellaneous | acc | 0.315453384419 |
---|
acc_stderr | 0.0166175017388 |
---|
acc_norm | 0.315453384419 |
---|
acc_norm_stderr | 0.0166175017388 |
---|
|
---|
hendrycksTest-high_school_psychology | acc | 0.28623853211 |
---|
acc_stderr | 0.0193794366289 |
---|
acc_norm | 0.28623853211 |
---|
acc_norm_stderr | 0.0193794366289 |
---|
|
---|
hendrycksTest-machine_learning | acc | 0.214285714286 |
---|
acc_stderr | 0.0389464112004 |
---|
acc_norm | 0.214285714286 |
---|
acc_norm_stderr | 0.0389464112004 |
---|
|
---|
truthfulqa_mc | mc1_stderr | 0.0150006743736 |
---|
mc2_stderr | 0.0138986280365 |
---|
mc2 | 0.38597579296 |
---|
mc1 | 0.2423500612 |
---|
|
---|
hendrycksTest-marketing | acc | 0.290598290598 |
---|
acc_stderr | 0.0297450485727 |
---|
acc_norm | 0.290598290598 |
---|
acc_norm_stderr | 0.0297450485727 |
---|
|
---|
hendrycksTest-world_religions | acc | 0.327485380117 |
---|
acc_stderr | 0.0359933577146 |
---|
acc_norm | 0.327485380117 |
---|
acc_norm_stderr | 0.0359933577146 |
---|
|
---|
hendrycksTest-logical_fallacies | acc | 0.233128834356 |
---|
acc_stderr | 0.0332201579578 |
---|
acc_norm | 0.233128834356 |
---|
acc_norm_stderr | 0.0332201579578 |
---|
|
---|
hendrycksTest-security_studies | acc | 0.326530612245 |
---|
acc_stderr | 0.0300210562384 |
---|
acc_norm | 0.326530612245 |
---|
acc_norm_stderr | 0.0300210562384 |
---|
|
---|
hendrycksTest-college_mathematics | acc | 0.31 |
---|
acc_stderr | 0.0464823198712 |
---|
acc_norm | 0.31 |
---|
acc_norm_stderr | 0.0464823198712 |
---|
|
---|
hendrycksTest-human_aging | acc | 0.381165919283 |
---|
acc_stderr | 0.0325962511842 |
---|
acc_norm | 0.381165919283 |
---|
acc_norm_stderr | 0.0325962511842 |
---|
|
---|
hendrycksTest-formal_logic | acc | 0.261904761905 |
---|
acc_stderr | 0.0393253768039 |
---|
acc_norm | 0.261904761905 |
---|
acc_norm_stderr | 0.0393253768039 |
---|
|
---|
hendrycksTest-sociology | acc | 0.258706467662 |
---|
acc_stderr | 0.0309659031236 |
---|
acc_norm | 0.258706467662 |
---|
acc_norm_stderr | 0.0309659031236 |
---|
|
---|
hendrycksTest-public_relations | acc | 0.309090909091 |
---|
acc_stderr | 0.044262946482 |
---|
acc_norm | 0.309090909091 |
---|
acc_norm_stderr | 0.044262946482 |
---|
|
---|
hendrycksTest-high_school_physics | acc | 0.304635761589 |
---|
acc_stderr | 0.0375794992294 |
---|
acc_norm | 0.304635761589 |
---|
acc_norm_stderr | 0.0375794992294 |
---|
|
---|
hendrycksTest-human_sexuality | acc | 0.198473282443 |
---|
acc_stderr | 0.0349814938546 |
---|
acc_norm | 0.198473282443 |
---|
acc_norm_stderr | 0.0349814938546 |
---|
|
---|
hendrycksTest-clinical_knowledge | acc | 0.264150943396 |
---|
acc_stderr | 0.0271342916287 |
---|
acc_norm | 0.264150943396 |
---|
acc_norm_stderr | 0.0271342916287 |
---|
|
---|
hendrycksTest-high_school_biology | acc | 0.241935483871 |
---|
acc_stderr | 0.024362599693 |
---|
acc_norm | 0.241935483871 |
---|
acc_norm_stderr | 0.024362599693 |
---|
|
---|
hendrycksTest-high_school_us_history | acc | 0.230392156863 |
---|
acc_stderr | 0.0295542926057 |
---|
acc_norm | 0.230392156863 |
---|
acc_norm_stderr | 0.0295542926057 |
---|
|
---|
hendrycksTest-high_school_mathematics | acc | 0.262962962963 |
---|
acc_stderr | 0.0268420578738 |
---|
acc_norm | 0.262962962963 |
---|
acc_norm_stderr | 0.0268420578738 |
---|
|
---|
hendrycksTest-medical_genetics | acc | 0.29 |
---|
acc_stderr | 0.0456048021572 |
---|
acc_norm | 0.29 |
---|
acc_norm_stderr | 0.0456048021572 |
---|
|
---|
hendrycksTest-moral_scenarios | acc | 0.212290502793 |
---|
acc_stderr | 0.0136766446858 |
---|
acc_norm | 0.212290502793 |
---|
acc_norm_stderr | 0.0136766446858 |
---|
|
---|
hendrycksTest-high_school_microeconomics | acc | 0.264705882353 |
---|
acc_stderr | 0.0286574912851 |
---|
acc_norm | 0.264705882353 |
---|
acc_norm_stderr | 0.0286574912851 |
---|
|
---|
hendrycksTest-philosophy | acc | 0.27652733119 |
---|
acc_stderr | 0.0254038329782 |
---|
acc_norm | 0.27652733119 |
---|
acc_norm_stderr | 0.0254038329782 |
---|
|
---|
hendrycksTest-virology | acc | 0.301204819277 |
---|
acc_stderr | 0.0357160923005 |
---|
acc_norm | 0.301204819277 |
---|
acc_norm_stderr | 0.0357160923005 |
---|
|
---|
hendrycksTest-econometrics | acc | 0.263157894737 |
---|
acc_stderr | 0.0414243971949 |
---|
acc_norm | 0.263157894737 |
---|
acc_norm_stderr | 0.0414243971949 |
---|
|
---|
hendrycksTest-abstract_algebra | acc | 0.23 |
---|
acc_stderr | 0.0422952584682 |
---|
acc_norm | 0.23 |
---|
acc_norm_stderr | 0.0422952584682 |
---|
|
---|
hendrycksTest-college_physics | acc | 0.274509803922 |
---|
acc_stderr | 0.0444052190618 |
---|
acc_norm | 0.274509803922 |
---|
acc_norm_stderr | 0.0444052190618 |
---|
|
---|
hendrycksTest-astronomy | acc | 0.328947368421 |
---|
acc_stderr | 0.0382342896993 |
---|
acc_norm | 0.328947368421 |
---|
acc_norm_stderr | 0.0382342896993 |
---|
|
---|
hendrycksTest-global_facts | acc | 0.35 |
---|
acc_stderr | 0.0479372485441 |
---|
acc_norm | 0.35 |
---|
acc_norm_stderr | 0.0479372485441 |
---|
|
---|
hendrycksTest-computer_security | acc | 0.36 |
---|
acc_stderr | 0.0482418151324 |
---|
acc_norm | 0.36 |
---|
acc_norm_stderr | 0.0482418151324 |
---|
|
---|
hendrycksTest-high_school_geography | acc | 0.287878787879 |
---|
acc_stderr | 0.032258835123 |
---|
acc_norm | 0.287878787879 |
---|
acc_norm_stderr | 0.032258835123 |
---|
|
---|
hendrycksTest-high_school_european_history | acc | 0.29696969697 |
---|
acc_stderr | 0.0356796977227 |
---|
acc_norm | 0.29696969697 |
---|
acc_norm_stderr | 0.0356796977227 |
---|
|
---|
hendrycksTest-professional_psychology | acc | 0.279411764706 |
---|
acc_stderr | 0.0181528710515 |
---|
acc_norm | 0.279411764706 |
---|
acc_norm_stderr | 0.0181528710515 |
---|
|
---|
hellaswag | acc | 0.551384186417 |
---|
acc_stderr | 0.00496336208528 |
---|
acc_norm | 0.725751842262 |
---|
acc_norm_stderr | 0.00445222854104 |
---|
|
---|
hendrycksTest-professional_accounting | acc | 0.265957446809 |
---|
acc_stderr | 0.0263580656989 |
---|
acc_norm | 0.265957446809 |
---|
acc_norm_stderr | 0.0263580656989 |
---|
|
---|
hendrycksTest-college_computer_science | acc | 0.26 |
---|
acc_stderr | 0.0440844002277 |
---|
acc_norm | 0.26 |
---|
acc_norm_stderr | 0.0440844002277 |
---|
|
---|
hendrycksTest-professional_law | acc | 0.256844850065 |
---|
acc_stderr | 0.0111584558531 |
---|
acc_norm | 0.256844850065 |
---|
acc_norm_stderr | 0.0111584558531 |
---|
|
---|
hendrycksTest-jurisprudence | acc | 0.231481481481 |
---|
acc_stderr | 0.0407749470925 |
---|
acc_norm | 0.231481481481 |
---|
acc_norm_stderr | 0.0407749470925 |
---|
|
---|
hendrycksTest-elementary_mathematics | acc | 0.269841269841 |
---|
acc_stderr | 0.0228608383092 |
---|
acc_norm | 0.269841269841 |
---|
acc_norm_stderr | 0.0228608383092 |
---|
|
---|
hendrycksTest-anatomy | acc | 0.296296296296 |
---|
acc_stderr | 0.039446241625 |
---|
acc_norm | 0.296296296296 |
---|
acc_norm_stderr | 0.039446241625 |
---|
|
---|
arc_challenge | acc | 0.389931740614 |
---|
acc_stderr | 0.0142529598489 |
---|
acc_norm | 0.430887372014 |
---|
acc_norm_stderr | 0.0144711333926 |
---|
|
---|