arc_challenge | acc | 0.3660409556313993 |
---|
acc_stderr | 0.014077223108470142 |
---|
acc_norm | 0.4121160409556314 |
---|
acc_norm_stderr | 0.014383915302225403 |
---|
|
---|
hellaswag | acc | 0.5401314479187412 |
---|
acc_stderr | 0.004973683026202171 |
---|
acc_norm | 0.7177853017327226 |
---|
acc_norm_stderr | 0.004491574539441884 |
---|
|
---|
hendrycksTest-abstract_algebra | acc | 0.21 |
---|
acc_stderr | 0.04093601807403326 |
---|
acc_norm | 0.21 |
---|
acc_norm_stderr | 0.04093601807403326 |
---|
|
---|
hendrycksTest-anatomy | acc | 0.25925925925925924 |
---|
acc_stderr | 0.03785714465066653 |
---|
acc_norm | 0.25925925925925924 |
---|
acc_norm_stderr | 0.03785714465066653 |
---|
|
---|
hendrycksTest-astronomy | acc | 0.28289473684210525 |
---|
acc_stderr | 0.03665349695640767 |
---|
acc_norm | 0.28289473684210525 |
---|
acc_norm_stderr | 0.03665349695640767 |
---|
|
---|
hendrycksTest-business_ethics | acc | 0.34 |
---|
acc_stderr | 0.04760952285695236 |
---|
acc_norm | 0.34 |
---|
acc_norm_stderr | 0.04760952285695236 |
---|
|
---|
hendrycksTest-clinical_knowledge | acc | 0.2528301886792453 |
---|
acc_stderr | 0.02674989977124123 |
---|
acc_norm | 0.2528301886792453 |
---|
acc_norm_stderr | 0.02674989977124123 |
---|
|
---|
hendrycksTest-college_biology | acc | 0.24305555555555555 |
---|
acc_stderr | 0.03586879280080339 |
---|
acc_norm | 0.24305555555555555 |
---|
acc_norm_stderr | 0.03586879280080339 |
---|
|
---|
hendrycksTest-college_chemistry | acc | 0.19 |
---|
acc_stderr | 0.03942772444036623 |
---|
acc_norm | 0.19 |
---|
acc_norm_stderr | 0.03942772444036623 |
---|
|
---|
hendrycksTest-college_computer_science | acc | 0.21 |
---|
acc_stderr | 0.040936018074033256 |
---|
acc_norm | 0.21 |
---|
acc_norm_stderr | 0.040936018074033256 |
---|
|
---|
hendrycksTest-college_mathematics | acc | 0.36 |
---|
acc_stderr | 0.04824181513244218 |
---|
acc_norm | 0.36 |
---|
acc_norm_stderr | 0.04824181513244218 |
---|
|
---|
hendrycksTest-college_medicine | acc | 0.20809248554913296 |
---|
acc_stderr | 0.030952890217749877 |
---|
acc_norm | 0.20809248554913296 |
---|
acc_norm_stderr | 0.030952890217749877 |
---|
|
---|
hendrycksTest-college_physics | acc | 0.2647058823529412 |
---|
acc_stderr | 0.04389869956808778 |
---|
acc_norm | 0.2647058823529412 |
---|
acc_norm_stderr | 0.04389869956808778 |
---|
|
---|
hendrycksTest-computer_security | acc | 0.33 |
---|
acc_stderr | 0.047258156262526045 |
---|
acc_norm | 0.33 |
---|
acc_norm_stderr | 0.047258156262526045 |
---|
|
---|
hendrycksTest-conceptual_physics | acc | 0.33617021276595743 |
---|
acc_stderr | 0.030881618520676942 |
---|
acc_norm | 0.33617021276595743 |
---|
acc_norm_stderr | 0.030881618520676942 |
---|
|
---|
hendrycksTest-econometrics | acc | 0.24561403508771928 |
---|
acc_stderr | 0.04049339297748144 |
---|
acc_norm | 0.24561403508771928 |
---|
acc_norm_stderr | 0.04049339297748144 |
---|
|
---|
hendrycksTest-electrical_engineering | acc | 0.2689655172413793 |
---|
acc_stderr | 0.03695183311650232 |
---|
acc_norm | 0.2689655172413793 |
---|
acc_norm_stderr | 0.03695183311650232 |
---|
|
---|
hendrycksTest-elementary_mathematics | acc | 0.2724867724867725 |
---|
acc_stderr | 0.022930973071633366 |
---|
acc_norm | 0.2724867724867725 |
---|
acc_norm_stderr | 0.022930973071633366 |
---|
|
---|
hendrycksTest-formal_logic | acc | 0.21428571428571427 |
---|
acc_stderr | 0.03670066451047181 |
---|
acc_norm | 0.21428571428571427 |
---|
acc_norm_stderr | 0.03670066451047181 |
---|
|
---|
hendrycksTest-global_facts | acc | 0.32 |
---|
acc_stderr | 0.046882617226215034 |
---|
acc_norm | 0.32 |
---|
acc_norm_stderr | 0.046882617226215034 |
---|
|
---|
hendrycksTest-high_school_biology | acc | 0.24193548387096775 |
---|
acc_stderr | 0.024362599693031093 |
---|
acc_norm | 0.24193548387096775 |
---|
acc_norm_stderr | 0.024362599693031093 |
---|
|
---|
hendrycksTest-high_school_chemistry | acc | 0.26108374384236455 |
---|
acc_stderr | 0.030903796952114485 |
---|
acc_norm | 0.26108374384236455 |
---|
acc_norm_stderr | 0.030903796952114485 |
---|
|
---|
hendrycksTest-high_school_computer_science | acc | 0.27 |
---|
acc_stderr | 0.04461960433384739 |
---|
acc_norm | 0.27 |
---|
acc_norm_stderr | 0.04461960433384739 |
---|
|
---|
hendrycksTest-high_school_european_history | acc | 0.30303030303030304 |
---|
acc_stderr | 0.035886248000917075 |
---|
acc_norm | 0.30303030303030304 |
---|
acc_norm_stderr | 0.035886248000917075 |
---|
|
---|
hendrycksTest-high_school_geography | acc | 0.24242424242424243 |
---|
acc_stderr | 0.030532892233932026 |
---|
acc_norm | 0.24242424242424243 |
---|
acc_norm_stderr | 0.030532892233932026 |
---|
|
---|
hendrycksTest-high_school_government_and_politics | acc | 0.24352331606217617 |
---|
acc_stderr | 0.030975436386845426 |
---|
acc_norm | 0.24352331606217617 |
---|
acc_norm_stderr | 0.030975436386845426 |
---|
|
---|
hendrycksTest-high_school_macroeconomics | acc | 0.2564102564102564 |
---|
acc_stderr | 0.022139081103971545 |
---|
acc_norm | 0.2564102564102564 |
---|
acc_norm_stderr | 0.022139081103971545 |
---|
|
---|
hendrycksTest-high_school_mathematics | acc | 0.2518518518518518 |
---|
acc_stderr | 0.026466117538959905 |
---|
acc_norm | 0.2518518518518518 |
---|
acc_norm_stderr | 0.026466117538959905 |
---|
|
---|
hendrycksTest-high_school_microeconomics | acc | 0.24789915966386555 |
---|
acc_stderr | 0.028047967224176892 |
---|
acc_norm | 0.24789915966386555 |
---|
acc_norm_stderr | 0.028047967224176892 |
---|
|
---|
hendrycksTest-high_school_physics | acc | 0.31125827814569534 |
---|
acc_stderr | 0.037804458505267334 |
---|
acc_norm | 0.31125827814569534 |
---|
acc_norm_stderr | 0.037804458505267334 |
---|
|
---|
hendrycksTest-high_school_psychology | acc | 0.26422018348623855 |
---|
acc_stderr | 0.01890416417151019 |
---|
acc_norm | 0.26422018348623855 |
---|
acc_norm_stderr | 0.01890416417151019 |
---|
|
---|
hendrycksTest-high_school_statistics | acc | 0.2175925925925926 |
---|
acc_stderr | 0.028139689444859672 |
---|
acc_norm | 0.2175925925925926 |
---|
acc_norm_stderr | 0.028139689444859672 |
---|
|
---|
hendrycksTest-high_school_us_history | acc | 0.24019607843137256 |
---|
acc_stderr | 0.02998373305591362 |
---|
acc_norm | 0.24019607843137256 |
---|
acc_norm_stderr | 0.02998373305591362 |
---|
|
---|
hendrycksTest-high_school_world_history | acc | 0.28270042194092826 |
---|
acc_stderr | 0.029312814153955924 |
---|
acc_norm | 0.28270042194092826 |
---|
acc_norm_stderr | 0.029312814153955924 |
---|
|
---|
hendrycksTest-human_aging | acc | 0.3901345291479821 |
---|
acc_stderr | 0.03273766725459156 |
---|
acc_norm | 0.3901345291479821 |
---|
acc_norm_stderr | 0.03273766725459156 |
---|
|
---|
hendrycksTest-human_sexuality | acc | 0.1984732824427481 |
---|
acc_stderr | 0.03498149385462473 |
---|
acc_norm | 0.1984732824427481 |
---|
acc_norm_stderr | 0.03498149385462473 |
---|
|
---|
hendrycksTest-international_law | acc | 0.30578512396694213 |
---|
acc_stderr | 0.04205953933884124 |
---|
acc_norm | 0.30578512396694213 |
---|
acc_norm_stderr | 0.04205953933884124 |
---|
|
---|
hendrycksTest-jurisprudence | acc | 0.24074074074074073 |
---|
acc_stderr | 0.0413311944024384 |
---|
acc_norm | 0.24074074074074073 |
---|
acc_norm_stderr | 0.0413311944024384 |
---|
|
---|
hendrycksTest-logical_fallacies | acc | 0.22085889570552147 |
---|
acc_stderr | 0.03259177392742177 |
---|
acc_norm | 0.22085889570552147 |
---|
acc_norm_stderr | 0.03259177392742177 |
---|
|
---|
hendrycksTest-machine_learning | acc | 0.24107142857142858 |
---|
acc_stderr | 0.04059867246952685 |
---|
acc_norm | 0.24107142857142858 |
---|
acc_norm_stderr | 0.04059867246952685 |
---|
|
---|
hendrycksTest-management | acc | 0.2912621359223301 |
---|
acc_stderr | 0.04498676320572922 |
---|
acc_norm | 0.2912621359223301 |
---|
acc_norm_stderr | 0.04498676320572922 |
---|
|
---|
hendrycksTest-marketing | acc | 0.28205128205128205 |
---|
acc_stderr | 0.029480360549541194 |
---|
acc_norm | 0.28205128205128205 |
---|
acc_norm_stderr | 0.029480360549541194 |
---|
|
---|
hendrycksTest-medical_genetics | acc | 0.31 |
---|
acc_stderr | 0.04648231987117316 |
---|
acc_norm | 0.31 |
---|
acc_norm_stderr | 0.04648231987117316 |
---|
|
---|
hendrycksTest-miscellaneous | acc | 0.30140485312899107 |
---|
acc_stderr | 0.016409091097268787 |
---|
acc_norm | 0.30140485312899107 |
---|
acc_norm_stderr | 0.016409091097268787 |
---|
|
---|
hendrycksTest-moral_disputes | acc | 0.30346820809248554 |
---|
acc_stderr | 0.024752411960917202 |
---|
acc_norm | 0.30346820809248554 |
---|
acc_norm_stderr | 0.024752411960917202 |
---|
|
---|
hendrycksTest-moral_scenarios | acc | 0.24692737430167597 |
---|
acc_stderr | 0.014422292204808835 |
---|
acc_norm | 0.24692737430167597 |
---|
acc_norm_stderr | 0.014422292204808835 |
---|
|
---|
hendrycksTest-nutrition | acc | 0.28431372549019607 |
---|
acc_stderr | 0.025829163272757475 |
---|
acc_norm | 0.28431372549019607 |
---|
acc_norm_stderr | 0.025829163272757475 |
---|
|
---|
hendrycksTest-philosophy | acc | 0.29260450160771706 |
---|
acc_stderr | 0.02583989833487798 |
---|
acc_norm | 0.29260450160771706 |
---|
acc_norm_stderr | 0.02583989833487798 |
---|
|
---|
hendrycksTest-prehistory | acc | 0.2993827160493827 |
---|
acc_stderr | 0.02548311560119547 |
---|
acc_norm | 0.2993827160493827 |
---|
acc_norm_stderr | 0.02548311560119547 |
---|
|
---|
hendrycksTest-professional_accounting | acc | 0.2624113475177305 |
---|
acc_stderr | 0.02624492034984301 |
---|
acc_norm | 0.2624113475177305 |
---|
acc_norm_stderr | 0.02624492034984301 |
---|
|
---|
hendrycksTest-professional_law | acc | 0.24641460234680573 |
---|
acc_stderr | 0.011005971399927242 |
---|
acc_norm | 0.24641460234680573 |
---|
acc_norm_stderr | 0.011005971399927242 |
---|
|
---|
hendrycksTest-professional_medicine | acc | 0.21691176470588236 |
---|
acc_stderr | 0.02503584522771125 |
---|
acc_norm | 0.21691176470588236 |
---|
acc_norm_stderr | 0.02503584522771125 |
---|
|
---|
hendrycksTest-professional_psychology | acc | 0.27941176470588236 |
---|
acc_stderr | 0.018152871051538812 |
---|
acc_norm | 0.27941176470588236 |
---|
acc_norm_stderr | 0.018152871051538812 |
---|
|
---|
hendrycksTest-public_relations | acc | 0.2636363636363636 |
---|
acc_stderr | 0.04220224692971987 |
---|
acc_norm | 0.2636363636363636 |
---|
acc_norm_stderr | 0.04220224692971987 |
---|
|
---|
hendrycksTest-security_studies | acc | 0.37551020408163266 |
---|
acc_stderr | 0.031001209039894836 |
---|
acc_norm | 0.37551020408163266 |
---|
acc_norm_stderr | 0.031001209039894836 |
---|
|
---|
hendrycksTest-sociology | acc | 0.26865671641791045 |
---|
acc_stderr | 0.03134328358208954 |
---|
acc_norm | 0.26865671641791045 |
---|
acc_norm_stderr | 0.03134328358208954 |
---|
|
---|
hendrycksTest-us_foreign_policy | acc | 0.35 |
---|
acc_stderr | 0.0479372485441102 |
---|
acc_norm | 0.35 |
---|
acc_norm_stderr | 0.0479372485441102 |
---|
|
---|
hendrycksTest-virology | acc | 0.3253012048192771 |
---|
acc_stderr | 0.03647168523683227 |
---|
acc_norm | 0.3253012048192771 |
---|
acc_norm_stderr | 0.03647168523683227 |
---|
|
---|
hendrycksTest-world_religions | acc | 0.30409356725146197 |
---|
acc_stderr | 0.035282112582452334 |
---|
acc_norm | 0.30409356725146197 |
---|
acc_norm_stderr | 0.035282112582452334 |
---|
|
---|
truthfulqa_mc | mc1 | 0.2460220318237454 |
---|
mc1_stderr | 0.015077219200662597 |
---|
mc2 | 0.3821583918021301 |
---|
mc2_stderr | 0.013892826466923648 |
---|
|
---|