hellaswag | acc | 0.5415255925114519 |
---|
acc_stderr | 0.004972543127767884 |
---|
acc_norm | 0.7211710814578769 |
---|
acc_norm_stderr | 0.004475067344626758 |
---|
|
---|
hendrycksTest-abstract_algebra | acc | 0.22 |
---|
acc_stderr | 0.04163331998932269 |
---|
acc_norm | 0.22 |
---|
acc_norm_stderr | 0.04163331998932269 |
---|
|
---|
hendrycksTest-anatomy | acc | 0.21481481481481482 |
---|
acc_stderr | 0.03547854198560824 |
---|
acc_norm | 0.21481481481481482 |
---|
acc_norm_stderr | 0.03547854198560824 |
---|
|
---|
hendrycksTest-astronomy | acc | 0.2894736842105263 |
---|
acc_stderr | 0.03690677986137283 |
---|
acc_norm | 0.2894736842105263 |
---|
acc_norm_stderr | 0.03690677986137283 |
---|
|
---|
hendrycksTest-business_ethics | acc | 0.37 |
---|
acc_stderr | 0.04852365870939099 |
---|
acc_norm | 0.37 |
---|
acc_norm_stderr | 0.04852365870939099 |
---|
|
---|
hendrycksTest-clinical_knowledge | acc | 0.2641509433962264 |
---|
acc_stderr | 0.02713429162874172 |
---|
acc_norm | 0.2641509433962264 |
---|
acc_norm_stderr | 0.02713429162874172 |
---|
|
---|
hendrycksTest-college_biology | acc | 0.2361111111111111 |
---|
acc_stderr | 0.03551446610810826 |
---|
acc_norm | 0.2361111111111111 |
---|
acc_norm_stderr | 0.03551446610810826 |
---|
|
---|
hendrycksTest-college_chemistry | acc | 0.24 |
---|
acc_stderr | 0.04292346959909284 |
---|
acc_norm | 0.24 |
---|
acc_norm_stderr | 0.04292346959909284 |
---|
|
---|
hendrycksTest-college_computer_science | acc | 0.22 |
---|
acc_stderr | 0.04163331998932269 |
---|
acc_norm | 0.22 |
---|
acc_norm_stderr | 0.04163331998932269 |
---|
|
---|
hendrycksTest-college_mathematics | acc | 0.3 |
---|
acc_stderr | 0.046056618647183814 |
---|
acc_norm | 0.3 |
---|
acc_norm_stderr | 0.046056618647183814 |
---|
|
---|
hendrycksTest-college_medicine | acc | 0.21965317919075145 |
---|
acc_stderr | 0.031568093627031744 |
---|
acc_norm | 0.21965317919075145 |
---|
acc_norm_stderr | 0.031568093627031744 |
---|
|
---|
hendrycksTest-college_physics | acc | 0.19607843137254902 |
---|
acc_stderr | 0.03950581861179961 |
---|
acc_norm | 0.19607843137254902 |
---|
acc_norm_stderr | 0.03950581861179961 |
---|
|
---|
hendrycksTest-computer_security | acc | 0.33 |
---|
acc_stderr | 0.04725815626252605 |
---|
acc_norm | 0.33 |
---|
acc_norm_stderr | 0.04725815626252605 |
---|
|
---|
hendrycksTest-conceptual_physics | acc | 0.3148936170212766 |
---|
acc_stderr | 0.03036358219723817 |
---|
acc_norm | 0.3148936170212766 |
---|
acc_norm_stderr | 0.03036358219723817 |
---|
|
---|
hendrycksTest-econometrics | acc | 0.22807017543859648 |
---|
acc_stderr | 0.03947152782669416 |
---|
acc_norm | 0.22807017543859648 |
---|
acc_norm_stderr | 0.03947152782669416 |
---|
|
---|
hendrycksTest-electrical_engineering | acc | 0.2 |
---|
acc_stderr | 0.0333333333333333 |
---|
acc_norm | 0.2 |
---|
acc_norm_stderr | 0.0333333333333333 |
---|
|
---|
hendrycksTest-elementary_mathematics | acc | 0.29365079365079366 |
---|
acc_stderr | 0.023456037383982022 |
---|
acc_norm | 0.29365079365079366 |
---|
acc_norm_stderr | 0.023456037383982022 |
---|
|
---|
hendrycksTest-formal_logic | acc | 0.1984126984126984 |
---|
acc_stderr | 0.03567016675276862 |
---|
acc_norm | 0.1984126984126984 |
---|
acc_norm_stderr | 0.03567016675276862 |
---|
|
---|
hendrycksTest-global_facts | acc | 0.32 |
---|
acc_stderr | 0.046882617226215034 |
---|
acc_norm | 0.32 |
---|
acc_norm_stderr | 0.046882617226215034 |
---|
|
---|
hendrycksTest-high_school_biology | acc | 0.23225806451612904 |
---|
acc_stderr | 0.02402225613030824 |
---|
acc_norm | 0.23225806451612904 |
---|
acc_norm_stderr | 0.02402225613030824 |
---|
|
---|
hendrycksTest-high_school_chemistry | acc | 0.23645320197044334 |
---|
acc_stderr | 0.029896114291733552 |
---|
acc_norm | 0.23645320197044334 |
---|
acc_norm_stderr | 0.029896114291733552 |
---|
|
---|
hendrycksTest-high_school_computer_science | acc | 0.26 |
---|
acc_stderr | 0.044084400227680794 |
---|
acc_norm | 0.26 |
---|
acc_norm_stderr | 0.044084400227680794 |
---|
|
---|
hendrycksTest-high_school_european_history | acc | 0.22424242424242424 |
---|
acc_stderr | 0.032568666616811015 |
---|
acc_norm | 0.22424242424242424 |
---|
acc_norm_stderr | 0.032568666616811015 |
---|
|
---|
hendrycksTest-high_school_geography | acc | 0.20707070707070707 |
---|
acc_stderr | 0.02886977846026705 |
---|
acc_norm | 0.20707070707070707 |
---|
acc_norm_stderr | 0.02886977846026705 |
---|
|
---|
hendrycksTest-high_school_government_and_politics | acc | 0.22797927461139897 |
---|
acc_stderr | 0.030276909945178274 |
---|
acc_norm | 0.22797927461139897 |
---|
acc_norm_stderr | 0.030276909945178274 |
---|
|
---|
hendrycksTest-high_school_macroeconomics | acc | 0.2358974358974359 |
---|
acc_stderr | 0.021525965407408726 |
---|
acc_norm | 0.2358974358974359 |
---|
acc_norm_stderr | 0.021525965407408726 |
---|
|
---|
hendrycksTest-high_school_mathematics | acc | 0.25925925925925924 |
---|
acc_stderr | 0.026719240783712163 |
---|
acc_norm | 0.25925925925925924 |
---|
acc_norm_stderr | 0.026719240783712163 |
---|
|
---|
hendrycksTest-high_school_microeconomics | acc | 0.2815126050420168 |
---|
acc_stderr | 0.029213549414372167 |
---|
acc_norm | 0.2815126050420168 |
---|
acc_norm_stderr | 0.029213549414372167 |
---|
|
---|
hendrycksTest-high_school_physics | acc | 0.2980132450331126 |
---|
acc_stderr | 0.037345356767871984 |
---|
acc_norm | 0.2980132450331126 |
---|
acc_norm_stderr | 0.037345356767871984 |
---|
|
---|
hendrycksTest-high_school_psychology | acc | 0.23669724770642203 |
---|
acc_stderr | 0.01822407811729907 |
---|
acc_norm | 0.23669724770642203 |
---|
acc_norm_stderr | 0.01822407811729907 |
---|
|
---|
hendrycksTest-high_school_statistics | acc | 0.1574074074074074 |
---|
acc_stderr | 0.02483717351824239 |
---|
acc_norm | 0.1574074074074074 |
---|
acc_norm_stderr | 0.02483717351824239 |
---|
|
---|
hendrycksTest-high_school_us_history | acc | 0.2647058823529412 |
---|
acc_stderr | 0.03096451792692341 |
---|
acc_norm | 0.2647058823529412 |
---|
acc_norm_stderr | 0.03096451792692341 |
---|
|
---|
hendrycksTest-high_school_world_history | acc | 0.28270042194092826 |
---|
acc_stderr | 0.02931281415395592 |
---|
acc_norm | 0.28270042194092826 |
---|
acc_norm_stderr | 0.02931281415395592 |
---|
|
---|
hendrycksTest-human_aging | acc | 0.39461883408071746 |
---|
acc_stderr | 0.03280400504755291 |
---|
acc_norm | 0.39461883408071746 |
---|
acc_norm_stderr | 0.03280400504755291 |
---|
|
---|
hendrycksTest-human_sexuality | acc | 0.22900763358778625 |
---|
acc_stderr | 0.036853466317118506 |
---|
acc_norm | 0.22900763358778625 |
---|
acc_norm_stderr | 0.036853466317118506 |
---|
|
---|
hendrycksTest-international_law | acc | 0.30578512396694213 |
---|
acc_stderr | 0.04205953933884124 |
---|
acc_norm | 0.30578512396694213 |
---|
acc_norm_stderr | 0.04205953933884124 |
---|
|
---|
hendrycksTest-jurisprudence | acc | 0.32407407407407407 |
---|
acc_stderr | 0.04524596007030048 |
---|
acc_norm | 0.32407407407407407 |
---|
acc_norm_stderr | 0.04524596007030048 |
---|
|
---|
hendrycksTest-logical_fallacies | acc | 0.22699386503067484 |
---|
acc_stderr | 0.03291099578615769 |
---|
acc_norm | 0.22699386503067484 |
---|
acc_norm_stderr | 0.03291099578615769 |
---|
|
---|
hendrycksTest-machine_learning | acc | 0.2857142857142857 |
---|
acc_stderr | 0.04287858751340456 |
---|
acc_norm | 0.2857142857142857 |
---|
acc_norm_stderr | 0.04287858751340456 |
---|
|
---|
hendrycksTest-management | acc | 0.27184466019417475 |
---|
acc_stderr | 0.044052680241409216 |
---|
acc_norm | 0.27184466019417475 |
---|
acc_norm_stderr | 0.044052680241409216 |
---|
|
---|
hendrycksTest-marketing | acc | 0.27350427350427353 |
---|
acc_stderr | 0.02920254015343117 |
---|
acc_norm | 0.27350427350427353 |
---|
acc_norm_stderr | 0.02920254015343117 |
---|
|
---|
hendrycksTest-medical_genetics | acc | 0.23 |
---|
acc_stderr | 0.04229525846816505 |
---|
acc_norm | 0.23 |
---|
acc_norm_stderr | 0.04229525846816505 |
---|
|
---|
hendrycksTest-miscellaneous | acc | 0.2937420178799489 |
---|
acc_stderr | 0.016287759388491672 |
---|
acc_norm | 0.2937420178799489 |
---|
acc_norm_stderr | 0.016287759388491672 |
---|
|
---|
hendrycksTest-moral_disputes | acc | 0.2774566473988439 |
---|
acc_stderr | 0.024105712607754307 |
---|
acc_norm | 0.2774566473988439 |
---|
acc_norm_stderr | 0.024105712607754307 |
---|
|
---|
hendrycksTest-moral_scenarios | acc | 0.24916201117318434 |
---|
acc_stderr | 0.014465893829859933 |
---|
acc_norm | 0.24916201117318434 |
---|
acc_norm_stderr | 0.014465893829859933 |
---|
|
---|
hendrycksTest-nutrition | acc | 0.24836601307189543 |
---|
acc_stderr | 0.024739981355113596 |
---|
acc_norm | 0.24836601307189543 |
---|
acc_norm_stderr | 0.024739981355113596 |
---|
|
---|
hendrycksTest-philosophy | acc | 0.2733118971061093 |
---|
acc_stderr | 0.02531176597542612 |
---|
acc_norm | 0.2733118971061093 |
---|
acc_norm_stderr | 0.02531176597542612 |
---|
|
---|
hendrycksTest-prehistory | acc | 0.2623456790123457 |
---|
acc_stderr | 0.024477222856135107 |
---|
acc_norm | 0.2623456790123457 |
---|
acc_norm_stderr | 0.024477222856135107 |
---|
|
---|
hendrycksTest-professional_accounting | acc | 0.2801418439716312 |
---|
acc_stderr | 0.026789172351140228 |
---|
acc_norm | 0.2801418439716312 |
---|
acc_norm_stderr | 0.026789172351140228 |
---|
|
---|
hendrycksTest-professional_law | acc | 0.24185136897001303 |
---|
acc_stderr | 0.010936550813827065 |
---|
acc_norm | 0.24185136897001303 |
---|
acc_norm_stderr | 0.010936550813827065 |
---|
|
---|
hendrycksTest-professional_medicine | acc | 0.20220588235294118 |
---|
acc_stderr | 0.02439819298665492 |
---|
acc_norm | 0.20220588235294118 |
---|
acc_norm_stderr | 0.02439819298665492 |
---|
|
---|
hendrycksTest-professional_psychology | acc | 0.25980392156862747 |
---|
acc_stderr | 0.017740899509177795 |
---|
acc_norm | 0.25980392156862747 |
---|
acc_norm_stderr | 0.017740899509177795 |
---|
|
---|
hendrycksTest-public_relations | acc | 0.34545454545454546 |
---|
acc_stderr | 0.04554619617541054 |
---|
acc_norm | 0.34545454545454546 |
---|
acc_norm_stderr | 0.04554619617541054 |
---|
|
---|
hendrycksTest-security_studies | acc | 0.23673469387755103 |
---|
acc_stderr | 0.02721283588407316 |
---|
acc_norm | 0.23673469387755103 |
---|
acc_norm_stderr | 0.02721283588407316 |
---|
|
---|
hendrycksTest-sociology | acc | 0.23880597014925373 |
---|
acc_stderr | 0.030147775935409224 |
---|
acc_norm | 0.23880597014925373 |
---|
acc_norm_stderr | 0.030147775935409224 |
---|
|
---|
hendrycksTest-us_foreign_policy | acc | 0.29 |
---|
acc_stderr | 0.04560480215720684 |
---|
acc_norm | 0.29 |
---|
acc_norm_stderr | 0.04560480215720684 |
---|
|
---|
hendrycksTest-virology | acc | 0.3132530120481928 |
---|
acc_stderr | 0.036108050180310235 |
---|
acc_norm | 0.3132530120481928 |
---|
acc_norm_stderr | 0.036108050180310235 |
---|
|
---|
hendrycksTest-world_religions | acc | 0.34502923976608185 |
---|
acc_stderr | 0.036459813773888065 |
---|
acc_norm | 0.34502923976608185 |
---|
acc_norm_stderr | 0.036459813773888065 |
---|
|
---|
truthfulqa_mc | mc1 | 0.23990208078335373 |
---|
mc1_stderr | 0.014948812679062133 |
---|
mc2 | 0.38208865449224466 |
---|
mc2_stderr | 0.013866257677161405 |
---|
|
---|
arc_challenge | acc | 0.3856655290102389 |
---|
acc_stderr | 0.014224250973257177 |
---|
acc_norm | 0.41723549488054607 |
---|
acc_norm_stderr | 0.014409825518403079 |
---|
|
---|