|
dataset,prompt,metric,value
|
|
anli_dev_r1,GPT-3 style,accuracy,0.486
|
|
anli_dev_r1,MNLI crowdsource,accuracy,0.427
|
|
anli_dev_r1,can we infer,accuracy,0.474
|
|
anli_dev_r1,guaranteed/possible/impossible,accuracy,0.39
|
|
anli_dev_r1,justified in saying,accuracy,0.46
|
|
anli_dev_r1,median,accuracy,0.46
|
|
anli_dev_r2,GPT-3 style,accuracy,0.441
|
|
anli_dev_r2,MNLI crowdsource,accuracy,0.406
|
|
anli_dev_r2,can we infer,accuracy,0.426
|
|
anli_dev_r2,guaranteed/possible/impossible,accuracy,0.36
|
|
anli_dev_r2,justified in saying,accuracy,0.419
|
|
anli_dev_r2,median,accuracy,0.419
|
|
anli_dev_r3,GPT-3 style,accuracy,0.455
|
|
anli_dev_r3,MNLI crowdsource,accuracy,0.42
|
|
anli_dev_r3,can we infer,accuracy,0.445
|
|
anli_dev_r3,guaranteed/possible/impossible,accuracy,0.32083333333333336
|
|
anli_dev_r3,justified in saying,accuracy,0.4266666666666667
|
|
anli_dev_r3,median,accuracy,0.4266666666666667
|
|
story_cloze_2016,Answer Given options,accuracy,0.9567076429716729
|
|
story_cloze_2016,Choose Story Ending,accuracy,0.9625868519508284
|
|
story_cloze_2016,Generate Ending,accuracy,0.7814003206841261
|
|
story_cloze_2016,Novel Correct Ending,accuracy,0.9577765900587921
|
|
story_cloze_2016,Story Continuation and Options,accuracy,0.951362907536077
|
|
story_cloze_2016,median,accuracy,0.9567076429716729
|
|
super_glue_cb,GPT-3 style,accuracy,0.8214285714285714
|
|
super_glue_cb,MNLI crowdsource,accuracy,0.375
|
|
super_glue_cb,can we infer,accuracy,0.8214285714285714
|
|
super_glue_cb,guaranteed/possible/impossible,accuracy,0.7321428571428571
|
|
super_glue_cb,justified in saying,accuracy,0.7678571428571429
|
|
super_glue_cb,median,accuracy,0.7678571428571429
|
|
super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.62
|
|
super_glue_copa,best_option,accuracy,0.87
|
|
super_glue_copa,cause_effect,accuracy,0.88
|
|
super_glue_copa,i_am_hesitating,accuracy,0.91
|
|
super_glue_copa,plausible_alternatives,accuracy,0.88
|
|
super_glue_copa,median,accuracy,0.88
|
|
super_glue_rte,GPT-3 style,accuracy,0.8303249097472925
|
|
super_glue_rte,MNLI crowdsource,accuracy,0.855595667870036
|
|
super_glue_rte,does it follow that,accuracy,0.7833935018050542
|
|
super_glue_rte,guaranteed true,accuracy,0.8122743682310469
|
|
super_glue_rte,should assume,accuracy,0.8194945848375451
|
|
super_glue_rte,median,accuracy,0.8194945848375451
|
|
winogrande_winogrande_xl,Replace,accuracy,0.584846093133386
|
|
winogrande_winogrande_xl,True or False,accuracy,0.5217048145224941
|
|
winogrande_winogrande_xl,does underscore refer to,accuracy,0.5840568271507498
|
|
winogrande_winogrande_xl,stand for,accuracy,0.5114443567482242
|
|
winogrande_winogrande_xl,underscore refer to,accuracy,0.5927387529597474
|
|
winogrande_winogrande_xl,median,accuracy,0.5840568271507498
|
|
xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.55
|
|
xcopa_id,best_option,accuracy,0.78
|
|
xcopa_id,cause_effect,accuracy,0.86
|
|
xcopa_id,i_am_hesitating,accuracy,0.79
|
|
xcopa_id,plausible_alternatives,accuracy,0.84
|
|
xcopa_id,median,accuracy,0.79
|
|
xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.57
|
|
xcopa_sw,best_option,accuracy,0.6
|
|
xcopa_sw,cause_effect,accuracy,0.6
|
|
xcopa_sw,i_am_hesitating,accuracy,0.64
|
|
xcopa_sw,plausible_alternatives,accuracy,0.62
|
|
xcopa_sw,median,accuracy,0.6
|
|
xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.58
|
|
xcopa_ta,best_option,accuracy,0.67
|
|
xcopa_ta,cause_effect,accuracy,0.67
|
|
xcopa_ta,i_am_hesitating,accuracy,0.68
|
|
xcopa_ta,plausible_alternatives,accuracy,0.69
|
|
xcopa_ta,median,accuracy,0.67
|
|
xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.55
|
|
xcopa_vi,best_option,accuracy,0.83
|
|
xcopa_vi,cause_effect,accuracy,0.87
|
|
xcopa_vi,i_am_hesitating,accuracy,0.84
|
|
xcopa_vi,plausible_alternatives,accuracy,0.86
|
|
xcopa_vi,median,accuracy,0.84
|
|
xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.55
|
|
xcopa_zh,best_option,accuracy,0.83
|
|
xcopa_zh,cause_effect,accuracy,0.9
|
|
xcopa_zh,i_am_hesitating,accuracy,0.9
|
|
xcopa_zh,plausible_alternatives,accuracy,0.86
|
|
xcopa_zh,median,accuracy,0.86
|
|
xnli_ar,GPT-3 style,accuracy,0.5357429718875502
|
|
xnli_ar,MNLI crowdsource,accuracy,0.41004016064257026
|
|
xnli_ar,can we infer,accuracy,0.5606425702811245
|
|
xnli_ar,guaranteed/possible/impossible,accuracy,0.6068273092369478
|
|
xnli_ar,justified in saying,accuracy,0.5437751004016064
|
|
xnli_ar,median,accuracy,0.5437751004016064
|
|
xnli_en,GPT-3 style,accuracy,0.6168674698795181
|
|
xnli_en,MNLI crowdsource,accuracy,0.45502008032128516
|
|
xnli_en,can we infer,accuracy,0.6092369477911647
|
|
xnli_en,guaranteed/possible/impossible,accuracy,0.6746987951807228
|
|
xnli_en,justified in saying,accuracy,0.5895582329317269
|
|
xnli_en,median,accuracy,0.6092369477911647
|
|
xnli_es,GPT-3 style,accuracy,0.585140562248996
|
|
xnli_es,MNLI crowdsource,accuracy,0.4357429718875502
|
|
xnli_es,can we infer,accuracy,0.5883534136546185
|
|
xnli_es,guaranteed/possible/impossible,accuracy,0.6124497991967871
|
|
xnli_es,justified in saying,accuracy,0.5734939759036145
|
|
xnli_es,median,accuracy,0.585140562248996
|
|
xnli_fr,GPT-3 style,accuracy,0.5771084337349398
|
|
xnli_fr,MNLI crowdsource,accuracy,0.43012048192771085
|
|
xnli_fr,can we infer,accuracy,0.5807228915662651
|
|
xnli_fr,guaranteed/possible/impossible,accuracy,0.6136546184738956
|
|
xnli_fr,justified in saying,accuracy,0.5694779116465863
|
|
xnli_fr,median,accuracy,0.5771084337349398
|
|
xnli_hi,GPT-3 style,accuracy,0.5248995983935743
|
|
xnli_hi,MNLI crowdsource,accuracy,0.3795180722891566
|
|
xnli_hi,can we infer,accuracy,0.5506024096385542
|
|
xnli_hi,guaranteed/possible/impossible,accuracy,0.5682730923694779
|
|
xnli_hi,justified in saying,accuracy,0.5353413654618474
|
|
xnli_hi,median,accuracy,0.5353413654618474
|
|
xnli_sw,GPT-3 style,accuracy,0.4795180722891566
|
|
xnli_sw,MNLI crowdsource,accuracy,0.39196787148594375
|
|
xnli_sw,can we infer,accuracy,0.5208835341365462
|
|
xnli_sw,guaranteed/possible/impossible,accuracy,0.5036144578313253
|
|
xnli_sw,justified in saying,accuracy,0.5184738955823294
|
|
xnli_sw,median,accuracy,0.5036144578313253
|
|
xnli_ur,GPT-3 style,accuracy,0.46586345381526106
|
|
xnli_ur,MNLI crowdsource,accuracy,0.3718875502008032
|
|
xnli_ur,can we infer,accuracy,0.5080321285140562
|
|
xnli_ur,guaranteed/possible/impossible,accuracy,0.4995983935742972
|
|
xnli_ur,justified in saying,accuracy,0.5080321285140562
|
|
xnli_ur,median,accuracy,0.4995983935742972
|
|
xnli_vi,GPT-3 style,accuracy,0.5578313253012048
|
|
xnli_vi,MNLI crowdsource,accuracy,0.42449799196787147
|
|
xnli_vi,can we infer,accuracy,0.5678714859437751
|
|
xnli_vi,guaranteed/possible/impossible,accuracy,0.6100401606425703
|
|
xnli_vi,justified in saying,accuracy,0.5538152610441767
|
|
xnli_vi,median,accuracy,0.5578313253012048
|
|
xnli_zh,GPT-3 style,accuracy,0.5526104417670683
|
|
xnli_zh,MNLI crowdsource,accuracy,0.38473895582329315
|
|
xnli_zh,can we infer,accuracy,0.5690763052208835
|
|
xnli_zh,guaranteed/possible/impossible,accuracy,0.5674698795180723
|
|
xnli_zh,justified in saying,accuracy,0.5622489959839357
|
|
xnli_zh,median,accuracy,0.5622489959839357
|
|
xstory_cloze_ar,Answer Given options,accuracy,0.7968232958305758
|
|
xstory_cloze_ar,Choose Story Ending,accuracy,0.9232296492389146
|
|
xstory_cloze_ar,Generate Ending,accuracy,0.6677696889477167
|
|
xstory_cloze_ar,Novel Correct Ending,accuracy,0.9265387160820648
|
|
xstory_cloze_ar,Story Continuation and Options,accuracy,0.9126406353408338
|
|
xstory_cloze_ar,median,accuracy,0.9126406353408338
|
|
xstory_cloze_es,Answer Given options,accuracy,0.8729318332230311
|
|
xstory_cloze_es,Choose Story Ending,accuracy,0.9417604235605559
|
|
xstory_cloze_es,Generate Ending,accuracy,0.7359364659166115
|
|
xstory_cloze_es,Novel Correct Ending,accuracy,0.9430840502978161
|
|
xstory_cloze_es,Story Continuation and Options,accuracy,0.9318332230311053
|
|
xstory_cloze_es,median,accuracy,0.9318332230311053
|
|
xstory_cloze_eu,Answer Given options,accuracy,0.7054930509596293
|
|
xstory_cloze_eu,Choose Story Ending,accuracy,0.8663136995367307
|
|
xstory_cloze_eu,Generate Ending,accuracy,0.6320317670416943
|
|
xstory_cloze_eu,Novel Correct Ending,accuracy,0.8689609530112509
|
|
xstory_cloze_eu,Story Continuation and Options,accuracy,0.8524156187954997
|
|
xstory_cloze_eu,median,accuracy,0.8524156187954997
|
|
xstory_cloze_hi,Answer Given options,accuracy,0.798808735936466
|
|
xstory_cloze_hi,Choose Story Ending,accuracy,0.8702845797485109
|
|
xstory_cloze_hi,Generate Ending,accuracy,0.6604897418927862
|
|
xstory_cloze_hi,Novel Correct Ending,accuracy,0.8788881535407015
|
|
xstory_cloze_hi,Story Continuation and Options,accuracy,0.870946393117141
|
|
xstory_cloze_hi,median,accuracy,0.8702845797485109
|
|
xstory_cloze_id,Answer Given options,accuracy,0.8557246856386499
|
|
xstory_cloze_id,Choose Story Ending,accuracy,0.9212442091330245
|
|
xstory_cloze_id,Generate Ending,accuracy,0.7041694242223693
|
|
xstory_cloze_id,Novel Correct Ending,accuracy,0.9205823957643945
|
|
xstory_cloze_id,Story Continuation and Options,accuracy,0.9066843150231635
|
|
xstory_cloze_id,median,accuracy,0.9066843150231635
|
|
xstory_cloze_zh,Answer Given options,accuracy,0.900066181336863
|
|
xstory_cloze_zh,Choose Story Ending,accuracy,0.9232296492389146
|
|
xstory_cloze_zh,Generate Ending,accuracy,0.684976836532098
|
|
xstory_cloze_zh,Novel Correct Ending,accuracy,0.9311714096624751
|
|
xstory_cloze_zh,Story Continuation and Options,accuracy,0.9199205823957644
|
|
xstory_cloze_zh,median,accuracy,0.9199205823957644
|
|
xwinograd_en,Replace,accuracy,0.6847311827956989
|
|
xwinograd_en,True or False,accuracy,0.5135483870967742
|
|
xwinograd_en,does underscore refer to,accuracy,0.6787096774193548
|
|
xwinograd_en,stand for,accuracy,0.5053763440860215
|
|
xwinograd_en,underscore refer to,accuracy,0.690752688172043
|
|
xwinograd_en,median,accuracy,0.6787096774193548
|
|
xwinograd_fr,Replace,accuracy,0.6506024096385542
|
|
xwinograd_fr,True or False,accuracy,0.4939759036144578
|
|
xwinograd_fr,does underscore refer to,accuracy,0.6867469879518072
|
|
xwinograd_fr,stand for,accuracy,0.46987951807228917
|
|
xwinograd_fr,underscore refer to,accuracy,0.6626506024096386
|
|
xwinograd_fr,median,accuracy,0.6506024096385542
|
|
xwinograd_pt,Replace,accuracy,0.6349809885931559
|
|
xwinograd_pt,True or False,accuracy,0.4866920152091255
|
|
xwinograd_pt,does underscore refer to,accuracy,0.6387832699619772
|
|
xwinograd_pt,stand for,accuracy,0.49429657794676807
|
|
xwinograd_pt,underscore refer to,accuracy,0.6425855513307985
|
|
xwinograd_pt,median,accuracy,0.6349809885931559
|
|
xwinograd_zh,Replace,accuracy,0.6865079365079365
|
|
xwinograd_zh,True or False,accuracy,0.5277777777777778
|
|
xwinograd_zh,does underscore refer to,accuracy,0.6884920634920635
|
|
xwinograd_zh,stand for,accuracy,0.4861111111111111
|
|
xwinograd_zh,underscore refer to,accuracy,0.6904761904761905
|
|
xwinograd_zh,median,accuracy,0.6865079365079365
|
|
multiple,average,multiple,0.6903830754158429
|
|
|