|
dataset,prompt,metric,value
|
|
anli_dev_r1,GPT-3 style,accuracy,0.421
|
|
anli_dev_r1,MNLI crowdsource,accuracy,0.356
|
|
anli_dev_r1,can we infer,accuracy,0.411
|
|
anli_dev_r1,guaranteed/possible/impossible,accuracy,0.331
|
|
anli_dev_r1,justified in saying,accuracy,0.409
|
|
anli_dev_r1,median,accuracy,0.409
|
|
anli_dev_r2,GPT-3 style,accuracy,0.395
|
|
anli_dev_r2,MNLI crowdsource,accuracy,0.339
|
|
anli_dev_r2,can we infer,accuracy,0.394
|
|
anli_dev_r2,guaranteed/possible/impossible,accuracy,0.324
|
|
anli_dev_r2,justified in saying,accuracy,0.382
|
|
anli_dev_r2,median,accuracy,0.382
|
|
anli_dev_r3,GPT-3 style,accuracy,0.41
|
|
anli_dev_r3,MNLI crowdsource,accuracy,0.36
|
|
anli_dev_r3,can we infer,accuracy,0.4083333333333333
|
|
anli_dev_r3,guaranteed/possible/impossible,accuracy,0.3225
|
|
anli_dev_r3,justified in saying,accuracy,0.38
|
|
anli_dev_r3,median,accuracy,0.38
|
|
story_cloze_2016,Answer Given options,accuracy,0.8733297701763763
|
|
story_cloze_2016,Choose Story Ending,accuracy,0.9043292357028327
|
|
story_cloze_2016,Generate Ending,accuracy,0.7129877071084981
|
|
story_cloze_2016,Novel Correct Ending,accuracy,0.8909673971138429
|
|
story_cloze_2016,Story Continuation and Options,accuracy,0.8952431854623196
|
|
story_cloze_2016,median,accuracy,0.8909673971138429
|
|
super_glue_cb,GPT-3 style,accuracy,0.8035714285714286
|
|
super_glue_cb,MNLI crowdsource,accuracy,0.125
|
|
super_glue_cb,can we infer,accuracy,0.5714285714285714
|
|
super_glue_cb,guaranteed/possible/impossible,accuracy,0.3392857142857143
|
|
super_glue_cb,justified in saying,accuracy,0.6428571428571429
|
|
super_glue_cb,median,accuracy,0.5714285714285714
|
|
super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.63
|
|
super_glue_copa,best_option,accuracy,0.81
|
|
super_glue_copa,cause_effect,accuracy,0.85
|
|
super_glue_copa,i_am_hesitating,accuracy,0.86
|
|
super_glue_copa,plausible_alternatives,accuracy,0.83
|
|
super_glue_copa,median,accuracy,0.83
|
|
super_glue_rte,GPT-3 style,accuracy,0.8411552346570397
|
|
super_glue_rte,MNLI crowdsource,accuracy,0.8014440433212996
|
|
super_glue_rte,does it follow that,accuracy,0.7653429602888087
|
|
super_glue_rte,guaranteed true,accuracy,0.7906137184115524
|
|
super_glue_rte,should assume,accuracy,0.7870036101083032
|
|
super_glue_rte,median,accuracy,0.7906137184115524
|
|
winogrande_winogrande_xl,Replace,accuracy,0.5580110497237569
|
|
winogrande_winogrande_xl,True or False,accuracy,0.531965272296764
|
|
winogrande_winogrande_xl,does underscore refer to,accuracy,0.5540647198105761
|
|
winogrande_winogrande_xl,stand for,accuracy,0.5153906866614049
|
|
winogrande_winogrande_xl,underscore refer to,accuracy,0.5564325177584846
|
|
winogrande_winogrande_xl,median,accuracy,0.5540647198105761
|
|
xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.53
|
|
xcopa_id,best_option,accuracy,0.7
|
|
xcopa_id,cause_effect,accuracy,0.75
|
|
xcopa_id,i_am_hesitating,accuracy,0.76
|
|
xcopa_id,plausible_alternatives,accuracy,0.7
|
|
xcopa_id,median,accuracy,0.7
|
|
xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.63
|
|
xcopa_sw,best_option,accuracy,0.44
|
|
xcopa_sw,cause_effect,accuracy,0.51
|
|
xcopa_sw,i_am_hesitating,accuracy,0.44
|
|
xcopa_sw,plausible_alternatives,accuracy,0.47
|
|
xcopa_sw,median,accuracy,0.47
|
|
xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.57
|
|
xcopa_ta,best_option,accuracy,0.52
|
|
xcopa_ta,cause_effect,accuracy,0.55
|
|
xcopa_ta,i_am_hesitating,accuracy,0.49
|
|
xcopa_ta,plausible_alternatives,accuracy,0.57
|
|
xcopa_ta,median,accuracy,0.55
|
|
xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.55
|
|
xcopa_vi,best_option,accuracy,0.78
|
|
xcopa_vi,cause_effect,accuracy,0.79
|
|
xcopa_vi,i_am_hesitating,accuracy,0.79
|
|
xcopa_vi,plausible_alternatives,accuracy,0.78
|
|
xcopa_vi,median,accuracy,0.78
|
|
xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.67
|
|
xcopa_zh,best_option,accuracy,0.74
|
|
xcopa_zh,cause_effect,accuracy,0.8
|
|
xcopa_zh,i_am_hesitating,accuracy,0.76
|
|
xcopa_zh,plausible_alternatives,accuracy,0.8
|
|
xcopa_zh,median,accuracy,0.76
|
|
xnli_ar,GPT-3 style,accuracy,0.529718875502008
|
|
xnli_ar,MNLI crowdsource,accuracy,0.378714859437751
|
|
xnli_ar,can we infer,accuracy,0.5325301204819277
|
|
xnli_ar,guaranteed/possible/impossible,accuracy,0.3542168674698795
|
|
xnli_ar,justified in saying,accuracy,0.5120481927710844
|
|
xnli_ar,median,accuracy,0.5120481927710844
|
|
xnli_en,GPT-3 style,accuracy,0.5967871485943775
|
|
xnli_en,MNLI crowdsource,accuracy,0.38112449799196785
|
|
xnli_en,can we infer,accuracy,0.5755020080321285
|
|
xnli_en,guaranteed/possible/impossible,accuracy,0.40883534136546185
|
|
xnli_en,justified in saying,accuracy,0.5538152610441767
|
|
xnli_en,median,accuracy,0.5538152610441767
|
|
xnli_es,GPT-3 style,accuracy,0.548995983935743
|
|
xnli_es,MNLI crowdsource,accuracy,0.42369477911646586
|
|
xnli_es,can we infer,accuracy,0.551004016064257
|
|
xnli_es,guaranteed/possible/impossible,accuracy,0.36947791164658633
|
|
xnli_es,justified in saying,accuracy,0.5253012048192771
|
|
xnli_es,median,accuracy,0.5253012048192771
|
|
xnli_fr,GPT-3 style,accuracy,0.5526104417670683
|
|
xnli_fr,MNLI crowdsource,accuracy,0.41566265060240964
|
|
xnli_fr,can we infer,accuracy,0.5437751004016064
|
|
xnli_fr,guaranteed/possible/impossible,accuracy,0.351004016064257
|
|
xnli_fr,justified in saying,accuracy,0.5240963855421686
|
|
xnli_fr,median,accuracy,0.5240963855421686
|
|
xnli_hi,GPT-3 style,accuracy,0.5020080321285141
|
|
xnli_hi,MNLI crowdsource,accuracy,0.3654618473895582
|
|
xnli_hi,can we infer,accuracy,0.5088353413654618
|
|
xnli_hi,guaranteed/possible/impossible,accuracy,0.35582329317269074
|
|
xnli_hi,justified in saying,accuracy,0.4835341365461847
|
|
xnli_hi,median,accuracy,0.4835341365461847
|
|
xnli_sw,GPT-3 style,accuracy,0.4602409638554217
|
|
xnli_sw,MNLI crowdsource,accuracy,0.37309236947791163
|
|
xnli_sw,can we infer,accuracy,0.46626506024096387
|
|
xnli_sw,guaranteed/possible/impossible,accuracy,0.3823293172690763
|
|
xnli_sw,justified in saying,accuracy,0.4389558232931727
|
|
xnli_sw,median,accuracy,0.4389558232931727
|
|
xnli_ur,GPT-3 style,accuracy,0.4646586345381526
|
|
xnli_ur,MNLI crowdsource,accuracy,0.3497991967871486
|
|
xnli_ur,can we infer,accuracy,0.4650602409638554
|
|
xnli_ur,guaranteed/possible/impossible,accuracy,0.342570281124498
|
|
xnli_ur,justified in saying,accuracy,0.4393574297188755
|
|
xnli_ur,median,accuracy,0.4393574297188755
|
|
xnli_vi,GPT-3 style,accuracy,0.5293172690763053
|
|
xnli_vi,MNLI crowdsource,accuracy,0.3863453815261044
|
|
xnli_vi,can we infer,accuracy,0.5265060240963856
|
|
xnli_vi,guaranteed/possible/impossible,accuracy,0.38032128514056224
|
|
xnli_vi,justified in saying,accuracy,0.5008032128514056
|
|
xnli_vi,median,accuracy,0.5008032128514056
|
|
xnli_zh,GPT-3 style,accuracy,0.5236947791164659
|
|
xnli_zh,MNLI crowdsource,accuracy,0.3819277108433735
|
|
xnli_zh,can we infer,accuracy,0.536144578313253
|
|
xnli_zh,guaranteed/possible/impossible,accuracy,0.3610441767068273
|
|
xnli_zh,justified in saying,accuracy,0.5124497991967871
|
|
xnli_zh,median,accuracy,0.5124497991967871
|
|
xstory_cloze_ar,Answer Given options,accuracy,0.7306419589675711
|
|
xstory_cloze_ar,Choose Story Ending,accuracy,0.8325612177365983
|
|
xstory_cloze_ar,Generate Ending,accuracy,0.599602911978822
|
|
xstory_cloze_ar,Novel Correct Ending,accuracy,0.8206485771012575
|
|
xstory_cloze_ar,Story Continuation and Options,accuracy,0.814692256783587
|
|
xstory_cloze_ar,median,accuracy,0.814692256783587
|
|
xstory_cloze_es,Answer Given options,accuracy,0.827928524156188
|
|
xstory_cloze_es,Choose Story Ending,accuracy,0.8888153540701522
|
|
xstory_cloze_es,Generate Ending,accuracy,0.6730641958967571
|
|
xstory_cloze_es,Novel Correct Ending,accuracy,0.8676373262739907
|
|
xstory_cloze_es,Story Continuation and Options,accuracy,0.8888153540701522
|
|
xstory_cloze_es,median,accuracy,0.8676373262739907
|
|
xstory_cloze_eu,Answer Given options,accuracy,0.6320317670416943
|
|
xstory_cloze_eu,Choose Story Ending,accuracy,0.7332892124420913
|
|
xstory_cloze_eu,Generate Ending,accuracy,0.5638649900727994
|
|
xstory_cloze_eu,Novel Correct Ending,accuracy,0.7081403044341495
|
|
xstory_cloze_eu,Story Continuation and Options,accuracy,0.7233620119126406
|
|
xstory_cloze_eu,median,accuracy,0.7081403044341495
|
|
xstory_cloze_hi,Answer Given options,accuracy,0.6664460622104567
|
|
xstory_cloze_hi,Choose Story Ending,accuracy,0.7948378557246857
|
|
xstory_cloze_hi,Generate Ending,accuracy,0.5962938451356717
|
|
xstory_cloze_hi,Novel Correct Ending,accuracy,0.7531436135009927
|
|
xstory_cloze_hi,Story Continuation and Options,accuracy,0.7895433487756452
|
|
xstory_cloze_hi,median,accuracy,0.7531436135009927
|
|
xstory_cloze_id,Answer Given options,accuracy,0.7544672402382528
|
|
xstory_cloze_id,Choose Story Ending,accuracy,0.8424884182660489
|
|
xstory_cloze_id,Generate Ending,accuracy,0.6360026472534746
|
|
xstory_cloze_id,Novel Correct Ending,accuracy,0.8239576439444076
|
|
xstory_cloze_id,Story Continuation and Options,accuracy,0.8272667107875579
|
|
xstory_cloze_id,median,accuracy,0.8239576439444076
|
|
xstory_cloze_zh,Answer Given options,accuracy,0.7802779616148247
|
|
xstory_cloze_zh,Choose Story Ending,accuracy,0.8504301786896096
|
|
xstory_cloze_zh,Generate Ending,accuracy,0.6154864328259431
|
|
xstory_cloze_zh,Novel Correct Ending,accuracy,0.8491065519523494
|
|
xstory_cloze_zh,Story Continuation and Options,accuracy,0.8491065519523494
|
|
xstory_cloze_zh,median,accuracy,0.8491065519523494
|
|
xwinograd_en,Replace,accuracy,0.6602150537634408
|
|
xwinograd_en,True or False,accuracy,0.4972043010752688
|
|
xwinograd_en,does underscore refer to,accuracy,0.6008602150537634
|
|
xwinograd_en,stand for,accuracy,0.5165591397849463
|
|
xwinograd_en,underscore refer to,accuracy,0.6283870967741936
|
|
xwinograd_en,median,accuracy,0.6008602150537634
|
|
xwinograd_fr,Replace,accuracy,0.5301204819277109
|
|
xwinograd_fr,True or False,accuracy,0.5060240963855421
|
|
xwinograd_fr,does underscore refer to,accuracy,0.5783132530120482
|
|
xwinograd_fr,stand for,accuracy,0.4939759036144578
|
|
xwinograd_fr,underscore refer to,accuracy,0.5421686746987951
|
|
xwinograd_fr,median,accuracy,0.5301204819277109
|
|
xwinograd_pt,Replace,accuracy,0.5779467680608364
|
|
xwinograd_pt,True or False,accuracy,0.4752851711026616
|
|
xwinograd_pt,does underscore refer to,accuracy,0.5551330798479087
|
|
xwinograd_pt,stand for,accuracy,0.49049429657794674
|
|
xwinograd_pt,underscore refer to,accuracy,0.5399239543726235
|
|
xwinograd_pt,median,accuracy,0.5399239543726235
|
|
xwinograd_zh,Replace,accuracy,0.6369047619047619
|
|
xwinograd_zh,True or False,accuracy,0.5972222222222222
|
|
xwinograd_zh,does underscore refer to,accuracy,0.5793650793650794
|
|
xwinograd_zh,stand for,accuracy,0.47619047619047616
|
|
xwinograd_zh,underscore refer to,accuracy,0.5595238095238095
|
|
xwinograd_zh,median,accuracy,0.5793650793650794
|
|
multiple,average,multiple,0.6132932275048852
|
|
|