Muennighoff's picture
Logs
ed75a42
raw
history blame
10.5 kB
dataset,prompt,metric,value
anli_dev_r1,GPT-3 style,accuracy,0.421
anli_dev_r1,MNLI crowdsource,accuracy,0.356
anli_dev_r1,can we infer,accuracy,0.411
anli_dev_r1,guaranteed/possible/impossible,accuracy,0.331
anli_dev_r1,justified in saying,accuracy,0.409
anli_dev_r1,median,accuracy,0.409
anli_dev_r2,GPT-3 style,accuracy,0.395
anli_dev_r2,MNLI crowdsource,accuracy,0.339
anli_dev_r2,can we infer,accuracy,0.394
anli_dev_r2,guaranteed/possible/impossible,accuracy,0.324
anli_dev_r2,justified in saying,accuracy,0.382
anli_dev_r2,median,accuracy,0.382
anli_dev_r3,GPT-3 style,accuracy,0.41
anli_dev_r3,MNLI crowdsource,accuracy,0.36
anli_dev_r3,can we infer,accuracy,0.4083333333333333
anli_dev_r3,guaranteed/possible/impossible,accuracy,0.3225
anli_dev_r3,justified in saying,accuracy,0.38
anli_dev_r3,median,accuracy,0.38
story_cloze_2016,Answer Given options,accuracy,0.8733297701763763
story_cloze_2016,Choose Story Ending,accuracy,0.9043292357028327
story_cloze_2016,Generate Ending,accuracy,0.7129877071084981
story_cloze_2016,Novel Correct Ending,accuracy,0.8909673971138429
story_cloze_2016,Story Continuation and Options,accuracy,0.8952431854623196
story_cloze_2016,median,accuracy,0.8909673971138429
super_glue_cb,GPT-3 style,accuracy,0.8035714285714286
super_glue_cb,MNLI crowdsource,accuracy,0.125
super_glue_cb,can we infer,accuracy,0.5714285714285714
super_glue_cb,guaranteed/possible/impossible,accuracy,0.3392857142857143
super_glue_cb,justified in saying,accuracy,0.6428571428571429
super_glue_cb,median,accuracy,0.5714285714285714
super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.63
super_glue_copa,best_option,accuracy,0.81
super_glue_copa,cause_effect,accuracy,0.85
super_glue_copa,i_am_hesitating,accuracy,0.86
super_glue_copa,plausible_alternatives,accuracy,0.83
super_glue_copa,median,accuracy,0.83
super_glue_rte,GPT-3 style,accuracy,0.8411552346570397
super_glue_rte,MNLI crowdsource,accuracy,0.8014440433212996
super_glue_rte,does it follow that,accuracy,0.7653429602888087
super_glue_rte,guaranteed true,accuracy,0.7906137184115524
super_glue_rte,should assume,accuracy,0.7870036101083032
super_glue_rte,median,accuracy,0.7906137184115524
winogrande_winogrande_xl,Replace,accuracy,0.5580110497237569
winogrande_winogrande_xl,True or False,accuracy,0.531965272296764
winogrande_winogrande_xl,does underscore refer to,accuracy,0.5540647198105761
winogrande_winogrande_xl,stand for,accuracy,0.5153906866614049
winogrande_winogrande_xl,underscore refer to,accuracy,0.5564325177584846
winogrande_winogrande_xl,median,accuracy,0.5540647198105761
xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.53
xcopa_id,best_option,accuracy,0.7
xcopa_id,cause_effect,accuracy,0.75
xcopa_id,i_am_hesitating,accuracy,0.76
xcopa_id,plausible_alternatives,accuracy,0.7
xcopa_id,median,accuracy,0.7
xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.63
xcopa_sw,best_option,accuracy,0.44
xcopa_sw,cause_effect,accuracy,0.51
xcopa_sw,i_am_hesitating,accuracy,0.44
xcopa_sw,plausible_alternatives,accuracy,0.47
xcopa_sw,median,accuracy,0.47
xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.57
xcopa_ta,best_option,accuracy,0.52
xcopa_ta,cause_effect,accuracy,0.55
xcopa_ta,i_am_hesitating,accuracy,0.49
xcopa_ta,plausible_alternatives,accuracy,0.57
xcopa_ta,median,accuracy,0.55
xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.55
xcopa_vi,best_option,accuracy,0.78
xcopa_vi,cause_effect,accuracy,0.79
xcopa_vi,i_am_hesitating,accuracy,0.79
xcopa_vi,plausible_alternatives,accuracy,0.78
xcopa_vi,median,accuracy,0.78
xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.67
xcopa_zh,best_option,accuracy,0.74
xcopa_zh,cause_effect,accuracy,0.8
xcopa_zh,i_am_hesitating,accuracy,0.76
xcopa_zh,plausible_alternatives,accuracy,0.8
xcopa_zh,median,accuracy,0.76
xnli_ar,GPT-3 style,accuracy,0.529718875502008
xnli_ar,MNLI crowdsource,accuracy,0.378714859437751
xnli_ar,can we infer,accuracy,0.5325301204819277
xnli_ar,guaranteed/possible/impossible,accuracy,0.3542168674698795
xnli_ar,justified in saying,accuracy,0.5120481927710844
xnli_ar,median,accuracy,0.5120481927710844
xnli_en,GPT-3 style,accuracy,0.5967871485943775
xnli_en,MNLI crowdsource,accuracy,0.38112449799196785
xnli_en,can we infer,accuracy,0.5755020080321285
xnli_en,guaranteed/possible/impossible,accuracy,0.40883534136546185
xnli_en,justified in saying,accuracy,0.5538152610441767
xnli_en,median,accuracy,0.5538152610441767
xnli_es,GPT-3 style,accuracy,0.548995983935743
xnli_es,MNLI crowdsource,accuracy,0.42369477911646586
xnli_es,can we infer,accuracy,0.551004016064257
xnli_es,guaranteed/possible/impossible,accuracy,0.36947791164658633
xnli_es,justified in saying,accuracy,0.5253012048192771
xnli_es,median,accuracy,0.5253012048192771
xnli_fr,GPT-3 style,accuracy,0.5526104417670683
xnli_fr,MNLI crowdsource,accuracy,0.41566265060240964
xnli_fr,can we infer,accuracy,0.5437751004016064
xnli_fr,guaranteed/possible/impossible,accuracy,0.351004016064257
xnli_fr,justified in saying,accuracy,0.5240963855421686
xnli_fr,median,accuracy,0.5240963855421686
xnli_hi,GPT-3 style,accuracy,0.5020080321285141
xnli_hi,MNLI crowdsource,accuracy,0.3654618473895582
xnli_hi,can we infer,accuracy,0.5088353413654618
xnli_hi,guaranteed/possible/impossible,accuracy,0.35582329317269074
xnli_hi,justified in saying,accuracy,0.4835341365461847
xnli_hi,median,accuracy,0.4835341365461847
xnli_sw,GPT-3 style,accuracy,0.4602409638554217
xnli_sw,MNLI crowdsource,accuracy,0.37309236947791163
xnli_sw,can we infer,accuracy,0.46626506024096387
xnli_sw,guaranteed/possible/impossible,accuracy,0.3823293172690763
xnli_sw,justified in saying,accuracy,0.4389558232931727
xnli_sw,median,accuracy,0.4389558232931727
xnli_ur,GPT-3 style,accuracy,0.4646586345381526
xnli_ur,MNLI crowdsource,accuracy,0.3497991967871486
xnli_ur,can we infer,accuracy,0.4650602409638554
xnli_ur,guaranteed/possible/impossible,accuracy,0.342570281124498
xnli_ur,justified in saying,accuracy,0.4393574297188755
xnli_ur,median,accuracy,0.4393574297188755
xnli_vi,GPT-3 style,accuracy,0.5293172690763053
xnli_vi,MNLI crowdsource,accuracy,0.3863453815261044
xnli_vi,can we infer,accuracy,0.5265060240963856
xnli_vi,guaranteed/possible/impossible,accuracy,0.38032128514056224
xnli_vi,justified in saying,accuracy,0.5008032128514056
xnli_vi,median,accuracy,0.5008032128514056
xnli_zh,GPT-3 style,accuracy,0.5236947791164659
xnli_zh,MNLI crowdsource,accuracy,0.3819277108433735
xnli_zh,can we infer,accuracy,0.536144578313253
xnli_zh,guaranteed/possible/impossible,accuracy,0.3610441767068273
xnli_zh,justified in saying,accuracy,0.5124497991967871
xnli_zh,median,accuracy,0.5124497991967871
xstory_cloze_ar,Answer Given options,accuracy,0.7306419589675711
xstory_cloze_ar,Choose Story Ending,accuracy,0.8325612177365983
xstory_cloze_ar,Generate Ending,accuracy,0.599602911978822
xstory_cloze_ar,Novel Correct Ending,accuracy,0.8206485771012575
xstory_cloze_ar,Story Continuation and Options,accuracy,0.814692256783587
xstory_cloze_ar,median,accuracy,0.814692256783587
xstory_cloze_es,Answer Given options,accuracy,0.827928524156188
xstory_cloze_es,Choose Story Ending,accuracy,0.8888153540701522
xstory_cloze_es,Generate Ending,accuracy,0.6730641958967571
xstory_cloze_es,Novel Correct Ending,accuracy,0.8676373262739907
xstory_cloze_es,Story Continuation and Options,accuracy,0.8888153540701522
xstory_cloze_es,median,accuracy,0.8676373262739907
xstory_cloze_eu,Answer Given options,accuracy,0.6320317670416943
xstory_cloze_eu,Choose Story Ending,accuracy,0.7332892124420913
xstory_cloze_eu,Generate Ending,accuracy,0.5638649900727994
xstory_cloze_eu,Novel Correct Ending,accuracy,0.7081403044341495
xstory_cloze_eu,Story Continuation and Options,accuracy,0.7233620119126406
xstory_cloze_eu,median,accuracy,0.7081403044341495
xstory_cloze_hi,Answer Given options,accuracy,0.6664460622104567
xstory_cloze_hi,Choose Story Ending,accuracy,0.7948378557246857
xstory_cloze_hi,Generate Ending,accuracy,0.5962938451356717
xstory_cloze_hi,Novel Correct Ending,accuracy,0.7531436135009927
xstory_cloze_hi,Story Continuation and Options,accuracy,0.7895433487756452
xstory_cloze_hi,median,accuracy,0.7531436135009927
xstory_cloze_id,Answer Given options,accuracy,0.7544672402382528
xstory_cloze_id,Choose Story Ending,accuracy,0.8424884182660489
xstory_cloze_id,Generate Ending,accuracy,0.6360026472534746
xstory_cloze_id,Novel Correct Ending,accuracy,0.8239576439444076
xstory_cloze_id,Story Continuation and Options,accuracy,0.8272667107875579
xstory_cloze_id,median,accuracy,0.8239576439444076
xstory_cloze_zh,Answer Given options,accuracy,0.7802779616148247
xstory_cloze_zh,Choose Story Ending,accuracy,0.8504301786896096
xstory_cloze_zh,Generate Ending,accuracy,0.6154864328259431
xstory_cloze_zh,Novel Correct Ending,accuracy,0.8491065519523494
xstory_cloze_zh,Story Continuation and Options,accuracy,0.8491065519523494
xstory_cloze_zh,median,accuracy,0.8491065519523494
xwinograd_en,Replace,accuracy,0.6602150537634408
xwinograd_en,True or False,accuracy,0.4972043010752688
xwinograd_en,does underscore refer to,accuracy,0.6008602150537634
xwinograd_en,stand for,accuracy,0.5165591397849463
xwinograd_en,underscore refer to,accuracy,0.6283870967741936
xwinograd_en,median,accuracy,0.6008602150537634
xwinograd_fr,Replace,accuracy,0.5301204819277109
xwinograd_fr,True or False,accuracy,0.5060240963855421
xwinograd_fr,does underscore refer to,accuracy,0.5783132530120482
xwinograd_fr,stand for,accuracy,0.4939759036144578
xwinograd_fr,underscore refer to,accuracy,0.5421686746987951
xwinograd_fr,median,accuracy,0.5301204819277109
xwinograd_pt,Replace,accuracy,0.5779467680608364
xwinograd_pt,True or False,accuracy,0.4752851711026616
xwinograd_pt,does underscore refer to,accuracy,0.5551330798479087
xwinograd_pt,stand for,accuracy,0.49049429657794674
xwinograd_pt,underscore refer to,accuracy,0.5399239543726235
xwinograd_pt,median,accuracy,0.5399239543726235
xwinograd_zh,Replace,accuracy,0.6369047619047619
xwinograd_zh,True or False,accuracy,0.5972222222222222
xwinograd_zh,does underscore refer to,accuracy,0.5793650793650794
xwinograd_zh,stand for,accuracy,0.47619047619047616
xwinograd_zh,underscore refer to,accuracy,0.5595238095238095
xwinograd_zh,median,accuracy,0.5793650793650794
multiple,average,multiple,0.6132932275048852