|
dataset,prompt,metric,value
|
|
amazon_reviews_multi_en,prompt_body_title_to_star,accuracy,0.5416
|
|
amazon_reviews_multi_en,prompt_review_to_star,accuracy,0.509
|
|
amazon_reviews_multi_en,prompt_title_to_star,accuracy,0.364
|
|
amazon_reviews_multi_en,median,accuracy,0.509
|
|
amazon_reviews_multi_es,prompt_body_title_to_star,accuracy,0.4448
|
|
amazon_reviews_multi_es,prompt_review_to_star,accuracy,0.4326
|
|
amazon_reviews_multi_es,prompt_title_to_star,accuracy,0.2802
|
|
amazon_reviews_multi_es,median,accuracy,0.4326
|
|
amazon_reviews_multi_fr,prompt_body_title_to_star,accuracy,0.449
|
|
amazon_reviews_multi_fr,prompt_review_to_star,accuracy,0.4392
|
|
amazon_reviews_multi_fr,prompt_title_to_star,accuracy,0.3128
|
|
amazon_reviews_multi_fr,median,accuracy,0.4392
|
|
amazon_reviews_multi_zh,prompt_body_title_to_star,accuracy,0.421
|
|
amazon_reviews_multi_zh,prompt_review_to_star,accuracy,0.4048
|
|
amazon_reviews_multi_zh,prompt_title_to_star,accuracy,0.302
|
|
amazon_reviews_multi_zh,median,accuracy,0.4048
|
|
aqua_rat_raw,Answer questions from options,accuracy,0.2755905511811024
|
|
aqua_rat_raw,answer_quiz,accuracy,0.2677165354330709
|
|
aqua_rat_raw,select_the_best_option,accuracy,0.28346456692913385
|
|
aqua_rat_raw,median,accuracy,0.2755905511811024
|
|
art_None,choose_hypothesis,accuracy,0.6742819843342036
|
|
art_None,choose_hypothesis_believable,accuracy,0.6677545691906005
|
|
art_None,choose_hypothesis_desc,accuracy,0.5515665796344648
|
|
art_None,choose_hypothesis_likely,accuracy,0.5737597911227154
|
|
art_None,choose_hypothesis_options,accuracy,0.6657963446475196
|
|
art_None,median,accuracy,0.6657963446475196
|
|
banking77_None,direct_to_which_department,accuracy,0.16688311688311688
|
|
banking77_None,help_page_topic,accuracy,0.2857142857142857
|
|
banking77_None,rephrase_as_banking_term,accuracy,0.2905844155844156
|
|
banking77_None,median,accuracy,0.2857142857142857
|
|
blbooksgenre_title_genre_classifiction,classify,accuracy,0.3127880184331797
|
|
blbooksgenre_title_genre_classifiction,multi-choice,accuracy,0.8640552995391705
|
|
blbooksgenre_title_genre_classifiction,premise_context_first,accuracy,0.7707373271889401
|
|
blbooksgenre_title_genre_classifiction,median,accuracy,0.7707373271889401
|
|
blimp_adjunct_island,grammatical_between_1_2,accuracy,0.466
|
|
blimp_adjunct_island,grammatical_between_A_B,accuracy,0.327
|
|
blimp_adjunct_island,grammatical_which_one_1_2,accuracy,0.498
|
|
blimp_adjunct_island,single_sentence_bad_yes_no,accuracy,0.507
|
|
blimp_adjunct_island,single_sentence_good_yes_no,accuracy,0.516
|
|
blimp_adjunct_island,median,accuracy,0.498
|
|
climate_fever_None,claim_and_all_supporting_evidences,accuracy,0.24299674267100976
|
|
climate_fever_None,fifth_evidence_and_claim_itemization,accuracy,0.36612377850162864
|
|
climate_fever_None,first_evidence_and_claim_itemization,accuracy,0.15895765472312703
|
|
climate_fever_None,second_evidence_and_claim_itemization,accuracy,0.14788273615635178
|
|
climate_fever_None,third_evidence_claim_pair,accuracy,0.18631921824104233
|
|
climate_fever_None,median,accuracy,0.18631921824104233
|
|
codah_codah,affirmative_instruction_after_sentence_and_choices,accuracy,0.8065561959654178
|
|
codah_codah,affirmative_instruction_before_sentence_and_choices,accuracy,0.7885446685878963
|
|
codah_codah,interrogative_instruction_after_sentence_and_choices,accuracy,0.8090778097982709
|
|
codah_codah,median,accuracy,0.8065561959654178
|
|
commonsense_qa_None,answer_given_question_without_options,accuracy,0.7018837018837019
|
|
commonsense_qa_None,most_suitable_answer,accuracy,0.8304668304668305
|
|
commonsense_qa_None,question_answering,accuracy,0.8026208026208026
|
|
commonsense_qa_None,median,accuracy,0.8026208026208026
|
|
conv_ai_3_None,ambiguous,accuracy,0.39040207522697795
|
|
conv_ai_3_None,clarification_needed,accuracy,0.39040207522697795
|
|
conv_ai_3_None,directly_answer,accuracy,0.6095979247730221
|
|
conv_ai_3_None,score_give_number,accuracy,0.21444012105490703
|
|
conv_ai_3_None,score_how_much,accuracy,0.21444012105490703
|
|
conv_ai_3_None,median,accuracy,0.39040207522697795
|
|
craigslist_bargains_None,best deal,accuracy,0.5175879396984925
|
|
craigslist_bargains_None,good deal for seller,accuracy,0.2864321608040201
|
|
craigslist_bargains_None,good deal for seller no list price,accuracy,0.16917922948073702
|
|
craigslist_bargains_None,good deal for seller no list price implicit,accuracy,0.24288107202680068
|
|
craigslist_bargains_None,median,accuracy,0.2646566164154104
|
|
emotion_None,answer_question_with_emotion_label,accuracy,0.3675
|
|
emotion_None,answer_with_class_label,accuracy,0.1445
|
|
emotion_None,choose_the_best_emotion_label,accuracy,0.3665
|
|
emotion_None,reply_with_emoation_label,accuracy,0.452
|
|
emotion_None,median,accuracy,0.367
|
|
financial_phrasebank_sentences_allagree,bullish_neutral_bearish,accuracy,0.24823321554770317
|
|
financial_phrasebank_sentences_allagree,complementary_industries,accuracy,0.0627208480565371
|
|
financial_phrasebank_sentences_allagree,sentiment,accuracy,0.3630742049469965
|
|
financial_phrasebank_sentences_allagree,share_price_option,accuracy,0.37234982332155475
|
|
financial_phrasebank_sentences_allagree,word_comes_to_mind,accuracy,0.05830388692579505
|
|
financial_phrasebank_sentences_allagree,median,accuracy,0.24823321554770317
|
|
glue_cola,Following sentence acceptable,accuracy,0.50143815915628
|
|
glue_cola,Make sense yes no,accuracy,0.6337488015340365
|
|
glue_cola,Previous sentence acceptable,accuracy,0.3461169702780441
|
|
glue_cola,editing,accuracy,0.4458293384467881
|
|
glue_cola,is_this_correct,accuracy,0.4228187919463087
|
|
glue_cola,median,accuracy,0.4458293384467881
|
|
glue_sst2,following positive negative,accuracy,0.944954128440367
|
|
glue_sst2,happy or mad,accuracy,0.9334862385321101
|
|
glue_sst2,positive negative after,accuracy,0.9392201834862385
|
|
glue_sst2,review,accuracy,0.9506880733944955
|
|
glue_sst2,said,accuracy,0.819954128440367
|
|
glue_sst2,median,accuracy,0.9392201834862385
|
|
head_qa_en,multiple_choice_a_and_q_en,accuracy,0.32430453879941434
|
|
head_qa_en,multiple_choice_a_and_q_with_context_en,accuracy,0.3330893118594436
|
|
head_qa_en,multiple_choice_q_and_a_en,accuracy,0.5395314787701317
|
|
head_qa_en,multiple_choice_q_and_a_index_en,accuracy,0.5314787701317716
|
|
head_qa_en,multiple_choice_q_and_a_index_with_context_en,accuracy,0.5380673499267935
|
|
head_qa_en,median,accuracy,0.5314787701317716
|
|
head_qa_es,multiple_choice_a_and_q_en,accuracy,0.3213762811127379
|
|
head_qa_es,multiple_choice_a_and_q_with_context_en,accuracy,0.32723279648609077
|
|
head_qa_es,multiple_choice_q_and_a_en,accuracy,0.5080527086383602
|
|
head_qa_es,multiple_choice_q_and_a_index_en,accuracy,0.5175695461200586
|
|
head_qa_es,multiple_choice_q_and_a_index_with_context_en,accuracy,0.5153733528550513
|
|
head_qa_es,median,accuracy,0.5080527086383602
|
|
health_fact_None,claim_explanation_classification,accuracy,0.6130612244897959
|
|
health_fact_None,claim_veracity_classification_after_reading_I_believe,accuracy,0.4791836734693877
|
|
health_fact_None,claim_veracity_classification_tell_me,accuracy,0.052244897959183675
|
|
health_fact_None,median,accuracy,0.4791836734693877
|
|
hlgd_None,is_same_event_editor_asks,accuracy,0.5360077332044466
|
|
hlgd_None,is_same_event_interrogative_talk,accuracy,0.6549057515708071
|
|
hlgd_None,is_same_event_refer,accuracy,0.7114548090865153
|
|
hlgd_None,is_same_event_with_time_interrogative_related,accuracy,0.6756887385210246
|
|
hlgd_None,is_same_event_with_time_interrogative_talk,accuracy,0.7844369260512325
|
|
hlgd_None,median,accuracy,0.6756887385210246
|
|
hyperpartisan_news_detection_byarticle,consider_does_it_follow_a_hyperpartisan_argumentation,accuracy,0.6372093023255814
|
|
hyperpartisan_news_detection_byarticle,consider_it_exhibits_extreme_one_sidedness,accuracy,0.6310077519379845
|
|
hyperpartisan_news_detection_byarticle,consume_with_caution,accuracy,0.6310077519379845
|
|
hyperpartisan_news_detection_byarticle,extreme_left_wing_or_right_wing,accuracy,0.6310077519379845
|
|
hyperpartisan_news_detection_byarticle,follows_hyperpartisan_argumentation,accuracy,0.6310077519379845
|
|
hyperpartisan_news_detection_byarticle,median,accuracy,0.6310077519379845
|
|
liar_None,Given statement guess category,accuracy,0.2087227414330218
|
|
liar_None,median,accuracy,0.2087227414330218
|
|
lince_sa_spaeng,express sentiment,accuracy,0.5960193652501344
|
|
lince_sa_spaeng,negation template,accuracy,0.36847767616998384
|
|
lince_sa_spaeng,original poster expressed sentiment,accuracy,0.6008606777837547
|
|
lince_sa_spaeng,sentiment trying to express,accuracy,0.5954814416352878
|
|
lince_sa_spaeng,the author seem,accuracy,0.5965572888649812
|
|
lince_sa_spaeng,median,accuracy,0.5960193652501344
|
|
math_qa_None,choose_correct_og,accuracy,0.22981574539363483
|
|
math_qa_None,first_choice_then_problem,accuracy,0.192964824120603
|
|
math_qa_None,gre_problem,accuracy,0.2184254606365159
|
|
math_qa_None,pick_the_correct,accuracy,0.2150753768844221
|
|
math_qa_None,problem_set_type,accuracy,0.4737018425460637
|
|
math_qa_None,median,accuracy,0.2184254606365159
|
|
mlsum_es,layman_summ_es,bleu,0.036061261250491146
|
|
mlsum_es,palm_prompt,bleu,0.04155428402841844
|
|
mlsum_es,summarise_this_in_es_few_sentences,bleu,0.027821053236675306
|
|
mlsum_es,median,bleu,0.036061261250491146
|
|
movie_rationales_None,Evidences + review,accuracy,0.985
|
|
movie_rationales_None,Evidences sentiment classification,accuracy,0.995
|
|
movie_rationales_None,Standard binary sentiment analysis,accuracy,0.955
|
|
movie_rationales_None,median,accuracy,0.985
|
|
mwsc_None,in-the-sentence,accuracy,0.6829268292682927
|
|
mwsc_None,in-the-sentence-question-first,accuracy,0.6585365853658537
|
|
mwsc_None,is-correct,accuracy,0.7195121951219512
|
|
mwsc_None,options-or,accuracy,0.8048780487804879
|
|
mwsc_None,what-think,accuracy,0.7682926829268293
|
|
mwsc_None,median,accuracy,0.7195121951219512
|
|
onestop_english_None,ara_context,accuracy,0.4673721340388007
|
|
onestop_english_None,assess,accuracy,0.3350970017636684
|
|
onestop_english_None,determine_reading_level_from_the_first_three_sentences,accuracy,0.5308641975308642
|
|
onestop_english_None,esl_context,accuracy,0.41798941798941797
|
|
onestop_english_None,esl_variation,accuracy,0.3386243386243386
|
|
onestop_english_None,median,accuracy,0.41798941798941797
|
|
poem_sentiment_None,guess_sentiment_without_options_variation_1,accuracy,0.20952380952380953
|
|
poem_sentiment_None,most_appropriate_sentiment,accuracy,0.23809523809523808
|
|
poem_sentiment_None,positive_or_negative_sentiment_variation_1,accuracy,0.23809523809523808
|
|
poem_sentiment_None,positive_or_negative_sentiment_variation_2,accuracy,0.23809523809523808
|
|
poem_sentiment_None,question_answer_format,accuracy,0.24761904761904763
|
|
poem_sentiment_None,median,accuracy,0.23809523809523808
|
|
pubmed_qa_pqa_labeled,Long Answer to Final Decision,accuracy,0.704
|
|
pubmed_qa_pqa_labeled,Question Answering (Short),accuracy,0.744
|
|
pubmed_qa_pqa_labeled,median,accuracy,0.724
|
|
riddle_sense_None,answer_given_question_without_options,accuracy,0.5925563173359452
|
|
riddle_sense_None,most_suitable_answer,accuracy,0.5161606268364348
|
|
riddle_sense_None,question_answering,accuracy,0.47502448579823703
|
|
riddle_sense_None,question_to_answer_index,accuracy,0.49657198824681686
|
|
riddle_sense_None,median,accuracy,0.5063663075416258
|
|
scicite_None,Classify intent,accuracy,0.6266375545851528
|
|
scicite_None,Classify intent (choices first),accuracy,0.4705240174672489
|
|
scicite_None,Classify intent (select choice),accuracy,0.4388646288209607
|
|
scicite_None,Classify intent w/section (select choice),accuracy,0.5491266375545851
|
|
scicite_None,can_describe,accuracy,0.6342794759825328
|
|
scicite_None,median,accuracy,0.5491266375545851
|
|
selqa_answer_selection_analysis,is-he-talking-about,accuracy,0.9184713375796179
|
|
selqa_answer_selection_analysis,make-sense-rand,accuracy,0.9426751592356688
|
|
selqa_answer_selection_analysis,which-answer-1st-vs-random,accuracy,0.9006369426751593
|
|
selqa_answer_selection_analysis,would-make-sense-qu-rand,accuracy,0.910828025477707
|
|
selqa_answer_selection_analysis,median,accuracy,0.9146496815286624
|
|
snips_built_in_intents_None,categorize_query,accuracy,0.7865853658536586
|
|
snips_built_in_intents_None,categorize_query_brief,accuracy,0.7012195121951219
|
|
snips_built_in_intents_None,intent_query,accuracy,0.4176829268292683
|
|
snips_built_in_intents_None,query_intent,accuracy,0.7835365853658537
|
|
snips_built_in_intents_None,voice_intent,accuracy,0.7012195121951219
|
|
snips_built_in_intents_None,median,accuracy,0.7012195121951219
|
|
wmt14_fr_en_en-fr,a_good_translation-en-fr-source+target,bleu,0.26028441633496957
|
|
wmt14_fr_en_en-fr,a_good_translation-en-fr-target,bleu,0.26105356968174953
|
|
wmt14_fr_en_en-fr,gpt3-en-fr,bleu,0.17923414272364485
|
|
wmt14_fr_en_en-fr,version-en-fr-target,bleu,0.23518794525011924
|
|
wmt14_fr_en_en-fr,xglm-en-fr-target,bleu,0.27490320032481685
|
|
wmt14_fr_en_en-fr,median,bleu,0.26028441633496957
|
|
wmt14_fr_en_fr-en,a_good_translation-fr-en-source+target,bleu,0.22344520948134364
|
|
wmt14_fr_en_fr-en,a_good_translation-fr-en-target,bleu,0.2988387938888211
|
|
wmt14_fr_en_fr-en,gpt3-fr-en,bleu,0.2897671081332691
|
|
wmt14_fr_en_fr-en,version-fr-en-target,bleu,0.3370883690137962
|
|
wmt14_fr_en_fr-en,xglm-fr-en-target,bleu,0.26028992585410116
|
|
wmt14_fr_en_fr-en,median,bleu,0.2897671081332691
|
|
wmt14_hi_en_en-hi,a_good_translation-en-hi-source+target,bleu,0.09550778502148496
|
|
wmt14_hi_en_en-hi,a_good_translation-en-hi-target,bleu,0.10547062820945455
|
|
wmt14_hi_en_en-hi,gpt-3-en-hi-target,bleu,0.034030829410154916
|
|
wmt14_hi_en_en-hi,version-en-hi-target,bleu,0.1149224530123302
|
|
wmt14_hi_en_en-hi,xglm-en-hi-target,bleu,0.06980407323250921
|
|
wmt14_hi_en_en-hi,median,bleu,0.09550778502148496
|
|
wmt14_hi_en_hi-en,a_good_translation-hi-en-source+target,bleu,0.04963973034828739
|
|
wmt14_hi_en_hi-en,a_good_translation-hi-en-target,bleu,0.11802320249982352
|
|
wmt14_hi_en_hi-en,gpt-3-hi-en-target,bleu,1.9401417583412615e-15
|
|
wmt14_hi_en_hi-en,version-hi-en-target,bleu,0.2117559943306028
|
|
wmt14_hi_en_hi-en,xglm-hi-en-target,bleu,0.1834661289471336
|
|
wmt14_hi_en_hi-en,median,bleu,0.11802320249982352
|
|
multiple,average,multiple,0.4784114531991768
|
|
|