|
dataset,prompt,metric,value
|
|
amazon_reviews_multi_en,prompt_body_title_to_star,accuracy,0.5044
|
|
amazon_reviews_multi_en,prompt_review_to_star,accuracy,0.4652
|
|
amazon_reviews_multi_en,prompt_title_to_star,accuracy,0.3226
|
|
amazon_reviews_multi_en,median,accuracy,0.4652
|
|
amazon_reviews_multi_es,prompt_body_title_to_star,accuracy,0.391
|
|
amazon_reviews_multi_es,prompt_review_to_star,accuracy,0.376
|
|
amazon_reviews_multi_es,prompt_title_to_star,accuracy,0.2564
|
|
amazon_reviews_multi_es,median,accuracy,0.376
|
|
amazon_reviews_multi_fr,prompt_body_title_to_star,accuracy,0.4116
|
|
amazon_reviews_multi_fr,prompt_review_to_star,accuracy,0.3878
|
|
amazon_reviews_multi_fr,prompt_title_to_star,accuracy,0.274
|
|
amazon_reviews_multi_fr,median,accuracy,0.3878
|
|
amazon_reviews_multi_zh,prompt_body_title_to_star,accuracy,0.3948
|
|
amazon_reviews_multi_zh,prompt_review_to_star,accuracy,0.3754
|
|
amazon_reviews_multi_zh,prompt_title_to_star,accuracy,0.2858
|
|
amazon_reviews_multi_zh,median,accuracy,0.3754
|
|
aqua_rat_raw,Answer questions from options,accuracy,0.20078740157480315
|
|
aqua_rat_raw,answer_quiz,accuracy,0.19291338582677164
|
|
aqua_rat_raw,select_the_best_option,accuracy,0.16535433070866143
|
|
aqua_rat_raw,median,accuracy,0.19291338582677164
|
|
art_None,choose_hypothesis,accuracy,0.4934725848563969
|
|
art_None,choose_hypothesis_believable,accuracy,0.5163185378590078
|
|
art_None,choose_hypothesis_desc,accuracy,0.512402088772846
|
|
art_None,choose_hypothesis_likely,accuracy,0.5150130548302873
|
|
art_None,choose_hypothesis_options,accuracy,0.5267624020887729
|
|
art_None,median,accuracy,0.5150130548302873
|
|
banking77_None,direct_to_which_department,accuracy,0.13214285714285715
|
|
banking77_None,help_page_topic,accuracy,0.14383116883116884
|
|
banking77_None,rephrase_as_banking_term,accuracy,0.16103896103896104
|
|
banking77_None,median,accuracy,0.14383116883116884
|
|
blbooksgenre_title_genre_classifiction,classify,accuracy,0.2534562211981567
|
|
blbooksgenre_title_genre_classifiction,multi-choice,accuracy,0.4147465437788018
|
|
blbooksgenre_title_genre_classifiction,premise_context_first,accuracy,0.7430875576036866
|
|
blbooksgenre_title_genre_classifiction,median,accuracy,0.4147465437788018
|
|
blimp_adjunct_island,grammatical_between_1_2,accuracy,0.501
|
|
blimp_adjunct_island,grammatical_between_A_B,accuracy,0.771
|
|
blimp_adjunct_island,grammatical_which_one_1_2,accuracy,0.516
|
|
blimp_adjunct_island,single_sentence_bad_yes_no,accuracy,0.496
|
|
blimp_adjunct_island,single_sentence_good_yes_no,accuracy,0.524
|
|
blimp_adjunct_island,median,accuracy,0.516
|
|
climate_fever_None,claim_and_all_supporting_evidences,accuracy,0.31791530944625407
|
|
climate_fever_None,fifth_evidence_and_claim_itemization,accuracy,0.24104234527687296
|
|
climate_fever_None,first_evidence_and_claim_itemization,accuracy,0.5570032573289903
|
|
climate_fever_None,second_evidence_and_claim_itemization,accuracy,0.45342019543973944
|
|
climate_fever_None,third_evidence_claim_pair,accuracy,0.6397394136807818
|
|
climate_fever_None,median,accuracy,0.45342019543973944
|
|
codah_codah,affirmative_instruction_after_sentence_and_choices,accuracy,0.24927953890489912
|
|
codah_codah,affirmative_instruction_before_sentence_and_choices,accuracy,0.24855907780979827
|
|
codah_codah,interrogative_instruction_after_sentence_and_choices,accuracy,0.2503602305475504
|
|
codah_codah,median,accuracy,0.24927953890489912
|
|
commonsense_qa_None,answer_given_question_without_options,accuracy,0.3931203931203931
|
|
commonsense_qa_None,most_suitable_answer,accuracy,0.4398034398034398
|
|
commonsense_qa_None,question_answering,accuracy,0.44471744471744473
|
|
commonsense_qa_None,median,accuracy,0.4398034398034398
|
|
conv_ai_3_None,ambiguous,accuracy,0.39040207522697795
|
|
conv_ai_3_None,clarification_needed,accuracy,0.39040207522697795
|
|
conv_ai_3_None,directly_answer,accuracy,0.6095979247730221
|
|
conv_ai_3_None,score_give_number,accuracy,0.21444012105490703
|
|
conv_ai_3_None,score_how_much,accuracy,0.1733679204496325
|
|
conv_ai_3_None,median,accuracy,0.39040207522697795
|
|
craigslist_bargains_None,best deal,accuracy,0.25963149078726966
|
|
craigslist_bargains_None,good deal for seller,accuracy,0.5192629815745393
|
|
craigslist_bargains_None,good deal for seller no list price,accuracy,0.7252931323283082
|
|
craigslist_bargains_None,good deal for seller no list price implicit,accuracy,0.23785594639865998
|
|
craigslist_bargains_None,median,accuracy,0.3894472361809045
|
|
emotion_None,answer_question_with_emotion_label,accuracy,0.232
|
|
emotion_None,answer_with_class_label,accuracy,0.2585
|
|
emotion_None,choose_the_best_emotion_label,accuracy,0.467
|
|
emotion_None,reply_with_emoation_label,accuracy,0.436
|
|
emotion_None,median,accuracy,0.34725
|
|
financial_phrasebank_sentences_allagree,bullish_neutral_bearish,accuracy,0.1603356890459364
|
|
financial_phrasebank_sentences_allagree,complementary_industries,accuracy,0.44787985865724383
|
|
financial_phrasebank_sentences_allagree,sentiment,accuracy,0.31095406360424027
|
|
financial_phrasebank_sentences_allagree,share_price_option,accuracy,0.3224381625441696
|
|
financial_phrasebank_sentences_allagree,word_comes_to_mind,accuracy,0.2751766784452297
|
|
financial_phrasebank_sentences_allagree,median,accuracy,0.31095406360424027
|
|
glue_cola,Following sentence acceptable,accuracy,0.31639501438159157
|
|
glue_cola,Make sense yes no,accuracy,0.3087248322147651
|
|
glue_cola,Previous sentence acceptable,accuracy,0.31255992329817833
|
|
glue_cola,editing,accuracy,0.3077660594439118
|
|
glue_cola,is_this_correct,accuracy,0.4170661553211889
|
|
glue_cola,median,accuracy,0.31255992329817833
|
|
glue_sst2,following positive negative,accuracy,0.8795871559633027
|
|
glue_sst2,happy or mad,accuracy,0.6571100917431193
|
|
glue_sst2,positive negative after,accuracy,0.9277522935779816
|
|
glue_sst2,review,accuracy,0.9288990825688074
|
|
glue_sst2,said,accuracy,0.8325688073394495
|
|
glue_sst2,median,accuracy,0.8795871559633027
|
|
head_qa_en,multiple_choice_a_and_q_en,accuracy,0.2591508052708638
|
|
head_qa_en,multiple_choice_a_and_q_with_context_en,accuracy,0.2554904831625183
|
|
head_qa_en,multiple_choice_q_and_a_en,accuracy,0.2759882869692533
|
|
head_qa_en,multiple_choice_q_and_a_index_en,accuracy,0.2774524158125915
|
|
head_qa_en,multiple_choice_q_and_a_index_with_context_en,accuracy,0.26720351390922403
|
|
head_qa_en,median,accuracy,0.26720351390922403
|
|
head_qa_es,multiple_choice_a_and_q_en,accuracy,0.23206442166910687
|
|
head_qa_es,multiple_choice_a_and_q_with_context_en,accuracy,0.2342606149341142
|
|
head_qa_es,multiple_choice_q_and_a_en,accuracy,0.26939970717423134
|
|
head_qa_es,multiple_choice_q_and_a_index_en,accuracy,0.2774524158125915
|
|
head_qa_es,multiple_choice_q_and_a_index_with_context_en,accuracy,0.28257686676427524
|
|
head_qa_es,median,accuracy,0.26939970717423134
|
|
health_fact_None,claim_explanation_classification,accuracy,0.5477551020408163
|
|
health_fact_None,claim_veracity_classification_after_reading_I_believe,accuracy,0.4204081632653061
|
|
health_fact_None,claim_veracity_classification_tell_me,accuracy,0.04653061224489796
|
|
health_fact_None,median,accuracy,0.4204081632653061
|
|
hlgd_None,is_same_event_editor_asks,accuracy,0.7385210246495891
|
|
hlgd_None,is_same_event_interrogative_talk,accuracy,0.6447559207346544
|
|
hlgd_None,is_same_event_refer,accuracy,0.7491541807636539
|
|
hlgd_None,is_same_event_with_time_interrogative_related,accuracy,0.7288545190913485
|
|
hlgd_None,is_same_event_with_time_interrogative_talk,accuracy,0.7153214113098115
|
|
hlgd_None,median,accuracy,0.7288545190913485
|
|
hyperpartisan_news_detection_byarticle,consider_does_it_follow_a_hyperpartisan_argumentation,accuracy,0.6310077519379845
|
|
hyperpartisan_news_detection_byarticle,consider_it_exhibits_extreme_one_sidedness,accuracy,0.6294573643410852
|
|
hyperpartisan_news_detection_byarticle,consume_with_caution,accuracy,0.6310077519379845
|
|
hyperpartisan_news_detection_byarticle,extreme_left_wing_or_right_wing,accuracy,0.6294573643410852
|
|
hyperpartisan_news_detection_byarticle,follows_hyperpartisan_argumentation,accuracy,0.6263565891472869
|
|
hyperpartisan_news_detection_byarticle,median,accuracy,0.6294573643410852
|
|
liar_None,Given statement guess category,accuracy,0.17601246105919002
|
|
liar_None,median,accuracy,0.17601246105919002
|
|
lince_sa_spaeng,express sentiment,accuracy,0.5804195804195804
|
|
lince_sa_spaeng,negation template,accuracy,0.164066702528241
|
|
lince_sa_spaeng,original poster expressed sentiment,accuracy,0.5330823023130715
|
|
lince_sa_spaeng,sentiment trying to express,accuracy,0.5852608929532006
|
|
lince_sa_spaeng,the author seem,accuracy,0.5831091984938139
|
|
lince_sa_spaeng,median,accuracy,0.5804195804195804
|
|
math_qa_None,choose_correct_og,accuracy,0.1966499162479062
|
|
math_qa_None,first_choice_then_problem,accuracy,0.21239530988274707
|
|
math_qa_None,gre_problem,accuracy,0.19028475711892798
|
|
math_qa_None,pick_the_correct,accuracy,0.18860971524288106
|
|
math_qa_None,problem_set_type,accuracy,0.4556113902847571
|
|
math_qa_None,median,accuracy,0.1966499162479062
|
|
mlsum_es,layman_summ_es,bleu,0.03343300117413671
|
|
mlsum_es,palm_prompt,bleu,0.03619325486335941
|
|
mlsum_es,summarise_this_in_es_few_sentences,bleu,0.022882710041529152
|
|
mlsum_es,median,bleu,0.03343300117413671
|
|
movie_rationales_None,Evidences + review,accuracy,0.935
|
|
movie_rationales_None,Evidences sentiment classification,accuracy,0.985
|
|
movie_rationales_None,Standard binary sentiment analysis,accuracy,0.875
|
|
movie_rationales_None,median,accuracy,0.935
|
|
mwsc_None,in-the-sentence,accuracy,0.5365853658536586
|
|
mwsc_None,in-the-sentence-question-first,accuracy,0.5609756097560976
|
|
mwsc_None,is-correct,accuracy,0.5853658536585366
|
|
mwsc_None,options-or,accuracy,0.5
|
|
mwsc_None,what-think,accuracy,0.5121951219512195
|
|
mwsc_None,median,accuracy,0.5365853658536586
|
|
onestop_english_None,ara_context,accuracy,0.3368606701940035
|
|
onestop_english_None,assess,accuracy,0.37566137566137564
|
|
onestop_english_None,determine_reading_level_from_the_first_three_sentences,accuracy,0.5185185185185185
|
|
onestop_english_None,esl_context,accuracy,0.43386243386243384
|
|
onestop_english_None,esl_variation,accuracy,0.5555555555555556
|
|
onestop_english_None,median,accuracy,0.43386243386243384
|
|
poem_sentiment_None,guess_sentiment_without_options_variation_1,accuracy,0.2
|
|
poem_sentiment_None,most_appropriate_sentiment,accuracy,0.22857142857142856
|
|
poem_sentiment_None,positive_or_negative_sentiment_variation_1,accuracy,0.21904761904761905
|
|
poem_sentiment_None,positive_or_negative_sentiment_variation_2,accuracy,0.20952380952380953
|
|
poem_sentiment_None,question_answer_format,accuracy,0.20952380952380953
|
|
poem_sentiment_None,median,accuracy,0.20952380952380953
|
|
pubmed_qa_pqa_labeled,Long Answer to Final Decision,accuracy,0.575
|
|
pubmed_qa_pqa_labeled,Question Answering (Short),accuracy,0.54
|
|
pubmed_qa_pqa_labeled,median,accuracy,0.5575
|
|
riddle_sense_None,answer_given_question_without_options,accuracy,0.37414299706170423
|
|
riddle_sense_None,most_suitable_answer,accuracy,0.23996082272282077
|
|
riddle_sense_None,question_answering,accuracy,0.21743388834476005
|
|
riddle_sense_None,question_to_answer_index,accuracy,0.20274240940254654
|
|
riddle_sense_None,median,accuracy,0.2286973555337904
|
|
scicite_None,Classify intent,accuracy,0.13427947598253276
|
|
scicite_None,Classify intent (choices first),accuracy,0.14737991266375547
|
|
scicite_None,Classify intent (select choice),accuracy,0.5141921397379913
|
|
scicite_None,Classify intent w/section (select choice),accuracy,0.5491266375545851
|
|
scicite_None,can_describe,accuracy,0.3307860262008734
|
|
scicite_None,median,accuracy,0.3307860262008734
|
|
selqa_answer_selection_analysis,is-he-talking-about,accuracy,0.9006369426751593
|
|
selqa_answer_selection_analysis,make-sense-rand,accuracy,0.9235668789808917
|
|
selqa_answer_selection_analysis,which-answer-1st-vs-random,accuracy,0.5885350318471337
|
|
selqa_answer_selection_analysis,would-make-sense-qu-rand,accuracy,0.8955414012738854
|
|
selqa_answer_selection_analysis,median,accuracy,0.8980891719745223
|
|
snips_built_in_intents_None,categorize_query,accuracy,0.17682926829268292
|
|
snips_built_in_intents_None,categorize_query_brief,accuracy,0.17073170731707318
|
|
snips_built_in_intents_None,intent_query,accuracy,0.27134146341463417
|
|
snips_built_in_intents_None,query_intent,accuracy,0.2896341463414634
|
|
snips_built_in_intents_None,voice_intent,accuracy,0.4634146341463415
|
|
snips_built_in_intents_None,median,accuracy,0.27134146341463417
|
|
wmt14_fr_en_en-fr,a_good_translation-en-fr-source+target,bleu,0.02362605149306088
|
|
wmt14_fr_en_en-fr,a_good_translation-en-fr-target,bleu,0.021337681990699707
|
|
wmt14_fr_en_en-fr,gpt3-en-fr,bleu,0.001200164695535072
|
|
wmt14_fr_en_en-fr,version-en-fr-target,bleu,0.022199071642137146
|
|
wmt14_fr_en_en-fr,xglm-en-fr-target,bleu,0.07032093073230125
|
|
wmt14_fr_en_en-fr,median,bleu,0.022199071642137146
|
|
wmt14_fr_en_fr-en,a_good_translation-fr-en-source+target,bleu,0.20707991062833414
|
|
wmt14_fr_en_fr-en,a_good_translation-fr-en-target,bleu,0.2124042389381691
|
|
wmt14_fr_en_fr-en,gpt3-fr-en,bleu,0.008578242652965286
|
|
wmt14_fr_en_fr-en,version-fr-en-target,bleu,0.18961608535661192
|
|
wmt14_fr_en_fr-en,xglm-fr-en-target,bleu,0.13352998875720223
|
|
wmt14_fr_en_fr-en,median,bleu,0.18961608535661192
|
|
wmt14_hi_en_en-hi,a_good_translation-en-hi-source+target,bleu,0.018311038199670525
|
|
wmt14_hi_en_en-hi,a_good_translation-en-hi-target,bleu,0.004060513374212342
|
|
wmt14_hi_en_en-hi,gpt-3-en-hi-target,bleu,2.3016809855913707e-10
|
|
wmt14_hi_en_en-hi,version-en-hi-target,bleu,0.0023958056990365257
|
|
wmt14_hi_en_en-hi,xglm-en-hi-target,bleu,0.00893152501923088
|
|
wmt14_hi_en_en-hi,median,bleu,0.004060513374212342
|
|
wmt14_hi_en_hi-en,a_good_translation-hi-en-source+target,bleu,0.06966981523891758
|
|
wmt14_hi_en_hi-en,a_good_translation-hi-en-target,bleu,0.09023899192195889
|
|
wmt14_hi_en_hi-en,gpt-3-hi-en-target,bleu,3.7797885719160415e-140
|
|
wmt14_hi_en_hi-en,version-hi-en-target,bleu,0.07812739647665366
|
|
wmt14_hi_en_hi-en,xglm-hi-en-target,bleu,0.061384876720111704
|
|
wmt14_hi_en_hi-en,median,bleu,0.06966981523891758
|
|
multiple,average,multiple,0.37870942800865803
|
|
|