Added Contamination Evidence from GPT4 Tech Report using String matching on GPT-4

#11
Files changed (3) hide show
  1. .gitignore +2 -1
  2. app.py +2 -2
  3. contamination_report.csv +185 -17
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  *.pyc
2
- *.json
 
 
1
  *.pyc
2
+ *.json
3
+ *.lock
app.py CHANGED
@@ -22,11 +22,11 @@ def filter_dataframe(dataframe, eval_dataset, cont_source, checkboxes):
22
  """
23
  if isinstance(eval_dataset, str):
24
  dataframe = dataframe[
25
- dataframe["Evaluation Dataset"].str.contains(eval_dataset)
26
  ]
27
  if isinstance(cont_source, str):
28
  dataframe = dataframe[
29
- dataframe["Contaminated Source"].str.contains(cont_source)
30
  ]
31
  if isinstance(checkboxes, list) and "Exclude model-based evidences" in checkboxes:
32
  dataframe = dataframe[dataframe["Approach"] != "model-based"]
 
22
  """
23
  if isinstance(eval_dataset, str):
24
  dataframe = dataframe[
25
+ dataframe["Evaluation Dataset"].str.contains(f"(?i){eval_dataset}")
26
  ]
27
  if isinstance(cont_source, str):
28
  dataframe = dataframe[
29
+ dataframe["Contaminated Source"].str.contains(f"(?i){cont_source}")
30
  ]
31
  if isinstance(checkboxes, list) and "Exclude model-based evidences" in checkboxes:
32
  dataframe = dataframe[dataframe["Approach"] != "model-based"]
contamination_report.csv CHANGED
@@ -1,5 +1,24 @@
1
  Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  allenai/ai2_arc;;CommonCrawl;corpus;;;28.7;data-based;https://arxiv.org/abs/2310.17589;5
5
  tau/commonsense_qa;;CommonCrawl;corpus;;1.6;;data-based;https://arxiv.org/abs/2310.17589;5
@@ -436,30 +455,179 @@ zest;;EleutherAI/pile;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
436
  zest;;togethercomputer/RedPajama-Data-V2;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
437
 
438
 
439
- imdb;;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
440
- imdb;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
441
 
442
- ag_news;;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/pdf/2308.08493;3
443
- ag_news;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
444
 
445
- yelp_review_full;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
446
- yelp_review_full;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
447
 
448
- nyu-mll/glue;rte;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
449
- nyu-mll/glue;rte;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
450
 
451
- nyu-mll/glue;wnli;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/pdf/2308.08493;3
452
- nyu-mll/glue;wnli;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
453
 
454
- samsum;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
455
- samsum;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
456
 
457
- EdinburghNLP/xsum;;GPT-4;model;0.0;;100.0;model-based;https://arxiv.org/pdf/2308.08493;3
458
- EdinburghNLP/xsum;;GPT-3.5;model;0.0;;100.0;model-based;https://arxiv.org/pdf/2308.08493;3
459
 
460
- bigbio/mednli;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
461
- bigbio/mednli;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
462
 
463
  RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
464
  RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
465
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
2
 
3
+ gsm8k;;GPT-4;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
4
+ ucinlp/drop;;GPT-4;model;;44.00;;model-based;https://arxiv.org/abs/2311.06233;8
5
+ openai_humaneval;;GPT-4;model;;;56.71;model-based;https://arxiv.org/abs/2311.06233;8
6
+ openai_humaneval;;GPT-3.5-turbo/0613;model;;;23.79;model-based;https://arxiv.org/abs/2402.15938;16
7
+ openai_humaneval;;GPT-3.5-turbo/1106;model;;;41.47;model-based;https://arxiv.org/abs/2402.15938;16
8
+ imdb;;GPT-4;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
9
+ imdb;;GPT-3.5;model;;;55.00;model-based;https://arxiv.org/abs/2311.06233;8
10
+ ag_news;;GPT-4;model;;;91.00;model-based;https://arxiv.org/abs/2311.06233;8
11
+ ag_news;;GPT-3.5;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
12
+ yelp_review_full;;GPT-4;model;;;80.00;model-based;https://arxiv.org/abs/2311.06233;8
13
+ yelp_review_full;;GPT-3.5;model;;;13.00;model-based;https://arxiv.org/abs/2311.06233;8
14
+ nyu-mll/glue;rte;GPT-4;model;;60.00;;model-based;https://arxiv.org/abs/2311.06233;8
15
+ nyu-mll/glue;rte;GPT-3.5;model;;71.00;;model-based;https://arxiv.org/abs/2311.06233;8
16
+ nyu-mll/glue;wnli;GPT-4;model;;50.70;;model-based;https://arxiv.org/abs/2311.06233;8
17
+ nyu-mll/glue;wnli;GPT-3.5;model;;12.68;;model-based;https://arxiv.org/abs/2311.06233;8
18
+ samsum;;GPT-4;model;;;77.00;model-based;https://arxiv.org/abs/2311.06233;8
19
+ samsum;;GPT-3.5;model;;;74.00;model-based;https://arxiv.org/abs/2311.06233;8
20
+ EdinburghNLP/xsum;;GPT-4;model;;;95.00;model-based;https://arxiv.org/abs/2311.06233;8
21
+ EdinburghNLP/xsum;;GPT-3.5;model;;;79.00;model-based;https://arxiv.org/abs/2311.06233;8
22
 
23
  allenai/ai2_arc;;CommonCrawl;corpus;;;28.7;data-based;https://arxiv.org/abs/2310.17589;5
24
  tau/commonsense_qa;;CommonCrawl;corpus;;1.6;;data-based;https://arxiv.org/abs/2310.17589;5
 
455
  zest;;togethercomputer/RedPajama-Data-V2;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
456
 
457
 
458
+ imdb;;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
459
+ imdb;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
460
 
461
+ ag_news;;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
462
+ ag_news;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
463
 
464
+ yelp_review_full;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
465
+ yelp_review_full;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
466
 
467
+ nyu-mll/glue;rte;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
468
+ nyu-mll/glue;rte;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
469
 
470
+ nyu-mll/glue;wnli;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
471
+ nyu-mll/glue;wnli;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
472
 
473
+ samsum;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
474
+ samsum;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
475
 
476
+ EdinburghNLP/xsum;;GPT-4;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
477
+ EdinburghNLP/xsum;;GPT-3.5;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
478
 
479
+ bigbio/mednli;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
480
+ bigbio/mednli;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
481
 
482
  RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
483
  RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
484
+
485
+
486
+ openai_humaneval;;GPT-4;model;;;25.0;data-based;https://arxiv.org/abs/2303.08774;11
487
+ ucinlp/drop;;GPT-4;model;;21.0;;data-based;https://arxiv.org/abs/2303.08774;11
488
+ bigbench;;GPT-4;model;;;100.0;data-based;https://arxiv.org/abs/2303.08774;11
489
+ gsm8k;;GPT-4;model;100.0;;1.0;data-based;https://arxiv.org/abs/2303.08774;11
490
+ EleutherAI/hendrycks_math;;GPT-4;model;100.0;;;data-based;https://arxiv.org/abs/2303.08774;11
491
+ cais/mmlu;;GPT-4;model;;;0.6;data-based;https://arxiv.org/abs/2303.08774;11
492
+ ibragim-bad/arc_challenge;;GPT-4;model;;;3.4;data-based;https://arxiv.org/abs/2303.08774;11
493
+ winogrande;;GPT-4;model;;;0.9;data-based;https://arxiv.org/abs/2303.08774;11
494
+ cais/mmlu;;GPT-3.5;model;;;52.0;model-based;https://arxiv.org/abs/2311.09783;10
495
+ winogrande;;GPT-3.5;model;;;9.0;model-based;https://arxiv.org/abs/2311.09783;10
496
+ truthful_qa;;GPT-3.5;model;;;12.0;model-based;https://arxiv.org/abs/2311.09783;10
497
+ allenai/openbookqa;;GPT-3.5;model;;;1.0;model-based;https://arxiv.org/abs/2311.09783;10
498
+
499
+ cais/mmlu;;GPT-4;model;;;57.0;model-based;https://arxiv.org/abs/2311.09783;10
500
+ truthful_qa;;GPT-4;model;;;10.0;model-based;https://arxiv.org/abs/2311.09783;10
501
+ winogrande;;GPT-4;model;;;12.0;model-based;https://arxiv.org/abs/2311.09783;10
502
+ allenai/openbookqa;;GPT-4;model;;;1.0;model-based;https://arxiv.org/abs/2311.09783;10
503
+ Rowan/hellaswag;;GPT-4;model;;;2.0;model-based;https://arxiv.org/abs/2311.09783;10
504
+
505
+
506
+ allenai/openbookqa;;LLaMa 2-13B;model;;;4.0;model-based;https://arxiv.org/abs/2311.09783;10
507
+ truthful_qa;;LLaMa 2-13B;model;;;2.0;model-based;https://arxiv.org/abs/2311.09783;10
508
+ winogrande;;LLaMa 2-13B;model;;;1.0;model-based;https://arxiv.org/abs/2311.09783;10
509
+
510
+ truthful_qa;;Mistral-7B;model;;;15.0;model-based;https://arxiv.org/abs/2311.09783;10
511
+ allenai/openbookqa;;Mistral-7B;model;;;10.0;model-based;https://arxiv.org/abs/2311.09783;10
512
+ winogrande;;Mistral-7B;model;;;3.0;model-based;https://arxiv.org/abs/2311.09783;10
513
+ cais/mmlu;;Mistral-7B;model;;;1.0;model-based;https://arxiv.org/abs/2311.09783;10
514
+ RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
515
+ RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
516
+ RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
517
+ RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
518
+
519
+
520
+ openai_humaneval;;EleutherAI/pile;corpus;;;12.2;data-based;https://arxiv.org/abs/2403.04811;12
521
+ mbpp;;EleutherAI/pile;corpus;;;3.6;data-based;https://arxiv.org/abs/2403.04811;12
522
+ openai_humaneval;;bigcode/the-stack;corpus;;;18.9;data-based;https://arxiv.org/abs/2403.04811;12
523
+ mbpp;;bigcode/the-stack;corpus;;;20.8;data-based;https://arxiv.org/abs/2403.04811;12
524
+
525
+ quac;;GPT-3;model;;99.0;;data-based;https://arxiv.org/abs/2005.14165;13
526
+ rajpurkar/squad_v2;;GPT-3;model;;94.0;;data-based;https://arxiv.org/abs/2005.14165;13
527
+ ucinlp/drop;;GPT-3;model;;93.0;;data-based;https://arxiv.org/abs/2005.14165;13
528
+ Symbol Insertion;;GPT-3;model;;86.0;;data-based;https://arxiv.org/abs/2005.14165;13
529
+ stanfordnlp/coqa;;GPT-3;model;;64.0;;data-based;https://arxiv.org/abs/2005.14165;13
530
+ super_glue;record;GPT-3;model;;61.0;;data-based;https://arxiv.org/abs/2005.14165;13
531
+ winograd_wsc;;GPT-3;model;;;60.0;data-based;https://arxiv.org/abs/2005.14165;13
532
+ super_glue;boolq;GPT-3;model;;60.0;;data-based;https://arxiv.org/abs/2005.14165;13
533
+ super_glue;multirc;GPT-3;model;;59.0;;data-based;https://arxiv.org/abs/2005.14165;13
534
+ race;high;GPT-3;model;;;45.0;data-based;https://arxiv.org/abs/2005.14165;13
535
+ cimec/lambada;;GPT-3;model;;;43.0;data-based;https://arxiv.org/abs/2005.14165;13
536
+ super_glue;wsc;GPT-3;model;;40.0;;data-based;https://arxiv.org/abs/2005.14165;13
537
+ piqa;;GPT-3;model;;29.0;;data-based;https://arxiv.org/abs/2005.14165;13
538
+ wmt/wmt16;en-de;GPT-3;model;;;25.0;data-based;https://arxiv.org/abs/2005.14165;13
539
+ wmt/wmt16;de-en;GPT-3;model;;;25.0;data-based;https://arxiv.org/abs/2005.14165;13
540
+ race;middle;GPT-3;model;;;25.0;data-based;https://arxiv.org/abs/2005.14165;13
541
+ rmanluo/RoG-webqsp;;GPT-3;model;;;21.0;data-based;https://arxiv.org/abs/2005.14165;13
542
+ wmt/wmt16;en-ro;GPT-3;model;;;21.0;data-based;https://arxiv.org/abs/2005.14165;13
543
+ wmt/wmt16;ro-en;GPT-3;model;;;21.0;data-based;https://arxiv.org/abs/2005.14165;13
544
+ facebook/anli;test_r1;GPT-3;model;;;20.0;data-based;https://arxiv.org/abs/2005.14165;13
545
+ facebook/anli;test_r2;GPT-3;model;;;18.0;data-based;https://arxiv.org/abs/2005.14165;13
546
+ mandarjoshi/trivia_qa;;GPT-3;model;;17.0;;data-based;https://arxiv.org/abs/2005.14165;13
547
+ facebook/anli;test_r3;GPT-3;model;;;16.0;data-based;https://arxiv.org/abs/2005.14165;13
548
+ wmt/wmt16;fr-en;GPT-3;model;;;14.0;data-based;https://arxiv.org/abs/2005.14165;13
549
+ wmt/wmt16;en-fr;GPT-3;model;;;14.0;data-based;https://arxiv.org/abs/2005.14165;13
550
+ super_glue;rte;GPT-3;model;;8.0;;data-based;https://arxiv.org/abs/2005.14165;13
551
+ super_glue;wic;GPT-3;model;;8.0;;data-based;https://arxiv.org/abs/2005.14165;13
552
+ super_glue;cb;GPT-3;model;;7.0;;data-based;https://arxiv.org/abs/2005.14165;13
553
+ Reversed Words;;GPT-3;model;;7.0;;data-based;https://arxiv.org/abs/2005.14165;13
554
+ Anagrams 2;;GPT-3;model;;7.0;;data-based;https://arxiv.org/abs/2005.14165;13
555
+ allenai/openbookqa;;GPT-3;model;;;6.0;data-based;https://arxiv.org/abs/2005.14165;13
556
+ ibragim-bad/arc_easy;;GPT-3;model;;;4.0;data-based;https://arxiv.org/abs/2005.14165;13
557
+ Anagrams 1;;GPT-3;model;;3.0;;data-based;https://arxiv.org/abs/2005.14165;13
558
+ ibragim-bad/arc_challenge;;GPT-3;model;;;3.0;data-based;https://arxiv.org/abs/2005.14165;13
559
+ super_glue;copa;GPT-3;model;;3.0;;data-based;https://arxiv.org/abs/2005.14165;13
560
+ Rowan/hellaswag;;GPT-3;model;;2.0;;data-based;https://arxiv.org/abs/2005.14165;13
561
+ natural_questions;;GPT-3;model;;;1.0;data-based;https://arxiv.org/abs/2005.14165;13
562
+ Cycled Letters;;GPT-3;model;;1.0;;data-based;https://arxiv.org/abs/2005.14165;13
563
+ SAT Analogies;;GPT-3;model;;1.0;;data-based;https://arxiv.org/abs/2005.14165;13
564
+
565
+ EdinburghNLP/xsum;;PaLM 2;model;;;42.0;data-based;https://arxiv.org/abs/2305.10403;13
566
+ csebuetnlp/xlsum;;PaLM 2;model;;;46.9;data-based;https://arxiv.org/abs/2305.10403;13
567
+ wiki_lingua;;PaLM 2;model;;;9.0;data-based;https://arxiv.org/abs/2305.10403;13
568
+
569
+ winograd_wsc;;PaLM;model;;;38.5;data-based;https://arxiv.org/abs/2204.02311;13
570
+ rmanluo/RoG-webqsp;;PaLM;model;;;26.7;data-based;https://arxiv.org/abs/2204.02311;13
571
+ super_glue;wsc;PaLM;model;;;36.8;data-based;https://arxiv.org/abs/2204.02311;13
572
+ mandarjoshi/trivia_qa;;PaLM;model;;19.9;;data-based;https://arxiv.org/abs/2204.02311;13
573
+ rajpurkar/squad_v2;;PaLM;model;;85.2;;data-based;https://arxiv.org/abs/2204.02311;13
574
+ super_glue;record;PaLM;model;;43.4;;data-based;https://arxiv.org/abs/2204.02311;13
575
+ cimec/lambada;;PaLM;model;;;29.3;data-based;https://arxiv.org/abs/2204.02311;13
576
+ super_glue;cb;PaLM;model;;48.2;;data-based;https://arxiv.org/abs/2204.02311;13
577
+ ibragim-bad/arc_easy;;PaLM;model;;;30.4;data-based;https://arxiv.org/abs/2204.02311;13
578
+ ibragim-bad/arc_challenge;;PaLM;model;;;24.7;data-based;https://arxiv.org/abs/2204.02311;13
579
+
580
+ winograd_wsc;;GLaM;model;;67.3;;data-based;https://arxiv.org/abs/2112.06905;13
581
+ winogrande;;GLaM;model;;;0.3;data-based;https://arxiv.org/abs/2112.06905;13
582
+ super_glue;wic;GLaM;model;;8.2;;data-based;https://arxiv.org/abs/2112.06905;13
583
+ super_glue;wsc;GLaM;model;;57.5;;data-based;https://arxiv.org/abs/2112.06905;13
584
+ mandarjoshi/trivia_qa;;GLaM;model;;18.8;;data-based;https://arxiv.org/abs/2112.06905;13
585
+ story_cloze;;GLaM;model;;100.0;;data-based;https://arxiv.org/abs/2112.06905;13
586
+ rajpurkar/squad_v2;;GLaM;model;;94.6;;data-based;https://arxiv.org/abs/2112.06905;13
587
+ super_glue;record;GLaM;model;;98.6;;data-based;https://arxiv.org/abs/2112.06905;13
588
+ super_glue;rte;GLaM;model;;54.9;;data-based;https://arxiv.org/abs/2112.06905;13
589
+ race;middle;GLaM;model;;58.4;;data-based;https://arxiv.org/abs/2112.06905;13
590
+ race;high;GLaM;model;;74.0;;data-based;https://arxiv.org/abs/2112.06905;13
591
+ quac;;GLaM;model;;99.9;;data-based;https://arxiv.org/abs/2112.06905;13
592
+ piqa;;GLaM;model;;49.8;;data-based;https://arxiv.org/abs/2112.06905;13
593
+ allenai/openbookqa;;GLaM;model;;20.0;;data-based;https://arxiv.org/abs/2112.06905;13
594
+ natural_questions;;GLaM;model;;3.9;;data-based;https://arxiv.org/abs/2112.06905;13
595
+ super_glue;multirc;GLaM;model;;68.8;;data-based;https://arxiv.org/abs/2112.06905;13
596
+ cimec/lambada;;GLaM;model;;;21.8;data-based;https://arxiv.org/abs/2112.06905;13
597
+ Rowan/hellaswag;;GLaM;model;;19.8;;data-based;https://arxiv.org/abs/2112.06905;13
598
+ stanfordnlp/coqa;;GLaM;model;;;75.0;data-based;https://arxiv.org/abs/2112.06905;13
599
+ super_glue;copa;GLaM;model;;3.0;;data-based;https://arxiv.org/abs/2112.06905;13
600
+ super_glue;cb;GLaM;model;;26.8;;data-based;https://arxiv.org/abs/2112.06905;13
601
+ super_glue;boolq;GLaM;model;;92.1;;data-based;https://arxiv.org/abs/2112.06905;13
602
+ ibragim-bad/arc_easy;;GLaM;model;;32.5;;data-based;https://arxiv.org/abs/2112.06905;13
603
+ ibragim-bad/arc_challenge;;GLaM;model;;31.8;;data-based;https://arxiv.org/abs/2112.06905;13
604
+ facebook/anli;dev_r3;GLaM;model;;40.7;;data-based;https://arxiv.org/abs/2112.06905;13
605
+ facebook/anli;dev_r2;GLaM;model;;96.8;;data-based;https://arxiv.org/abs/2112.06905;13
606
+ facebook/anli;dev_r1;GLaM;model;;96.2;;data-based;https://arxiv.org/abs/2112.06905;13
607
+
608
+ winogrande;;FLAN;model;;;0.2;data-based;https://arxiv.org/abs/2109.01652;13
609
+ mandarjoshi/trivia_qa;;FLAN;model;;22.8;;data-based;https://arxiv.org/abs/2109.01652;13
610
+ story_cloze;;FLAN;model;;0.4;;data-based;https://arxiv.org/abs/2109.01652;13
611
+ rajpurkar/squad_v2;;FLAN;model;;99.1;;data-based;https://arxiv.org/abs/2109.01652;13
612
+ wmt/wmt16;ro-en;FLAN;model;;;12.4;data-based;https://arxiv.org/abs/2109.01652;13
613
+ super_glue;record;FLAN;model;;68.0;;data-based;https://arxiv.org/abs/2109.01652;13
614
+ super_glue;rte;FLAN;model;;33.9;;data-based;https://arxiv.org/abs/2109.01652;13
615
+ piqa;;FLAN;model;;51.3;;data-based;https://arxiv.org/abs/2109.01652;13
616
+ allenai/openbookqa;;FLAN;model;;15.0;;data-based;https://arxiv.org/abs/2109.01652;13
617
+ natural_questions;;FLAN;model;;3.2;;data-based;https://arxiv.org/abs/2109.01652;13
618
+ super_glue;multirc;FLAN;model;;59.3;;data-based;https://arxiv.org/abs/2109.01652;13
619
+ Rowan/hellaswag;;FLAN;model;;34.5;;data-based;https://arxiv.org/abs/2109.01652;13
620
+ wmt/wmt16;fr-en;FLAN;model;;;25.3;data-based;https://arxiv.org/abs/2109.01652;13
621
+ wmt/wmt16;en-ro;FLAN;model;;;12.4;data-based;https://arxiv.org/abs/2109.01652;13
622
+ wmt/wmt16;en-fr;FLAN;model;;;25.3;data-based;https://arxiv.org/abs/2109.01652;13
623
+ wmt/wmt16;en-de;FLAN;model;;;14.3;data-based;https://arxiv.org/abs/2109.01652;13
624
+ wmt/wmt16;de-en;FLAN;model;;;14.3;data-based;https://arxiv.org/abs/2109.01652;13
625
+ ucinlp/drop;;FLAN;model;;99.4;;data-based;https://arxiv.org/abs/2109.01652;13
626
+ super_glue;copa;FLAN;model;;9.0;;data-based;https://arxiv.org/abs/2109.01652;13
627
+ super_glue;cb;FLAN;model;;5.4;;data-based;https://arxiv.org/abs/2109.01652;13
628
+ super_glue;boolq;FLAN;model;;23.1;;data-based;https://arxiv.org/abs/2109.01652;13
629
+ ibragim-bad/arc_easy;;FLAN;model;;20.2;;data-based;https://arxiv.org/abs/2109.01652;13
630
+ ibragim-bad/arc_challenge;;FLAN;model;;15.6;;data-based;https://arxiv.org/abs/2109.01652;13
631
+ facebook/anli;dev_r3;FLAN;model;;40.2;;data-based;https://arxiv.org/abs/2109.01652;13
632
+ facebook/anli;dev_r2;FLAN;model;;97.9;;data-based;https://arxiv.org/abs/2109.01652;13
633
+ facebook/anli;dev_r1;FLAN;model;;98.6;;data-based;https://arxiv.org/abs/2109.01652;13