Added Contamination Evidence on MMLU of ChatGPT/GPT4 from "Investigating data contamination in modern benchmarks for large language models"
#10
by
AmeyaPrabhu
- opened
- .gitignore +2 -1
- app.py +2 -2
- contamination_report.csv +175 -17
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
*.pyc
|
2 |
-
*.json
|
|
|
|
1 |
*.pyc
|
2 |
+
*.json
|
3 |
+
*.lock
|
app.py
CHANGED
@@ -22,11 +22,11 @@ def filter_dataframe(dataframe, eval_dataset, cont_source, checkboxes):
|
|
22 |
"""
|
23 |
if isinstance(eval_dataset, str):
|
24 |
dataframe = dataframe[
|
25 |
-
dataframe["Evaluation Dataset"].str.contains(eval_dataset)
|
26 |
]
|
27 |
if isinstance(cont_source, str):
|
28 |
dataframe = dataframe[
|
29 |
-
dataframe["Contaminated Source"].str.contains(cont_source)
|
30 |
]
|
31 |
if isinstance(checkboxes, list) and "Exclude model-based evidences" in checkboxes:
|
32 |
dataframe = dataframe[dataframe["Approach"] != "model-based"]
|
|
|
22 |
"""
|
23 |
if isinstance(eval_dataset, str):
|
24 |
dataframe = dataframe[
|
25 |
+
dataframe["Evaluation Dataset"].str.contains(f"(?i){eval_dataset}")
|
26 |
]
|
27 |
if isinstance(cont_source, str):
|
28 |
dataframe = dataframe[
|
29 |
+
dataframe["Contaminated Source"].str.contains(f"(?i){cont_source}")
|
30 |
]
|
31 |
if isinstance(checkboxes, list) and "Exclude model-based evidences" in checkboxes:
|
32 |
dataframe = dataframe[dataframe["Approach"] != "model-based"]
|
contamination_report.csv
CHANGED
@@ -1,5 +1,22 @@
|
|
1 |
Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
allenai/ai2_arc;;CommonCrawl;corpus;;;28.7;data-based;https://arxiv.org/abs/2310.17589;5
|
5 |
tau/commonsense_qa;;CommonCrawl;corpus;;1.6;;data-based;https://arxiv.org/abs/2310.17589;5
|
@@ -436,30 +453,171 @@ zest;;EleutherAI/pile;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
|
|
436 |
zest;;togethercomputer/RedPajama-Data-V2;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
|
437 |
|
438 |
|
439 |
-
imdb;;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/
|
440 |
-
imdb;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
441 |
|
442 |
-
ag_news;;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/
|
443 |
-
ag_news;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
444 |
|
445 |
-
yelp_review_full;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/
|
446 |
-
yelp_review_full;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
447 |
|
448 |
-
nyu-mll/glue;rte;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/
|
449 |
-
nyu-mll/glue;rte;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
450 |
|
451 |
-
nyu-mll/glue;wnli;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/
|
452 |
-
nyu-mll/glue;wnli;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
453 |
|
454 |
-
samsum;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/
|
455 |
-
samsum;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
456 |
|
457 |
-
EdinburghNLP/xsum;;GPT-4;model;0.0;;100.0;model-based;https://arxiv.org/
|
458 |
-
EdinburghNLP/xsum;;GPT-3.5;model;0.0;;100.0;model-based;https://arxiv.org/
|
459 |
|
460 |
-
bigbio/mednli;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/
|
461 |
-
bigbio/mednli;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/
|
462 |
|
463 |
RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
|
464 |
RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
|
465 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
|
2 |
|
3 |
+
gsm8k;;GPT-4;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
|
4 |
+
ucinlp/drop;;GPT-4;model;;44.00;;model-based;https://arxiv.org/abs/2311.06233;8
|
5 |
+
openai_humaneval;;GPT-4;model;;;56.71;model-based;https://arxiv.org/abs/2311.06233;8
|
6 |
+
imdb;;GPT-4;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
|
7 |
+
imdb;;GPT-3.5;model;;;55.00;model-based;https://arxiv.org/abs/2311.06233;8
|
8 |
+
ag_news;;GPT-4;model;;;91.00;model-based;https://arxiv.org/abs/2311.06233;8
|
9 |
+
ag_news;;GPT-3.5;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
|
10 |
+
yelp_review_full;;GPT-4;model;;;80.00;model-based;https://arxiv.org/abs/2311.06233;8
|
11 |
+
yelp_review_full;;GPT-3.5;model;;;13.00;model-based;https://arxiv.org/abs/2311.06233;8
|
12 |
+
nyu-mll/glue;rte;GPT-4;model;;60.00;;model-based;https://arxiv.org/abs/2311.06233;8
|
13 |
+
nyu-mll/glue;rte;GPT-3.5;model;;71.00;;model-based;https://arxiv.org/abs/2311.06233;8
|
14 |
+
nyu-mll/glue;wnli;GPT-4;model;;50.70;;model-based;https://arxiv.org/abs/2311.06233;8
|
15 |
+
nyu-mll/glue;wnli;GPT-3.5;model;;12.68;;model-based;https://arxiv.org/abs/2311.06233;8
|
16 |
+
samsum;;GPT-4;model;;;77.00;model-based;https://arxiv.org/abs/2311.06233;8
|
17 |
+
samsum;;GPT-3.5;model;;;74.00;model-based;https://arxiv.org/abs/2311.06233;8
|
18 |
+
EdinburghNLP/xsum;;GPT-4;model;;;95.00;model-based;https://arxiv.org/abs/2311.06233;8
|
19 |
+
EdinburghNLP/xsum;;GPT-3.5;model;;;79.00;model-based;https://arxiv.org/abs/2311.06233;8
|
20 |
|
21 |
allenai/ai2_arc;;CommonCrawl;corpus;;;28.7;data-based;https://arxiv.org/abs/2310.17589;5
|
22 |
tau/commonsense_qa;;CommonCrawl;corpus;;1.6;;data-based;https://arxiv.org/abs/2310.17589;5
|
|
|
453 |
zest;;togethercomputer/RedPajama-Data-V2;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
|
454 |
|
455 |
|
456 |
+
imdb;;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
457 |
+
imdb;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
458 |
|
459 |
+
ag_news;;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
|
460 |
+
ag_news;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
461 |
|
462 |
+
yelp_review_full;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
463 |
+
yelp_review_full;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
464 |
|
465 |
+
nyu-mll/glue;rte;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
466 |
+
nyu-mll/glue;rte;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
467 |
|
468 |
+
nyu-mll/glue;wnli;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
|
469 |
+
nyu-mll/glue;wnli;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
470 |
|
471 |
+
samsum;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
472 |
+
samsum;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
473 |
|
474 |
+
EdinburghNLP/xsum;;GPT-4;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
|
475 |
+
EdinburghNLP/xsum;;GPT-3.5;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
|
476 |
|
477 |
+
bigbio/mednli;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
478 |
+
bigbio/mednli;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
479 |
|
480 |
RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
|
481 |
RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
|
482 |
+
|
483 |
+
|
484 |
+
cais/mmlu;;GPT-3.5;model;;;52.0;model-based;https://arxiv.org/abs/2311.09783;10
|
485 |
+
winogrande;;GPT-3.5;model;;;9.0;model-based;https://arxiv.org/abs/2311.09783;10
|
486 |
+
truthful_qa;;GPT-3.5;model;;;12.0;model-based;https://arxiv.org/abs/2311.09783;10
|
487 |
+
allenai/openbookqa;;GPT-3.5;model;;;1.0;model-based;https://arxiv.org/abs/2311.09783;10
|
488 |
+
|
489 |
+
cais/mmlu;;GPT-4;model;;;57.0;model-based;https://arxiv.org/abs/2311.09783;10
|
490 |
+
truthful_qa;;GPT-4;model;;;10.0;model-based;https://arxiv.org/abs/2311.09783;10
|
491 |
+
winogrande;;GPT-4;model;;;12.0;model-based;https://arxiv.org/abs/2311.09783;10
|
492 |
+
allenai/openbookqa;;GPT-4;model;;;1.0;model-based;https://arxiv.org/abs/2311.09783;10
|
493 |
+
Rowan/hellaswag;;GPT-4;model;;;2.0;model-based;https://arxiv.org/abs/2311.09783;10
|
494 |
+
|
495 |
+
|
496 |
+
allenai/openbookqa;;LLaMa 2-13B;model;;;4.0;model-based;https://arxiv.org/abs/2311.09783;10
|
497 |
+
truthful_qa;;LLaMa 2-13B;model;;;2.0;model-based;https://arxiv.org/abs/2311.09783;10
|
498 |
+
winogrande;;LLaMa 2-13B;model;;;1.0;model-based;https://arxiv.org/abs/2311.09783;10
|
499 |
+
|
500 |
+
truthful_qa;;Mistral-7B;model;;;15.0;model-based;https://arxiv.org/abs/2311.09783;10
|
501 |
+
allenai/openbookqa;;Mistral-7B;model;;;10.0;model-based;https://arxiv.org/abs/2311.09783;10
|
502 |
+
winogrande;;Mistral-7B;model;;;3.0;model-based;https://arxiv.org/abs/2311.09783;10
|
503 |
+
cais/mmlu;;Mistral-7B;model;;;1.0;model-based;https://arxiv.org/abs/2311.09783;10
|
504 |
+
RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
505 |
+
RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
506 |
+
RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
507 |
+
RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
508 |
+
|
509 |
+
|
510 |
+
openai_humaneval;;EleutherAI/pile;corpus;;;12.2;data-based;https://arxiv.org/abs/2403.04811;12
|
511 |
+
mbpp;;EleutherAI/pile;corpus;;;3.6;data-based;https://arxiv.org/abs/2403.04811;12
|
512 |
+
openai_humaneval;;bigcode/the-stack;corpus;;;18.9;data-based;https://arxiv.org/abs/2403.04811;12
|
513 |
+
mbpp;;bigcode/the-stack;corpus;;;20.8;data-based;https://arxiv.org/abs/2403.04811;12
|
514 |
+
|
515 |
+
quac;;GPT-3;model;;99.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
516 |
+
rajpurkar/squad_v2;;GPT-3;model;;94.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
517 |
+
ucinlp/drop;;GPT-3;model;;93.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
518 |
+
Symbol Insertion;;GPT-3;model;;86.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
519 |
+
stanfordnlp/coqa;;GPT-3;model;;64.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
520 |
+
super_glue;record;GPT-3;model;;61.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
521 |
+
winograd_wsc;;GPT-3;model;;;60.0;data-based;https://arxiv.org/abs/2005.14165;13
|
522 |
+
super_glue;boolq;GPT-3;model;;60.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
523 |
+
super_glue;multirc;GPT-3;model;;59.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
524 |
+
race;high;GPT-3;model;;;45.0;data-based;https://arxiv.org/abs/2005.14165;13
|
525 |
+
cimec/lambada;;GPT-3;model;;;43.0;data-based;https://arxiv.org/abs/2005.14165;13
|
526 |
+
super_glue;wsc;GPT-3;model;;40.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
527 |
+
piqa;;GPT-3;model;;29.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
528 |
+
wmt/wmt16;en-de;GPT-3;model;;;25.0;data-based;https://arxiv.org/abs/2005.14165;13
|
529 |
+
wmt/wmt16;de-en;GPT-3;model;;;25.0;data-based;https://arxiv.org/abs/2005.14165;13
|
530 |
+
race;middle;GPT-3;model;;;25.0;data-based;https://arxiv.org/abs/2005.14165;13
|
531 |
+
rmanluo/RoG-webqsp;;GPT-3;model;;;21.0;data-based;https://arxiv.org/abs/2005.14165;13
|
532 |
+
wmt/wmt16;en-ro;GPT-3;model;;;21.0;data-based;https://arxiv.org/abs/2005.14165;13
|
533 |
+
wmt/wmt16;ro-en;GPT-3;model;;;21.0;data-based;https://arxiv.org/abs/2005.14165;13
|
534 |
+
facebook/anli;test_r1;GPT-3;model;;;20.0;data-based;https://arxiv.org/abs/2005.14165;13
|
535 |
+
facebook/anli;test_r2;GPT-3;model;;;18.0;data-based;https://arxiv.org/abs/2005.14165;13
|
536 |
+
mandarjoshi/trivia_qa;;GPT-3;model;;17.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
537 |
+
facebook/anli;test_r3;GPT-3;model;;;16.0;data-based;https://arxiv.org/abs/2005.14165;13
|
538 |
+
wmt/wmt16;fr-en;GPT-3;model;;;14.0;data-based;https://arxiv.org/abs/2005.14165;13
|
539 |
+
wmt/wmt16;en-fr;GPT-3;model;;;14.0;data-based;https://arxiv.org/abs/2005.14165;13
|
540 |
+
super_glue;rte;GPT-3;model;;8.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
541 |
+
super_glue;wic;GPT-3;model;;8.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
542 |
+
super_glue;cb;GPT-3;model;;7.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
543 |
+
Reversed Words;;GPT-3;model;;7.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
544 |
+
Anagrams 2;;GPT-3;model;;7.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
545 |
+
allenai/openbookqa;;GPT-3;model;;;6.0;data-based;https://arxiv.org/abs/2005.14165;13
|
546 |
+
ibragim-bad/arc_easy;;GPT-3;model;;;4.0;data-based;https://arxiv.org/abs/2005.14165;13
|
547 |
+
Anagrams 1;;GPT-3;model;;3.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
548 |
+
ibragim-bad/arc_challenge;;GPT-3;model;;;3.0;data-based;https://arxiv.org/abs/2005.14165;13
|
549 |
+
super_glue;copa;GPT-3;model;;3.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
550 |
+
Rowan/hellaswag;;GPT-3;model;;2.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
551 |
+
natural_questions;;GPT-3;model;;;1.0;data-based;https://arxiv.org/abs/2005.14165;13
|
552 |
+
Cycled Letters;;GPT-3;model;;1.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
553 |
+
SAT Analogies;;GPT-3;model;;1.0;;data-based;https://arxiv.org/abs/2005.14165;13
|
554 |
+
|
555 |
+
EdinburghNLP/xsum;;PaLM 2;model;;;42.0;data-based;https://arxiv.org/abs/2305.10403;13
|
556 |
+
csebuetnlp/xlsum;;PaLM 2;model;;;46.9;data-based;https://arxiv.org/abs/2305.10403;13
|
557 |
+
wiki_lingua;;PaLM 2;model;;;9.0;data-based;https://arxiv.org/abs/2305.10403;13
|
558 |
+
|
559 |
+
winograd_wsc;;PaLM;model;;;38.5;data-based;https://arxiv.org/abs/2204.02311;13
|
560 |
+
rmanluo/RoG-webqsp;;PaLM;model;;;26.7;data-based;https://arxiv.org/abs/2204.02311;13
|
561 |
+
super_glue;wsc;PaLM;model;;;36.8;data-based;https://arxiv.org/abs/2204.02311;13
|
562 |
+
mandarjoshi/trivia_qa;;PaLM;model;;19.9;;data-based;https://arxiv.org/abs/2204.02311;13
|
563 |
+
rajpurkar/squad_v2;;PaLM;model;;85.2;;data-based;https://arxiv.org/abs/2204.02311;13
|
564 |
+
super_glue;record;PaLM;model;;43.4;;data-based;https://arxiv.org/abs/2204.02311;13
|
565 |
+
cimec/lambada;;PaLM;model;;;29.3;data-based;https://arxiv.org/abs/2204.02311;13
|
566 |
+
super_glue;cb;PaLM;model;;48.2;;data-based;https://arxiv.org/abs/2204.02311;13
|
567 |
+
ibragim-bad/arc_easy;;PaLM;model;;;30.4;data-based;https://arxiv.org/abs/2204.02311;13
|
568 |
+
ibragim-bad/arc_challenge;;PaLM;model;;;24.7;data-based;https://arxiv.org/abs/2204.02311;13
|
569 |
+
|
570 |
+
winograd_wsc;;GLaM;model;;67.3;;data-based;https://arxiv.org/abs/2112.06905;13
|
571 |
+
winogrande;;GLaM;model;;;0.3;data-based;https://arxiv.org/abs/2112.06905;13
|
572 |
+
super_glue;wic;GLaM;model;;8.2;;data-based;https://arxiv.org/abs/2112.06905;13
|
573 |
+
super_glue;wsc;GLaM;model;;57.5;;data-based;https://arxiv.org/abs/2112.06905;13
|
574 |
+
mandarjoshi/trivia_qa;;GLaM;model;;18.8;;data-based;https://arxiv.org/abs/2112.06905;13
|
575 |
+
story_cloze;;GLaM;model;;100.0;;data-based;https://arxiv.org/abs/2112.06905;13
|
576 |
+
rajpurkar/squad_v2;;GLaM;model;;94.6;;data-based;https://arxiv.org/abs/2112.06905;13
|
577 |
+
super_glue;record;GLaM;model;;98.6;;data-based;https://arxiv.org/abs/2112.06905;13
|
578 |
+
super_glue;rte;GLaM;model;;54.9;;data-based;https://arxiv.org/abs/2112.06905;13
|
579 |
+
race;middle;GLaM;model;;58.4;;data-based;https://arxiv.org/abs/2112.06905;13
|
580 |
+
race;high;GLaM;model;;74.0;;data-based;https://arxiv.org/abs/2112.06905;13
|
581 |
+
quac;;GLaM;model;;99.9;;data-based;https://arxiv.org/abs/2112.06905;13
|
582 |
+
piqa;;GLaM;model;;49.8;;data-based;https://arxiv.org/abs/2112.06905;13
|
583 |
+
allenai/openbookqa;;GLaM;model;;20.0;;data-based;https://arxiv.org/abs/2112.06905;13
|
584 |
+
natural_questions;;GLaM;model;;3.9;;data-based;https://arxiv.org/abs/2112.06905;13
|
585 |
+
super_glue;multirc;GLaM;model;;68.8;;data-based;https://arxiv.org/abs/2112.06905;13
|
586 |
+
cimec/lambada;;GLaM;model;;;21.8;data-based;https://arxiv.org/abs/2112.06905;13
|
587 |
+
Rowan/hellaswag;;GLaM;model;;19.8;;data-based;https://arxiv.org/abs/2112.06905;13
|
588 |
+
stanfordnlp/coqa;;GLaM;model;;;75.0;data-based;https://arxiv.org/abs/2112.06905;13
|
589 |
+
super_glue;copa;GLaM;model;;3.0;;data-based;https://arxiv.org/abs/2112.06905;13
|
590 |
+
super_glue;cb;GLaM;model;;26.8;;data-based;https://arxiv.org/abs/2112.06905;13
|
591 |
+
super_glue;boolq;GLaM;model;;92.1;;data-based;https://arxiv.org/abs/2112.06905;13
|
592 |
+
ibragim-bad/arc_easy;;GLaM;model;;32.5;;data-based;https://arxiv.org/abs/2112.06905;13
|
593 |
+
ibragim-bad/arc_challenge;;GLaM;model;;31.8;;data-based;https://arxiv.org/abs/2112.06905;13
|
594 |
+
facebook/anli;dev_r3;GLaM;model;;40.7;;data-based;https://arxiv.org/abs/2112.06905;13
|
595 |
+
facebook/anli;dev_r2;GLaM;model;;96.8;;data-based;https://arxiv.org/abs/2112.06905;13
|
596 |
+
facebook/anli;dev_r1;GLaM;model;;96.2;;data-based;https://arxiv.org/abs/2112.06905;13
|
597 |
+
|
598 |
+
winogrande;;FLAN;model;;;0.2;data-based;https://arxiv.org/abs/2109.01652;13
|
599 |
+
mandarjoshi/trivia_qa;;FLAN;model;;22.8;;data-based;https://arxiv.org/abs/2109.01652;13
|
600 |
+
story_cloze;;FLAN;model;;0.4;;data-based;https://arxiv.org/abs/2109.01652;13
|
601 |
+
rajpurkar/squad_v2;;FLAN;model;;99.1;;data-based;https://arxiv.org/abs/2109.01652;13
|
602 |
+
wmt/wmt16;ro-en;FLAN;model;;;12.4;data-based;https://arxiv.org/abs/2109.01652;13
|
603 |
+
super_glue;record;FLAN;model;;68.0;;data-based;https://arxiv.org/abs/2109.01652;13
|
604 |
+
super_glue;rte;FLAN;model;;33.9;;data-based;https://arxiv.org/abs/2109.01652;13
|
605 |
+
piqa;;FLAN;model;;51.3;;data-based;https://arxiv.org/abs/2109.01652;13
|
606 |
+
allenai/openbookqa;;FLAN;model;;15.0;;data-based;https://arxiv.org/abs/2109.01652;13
|
607 |
+
natural_questions;;FLAN;model;;3.2;;data-based;https://arxiv.org/abs/2109.01652;13
|
608 |
+
super_glue;multirc;FLAN;model;;59.3;;data-based;https://arxiv.org/abs/2109.01652;13
|
609 |
+
Rowan/hellaswag;;FLAN;model;;34.5;;data-based;https://arxiv.org/abs/2109.01652;13
|
610 |
+
wmt/wmt16;fr-en;FLAN;model;;;25.3;data-based;https://arxiv.org/abs/2109.01652;13
|
611 |
+
wmt/wmt16;en-ro;FLAN;model;;;12.4;data-based;https://arxiv.org/abs/2109.01652;13
|
612 |
+
wmt/wmt16;en-fr;FLAN;model;;;25.3;data-based;https://arxiv.org/abs/2109.01652;13
|
613 |
+
wmt/wmt16;en-de;FLAN;model;;;14.3;data-based;https://arxiv.org/abs/2109.01652;13
|
614 |
+
wmt/wmt16;de-en;FLAN;model;;;14.3;data-based;https://arxiv.org/abs/2109.01652;13
|
615 |
+
ucinlp/drop;;FLAN;model;;99.4;;data-based;https://arxiv.org/abs/2109.01652;13
|
616 |
+
super_glue;copa;FLAN;model;;9.0;;data-based;https://arxiv.org/abs/2109.01652;13
|
617 |
+
super_glue;cb;FLAN;model;;5.4;;data-based;https://arxiv.org/abs/2109.01652;13
|
618 |
+
super_glue;boolq;FLAN;model;;23.1;;data-based;https://arxiv.org/abs/2109.01652;13
|
619 |
+
ibragim-bad/arc_easy;;FLAN;model;;20.2;;data-based;https://arxiv.org/abs/2109.01652;13
|
620 |
+
ibragim-bad/arc_challenge;;FLAN;model;;15.6;;data-based;https://arxiv.org/abs/2109.01652;13
|
621 |
+
facebook/anli;dev_r3;FLAN;model;;40.2;;data-based;https://arxiv.org/abs/2109.01652;13
|
622 |
+
facebook/anli;dev_r2;FLAN;model;;97.9;;data-based;https://arxiv.org/abs/2109.01652;13
|
623 |
+
facebook/anli;dev_r1;FLAN;model;;98.6;;data-based;https://arxiv.org/abs/2109.01652;13
|