Add data from "Documenting Large Webtext Corpora: A Case Study on the Colossal Clean Crawled Corpus" (#6)
Browse files- Add data from "Documenting Large Webtext Corpora: A Case Study on the Colossal Clean Crawled Corpus" (ad06fdcd383dcbf2bd4b407332cdd6e7440c47af)
Co-authored-by: Vishaal Udandarao <[email protected]>
- contamination_report.csv +22 -0
contamination_report.csv
CHANGED
@@ -1,5 +1,27 @@
|
|
1 |
Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
UCLNLP/adversarial_qa;adversarialQA;allenai/c4;corpus;;;0.03;data-based;https://arxiv.org/abs/2310.20707;2
|
5 |
UCLNLP/adversarial_qa;adversarialQA;oscar-corpus/OSCAR-2301;corpus;;;0.03;data-based;https://arxiv.org/abs/2310.20707;2
|
|
|
1 |
Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
|
2 |
|
3 |
+
lama;T-REx;allenai/c4;corpus;;;4.6;data-based;https://arxiv.org/abs/2104.08758;6
|
4 |
+
lama;Google-RE;allenai/c4;corpus;;;5.7;data-based;https://arxiv.org/abs/2104.08758;6
|
5 |
+
EdinburghNLP/xsum;;allenai/c4;corpus;;;15.49;data-based;https://arxiv.org/abs/2104.08758;6
|
6 |
+
reddit_tifu;short;allenai/c4;;;24.88;data-based;https://arxiv.org/abs/2104.08758;6
|
7 |
+
reddit_tifu;long;allenai/c4;;;1.87;data-based;https://arxiv.org/abs/2104.08758;6
|
8 |
+
wiki_bio;;allenai/c4;;;3.72;data-based;https://arxiv.org/abs/2104.08758;6
|
9 |
+
AMR-to-Text;;allenai/c4;;;10.43;data-based;https://arxiv.org/abs/2104.08758;6
|
10 |
+
nyu-mll/glue;BoolQ;allenai/c4;;;2.4;data-based;https://arxiv.org/abs/2104.08758;6
|
11 |
+
nyu-mll/glue;CoLA;allenai/c4;;;14.4;data-based;https://arxiv.org/abs/2104.08758;6
|
12 |
+
nyu-mll/glue;MNLI-hypothesis;allenai/c4;;;14.2;data-based;https://arxiv.org/abs/2104.08758;6
|
13 |
+
nyu-mll/glue;MNLI-premise;allenai/c4;;;15.2;data-based;https://arxiv.org/abs/2104.08758;6
|
14 |
+
nyu-mll/glue;MRPC-sentence-1;allenai/c4;;;2.7;data-based;https://arxiv.org/abs/2104.08758;6
|
15 |
+
nyu-mll/glue;MRPC-sentence-2;allenai/c4;;;2.7;data-based;https://arxiv.org/abs/2104.08758;6
|
16 |
+
nyu-mll/glue;QNLI-sentence;allenai/c4;;;53.6;data-based;https://arxiv.org/abs/2104.08758;6
|
17 |
+
nyu-mll/glue;QNLI-question;allenai/c4;;;1.8;data-based;https://arxiv.org/abs/2104.08758;6
|
18 |
+
nyu-mll/glue;RTE-sentence-1;allenai/c4;;;6.0;data-based;https://arxiv.org/abs/2104.08758;6
|
19 |
+
nyu-mll/glue;RTE-sentence-2;allenai/c4;;;10.8;data-based;https://arxiv.org/abs/2104.08758;6
|
20 |
+
nyu-mll/glue;SST-2;allenai/c4;;;11.0;data-based;https://arxiv.org/abs/2104.08758;6
|
21 |
+
nyu-mll/glue;STS-B-sentence-1;allenai/c4;;;18.3;data-based;https://arxiv.org/abs/2104.08758;6
|
22 |
+
nyu-mll/glue;STS-B-sentence-2;allenai/c4;;;18.6;data-based;https://arxiv.org/abs/2104.08758;6
|
23 |
+
nyu-mll/glue;WNLI-sentence-1;allenai/c4;;;4.8;data-based;https://arxiv.org/abs/2104.08758;6
|
24 |
+
nyu-mll/glue;WNLI-sentence-2;allenai/c4;;;2.1;data-based;https://arxiv.org/abs/2104.08758;6
|
25 |
|
26 |
UCLNLP/adversarial_qa;adversarialQA;allenai/c4;corpus;;;0.03;data-based;https://arxiv.org/abs/2310.20707;2
|
27 |
UCLNLP/adversarial_qa;adversarialQA;oscar-corpus/OSCAR-2301;corpus;;;0.03;data-based;https://arxiv.org/abs/2310.20707;2
|