File size: 20,435 Bytes
e137e27
 
4a437aa
 
 
 
 
 
 
 
c31df73
 
 
5637ee4
 
 
 
7e67bfe
b46a153
e137e27
ccd1474
c40ec7e
 
 
ccd1474
c40ec7e
 
 
 
 
 
 
 
 
 
 
 
 
ccd1474
c40ec7e
 
 
 
 
 
 
 
 
 
 
 
 
 
ccd1474
dc5ac06
4a437aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc5ac06
 
c40ec7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc5ac06
c40ec7e
 
 
ccd1474
c40ec7e
 
 
 
 
 
 
 
 
d95e4d8
c40ec7e
 
 
 
 
 
 
 
d95e4d8
ccd1474
c31df73
c40ec7e
 
 
 
 
 
 
 
 
 
 
 
 
e137e27
c31df73
 
 
 
0f14580
 
 
 
 
 
 
11009b9
 
 
 
 
 
752d87b
 
 
44f2e3d
5236603
 
 
 
44f2e3d
c40ec7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44f2e3d
 
 
 
c31df73
c40ec7e
 
 
 
 
c31df73
c40ec7e
 
 
 
 
c790d40
c40ec7e
 
 
 
 
c535408
 
c40ec7e
c31df73
c40ec7e
 
 
c31df73
c40ec7e
 
 
 
 
 
 
4a437aa
 
 
0cc9c9f
c40ec7e
 
 
 
 
c31df73
c40ec7e
 
 
 
 
 
 
 
c535408
c40ec7e
c31df73
c535408
c40ec7e
 
c31df73
c40ec7e
c535408
c40ec7e
c31df73
c535408
c40ec7e
 
 
 
 
c31df73
c40ec7e
 
c535408
c40ec7e
c31df73
c535408
c40ec7e
 
 
 
 
 
 
 
c31df73
4a437aa
c40ec7e
 
 
 
c535408
c40ec7e
c31df73
c535408
dcb73ca
c40ec7e
 
 
 
 
 
 
 
c31df73
c40ec7e
c535408
c40ec7e
c31df73
c40ec7e
 
 
 
0cc9c9f
c40ec7e
 
 
0cc9c9f
c40ec7e
 
c31df73
0cc9c9f
c40ec7e
c31df73
c40ec7e
 
 
 
c31df73
c40ec7e
 
c31df73
c40ec7e
 
 
c31df73
c40ec7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c535408
c40ec7e
c31df73
c40ec7e
 
 
 
c31df73
c40ec7e
c790d40
c40ec7e
 
 
c790d40
c40ec7e
 
 
 
 
 
c535408
c40ec7e
 
 
 
 
 
c31df73
e137e27
c31df73
 
c40ec7e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
from fasthtml.common import *
from fasthtml.components import *
from fasthtml.components import (
    D_title,
    D_article,
    D_front_matter,
    D_contents,
    D_byline,
    D_cite,
)
from fh_plotly import plotly2fasthtml
import pandas as pd
import json
import random
import string
from rich import print
import jsonlines
import plotly.express as px
from fasthtml.components import D_code


def dup_cluster_graph():
    with open("data/cluster_sizes_100.json", "r") as f:
        data = json.load(f)

    dump_counts = pd.DataFrame(data.items(), columns=["cluster_size_range", "counts"])

    fig = px.bar(
        dump_counts,
        x="cluster_size_range",
        y="counts",
        log_y=True,
        labels={
            "cluster_size_range": "Size of Near-Duplicate Clusters (Document Count)",
            "counts": "Number of Clusters",
        },
        color_discrete_sequence=["#636EFA"],
    )

    fig.update_layout(
        template="none",
        showlegend=False,
        plot_bgcolor="rgba(0,0,0,0)",
        paper_bgcolor="rgba(0,0,0,0)",
    )

    fig.update_xaxes(
        tickmode="array",
        tickvals=[0, 200000, 400000, 600000, 800000],
        ticktext=["0", "0.2m", "0.4m", "0.6m", "0.8m"],
    )

    return fig


def dedup_pairs_bands():
    return pd.DataFrame(
        {
            "Bloom Filter": [
                "BF 0",
                "",
                "",
                "",
                "BF 1",
                "",
                "BF 8",
            ],
            "Band 0": [
                "(A,B)",
                "(C,D)",
                "(E,K)",
                "(B,K)",
                "...",
                "...",
                "...",
            ],
            "Band 1": [
                "(A,B)",
                "(C,D)",
                "(F,K)",
                "(B,K)",
                "...",
                "...",
                "...",
            ],
            "....": [
                "...",
                "...",
                "...",
                "...",
                "...",
                "...",
                "...",
            ],
            "Band 8": [
                "(A,B)",
                "(C,D)",
                "(D,E)",
                "(E,K)",
                "(B,K)",
                "...",
                "...",
            ],
        }
    ).to_html(index=False, border=0)


def dup_docs_count_graph():
    dup_docs_count = {
        "80": 382164413,
        "90": 660766607,
        "94": 2004544307,
        "49": 1249363963,
        "96": 6378899946,
        "91": 796400555,
        "13": 170737436,
        "34": 390565222,
        "37": 243097281,
        "78": 333786871,
        "40": 331019592,
        "47": 633983288,
        "74": 443143441,
        "12": 115630971,
        "82": 491144800,
        "63": 335567006,
        "60": 361001039,
        "42": 369986102,
        "43": 344094214,
        "95": 3297371929,
        "56": 450449769,
        "58": 394889638,
        "48": 821491815,
        "18": 192658724,
        "86": 621122463,
        "50": 917219351,
        "83": 468165632,
        "38": 281883697,
        "51": 244891366,
        "93": 1236979939,
        "65": 396080116,
        "71": 403250107,
        "11": 101639319,
        "81": 367154215,
        "72": 458795954,
        "32": 218765954,
        "92": 943046601,
        "85": 507967375,
        "66": 279985567,
        "54": 291611429,
        "87": 657754973,
        "39": 296672084,
        "89": 747973994,
        "26": 179628225,
        "45": 441047510,
        "64": 319547728,
        "76": 337730046,
        "57": 415519600,
        "53": 346555733,
        "75": 319730996,
        "21": 239475626,
        "67": 277544884,
        "10": 102493868,
        "68": 348155455,
        "59": 344897755,
        "62": 326551051,
        "22": 223000489,
        "88": 722070344,
        "52": 295881819,
        "84": 613535675,
        "55": 487356947,
        "17": 226423150,
        "69": 349626770,
        "20": 163869592,
        "16": 452282480,
        "70": 390584359,
        "73": 394778904,
        "28": 197047765,
        "36": 230817595,
        "44": 618669127,
        "29": 180518021,
        "77": 429496570,
        "25": 140344588,
        "14": 212064682,
        "41": 428759750,
        "15": 147268059,
        "00": 136048949,
        "31": 325178167,
        "35": 213448884,
        "79": 394056890,
        "24": 359444850,
        "30": 178934263,
        "61": 336060420,
        "23": 378045294,
        "46": 417319179,
        "33": 239167872,
        "27": 111503187,
        "19": 125085842,
    }

    dup_docs_count_for_graph = pd.DataFrame(
        sorted(dup_docs_count.items()), columns=["CC_dump", "num_duplicates"]
    )

    fig = px.bar(
        dup_docs_count_for_graph,
        x="CC_dump",
        y="num_duplicates",
        labels={
            "CC_dump": "CommonCrawl Dump",
            "num_duplicates": "Number of duplicates",
        },
    )

    fig.update_layout(
        template="none",
        showlegend=False,
        plot_bgcolor="rgba(0,0,0,0)",
        paper_bgcolor="rgba(0,0,0,0)",
    )

    return fig


nfc_examples = pd.DataFrame(
    {
        "Original Text": [
            "für",
            "the problem \ ud83d \ ude42",
            "peptidoglycan`s",
        ],
        "NFC Text": [
            "für",
            "the problem 🙂",
            "peptidoglycan's",
        ],
    }
)

table_html_nfc = nfc_examples.to_html(index=False, border=0)
table_div_nfc_examples = Div(NotStr(table_html_nfc), style="margin: 40px;")


dask_algo = """
dask.bag.from_sequence(doc_file_paths)
.map_partitions(stream_docs)
.groupby(lambda doc: doc["hash"])
.map_partitions(make_doc_pairs)
.compute()
"""
email_code = """
r"[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:(?:[A-Za-z0-9]
(?:["r"A-Za-z0-9-]*[A-Za-z0-9])?\.)+[A-Za-z0-9](?:[A-Za-z0-9-]*[A-Za-z0-9])?|\[(?:(?:25
[0-5]|2[0-4][0-9]|[&quot r&quot01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?
|[A-Za-z0-9-]*[A-Za-z0-9]:)])
"""
ip_address_code = """
r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
"""

nfc_code = """
ftfy.fix_text(text, normalization="NFC")
"""

pii_table = pd.DataFrame(
    {
        "PII Type": [
            "Email",
            "IP Address",
        ],
        "Examples": [
            "[email protected]",
            "172.217.164.110",
        ],
        "Target": [
            "[email protected]",
            "[22.214.171.124 , ...]",
        ],
    }
)

table_html_pii = pii_table.to_html(index=False, border=0)
table_div_pii = Div(NotStr(table_html_pii), style="margin: 40px;")

global_div = Div(
    Section(
        H2("Overview"),
        H3("What This Section Contains"),
        P(
            "This section discusses all details related to deduplication and filterings steps that were uniformly applied to all data. The section is split into the following topic areas: "
        ),
        Ul(
            Li("Motivation Behind Global Deduplication", style="margin-bottom: 5px"),
            Li(
                "TxT360 Deduplication Process and Implementation",
                style="margin-bottom: 5px",
            ),
            Li(
                "Personally Identifiable Information Removal",
                style="margin-bottom: 5px",
            ),
            Li("Normailzation Form C Discussion", style="margin-bottom: 5px"),
        ), 
        id="section1",
    ),
    Section(
        H2("Motivation Behind Global Deduplication"),
        P(
            "Deduplication is beneficial for LM pretraining in several ways, with the most important being controllable upsampling. With unique data, teams gain fine-grained control over the training data. Other benefits of deduplication include avoiding train-test overlap which prevents evaluation contamination."
        ),
        P(
            "Duplicate data can lead to a strong double descent phenomenon, where repeated data causes test loss to increase midway through training [2]. Additionally, it reduces the risk of memorization [1]. By implementing deduplication and selective upsampling, we gain control over the pretraining data distribution, rather than relying on the inherent distribution of the source."
        ),
        P(
            "To illustrate the need for deduplication, below is the distribution of near-duplicate clusters, organized into buckets of 100. The first bucket contains clusters with sizes ranging from 2 to 100, as found in the Common Crawl dataset. Some clusters even reach up to a million documents."
        ),
        plotly2fasthtml(dup_cluster_graph()),
        P(
            "The example below is from one such cluster. Here most of the text is repeated with just specifics changed."
        ),
        Img(src="images/100k.png", style="max-width: 100%;"),
        P(
            "We started deduplication with 61.8 TB of filtered and compressed documents. The initial dataset had roughly 48.83 billion documents. First, we performed exact deduplication using a Bloom filter with a capacity of 1 billion and a false positive rate of 0.001. This reduced the documents from 48.83 billion to 40.21 billion, removing about 17% as exact duplicates. This step used constant memory for the Bloom filter and lessened the workload for subsequent near-deduplication."
        ),
        P(
            "For the global near-deduplication, we employed a methodology used by prior works like SlimPajama [3] but scaled it to the entire dataset which includes 99 Common Crawl dumps (also called “crawls”) and the curated data. The near-deduplication process involved generating signatures for every document, matching these signatures to identify near-duplicates, and then clustering the near-duplicate documents to select all but one for deletion."
        ),
        P("We applied the following inclusion criteria for all documents:"),
        Ul(
            Li("Curated Document > Common Crawl Document", style="margin-bottom: 5px"),
            Li("Most Recent > Less Recent", style="margin-bottom: 5px"),
        ),
        P(
            "Additionally, we maintained statistics about each matching clusters as they were formed during the final stage of deduplication. Below are the details of all four stages of our deduplication pipeline. We use Dask extensively throughout all stages of the deduplication. We have included the size of results of each stage on disk to give an idea about the scale:"
        ),
        id="section2",
    ),
    Section(
        H3("MinHash Generation"),
        P(
            "We use the datasketch library to generate MinHash signatures with the number of permutations to 128. Each signature is signature represented as a MinHash object for each document. Before caluclating the signature, the text is cleaned by stripping whitespace, converting to lowercase, and removing punctuation, consecutive spaces, newlines, and tabs. Next, a list of 13-grams is generated to use as features for creating a document signature. The globally-unique document IDs and signatures are then saved to disk. The documented ID is designed by an encoding scheme which converts file names and line numbers (there is one document per line) to unique document IDs. This also helped a lot in saving disk and memory for this stage."
        ),
        P(B("This step produced 20 TB of hashes.")),
        id="section3",
    ),
    Section(
        H3("Matching Pairs Generation"),
        P(
            "We are using a Jaccard similarity threshold of 0.8 to identify near-duplicate documents. To do this, we divide the MinHashes into 9 bands, each with 13 hashes (also known as the range). To save memory during matching, we first store each band of MinHashes separately on disk. We then process each band individually. Within each band, documents are matched based on their hashes, and the matches are saved as document pairs. A document is considered a match if it matches another document in any of the 9 bands. Since we are looking for near-duplicates, a document may match multiple documents across different bands."
        ),
        P(
            "For partitioning and matching the hashes, we utilize Dask's bag data structure to load the document ids and MinHashes. The matching process is simply a group by operation on this bag data structure. This approach allows us to group matches efficiently and distribute the operation to multiple machines. The group by produces full components (documents that share the same signature) within a band which simplifies the later stages. The algorithm can be expressed using the Dask expression below:"
        ),
        D_code(dask_algo, block="block", language="python"),
        P(B("This step produced 9.2 TB of matching pairs from all bands.")),
        id="section4",
    ),
    Section(
        H3("Finding Duplicate Pairs"),
        P(
            "Multiple bands can create the same document pairs, leading to duplicates. The simplest way to eliminate these duplicate pairs is to call distinct() before the compute(). However, we found that Dask is not very efficient when it comes to distributed distinct execution. Additionally, since we process each band separately, this approach wouldn’t remove duplicates across different bands."
        ),
        P(
            "To address this, we use a Bloom filter with a capacity of 64 billion and a false positive rate of 0.001 to remove duplicates. We parallelize the Bloom filter execution is by partitioning pairs horizontally and running one filter per partition, as shown in the table below. Note: this step was completed in ~5 days by parallelizing the Bloom filter versus ~25 days if the filter was serialized."
        ),
        P(
            "There is a high chance that duplicates from different bands will have the same pairs in the same horizontal partition. Performing the Bloom filter step reduces the number of pairs by nearly ninefold."
        ),
        Div(NotStr(dedup_pairs_bands()), style="margin: 40px;"),
        P(
            "The resulting unique pairs are then used to identify clusters of near-duplicates by finding connected components in a graph, where the vertices represent documents and the edges represent matches."
        ),
        P(B("This step produced 1.9 TB of unique pairs.")),
        id="section5",
    ),
    Section(
        H3("Finding Connected Components using MapReduce"),
        Img(src="images/findcc.svg", style="max-width: 100%;"),
        P(
            "The purpose of this step is to create a set of clusters of matching pairs. For example, a list of pairs (A, B), (B, C), (D, E) is merged into a list of components (A, B, C) and (D, E). Using a third-party library like NetworkX to find connected components would require all pairs to fit into the memory of a single machine, which is not feasible. Instead, we implemented a distributed connected component finder [4] using the Dask framework, which can scale across multiple machines. The algorithm works by mapping edges by both the source and destination of pairs and reducing only edges where the source is greater than the destination. It performs successive iterations of this MapReduce computation until convergence, meaning the number of new edges produced becomes zero. In the end, every document in a cluster points to the smallest document within the cluster. Later, we compile a list of duplicate documents that need deletion and gather statistics about each component."
        ),
        P(
            "We needed to partition the duplicate pairs generated in the third stage into three groups to reduce memory pressure on the final stage. We observed that the second stage itself generates partial components which have some overlap. These overlapping clusters cause some documents to appear in the delete set multiple times. However, our deletion code handled this overlap."
        ),
        P(
            "Below is the distribution of duplicate documents found across different dumps of CommonCrawl. The distribution is skewed to the right because the documents are bucketed by the dump ID of the document we retain, and we prefer documents from higher dump IDs."
        ),
        plotly2fasthtml(dup_docs_count_graph()),
        id="section6",
    ),
    Section(
        H3("Analysis of Near-Duplicate Clusters"),
        P(
            "Smaller components tend to have more overlap in their MinHash bands. The smallest components are almost exact pairs but due to small differences, were not included in the local exact deduplication."
        ),
        Img(src="images/image3.png", style="max-width: 100%;"),
        P(
            "Changes in text are incremental from buckets of 3 or more documents onwards. The example below shows a personnel list that has grown over the years."
        ),
        Img(src="images/image7.png", style="max-width: 100%;"),
        P(
            "In sizable clusters comprising 1000 or more documents, we observe a trend towards templatization. This involves the recurrent use of standardized language to convey general topics such as terms and conditions, warnings, and disclaimers. Such language is prevalent on commercial websites, offering a consistent and efficient way to communicate commonly encountered information."
        ),
        Img(src="images/image9.png", style="max-width: 100%;"),
    ),
    Section(
        H2("Personally Identifable Information Removal"),
        H3("Motivation Behind Personally Identifable Information Removal"),
        P(
            "Personally Identifable Information (PII) refers to any information that can be used to identify an individual, such as names, addresses, phone numbers, email addresses, and social security numbers. PII removal is essential for data privacy and security, as well as for compliance with global regulations. By removing PII from the training data, we can reduce the risk of data breaches and unauthorized access to sensitive information. Additionally, removing PII from training data prevents the models generating that specific PII during inference time."
        ),
        table_div_pii,
    ),
    Section(
        H3("Removing PII"),
        P(
            "We have removed two types of PII from the dataset: email address and IP address. Regular expressions are used to identify and replace these PII with a generic placeholder. Below is an example of how we removed email addresses from the dataset:"
        ),
        P(
            "We have used the following regular expressions to identify and replace PII:"
        ),
        Ul(
            Li("Email:"),
            Li(
                D_code(email_code, block="block", language="python"),
                style="list-style-type: none",
            ),
            Li("IP Address:"),
            Li(
                D_code(ip_address_code, block="block", language="python"),
                style="list-style-type: none",
            ),
        ),
        id="section7",
    ),
    Section(
        H2("Normalization Form C"),
        H3("Normalization Form C Defined"),
        P(
            "Normalization Form C (NFC) is a Unicode normalization form that combines characters with diacritics into a single code point. This is important for text processing tasks as it ensures that the text is consistently represented across different languages and scripts. By normalizing the text to NFC, we can avoid issues related to character encoding, such as duplicate tokens and incorrect tokenization."
        ),
    ),
    Section(
        H3("NFC Implementation"),
        P(
            "We have used the ftfy library to normalize the text to NFC. The library provides a simple API for normalizing text to NFC, which can be applied to the entire dataset one row at a time. Below is the code snippet about how we normalized text to NFC:"
        ),
        Ul(
            Li(
                D_code(nfc_code, block="block", language="python"),
                style="list-style-type: none",
            )
        ),  # "background-color= gray" "color= blue" maybe add this later
        id="section8",
    ),
    Section(
        H3("NFC Examples"),
        table_div_nfc_examples,
    ),
    Section(H3("Conclusion"), P("NEED TO UPDATE")),
)


def common_steps():
    return Div(Section(global_div, id="inner-text"))