victormiller
commited on
Commit
•
d95e4d8
1
Parent(s):
0b63499
Update common.py
Browse files
common.py
CHANGED
@@ -29,7 +29,109 @@ fig = px.bar(
|
|
29 |
|
30 |
dup_cluster_graph = fig.update_layout(showlegend=False)
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
nfc_examples = pd.DataFrame(
|
35 |
{
|
@@ -131,7 +233,7 @@ global_div = Div(
|
|
131 |
P("The purpose of this step is to create a set of clusters of matching pairs. For example, a list of pairs (A, B), (B, C), (D, E) is merged into a list of components (A, B, C) and (D, E). Using a third-party library like NetworkX to find connected components would require all pairs to fit into the memory of a single machine, which is not feasible. Instead, we implemented a distributed connected component finder [4] using the Dask framework, which can scale across multiple machines. The algorithm works by mapping edges by both the source and destination of pairs and reducing only edges where the source is greater than the destination. It performs successive iterations of this MapReduce computation until convergence, meaning the number of new edges produced becomes zero. In the end, every document in a cluster points to the smallest document within the cluster. Later, we compile a list of duplicate documents that need deletion and gather statistics about each component."),
|
132 |
P("We needed to partition the duplicate pairs generated in the third stage into three groups to reduce memory pressure on the final stage. We observed that the second stage itself generates partial components which have some overlap. These overlapping clusters cause some documents to appear in the delete set multiple times. However, our deletion code handled this overlap."),
|
133 |
P("Below is the distribution of duplicate documents found across different dumps of CommonCrawl. The distribution is skewed to the right because the documents are bucketed by the dump ID of the document we retain, and we prefer documents from higher dump IDs."),
|
134 |
-
|
135 |
),
|
136 |
Section(
|
137 |
H3("Analysis of Near-Duplicate Clusters"),
|
|
|
29 |
|
30 |
dup_cluster_graph = fig.update_layout(showlegend=False)
|
31 |
|
32 |
+
dup_docs_count = {
|
33 |
+
"80": 382164413,
|
34 |
+
"90": 660766607,
|
35 |
+
"94": 2004544307,
|
36 |
+
"49": 1249363963,
|
37 |
+
"96": 6378899946,
|
38 |
+
"91": 796400555,
|
39 |
+
"13": 170737436,
|
40 |
+
"34": 390565222,
|
41 |
+
"37": 243097281,
|
42 |
+
"78": 333786871,
|
43 |
+
"40": 331019592,
|
44 |
+
"47": 633983288,
|
45 |
+
"74": 443143441,
|
46 |
+
"12": 115630971,
|
47 |
+
"82": 491144800,
|
48 |
+
"63": 335567006,
|
49 |
+
"60": 361001039,
|
50 |
+
"42": 369986102,
|
51 |
+
"43": 344094214,
|
52 |
+
"95": 3297371929,
|
53 |
+
"56": 450449769,
|
54 |
+
"58": 394889638,
|
55 |
+
"48": 821491815,
|
56 |
+
"18": 192658724,
|
57 |
+
"86": 621122463,
|
58 |
+
"50": 917219351,
|
59 |
+
"83": 468165632,
|
60 |
+
"38": 281883697,
|
61 |
+
"51": 244891366,
|
62 |
+
"93": 1236979939,
|
63 |
+
"65": 396080116,
|
64 |
+
"71": 403250107,
|
65 |
+
"11": 101639319,
|
66 |
+
"81": 367154215,
|
67 |
+
"72": 458795954,
|
68 |
+
"32": 218765954,
|
69 |
+
"92": 943046601,
|
70 |
+
"85": 507967375,
|
71 |
+
"66": 279985567,
|
72 |
+
"54": 291611429,
|
73 |
+
"87": 657754973,
|
74 |
+
"39": 296672084,
|
75 |
+
"89": 747973994,
|
76 |
+
"26": 179628225,
|
77 |
+
"45": 441047510,
|
78 |
+
"64": 319547728,
|
79 |
+
"76": 337730046,
|
80 |
+
"57": 415519600,
|
81 |
+
"53": 346555733,
|
82 |
+
"75": 319730996,
|
83 |
+
"21": 239475626,
|
84 |
+
"67": 277544884,
|
85 |
+
"10": 102493868,
|
86 |
+
"68": 348155455,
|
87 |
+
"59": 344897755,
|
88 |
+
"62": 326551051,
|
89 |
+
"22": 223000489,
|
90 |
+
"88": 722070344,
|
91 |
+
"52": 295881819,
|
92 |
+
"84": 613535675,
|
93 |
+
"55": 487356947,
|
94 |
+
"17": 226423150,
|
95 |
+
"69": 349626770,
|
96 |
+
"20": 163869592,
|
97 |
+
"16": 452282480,
|
98 |
+
"70": 390584359,
|
99 |
+
"73": 394778904,
|
100 |
+
"28": 197047765,
|
101 |
+
"36": 230817595,
|
102 |
+
"44": 618669127,
|
103 |
+
"29": 180518021,
|
104 |
+
"77": 429496570,
|
105 |
+
"25": 140344588,
|
106 |
+
"14": 212064682,
|
107 |
+
"41": 428759750,
|
108 |
+
"15": 147268059,
|
109 |
+
"00": 136048949,
|
110 |
+
"31": 325178167,
|
111 |
+
"35": 213448884,
|
112 |
+
"79": 394056890,
|
113 |
+
"24": 359444850,
|
114 |
+
"30": 178934263,
|
115 |
+
"61": 336060420,
|
116 |
+
"23": 378045294,
|
117 |
+
"46": 417319179,
|
118 |
+
"33": 239167872,
|
119 |
+
"27": 111503187,
|
120 |
+
"19": 125085842,
|
121 |
+
}
|
122 |
|
123 |
+
dup_docs_count_for_graph = pd.DataFrame(
|
124 |
+
sorted(dup_docs_count.items()), columns=["CC_dump", "num_duplicates"]
|
125 |
+
)
|
126 |
+
|
127 |
+
fig = px.bar(
|
128 |
+
dup_docs_count_for_graph,
|
129 |
+
x="CC_dump",
|
130 |
+
y="num_duplicates",
|
131 |
+
labels={"CC_dump": "CommonCrawl Dump", "num_duplicates": "Number of duplicates"},
|
132 |
+
)
|
133 |
+
|
134 |
+
dup_docs_count_graph = fig
|
135 |
|
136 |
nfc_examples = pd.DataFrame(
|
137 |
{
|
|
|
233 |
P("The purpose of this step is to create a set of clusters of matching pairs. For example, a list of pairs (A, B), (B, C), (D, E) is merged into a list of components (A, B, C) and (D, E). Using a third-party library like NetworkX to find connected components would require all pairs to fit into the memory of a single machine, which is not feasible. Instead, we implemented a distributed connected component finder [4] using the Dask framework, which can scale across multiple machines. The algorithm works by mapping edges by both the source and destination of pairs and reducing only edges where the source is greater than the destination. It performs successive iterations of this MapReduce computation until convergence, meaning the number of new edges produced becomes zero. In the end, every document in a cluster points to the smallest document within the cluster. Later, we compile a list of duplicate documents that need deletion and gather statistics about each component."),
|
234 |
P("We needed to partition the duplicate pairs generated in the third stage into three groups to reduce memory pressure on the final stage. We observed that the second stage itself generates partial components which have some overlap. These overlapping clusters cause some documents to appear in the delete set multiple times. However, our deletion code handled this overlap."),
|
235 |
P("Below is the distribution of duplicate documents found across different dumps of CommonCrawl. The distribution is skewed to the right because the documents are bucketed by the dump ID of the document we retain, and we prefer documents from higher dump IDs."),
|
236 |
+
plotly2fasthtml(dup_docs_count_graph),
|
237 |
),
|
238 |
Section(
|
239 |
H3("Analysis of Near-Duplicate Clusters"),
|