omkarenator commited on
Commit
4a437aa
1 Parent(s): 9f2a4f7

fix bf example

Browse files
Files changed (1) hide show
  1. common.py +62 -45
common.py CHANGED
@@ -1,6 +1,13 @@
1
  from fasthtml.common import *
2
  from fasthtml.components import *
3
- from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline, D_cite
 
 
 
 
 
 
 
4
  from fh_plotly import plotly2fasthtml
5
  import pandas as pd
6
  import json
@@ -46,48 +53,56 @@ def dup_cluster_graph():
46
  return fig
47
 
48
 
49
- bloom_filter_table_info = pd.DataFrame(
50
- {
51
- "Bloom Filter": [
52
- "BF 0",
53
- "BF 8 ",
54
- ],
55
- "Band 0": [
56
- """
57
- (A,B)
58
- (C,D)
59
- (E,K)
60
- """,
61
- "(B,K)",
62
- ],
63
- "Band 1": [
64
- """
65
- (A,B)
66
- (C,D)
67
- (F,K)
68
- """,
69
- "(B,K)",
70
- ],
71
- "....": [
72
- "...",
73
- "...",
74
- ],
75
- "Band 8": [
76
- """
77
- (A,B)
78
- (C,D)
79
- (D,E)
80
- """,
81
- """
82
- (E,K)
83
- (B,K)
84
- """,
85
- ],
86
- }
87
- )
88
-
89
- table_html_bloom_filter = bloom_filter_table_info.to_html(index=False, border=0)
90
- table_div_bloom_examples = Div(NotStr(table_html_bloom_filter), style="margin: 40px;")
 
 
 
 
 
 
 
 
91
 
92
 
93
  def dup_docs_count_graph():
@@ -298,7 +313,9 @@ global_div = Div(
298
  "To illustrate the need for deduplication, below is the distribution of near-duplicate clusters, organized into buckets of 100. The first bucket contains clusters with sizes ranging from 2 to 100, as found in the Common Crawl dataset. Some clusters even reach up to a million documents."
299
  ),
300
  plotly2fasthtml(dup_cluster_graph()),
301
- P("The example below is from one such cluster. Here most of the text is repeated with just specifics changed."),
 
 
302
  Img(src="images/100k.png", style="max-width: 100%;"),
303
  P(
304
  "We started deduplication with 61.8 TB of filtered and compressed documents. The initial dataset had roughly 48.83 billion documents. First, we performed exact deduplication using a Bloom filter with a capacity of 1 billion and a false positive rate of 0.001. This reduced the documents from 48.83 billion to 40.21 billion, removing about 17% as exact duplicates. This step used constant memory for the Bloom filter and lessened the workload for subsequent near-deduplication."
@@ -344,7 +361,7 @@ global_div = Div(
344
  P(
345
  "There is a high chance that duplicates from different bands will have the same pairs in the same horizontal partition. Performing the Bloom filter step reduces the number of pairs by nearly ninefold."
346
  ),
347
- table_div_bloom_examples,
348
  P(
349
  "The resulting unique pairs are then used to identify clusters of near-duplicates by finding connected components in a graph, where the vertices represent documents and the edges represent matches."
350
  ),
 
1
  from fasthtml.common import *
2
  from fasthtml.components import *
3
+ from fasthtml.components import (
4
+ D_title,
5
+ D_article,
6
+ D_front_matter,
7
+ D_contents,
8
+ D_byline,
9
+ D_cite,
10
+ )
11
  from fh_plotly import plotly2fasthtml
12
  import pandas as pd
13
  import json
 
53
  return fig
54
 
55
 
56
+ def dedup_pairs_bands():
57
+ return pd.DataFrame(
58
+ {
59
+ "Bloom Filter": [
60
+ "BF 0",
61
+ "",
62
+ "",
63
+ "",
64
+ "BF 1",
65
+ "",
66
+ "BF 8",
67
+ ],
68
+ "Band 0": [
69
+ "(A,B)",
70
+ "(C,D)",
71
+ "(E,K)",
72
+ "(B,K)",
73
+ "...",
74
+ "...",
75
+ "...",
76
+ ],
77
+ "Band 1": [
78
+ "(A,B)",
79
+ "(C,D)",
80
+ "(F,K)",
81
+ "(B,K)",
82
+ "...",
83
+ "...",
84
+ "...",
85
+ ],
86
+ "....": [
87
+ "...",
88
+ "...",
89
+ "...",
90
+ "...",
91
+ "...",
92
+ "...",
93
+ "...",
94
+ ],
95
+ "Band 8": [
96
+ "(A,B)",
97
+ "(C,D)",
98
+ "(D,E)",
99
+ "(E,K)",
100
+ "(B,K)",
101
+ "...",
102
+ "...",
103
+ ],
104
+ }
105
+ ).to_html(index=False, border=0)
106
 
107
 
108
  def dup_docs_count_graph():
 
313
  "To illustrate the need for deduplication, below is the distribution of near-duplicate clusters, organized into buckets of 100. The first bucket contains clusters with sizes ranging from 2 to 100, as found in the Common Crawl dataset. Some clusters even reach up to a million documents."
314
  ),
315
  plotly2fasthtml(dup_cluster_graph()),
316
+ P(
317
+ "The example below is from one such cluster. Here most of the text is repeated with just specifics changed."
318
+ ),
319
  Img(src="images/100k.png", style="max-width: 100%;"),
320
  P(
321
  "We started deduplication with 61.8 TB of filtered and compressed documents. The initial dataset had roughly 48.83 billion documents. First, we performed exact deduplication using a Bloom filter with a capacity of 1 billion and a false positive rate of 0.001. This reduced the documents from 48.83 billion to 40.21 billion, removing about 17% as exact duplicates. This step used constant memory for the Bloom filter and lessened the workload for subsequent near-deduplication."
 
361
  P(
362
  "There is a high chance that duplicates from different bands will have the same pairs in the same horizontal partition. Performing the Bloom filter step reduces the number of pairs by nearly ninefold."
363
  ),
364
+ Div(NotStr(dedup_pairs_bands()), style="margin: 40px;"),
365
  P(
366
  "The resulting unique pairs are then used to identify clusters of near-duplicates by finding connected components in a graph, where the vertices represent documents and the edges represent matches."
367
  ),