hector.liu
commited on
Commit
•
a1ddc25
1
Parent(s):
c206db1
add sankey
Browse files
web.py
CHANGED
@@ -10,6 +10,8 @@ from data.non_web_urls import non_web_urls
|
|
10 |
from data_viewer import DV, DV2, DVS
|
11 |
from fasthtml.components import D_code, D_bibliography, D_appendix, D_cite
|
12 |
import pandas as pd
|
|
|
|
|
13 |
|
14 |
|
15 |
data_filtering_table_data = pd.DataFrame(
|
@@ -243,6 +245,79 @@ attrs.fraction_of_characters_in_duplicate_lines = sum(
|
|
243 |
) / max(character_count, 1)
|
244 |
"""
|
245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
def web_data():
|
248 |
return Div(
|
@@ -271,6 +346,8 @@ def web_data():
|
|
271 |
table_div_qf_filter_data,
|
272 |
P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across snapshots. "),
|
273 |
Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
|
|
|
|
|
274 |
P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
|
275 |
id="section2",),
|
276 |
Section(
|
|
|
10 |
from data_viewer import DV, DV2, DVS
|
11 |
from fasthtml.components import D_code, D_bibliography, D_appendix, D_cite
|
12 |
import pandas as pd
|
13 |
+
from plotly import graph_objects as go
|
14 |
+
from fh_plotly import plotly2fasthtml
|
15 |
|
16 |
|
17 |
data_filtering_table_data = pd.DataFrame(
|
|
|
245 |
) / max(character_count, 1)
|
246 |
"""
|
247 |
|
248 |
+
# Plot the distribution sankey.
|
249 |
+
|
250 |
+
# The filtering percentages
|
251 |
+
web_filtering_percentages = [
|
252 |
+
100,
|
253 |
+
96.98,
|
254 |
+
43.84,
|
255 |
+
43.59,
|
256 |
+
32.43,
|
257 |
+
24.77,
|
258 |
+
20.34,
|
259 |
+
16.75,
|
260 |
+
2.35,
|
261 |
+
]
|
262 |
+
|
263 |
+
# The step names
|
264 |
+
web_filtering_steps = [
|
265 |
+
"Common Crawl",
|
266 |
+
"Text Extraction",
|
267 |
+
"Language Identification",
|
268 |
+
"URL Filtering",
|
269 |
+
"Repetition Removal",
|
270 |
+
"Document-wise Filtering",
|
271 |
+
"Line-wise Corrections",
|
272 |
+
"Local Exact Deduplication",
|
273 |
+
"Global Fuzzy Deduplication",
|
274 |
+
]
|
275 |
+
|
276 |
+
step_colors = [
|
277 |
+
'#ff8000', # Most orange
|
278 |
+
'#f88d52',
|
279 |
+
'#fed380',
|
280 |
+
'#ffffbf',
|
281 |
+
'#d3e8a3', # Lighter green version of #ccea83
|
282 |
+
'#a3d992', # Lighter green version of #86cb66
|
283 |
+
'#57b86b', # Lighter green version of #2da155
|
284 |
+
'#33a352', # Lighter green version of #006837
|
285 |
+
'#1f773c', # Lightest green added at the end
|
286 |
+
]
|
287 |
+
|
288 |
+
def add_opacity(hex_color, opacity):
|
289 |
+
# Remove '#' if present
|
290 |
+
hex_color = hex_color.lstrip('#')
|
291 |
+
# Convert hex to RGB
|
292 |
+
rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
|
293 |
+
# Add the opacity value
|
294 |
+
return f"rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, {opacity})"
|
295 |
+
|
296 |
+
|
297 |
+
# Concatenate the percentage to each label
|
298 |
+
labels_with_percentages = [f"{label} ({percentage}%)" for label, percentage in zip(web_filtering_steps, web_filtering_percentages)]
|
299 |
+
|
300 |
+
filtering_sankey_fig = go.Figure(go.Sankey(
|
301 |
+
node=dict(
|
302 |
+
label=labels_with_percentages,
|
303 |
+
color=[add_opacity(c, 0.8) for c in step_colors[:9]] ,
|
304 |
+
pad=15, # Adjust padding between nodes
|
305 |
+
thickness=30,
|
306 |
+
),
|
307 |
+
link=dict(
|
308 |
+
source=list(range(0,8)), # Each source is the previous step
|
309 |
+
target=list(range(1,9)), # Each target is the next step
|
310 |
+
value=web_filtering_percentages,
|
311 |
+
color=[add_opacity(c, 0.5) for c in step_colors[:8]] # Match the link colors to the source node
|
312 |
+
|
313 |
+
)
|
314 |
+
))
|
315 |
+
|
316 |
+
filtering_sankey_fig.update_layout(
|
317 |
+
title_text="Web Data Filtering Process",
|
318 |
+
font_size=10,
|
319 |
+
margin=dict(l=0, r=0, t=40, b=0)
|
320 |
+
)
|
321 |
|
322 |
def web_data():
|
323 |
return Div(
|
|
|
346 |
table_div_qf_filter_data,
|
347 |
P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across snapshots. "),
|
348 |
Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
|
349 |
+
# The sankey diagram of the filtering percentage
|
350 |
+
plotly2fasthtml(filtering_sankey_fig),
|
351 |
P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
|
352 |
id="section2",),
|
353 |
Section(
|