hector.liu commited on
Commit
a1ddc25
1 Parent(s): c206db1

add sankey

Browse files
Files changed (1) hide show
  1. web.py +77 -0
web.py CHANGED
@@ -10,6 +10,8 @@ from data.non_web_urls import non_web_urls
10
  from data_viewer import DV, DV2, DVS
11
  from fasthtml.components import D_code, D_bibliography, D_appendix, D_cite
12
  import pandas as pd
 
 
13
 
14
 
15
  data_filtering_table_data = pd.DataFrame(
@@ -243,6 +245,79 @@ attrs.fraction_of_characters_in_duplicate_lines = sum(
243
  ) / max(character_count, 1)
244
  """
245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
  def web_data():
248
  return Div(
@@ -271,6 +346,8 @@ def web_data():
271
  table_div_qf_filter_data,
272
  P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across snapshots. "),
273
  Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
 
 
274
  P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
275
  id="section2",),
276
  Section(
 
10
  from data_viewer import DV, DV2, DVS
11
  from fasthtml.components import D_code, D_bibliography, D_appendix, D_cite
12
  import pandas as pd
13
+ from plotly import graph_objects as go
14
+ from fh_plotly import plotly2fasthtml
15
 
16
 
17
  data_filtering_table_data = pd.DataFrame(
 
245
  ) / max(character_count, 1)
246
  """
247
 
248
+ # Plot the distribution sankey.
249
+
250
+ # The filtering percentages
251
+ web_filtering_percentages = [
252
+ 100,
253
+ 96.98,
254
+ 43.84,
255
+ 43.59,
256
+ 32.43,
257
+ 24.77,
258
+ 20.34,
259
+ 16.75,
260
+ 2.35,
261
+ ]
262
+
263
+ # The step names
264
+ web_filtering_steps = [
265
+ "Common Crawl",
266
+ "Text Extraction",
267
+ "Language Identification",
268
+ "URL Filtering",
269
+ "Repetition Removal",
270
+ "Document-wise Filtering",
271
+ "Line-wise Corrections",
272
+ "Local Exact Deduplication",
273
+ "Global Fuzzy Deduplication",
274
+ ]
275
+
276
+ step_colors = [
277
+ '#ff8000', # Most orange
278
+ '#f88d52',
279
+ '#fed380',
280
+ '#ffffbf',
281
+ '#d3e8a3', # Lighter green version of #ccea83
282
+ '#a3d992', # Lighter green version of #86cb66
283
+ '#57b86b', # Lighter green version of #2da155
284
+ '#33a352', # Lighter green version of #006837
285
+ '#1f773c', # Lightest green added at the end
286
+ ]
287
+
288
+ def add_opacity(hex_color, opacity):
289
+ # Remove '#' if present
290
+ hex_color = hex_color.lstrip('#')
291
+ # Convert hex to RGB
292
+ rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
293
+ # Add the opacity value
294
+ return f"rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, {opacity})"
295
+
296
+
297
+ # Concatenate the percentage to each label
298
+ labels_with_percentages = [f"{label} ({percentage}%)" for label, percentage in zip(web_filtering_steps, web_filtering_percentages)]
299
+
300
+ filtering_sankey_fig = go.Figure(go.Sankey(
301
+ node=dict(
302
+ label=labels_with_percentages,
303
+ color=[add_opacity(c, 0.8) for c in step_colors[:9]] ,
304
+ pad=15, # Adjust padding between nodes
305
+ thickness=30,
306
+ ),
307
+ link=dict(
308
+ source=list(range(0,8)), # Each source is the previous step
309
+ target=list(range(1,9)), # Each target is the next step
310
+ value=web_filtering_percentages,
311
+ color=[add_opacity(c, 0.5) for c in step_colors[:8]] # Match the link colors to the source node
312
+
313
+ )
314
+ ))
315
+
316
+ filtering_sankey_fig.update_layout(
317
+ title_text="Web Data Filtering Process",
318
+ font_size=10,
319
+ margin=dict(l=0, r=0, t=40, b=0)
320
+ )
321
 
322
  def web_data():
323
  return Div(
 
346
  table_div_qf_filter_data,
347
  P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across snapshots. "),
348
  Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
349
+ # The sankey diagram of the filtering percentage
350
+ plotly2fasthtml(filtering_sankey_fig),
351
  P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
352
  id="section2",),
353
  Section(