Spaces:

LLM360
/

TxT360

Running

App Files Files Community

hector.liu commited on Oct 5

Commit

a1ddc25

•

1 Parent(s): c206db1

add sankey

Browse files

Files changed (1) hide show

web.py +77 -0

web.py CHANGED Viewed

@@ -10,6 +10,8 @@ from data.non_web_urls import non_web_urls
 from data_viewer import DV, DV2, DVS
 from fasthtml.components import D_code, D_bibliography, D_appendix, D_cite
 import pandas as pd
 data_filtering_table_data = pd.DataFrame(
@@ -243,6 +245,79 @@ attrs.fraction_of_characters_in_duplicate_lines = sum(
 ) / max(character_count, 1)
 """
 def web_data():
     return Div(
@@ -271,6 +346,8 @@ def web_data():
         table_div_qf_filter_data,
         P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across snapshots. "),
         Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
         P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
         id="section2",),
         Section(

 from data_viewer import DV, DV2, DVS
 from fasthtml.components import D_code, D_bibliography, D_appendix, D_cite
 import pandas as pd
+from plotly import graph_objects as go
+from fh_plotly import plotly2fasthtml
 data_filtering_table_data = pd.DataFrame(
 ) / max(character_count, 1)
 """
+# Plot the distribution sankey.
+# The filtering percentages
+web_filtering_percentages = [
+    100,
+    96.98,
+    43.84,
+    43.59,
+    32.43,
+    24.77,
+    20.34,
+    16.75,
+    2.35,
+]
+# The step names
+web_filtering_steps = [
+    "Common Crawl",
+    "Text Extraction",
+    "Language Identification",
+    "URL Filtering",
+    "Repetition Removal",
+    "Document-wise Filtering",
+    "Line-wise Corrections",
+    "Local Exact Deduplication",
+    "Global Fuzzy Deduplication",
+]
+step_colors = [
+    '#ff8000',  # Most orange
+    '#f88d52',
+    '#fed380',
+    '#ffffbf',
+    '#d3e8a3',  # Lighter green version of #ccea83
+    '#a3d992',  # Lighter green version of #86cb66
+    '#57b86b',  # Lighter green version of #2da155
+    '#33a352',   # Lighter green version of #006837
+    '#1f773c',   # Lightest green added at the end
+]
+def add_opacity(hex_color, opacity):
+    # Remove '#' if present
+    hex_color = hex_color.lstrip('#')
+    # Convert hex to RGB
+    rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
+    # Add the opacity value
+    return f"rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, {opacity})"
+# Concatenate the percentage to each label
+labels_with_percentages = [f"{label} ({percentage}%)" for label, percentage in zip(web_filtering_steps, web_filtering_percentages)]
+filtering_sankey_fig = go.Figure(go.Sankey(
+    node=dict(
+        label=labels_with_percentages,
+        color=[add_opacity(c, 0.8) for c in step_colors[:9]] ,
+        pad=15,  # Adjust padding between nodes
+        thickness=30,
+    ),
+    link=dict(
+        source=list(range(0,8)),  # Each source is the previous step
+        target=list(range(1,9)),  # Each target is the next step
+        value=web_filtering_percentages,
+        color=[add_opacity(c, 0.5) for c in step_colors[:8]]  # Match the link colors to the source node
+    )
+))
+filtering_sankey_fig.update_layout(
+    title_text="Web Data Filtering Process",
+    font_size=10,
+    margin=dict(l=0, r=0, t=40, b=0)
+)
 def web_data():
     return Div(
         table_div_qf_filter_data,
         P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across snapshots. "),
         Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
+        # The sankey diagram of the filtering percentage
+        plotly2fasthtml(filtering_sankey_fig),
         P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
         id="section2",),
         Section(