fh-new-vm1

Sleeping

App Files Files Community

victormiller commited on Sep 24

Commit

1c66024

•

1 Parent(s): 6afc890

Update main.py

Browse files

Files changed (1) hide show

main.py +3 -147

main.py CHANGED Viewed

@@ -65,7 +65,7 @@ def main():
                             ),
                             Li(
                                 A(
-                                    "Background",
                                     href="/intro#section2",
                                     hx_get="/intro#section2",
                                     hx_target="#inner-text",
@@ -73,7 +73,7 @@ def main():
                             ),
                             Li(
                                 A(
-                                    "Main Content",
                                     href="/intro#section3",
                                     hx_get="/intro#section3",
                                     hx_target="#inner-text",
@@ -81,7 +81,7 @@ def main():
                             ),
                             Li(
                                 A(
-                                    "Conclusion",
                                     href="/intro#section4",
                                     hx_get="/intro#section4",
                                     hx_target="#inner-text",
@@ -210,143 +210,6 @@ previous_content =  P("""The performance of a large language model (LLM)
                     and the process followed to create its 📚
                     FineWeb-Edu subset.""")
-dataset_comparison = pd.DataFrame(
-        {
-            "Dataset": [
-                "TxT360",
-                "FineWeb",
-                "RefinedWeb",
-                "RedPajama-v2",
-                "C4",
-                "Dolma",
-                "RedPajama-v1",
-                "The Pile",
-            ],
-            "CommonCrawl": [
-                "99 Snapshots",
-                "96 Snapshots",
-                "90 Snapshots",
-                "84 Snapshots",
-                "1 Snapshots",
-                "24 Snapshots",
-                "5 Snapshots",
-                "0.6% of 74 Snapshots",
-            ],
-            "Papers": [
-                "5 Sources",
-                "-",
-                "-",
-                "-",
-                "-",
-                "1 Source",
-                "1 Source",
-                "4 Sources",
-            ],
-            "Wikipedia": [
-                "Improves data quality by removing irrelevant documents",
-                "Filters out low-quality or incomplete documents",
-                "Provides additional information for analysis",
-                "Enables language-specific analysis and insights",
-                "Helps understand the complexity and content of documents",
-                "Identifies important terms and topics in the dataset",
-                "Quantifies the importance of individual words",
-                "RedPajama-v1",
-            ],
-            "FreeLaw": [
-                "May exclude documents in less common languages",
-                "May remove documents with valuable information",
-                "May introduce bias in the analysis",
-                "May not accurately represent the language distribution",
-                "May not capture the complexity of document structure",
-                "May be sensitive to noise and outliers",
-                "May not capture the semantic meaning of words",
-                "RedPajama-v1",
-            ],
-            "DM Math": [
-                "May exclude documents in less common languages",
-                "May remove documents with valuable information",
-                "May introduce bias in the analysis",
-                "May not accurately represent the language distribution",
-                "May not capture the complexity of document structure",
-                "May be sensitive to noise and outliers",
-                "May not capture the semantic meaning of words",
-                "RedPajama-v1",
-            ],
-            "USPTO": [
-                "May exclude documents in less common languages",
-                "May remove documents with valuable information",
-                "May introduce bias in the analysis",
-                "May not accurately represent the language distribution",
-                "May not capture the complexity of document structure",
-                "May be sensitive to noise and outliers",
-                "May not capture the semantic meaning of words",
-                "RedPajama-v1",
-            ],
-            "PG-19": [
-                "May exclude documents in less common languages",
-                "May remove documents with valuable information",
-                "May introduce bias in the analysis",
-                "May not accurately represent the language distribution",
-                "May not capture the complexity of document structure",
-                "May be sensitive to noise and outliers",
-                "May not capture the semantic meaning of words",
-                "RedPajama-v1",
-            ],
-            "HackerNews": [
-                "May exclude documents in less common languages",
-                "May remove documents with valuable information",
-                "May introduce bias in the analysis",
-                "May not accurately represent the language distribution",
-                "May not capture the complexity of document structure",
-                "May be sensitive to noise and outliers",
-                "May not capture the semantic meaning of words",
-                "RedPajama-v1",
-            ],
-            "Ubuntu IRC": [
-                "May exclude documents in less common languages",
-                "May remove documents with valuable information",
-                "May introduce bias in the analysis",
-                "May not accurately represent the language distribution",
-                "May not capture the complexity of document structure",
-                "May be sensitive to noise and outliers",
-                "May not capture the semantic meaning of words",
-                "RedPajama-v1",
-            ],
-            "EuroParl": [
-                "May exclude documents in less common languages",
-                "May remove documents with valuable information",
-                "May introduce bias in the analysis",
-                "May not accurately represent the language distribution",
-                "May not capture the complexity of document structure",
-                "May be sensitive to noise and outliers",
-                "May not capture the semantic meaning of words",
-                "RedPajama-v1",
-            ],
-            "StackExchange": [
-                "May exclude documents in less common languages",
-                "May remove documents with valuable information",
-                "May introduce bias in the analysis",
-                "May not accurately represent the language distribution",
-                "May not capture the complexity of document structure",
-                "May be sensitive to noise and outliers",
-                "May not capture the semantic meaning of words",
-                "RedPajama-v1",
-            ],
-            "Code": [
-                "May exclude documents in less common languages",
-                "May remove documents with valuable information",
-                "May introduce bias in the analysis",
-                "May not accurately represent the language distribution",
-                "May not capture the complexity of document structure",
-                "May be sensitive to noise and outliers",
-                "May not capture the semantic meaning of words",
-                "RedPajama-v1",
-            ],
-        }
-    )
-table_html = dataset_comparison.to_html(index=False, border=0)
-table_div = Div(NotStr(table_html), style="margin: 40px;")
@@ -386,13 +249,6 @@ def intro():
             P("In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"),
             id="section4",
         ),
-        Section(
-            H2("Combining the Best of Web and Curated Sources"),
-            H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
-            P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
-            table_div,
-            id="section5",
-        ),
         id="inner-text",
     )

                             ),
                             Li(
                                 A(
+                                    "Global Deduplication",
                                     href="/intro#section2",
                                     hx_get="/intro#section2",
                                     hx_target="#inner-text",
                             ),
                             Li(
                                 A(
+                                    "Controllable Upweighting",
                                     href="/intro#section3",
                                     hx_get="/intro#section3",
                                     hx_target="#inner-text",
                             ),
                             Li(
                                 A(
+                                    "Full Documentation",
                                     href="/intro#section4",
                                     hx_get="/intro#section4",
                                     hx_target="#inner-text",
                     and the process followed to create its 📚
                     FineWeb-Edu subset.""")
             P("In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"),
             id="section4",
         ),
         id="inner-text",
     )