victormiller commited on
Commit
081c95c
1 Parent(s): fe91749

Update web.py

Browse files
Files changed (1) hide show
  1. web.py +2 -32
web.py CHANGED
@@ -268,36 +268,6 @@ def web_data():
268
  P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
269
  Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
270
  P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
271
- # H3("TxT360 Filter Summary"),
272
- # P("This section provides highlevel details into the filtering that is applied to CommonCrawl in TxT360. Each decision listed is discussed in detail further on in this section."),
273
- # P("We adopt rules from RefinedWeb [1] to remove lines if they satisfy any of the following criteria:"),
274
- # Ul(
275
- # Li("the line is only composed of uppercase characters", style = "margin-bottom: 5px"),
276
- # Li("the line is only composed of numerical characters", style = "margin-bottom: 5px"),
277
- # Li("the line matches the pattern “r'^\d+\s+likes$", style = "margin-bottom: 5px"),
278
- # Li("the line only contains one word.", style = "margin-bottom: 5px"),
279
- # ),
280
- # P("We summarize other statistics-based rules originated from Gopher [7] in this section. The statistics can be used include:"),
281
- # Ul(
282
- # Li("the word count in the document", style = "margin-bottom: 5px"),
283
- # Li("the mean word length", style = "margin-bottom: 5px"),
284
- # Li("the number of sentences", style = "margin-bottom: 5px"),
285
- # Li("the symbol-to-word ratio", style = "margin-bottom: 5px"),
286
- # Li("the fraction of alphabetic words", style = "margin-bottom: 5px"),
287
- # Li("and the number of stop words", style = "margin-bottom: 5px"),
288
- # ),
289
- # P("Specifically, we remove any document which satisfies any of the following criteria:"),
290
- # Ul(
291
- # Li("it contains less than 50 words or more than 100,000 words", style = "margin-bottom: 5px"),
292
- # Li("its mean word length is outside the range of 3 to 10", style = "margin-bottom: 5px"),
293
- # Li("it contains less than 3 sentences", style = "margin-bottom: 5px"),
294
- # Li("its symbol-to-word ratio is greater than 0.1", style = "margin-bottom: 5px"),
295
- # Li("the words that contain at least one alphabetic character are less than 80% of the whole words", style = "margin-bottom: 5px"),
296
- # Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
297
- # ),
298
-
299
- # P("Following C4, we remove any page where the phrase “lorem ipsum” appears since some pages have placeholder “lorem ipsum” text."),
300
-
301
  id="section2",),
302
  Section(
303
  H2("Document Preparation"),
@@ -306,7 +276,7 @@ def web_data():
306
  P(B("Text Extraction: "), """
307
  Common Crawl provides webpage texts via two formats: WARC (Web ARChive format) and WET (WARC Encapsulated Text).
308
  WARC files contain the raw data from the crawl, which store the full HTTP response and request metadata.
309
- WET files contain plaintexts extracted by Common Crawl. In line with previous works ([1], [2], [3], [4]),
310
  we found WET files to include boilerplate content like navigation menus, ads, and other irrelevant texts.
311
  Accordingly, our pipeline starts from raw WARC files, reads with the warcio library, and extracts texts using trafilatura.
312
  """),
@@ -409,7 +379,7 @@ def web_data():
409
  ),
410
 
411
  Details(
412
- Summary("Blocked Document Examples from the URL Blocklist"),
413
  Div(
414
  DV(
415
  "data/bad_url_doc.jsonl",
 
268
  P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
269
  Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
270
  P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  id="section2",),
272
  Section(
273
  H2("Document Preparation"),
 
276
  P(B("Text Extraction: "), """
277
  Common Crawl provides webpage texts via two formats: WARC (Web ARChive format) and WET (WARC Encapsulated Text).
278
  WARC files contain the raw data from the crawl, which store the full HTTP response and request metadata.
279
+ WET files contain plaintexts extracted by Common Crawl. In line with previous works""",D_cite(bibtex_key="thepile"),D_cite(bibtex_key="refinedweb"),D_cite(bibtex_key="gopher"),D_cite(bibtex_key="fineweb") ,""" ,
280
  we found WET files to include boilerplate content like navigation menus, ads, and other irrelevant texts.
281
  Accordingly, our pipeline starts from raw WARC files, reads with the warcio library, and extracts texts using trafilatura.
282
  """),
 
379
  ),
380
 
381
  Details(
382
+ Summary("Blocked Document Examples from the URL Blocklist (WARNING: MAY CONTAIN OFFENSIVE MATERIAL)"),
383
  Div(
384
  DV(
385
  "data/bad_url_doc.jsonl",