victormiller commited on
Commit
dac95a4
1 Parent(s): 5672cf7

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +4 -3
curated.py CHANGED
@@ -599,8 +599,8 @@ filtering_process = Div(
599
  Section(
600
  H3("FreeLaw"),
601
  H4("Download and Extraction"),
602
- P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"),". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function.",
603
- D_code("""
604
  ("html", html2text),
605
  ("html_lawbox", html2text),
606
  ("html_columbia", html2text),
@@ -608,7 +608,8 @@ filtering_process = Div(
608
  ("html_with_citations", html2text),
609
  ("xml_harvard", html2text),
610
  plain_text
611
- """, language ="SQL")," All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
 
612
  H4("Filtering"),
613
  Ol(
614
  Li("Language Filter: English"),
 
599
  Section(
600
  H3("FreeLaw"),
601
  H4("Download and Extraction"),
602
+ P("The dataset was downloaded from:" A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"),". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
603
+ D_code("""
604
  ("html", html2text),
605
  ("html_lawbox", html2text),
606
  ("html_columbia", html2text),
 
608
  ("html_with_citations", html2text),
609
  ("xml_harvard", html2text),
610
  plain_text
611
+ """, language ="SQL"),
612
+ P("All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
613
  H4("Filtering"),
614
  Ol(
615
  Li("Language Filter: English"),