victormiller commited on
Commit
db5d55c
1 Parent(s): 9c1b63e

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +1 -0
curated.py CHANGED
@@ -551,6 +551,7 @@ filtering_process = Div(
551
  H3("ArXiv"),
552
  P("The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format."),
553
  P(B("Download and Extraction: "),"All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
 
554
  Ul(
555
  Li("Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.", style = "margin-bottom: -3px"),
556
  ),
 
551
  H3("ArXiv"),
552
  P("The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format."),
553
  P(B("Download and Extraction: "),"All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
554
+ P(B("Unique Data Preperation Challenges: ")),
555
  Ul(
556
  Li("Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.", style = "margin-bottom: -3px"),
557
  ),