TxT360

Sleeping

victormiller commited on Oct 4

Commit

db5d55c

•

1 Parent(s): 9c1b63e

Update curated.py

Files changed (1) hide show

curated.py CHANGED Viewed

@@ -551,6 +551,7 @@ filtering_process = Div(
         H3("ArXiv"),
         P("The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format."),
         P(B("Download and Extraction: "),"All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format",  D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
         Ul(
             Li("Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.", style = "margin-bottom: -3px"),
         ),

         H3("ArXiv"),
         P("The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format."),
         P(B("Download and Extraction: "),"All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format",  D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
+        P(B("Unique Data Preperation Challenges: ")),
         Ul(
             Li("Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.", style = "margin-bottom: -3px"),
         ),