victormiller
commited on
Commit
•
db5d55c
1
Parent(s):
9c1b63e
Update curated.py
Browse files- curated.py +1 -0
curated.py
CHANGED
@@ -551,6 +551,7 @@ filtering_process = Div(
|
|
551 |
H3("ArXiv"),
|
552 |
P("The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format."),
|
553 |
P(B("Download and Extraction: "),"All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
|
|
|
554 |
Ul(
|
555 |
Li("Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.", style = "margin-bottom: -3px"),
|
556 |
),
|
|
|
551 |
H3("ArXiv"),
|
552 |
P("The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format."),
|
553 |
P(B("Download and Extraction: "),"All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
|
554 |
+
P(B("Unique Data Preperation Challenges: ")),
|
555 |
Ul(
|
556 |
Li("Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.", style = "margin-bottom: -3px"),
|
557 |
),
|