File size: 1,859 Bytes
ac3b3f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import requests
from subprocess import call
import os
from pylatexenc.latex2text import LatexNodes2Text

def get_paper(paper_url):
    if 'abs' in paper_url:
        eprint_url = paper_url.replace("https://arxiv.org/abs/", "https://arxiv.org/e-print/")
    elif 'pdf' in paper_url:
        eprint_url = paper_url.replace("https://arxiv.org/pdf/", "https://arxiv.org/e-print/")
    else:
        raise ValueError("Invalid arXiv URL")

    suffix = 'paper-dir/' + eprint_url.replace("https://arxiv.org/e-print/", "")

    # check if the directory exists
    if os.path.exists(suffix):
        print("Paper already downloaded, skipping download")
    else:
        print("Downloading paper")
        r = requests.get(eprint_url)

        with open("paper", "wb") as f:
            f.write(r.content)

        # unzip gzipped tar file to new directory
        call(["mkdir", suffix])
        call(["tar", "-xzf", "paper", "-C", suffix])

    # get the list of all .tex files in the directory
    tex_files = [f for f in os.listdir(suffix) if f.endswith('.tex')]
    # remove math_commands.tex from tex_files if it exists
    if 'math_commands.tex' in tex_files:
        tex_files.remove('math_commands.tex')
    if len(tex_files) == 1:
        # read the main tex file
        with open(f'{suffix}/{tex_files[0]}', 'r') as f:
            paper_tex = f.read()
    elif len(tex_files) == 0:
        raise ValueError("No .tex files found in the paper")
    else:
        raise ValueError("More than one .tex file found in the paper")
    
    # convert latex to text
    paper_text = LatexNodes2Text().latex_to_text(paper_tex)

    with open(f"{suffix}/main.txt", 'w') as f:
        f.write(paper_text)
    
    return paper_text

if __name__=="__main__":
    paper_url = "https://arxiv.org/abs/2206.08896"
    paper_text = get_paper(paper_url)
    print(paper_text)