File size: 1,800 Bytes
fde33b3
5042efb
 
 
fde33b3
 
 
 
5042efb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fde33b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5042efb
fde33b3
 
 
45b3858
fde33b3
5042efb
fde33b3
5042efb
45b3858
fde33b3
5042efb
 
fde33b3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import argparse
import os
import re

from tqdm import tqdm
from bs4 import BeautifulSoup
from markdown import markdown
from pathlib import Path


def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

    html = re.sub(r'<!--((.|\n)*)-->', '', html)
    html = re.sub('<code>bash', '<code>', html)

    # extract text
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(text=True))

    text = re.sub('```(py|diff|python)', '', text)
    text = re.sub('```\n', '\n', text)
    text = re.sub('-         .*', '', text)
    text = text.replace('...', '')
    text = re.sub('\n(\n)+', '\n\n', text)

    return text


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-dir", help="input directory with markdown", type=str,
                        default="transformers/docs/source/en/")
    parser.add_argument("--output-dir", help="output directory to store raw texts", type=str,
                        default="docs")

    args = parser.parse_args()
    input_dir = Path(args.input_dir)
    output_dir = Path(args.output_dir)

    assert os.path.isdir(input_dir), "Input directory doesn't exist"

    files = input_dir.rglob("*")
    os.makedirs(output_dir, exist_ok=True)

    for file in tqdm(files):
        parent = file.parent.stem if file.parent.stem != input_dir.stem else ""
        if file.is_file():
            with open(file, encoding="utf-8") as f:
                md = f.read()

            text = markdown_to_text(md)

            with open(output_dir / f"{parent}_{file.stem}.txt", "w", encoding="utf-8") as f:
                f.write(text)


if __name__ == "__main__":
    main()