File size: 1,166 Bytes
70c6845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# This module pulls the MPEP day from the internet
# Saves each chapter as a separate document

import requests
from bs4 import BeautifulSoup
import os
import markdownify
from tqdm import tqdm

chapter_numbers = list(range(100,3000,100))

for chapter in tqdm(chapter_numbers, desc=" outer", position=0):
    URL = f"https://www.uspto.gov/web/offices/pac/mpep/mpep-{chapter:04}.html" #cast into four digits
    page = requests.get(URL)
    os.mkdir(f"data/{chapter:04}")
    soup = BeautifulSoup(page.content, "html.parser")
    div = soup.find("div", {"id": "article"})
    sections = div.find_all("li")
    for section in tqdm(sections, desc=" inner loop", position=1, leave=False):
        section_number = section.find('a')['href']
        URL = f"https://www.uspto.gov/web/offices/pac/mpep/{section_number}"
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, "html.parser")
        div = soup.find_all("div", class_="Section")

        h = markdownify.markdownify(str(div), heading_style="ATX")
        
        filename = f"data/{chapter:04}/{section_number}.txt"
        with open(filename, "w") as file:
            file.write(h)