Spaces:
Runtime error
Runtime error
File size: 1,166 Bytes
70c6845 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# This module pulls the MPEP day from the internet
# Saves each chapter as a separate document
import requests
from bs4 import BeautifulSoup
import os
import markdownify
from tqdm import tqdm
chapter_numbers = list(range(100,3000,100))
for chapter in tqdm(chapter_numbers, desc=" outer", position=0):
URL = f"https://www.uspto.gov/web/offices/pac/mpep/mpep-{chapter:04}.html" #cast into four digits
page = requests.get(URL)
os.mkdir(f"data/{chapter:04}")
soup = BeautifulSoup(page.content, "html.parser")
div = soup.find("div", {"id": "article"})
sections = div.find_all("li")
for section in tqdm(sections, desc=" inner loop", position=1, leave=False):
section_number = section.find('a')['href']
URL = f"https://www.uspto.gov/web/offices/pac/mpep/{section_number}"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
div = soup.find_all("div", class_="Section")
h = markdownify.markdownify(str(div), heading_style="ATX")
filename = f"data/{chapter:04}/{section_number}.txt"
with open(filename, "w") as file:
file.write(h)
|