Spaces:

vsarathy
/

mpep-qa

Runtime error

mpep-qa / scripts /get_mpep.py

Adding data (MPEP 2100) from USPTO

70c6845 over 1 year ago

1.17 kB

	# This module pulls the MPEP day from the internet
	# Saves each chapter as a separate document

	import requests
	from bs4 import BeautifulSoup
	import os
	import markdownify
	from tqdm import tqdm

	chapter_numbers = list(range(100,3000,100))

	for chapter in tqdm(chapter_numbers, desc=" outer", position=0):
	URL = f"https://www.uspto.gov/web/offices/pac/mpep/mpep-{chapter:04}.html" #cast into four digits
	page = requests.get(URL)
	os.mkdir(f"data/{chapter:04}")
	soup = BeautifulSoup(page.content, "html.parser")
	div = soup.find("div", {"id": "article"})
	sections = div.find_all("li")
	for section in tqdm(sections, desc=" inner loop", position=1, leave=False):
	section_number = section.find('a')['href']
	URL = f"https://www.uspto.gov/web/offices/pac/mpep/{section_number}"
	page = requests.get(URL)
	soup = BeautifulSoup(page.content, "html.parser")
	div = soup.find_all("div", class_="Section")

	h = markdownify.markdownify(str(div), heading_style="ATX")

	filename = f"data/{chapter:04}/{section_number}.txt"
	with open(filename, "w") as file:
	file.write(h)