import requests from bs4 import BeautifulSoup as bs def crawl_page(url): response = requests.get(url) if response.status_code == 200: return response else: print("Failed to retrieve the content") return None def extract_category_links(html_content): soup = bs(html_content, "html.parser") links = soup.find_all("a", href=True) category_links = [ f"https://de.wikipedia.org{link['href']}" for link in links if "/wiki/Kategorie:" in link["href"] and "wikivoyage" not in link["href"] and "Region" not in link["href"] and "Kategorie:Portal" not in link["href"] and "Kategorie:Emsland" not in link["href"] and "(Deutschland)" not in link["href"] and "(Niedersachsen)" not in link["href"] ] return category_links def extract_direct_links(html_content): soup = bs(html_content, "html.parser") links = soup.find_all("a", href=True) direct_links = [ f"https://de.wikipedia.org{link['href']}" for link in links if link["href"].startswith("/wiki/") and "Wikipedia:" not in link["href"] and "Spezial:" not in link["href"] and "Hilfe:" not in link["href"] and "Portal:" not in link["href"] and "Kategorie:" not in link["href"] ] return direct_links if __name__ == "__main__": links = [] level = 0 page = crawl_page("https://de.wikipedia.org/wiki/Kategorie:Emsland") next_categories = extract_category_links(page.content) link_pages = extract_direct_links(page.content) links += link_pages while len(next_categories) > 0: level = +1 for category in next_categories: new_categories = [] page = crawl_page(category) link_pages = extract_direct_links(page.content) links += link_pages categories = extract_category_links(page.content) cleaned_categories = [cat for cat in categories if cat != category] new_categories += cleaned_categories next_categories = new_categories else: links.append(extract_direct_links(page.content)) with open("links.txt", "w") as fp: for item in links: fp.write("%s\n" % item) print("done") print("Hey")