import requests
from bs4 import BeautifulSoup as bs


def crawl_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response
    else:
        print("Failed to retrieve the content")
        return None


def extract_category_links(html_content):
    soup = bs(html_content, "html.parser")
    links = soup.find_all("a", href=True)
    category_links = [
        f"https://de.wikipedia.org{link['href']}"
        for link in links
        if "/wiki/Kategorie:" in link["href"]
        and "wikivoyage" not in link["href"]
        and "Region" not in link["href"]
        and "Kategorie:Portal" not in link["href"]
        and "Kategorie:Emsland" not in link["href"]
        and "(Deutschland)" not in link["href"]
        and "(Niedersachsen)" not in link["href"]
    ]
    return category_links


def extract_direct_links(html_content):
    soup = bs(html_content, "html.parser")
    links = soup.find_all("a", href=True)
    direct_links = [
        f"https://de.wikipedia.org{link['href']}"
        for link in links
        if link["href"].startswith("/wiki/")
        and "Wikipedia:" not in link["href"]
        and "Spezial:" not in link["href"]
        and "Hilfe:" not in link["href"]
        and "Portal:" not in link["href"]
        and "Kategorie:" not in link["href"]
    ]
    return direct_links


if __name__ == "__main__":
    links = []
    level = 0

    page = crawl_page("https://de.wikipedia.org/wiki/Kategorie:Emsland")
    next_categories = extract_category_links(page.content)
    link_pages = extract_direct_links(page.content)
    links += link_pages

    while len(next_categories) > 0:
        level = +1
        for category in next_categories:
            new_categories = []
            page = crawl_page(category)
            link_pages = extract_direct_links(page.content)
            links += link_pages
            categories = extract_category_links(page.content)
            cleaned_categories = [cat for cat in categories if cat != category]
            new_categories += cleaned_categories
        next_categories = new_categories
    else:
        links.append(extract_direct_links(page.content))

    with open("links.txt", "w") as fp:
        for item in links:
            fp.write("%s\n" % item)
        print("done")

    print("Hey")