mpi_data_store / pages /file_web_source_collection.py
rianders's picture
Latest changes
c7d9652
raw
history blame
3.29 kB
import streamlit as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
def find_linked_urls(url):
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
urls = {link.get('href') for link in links if link.get('href') is not None}
return urls
else:
st.write(f"Failed to retrieve {url}")
except Exception as e:
st.write(f"An error occurred with {url}: {e}")
return set()
def convert_to_absolute_urls(base_url, links):
return {urljoin(base_url, link) if not link.startswith('http') else link for link in links}
def categorize_links(base_url, links):
internal_links, external_links = set(), set()
for link in links:
if urlparse(link).netloc == urlparse(base_url).netloc:
internal_links.add(link)
else:
external_links.add(link)
return internal_links, external_links
def main():
st.title("Data Source Configuration")
if 'scanned_urls' not in st.session_state:
st.session_state['scanned_urls'] = {}
st.subheader("Scan Websites for URLs")
url_input = st.text_area("Enter URLs to scan, separated by new lines:")
url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()] # Splitting and cleaning input
scan_button_clicked = st.button("Scan URLs")
if scan_button_clicked or st.session_state['scanned_urls']:
if scan_button_clicked:
for url in url_list:
unique_urls = find_linked_urls(url)
absolute_urls = convert_to_absolute_urls(url, unique_urls)
internal_links, external_links = categorize_links(url, absolute_urls)
st.session_state['scanned_urls'][url] = {"internal": internal_links, "external": external_links}
selected_urls = []
for base_url, links in st.session_state['scanned_urls'].items():
st.write(f"Base URL: {base_url}")
include_all_internal = st.checkbox(f"Include all internal links from {base_url}", key=f"all_{base_url}")
if include_all_internal:
selected_urls.extend(links["internal"])
else:
selected_internal = [link for link in links["internal"] if st.checkbox(link, key=link)]
selected_urls.extend(selected_internal)
if links["external"]:
st.write("External links:")
for link in links["external"]:
st.write(link)
if selected_urls:
df_selected_urls = pd.DataFrame(selected_urls, columns=['Selected URLs'])
st.write(df_selected_urls)
st.session_state['selected_urls'] = df_selected_urls
# Convert DataFrame to CSV for download
csv = df_selected_urls.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download selected URLs as CSV",
data=csv,
file_name='selected_urls.csv',
mime='text/csv',
)
if __name__ == "__main__":
main()