mpi_data_store / pages /01_data_collection.py
rianders's picture
Functioning pages added
9831243
raw
history blame
4.06 kB
import streamlit as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from datetime import datetime
def find_linked_urls_and_title(url):
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
urls = {link.get('href') for link in links if link.get('href') is not None}
title_tag = soup.find('title')
page_title = title_tag.text if title_tag else 'No Title Found'
return urls, page_title
else:
st.write(f"Failed to retrieve {url}")
return set(), 'No Title Found'
except Exception as e:
st.write(f"An error occurred with {url}: {e}")
return set(), 'No Title Found'
def convert_to_absolute_urls(base_url, links):
return {urljoin(base_url, link) if not link.startswith('http') else link for link in links}
def categorize_links(base_url, links):
internal_links, external_links = set(), set()
for link in links:
if urlparse(link).netloc == urlparse(base_url).netloc:
internal_links.add(link)
else:
external_links.add(link)
return internal_links, external_links
def display_editable_table(df):
edited_df = st.data_editor(data=df, key="data_editor_key", num_rows="dynamic") # Add num_rows="dynamic" to allow adding/deleting rows
return edited_df
def prepare_dataframe(df):
if "Ignore" not in df.columns:
df["Ignore"] = False # Initialize all values as False
return df
def store_data(df):
st.session_state['data'] = df
def main():
#menu()
st.title("Data Source Configuration")
# Initialize 'scanned_urls' with all columns, including 'Ignore'
if 'scanned_urls' not in st.session_state:
st.session_state['scanned_urls'] = pd.DataFrame(columns=['URL', 'Type', 'Page Name', 'Scanned DateTime', 'Ignore'])
st.subheader("Scan Websites for URLs")
url_input = st.text_area("Enter URLs to scan, separated by new lines:", "https://fubarlabs.org")
url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()]
scan_button_clicked = st.button("Scan URLs")
if scan_button_clicked:
for url in url_list:
unique_urls, page_title = find_linked_urls_and_title(url)
scan_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
absolute_urls = convert_to_absolute_urls(url, unique_urls)
internal_links, external_links = categorize_links(url, absolute_urls)
new_entries = pd.DataFrame([(url, 'Internal', page_title, scan_datetime, False) for url in internal_links] +
[(url, 'External', page_title, scan_datetime, False) for url in external_links],
columns=['URL', 'Type', 'Page Name', 'Scanned DateTime', 'Ignore']) # Include 'Ignore' column
st.session_state['scanned_urls'] = pd.concat([st.session_state['scanned_urls'], new_entries]).drop_duplicates().reset_index(drop=True)
store_data(st.session_state['scanned_urls'])
if not st.session_state['scanned_urls'].empty:
# Prepare the dataframe, this now includes the 'Ignore' column from the start
prepared_df = prepare_dataframe(st.session_state['scanned_urls'])
# Display the editable table with an "Ignore" column
edited_df = display_editable_table(prepared_df)
if edited_df is not None:
st.session_state['scanned_urls'] = edited_df
# Access the edits made to the table
if "data_editor_key" in st.session_state:
edits = st.session_state["data_editor_key"]
st.write("Edits made to the table:")
st.write(edits)
if st.button('Proceed to Data Organization'):
st.switch_page('pages/02_data_organization.py')
if __name__ == "__main__":
main()