import streamlit as st | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
from datetime import datetime | |
def find_linked_urls_and_title(url): | |
try: | |
response = requests.get(url) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
links = soup.find_all('a') | |
urls = {link.get('href') for link in links if link.get('href') is not None} | |
title_tag = soup.find('title') | |
page_title = title_tag.text if title_tag else 'No Title Found' | |
return urls, page_title | |
else: | |
st.write(f"Failed to retrieve {url}") | |
return set(), 'No Title Found' | |
except Exception as e: | |
st.write(f"An error occurred with {url}: {e}") | |
return set(), 'No Title Found' | |
def convert_to_absolute_urls(base_url, links): | |
return {urljoin(base_url, link) if not link.startswith('http') else link for link in links} | |
def categorize_links(base_url, links): | |
internal_links, external_links = set(), set() | |
for link in links: | |
if urlparse(link).netloc == urlparse(base_url).netloc: | |
internal_links.add(link) | |
else: | |
external_links.add(link) | |
return internal_links, external_links | |
def display_editable_table(df): | |
edited_df = st.data_editor(data=df, key="data_editor_key", num_rows="dynamic") # Add num_rows="dynamic" to allow adding/deleting rows | |
return edited_df | |
def prepare_dataframe(df): | |
if "Ignore" not in df.columns: | |
df["Ignore"] = False # Initialize all values as False | |
return df | |
def store_data(df): | |
st.session_state['data'] = df | |
def main(): | |
#menu() | |
st.title("Data Source Configuration") | |
# Initialize 'scanned_urls' with all columns, including 'Ignore' | |
if 'scanned_urls' not in st.session_state: | |
st.session_state['scanned_urls'] = pd.DataFrame(columns=['URL', 'Type', 'Page Name', 'Scanned DateTime', 'Ignore']) | |
st.subheader("Scan Websites for URLs") | |
url_input = st.text_area("Enter URLs to scan, separated by new lines:", "") | |
url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()] | |
scan_button_clicked = st.button("Scan URLs") | |
if scan_button_clicked: | |
for url in url_list: | |
unique_urls, page_title = find_linked_urls_and_title(url) | |
scan_datetime ="%Y-%m-%d %H:%M:%S") | |
absolute_urls = convert_to_absolute_urls(url, unique_urls) | |
internal_links, external_links = categorize_links(url, absolute_urls) | |
new_entries = pd.DataFrame([(url, 'Internal', page_title, scan_datetime, False) for url in internal_links] + | |
[(url, 'External', page_title, scan_datetime, False) for url in external_links], | |
columns=['URL', 'Type', 'Page Name', 'Scanned DateTime', 'Ignore']) # Include 'Ignore' column | |
st.session_state['scanned_urls'] = pd.concat([st.session_state['scanned_urls'], new_entries]).drop_duplicates().reset_index(drop=True) | |
store_data(st.session_state['scanned_urls']) | |
if not st.session_state['scanned_urls'].empty: | |
# Prepare the dataframe, this now includes the 'Ignore' column from the start | |
prepared_df = prepare_dataframe(st.session_state['scanned_urls']) | |
# Display the editable table with an "Ignore" column | |
edited_df = display_editable_table(prepared_df) | |
if edited_df is not None: | |
st.session_state['scanned_urls'] = edited_df | |
# Access the edits made to the table | |
if "data_editor_key" in st.session_state: | |
edits = st.session_state["data_editor_key"] | |
st.write("Edits made to the table:") | |
st.write(edits) | |
if st.button('Proceed to Data Organization'): | |
st.switch_page('pages/') | |
if __name__ == "__main__": | |
main() |