Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
def find_linked_urls(url): | |
try: | |
response = requests.get(url) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.text, 'html.parser') | |
links = soup.find_all('a') | |
urls = {link.get('href') for link in links if link.get('href') is not None} | |
return urls | |
else: | |
st.write(f"Failed to retrieve {url}") | |
except Exception as e: | |
st.write(f"An error occurred with {url}: {e}") | |
return set() | |
def convert_to_absolute_urls(base_url, links): | |
return {urljoin(base_url, link) if not link.startswith('http') else link for link in links} | |
def categorize_links(base_url, links): | |
internal_links, external_links = set(), set() | |
for link in links: | |
if urlparse(link).netloc == urlparse(base_url).netloc: | |
internal_links.add(link) | |
else: | |
external_links.add(link) | |
return internal_links, external_links | |
def main(): | |
st.title("Data Source Configuration") | |
if 'scanned_urls' not in st.session_state: | |
st.session_state['scanned_urls'] = {} | |
st.subheader("Scan Websites for URLs") | |
url_input = st.text_area("Enter URLs to scan, separated by new lines:") | |
url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()] # Splitting and cleaning input | |
scan_button_clicked = st.button("Scan URLs") | |
if scan_button_clicked or st.session_state['scanned_urls']: | |
if scan_button_clicked: | |
for url in url_list: | |
unique_urls = find_linked_urls(url) | |
absolute_urls = convert_to_absolute_urls(url, unique_urls) | |
internal_links, external_links = categorize_links(url, absolute_urls) | |
st.session_state['scanned_urls'][url] = {"internal": internal_links, "external": external_links} | |
selected_urls = [] | |
for base_url, links in st.session_state['scanned_urls'].items(): | |
st.write(f"Base URL: {base_url}") | |
include_all_internal = st.checkbox(f"Include all internal links from {base_url}", key=f"all_{base_url}") | |
if include_all_internal: | |
selected_urls.extend(links["internal"]) | |
else: | |
selected_internal = [link for link in links["internal"] if st.checkbox(link, key=link)] | |
selected_urls.extend(selected_internal) | |
if links["external"]: | |
st.write("External links:") | |
for link in links["external"]: | |
st.write(link) | |
if selected_urls: | |
df_selected_urls = pd.DataFrame(selected_urls, columns=['Selected URLs']) | |
st.write(df_selected_urls) | |
st.session_state['selected_urls'] = df_selected_urls | |
# Convert DataFrame to CSV for download | |
csv = df_selected_urls.to_csv(index=False).encode('utf-8') | |
st.download_button( | |
label="Download selected URLs as CSV", | |
data=csv, | |
file_name='selected_urls.csv', | |
mime='text/csv', | |
) | |
if __name__ == "__main__": | |
main() | |