import streamlit as st import pandas as pd import requests from bs4 import BeautifulSoup from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity from sklearn.manifold import TSNE import numpy as np from numpy.linalg import norm import matplotlib.pyplot as plt import plotly.express as px import re # Load the LaBSE model @st.cache_resource def load_model(): return SentenceTransformer("sentence-transformers/LaBSE") model = load_model() def fetch_sitemap_urls(domain): """Fetch and parse URLs from sitemaps, excluding images and handling nested sitemaps.""" domain = domain.replace("https://", "").replace("http://", "").strip("/") sitemap_urls = [ f"https://{domain}/sitemap.xml", f"https://{domain}/sitemap_index.xml", f"https://{domain}/robots.txt" ] all_urls = [] for sitemap_url in sitemap_urls: try: response = requests.get(sitemap_url, headers={"User-Agent": "SiteFocusTool/1.0"}, timeout=10) response.raise_for_status() if "robots.txt" in sitemap_url: for line in response.text.splitlines(): if line.lower().startswith("sitemap:"): nested_sitemap_url = line.split(":", 1)[1].strip() all_urls.extend(fetch_sitemap_urls_from_xml(nested_sitemap_url, domain, recursive=True)) else: all_urls.extend(fetch_sitemap_urls_from_xml(sitemap_url, domain, recursive=True)) except requests.RequestException: continue return list(set(all_urls)) def fetch_sitemap_urls_from_xml(sitemap_url, domain, recursive=False): """Fetch URLs from a sitemap XML file.""" urls = [] try: response = requests.get(sitemap_url, headers={"User-Agent": "SiteFocusTool/1.0"}, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, "lxml-xml") # Use lxml parser if soup.find_all("sitemap"): for sitemap in soup.find_all("sitemap"): loc = sitemap.find("loc").text if recursive: urls.extend(fetch_sitemap_urls_from_xml(loc, domain, recursive=True)) else: for loc in soup.find_all("loc"): url = loc.text if not re.search(r"\.(jpg|jpeg|png|gif|svg|webp|bmp|tif|tiff)$", url, re.IGNORECASE): urls.append(url) except requests.RequestException: pass return urls def clean_text_from_url(url, domain): """Clean URL by removing root domain and extracting readable text.""" domain = domain.replace("https://", "").replace("http://", "").strip("/") url = url.replace(f"https://{domain}/", "").replace(f"http://{domain}/", "") text = re.sub(r"[^\w\s]", " ", url) text = text.replace("/", " ").replace("_", " ").replace("-", " ") return text.strip() def compute_embeddings(data): """Generate normalized embeddings for the cleaned text.""" data["Embedding"] = data["Cleaned Text"].apply(lambda text: model.encode(text)) data["Embedding"] = data["Embedding"].apply(lambda emb: emb / norm(emb)) # Normalize return data def calculate_site_focus_and_radius(embeddings): """Calculate site focus score and site radius.""" centroid_embedding = np.mean(embeddings, axis=0) deviations = [1 - cosine_similarity([embedding], [centroid_embedding])[0][0] for embedding in embeddings] site_radius = np.mean(deviations) site_focus_score = max(0, 1 - site_radius) return site_focus_score, site_radius, centroid_embedding, deviations def plot_gradient_strip_with_indicator(score, title): """Visualize the score as a gradient strip with an indicator.""" plt.figure(figsize=(8, 1)) gradient = np.linspace(0, 1, 256).reshape(1, -1) gradient = np.vstack((gradient, gradient)) plt.imshow(gradient, aspect="auto", cmap="RdYlGn_r") # Red to Green reversed for correct mapping plt.axvline(x=score * 256, color="black", linestyle="--", linewidth=2) plt.gca().set_axis_off() plt.title(f"{title}: {score * 100:.2f}%") plt.show() st.pyplot(plt) def plot_3d_tsne(embeddings, urls, centroid, deviations): """Interactive 3D t-SNE scatter plot with hover labels.""" tsne = TSNE(n_components=3, random_state=42, perplexity=min(30, len(embeddings) - 1)) tsne_results = tsne.fit_transform(np.vstack([embeddings, centroid])) centroid_tsne = tsne_results[-1] # Last point is the centroid tsne_results = tsne_results[:-1] # Remaining points are pages fig = px.scatter_3d( x=tsne_results[:, 0], y=tsne_results[:, 1], z=tsne_results[:, 2], color=deviations, color_continuous_scale="RdYlGn_r", hover_name=urls, labels={"color": "Deviation"}, title="3D t-SNE Projection of Page Embeddings" ) fig.add_scatter3d( x=[centroid_tsne[0]], y=[centroid_tsne[1]], z=[centroid_tsne[2]], mode="markers", marker=dict(size=15, color="green"), name="Centroid" ) st.plotly_chart(fig) def plot_spherical_distances_optimized(deviations, embeddings, urls): """Improved scatter plot showing distances in a spherical layout with better angle distribution.""" # Normalize embeddings normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) num_points = len(deviations) angles = np.linspace(0, 2 * np.pi, num_points, endpoint=False) # Spread angles evenly # Create polar scatter plot fig = px.scatter_polar( r=deviations, theta=np.degrees(angles), color=deviations, color_continuous_scale="RdYlGn_r", title="Optimized Spherical Plot of Page Distances from Centroid", labels={"color": "Deviation"} ) # Update traces to show text (labels) only on hover fig.update_traces( mode="markers", # Display only markers by default hovertemplate="%{text}", # Show text on hover text=urls # Set URLs as hover labels ) st.plotly_chart(fig) # Streamlit Interface st.title("SiteFocus Tool") domain = st.text_input("Enter domain:", placeholder="example.com") if st.button("START"): if domain: urls = fetch_sitemap_urls(domain) if not urls: st.error("No URLs found. Please check the domain and try again.") else: cleaned_texts = [clean_text_from_url(url, domain) for url in urls] embeddings = np.array([model.encode(text) / norm(model.encode(text)) for text in cleaned_texts]) site_focus_score, site_radius, centroid, deviations = calculate_site_focus_and_radius(embeddings) # Visualize siteFocusScore st.subheader("siteFocusScore") st.markdown("**Description:** The siteFocusScore reflects how tightly aligned a site's content is to a single thematic area. A higher score indicates greater thematic focus, which can improve topical authority in SEO.") plot_gradient_strip_with_indicator(site_focus_score, "siteFocusScore") # Visualize siteRadius st.subheader("siteRadius") st.markdown("**Description:** The siteRadius measures how far individual pages deviate from the site's central theme. A smaller radius indicates higher consistency across the site, which is beneficial for SEO.") plot_gradient_strip_with_indicator(site_radius, "siteRadius") # Sorted dataframe by closeness to centroid st.subheader("Pages Closest to Centroid") distances = [1 - dev for dev in deviations] df = pd.DataFrame({"URL": urls, "Proximity to Centroid": distances}) df_sorted = df.sort_values(by="Proximity to Centroid", ascending=False) st.dataframe(df_sorted) # Interactive 3D t-SNE plot st.subheader("3D t-SNE Projection") plot_3d_tsne(embeddings, urls, centroid, deviations) # Optimized spherical distance plot st.subheader("Spherical Distance Plot") plot_spherical_distances_optimized(deviations, embeddings, urls)