|
import streamlit as st |
|
from advertools import sitemap_to_df, word_frequency |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
common_words = set(["author", "category", "product", "authors", "categories", "products", "blog", "blogs"]) |
|
|
|
|
|
st.sidebar.markdown("### How to use this Sitemap Analyzer") |
|
st.sidebar.markdown(""" |
|
This sitemap analyzer shows you how many pages each domain has published over a period of time. |
|
To use it, input the client's sitemap on "Input client sitemap here" and put up to 3 competitor sitemaps below it, pressing enter after every time you put the sitemap URL. |
|
Credits to [Advertools](https://github.com/eliasdabbas/advertools) and [holisticseo.digital](https://www.holisticseo.digital/python-seo/content-analysis-with-sitemaps/)""") |
|
st.sidebar.markdown("You can use this tool to detect or guess where the sitemap of each domain can be: [Free Sitemap Finder & Checker Tool](https://seomator.com/sitemap-finder)") |
|
|
|
st.sidebar.markdown("## Tool uploaded and maintained by: [Blazing SEO](http://blazing-seo.com/)") |
|
|
|
|
|
sitemap_urls = [ |
|
st.sidebar.text_input("Input client sitemap here:", ""), |
|
st.sidebar.text_input("Enter the competitor sitemap URL 1:", ""), |
|
st.sidebar.text_input("Enter the competitor sitemap URL 2:", ""), |
|
st.sidebar.text_input("Enter the competitor sitemap URL 3:", "") |
|
] |
|
|
|
for idx, sitemap_url in enumerate(sitemap_urls): |
|
if sitemap_url: |
|
try: |
|
sitemap_data = sitemap_to_df(sitemap_url) |
|
sitemap_data['lastmod'] = pd.to_datetime(sitemap_data['lastmod']) |
|
|
|
|
|
slugs = sitemap_data['loc'].apply(lambda x: x.split("/")[-2].replace("-", " ")) |
|
|
|
|
|
slugs_filtered = [' '.join([word for word in slug.split() if word.lower() not in common_words]) for slug in slugs] |
|
|
|
|
|
word_freq = word_frequency(slugs_filtered, phrase_len=1) |
|
st.subheader(f"Most-frequently used words in article titles for {sitemap_url} (excluding common words)") |
|
st.dataframe(word_freq.head(100)) |
|
|
|
|
|
word_freq_phrases = word_frequency(slugs_filtered, phrase_len=2) |
|
st.subheader(f"Most-frequently used two-word phrases in article titles for {sitemap_url} (excluding common words)") |
|
st.dataframe(word_freq_phrases.head(100)) |
|
|
|
|
|
word_freq_trigrams = word_frequency(slugs_filtered, phrase_len=3) |
|
st.subheader(f"Most-frequently used three-word phrases in article titles for {sitemap_url} (excluding common words)") |
|
st.dataframe(word_freq_trigrams.head(100)) |
|
|
|
|
|
for trend_name, resample_rule, ylabel in [("Yearly", "A", "Count"), ("Monthly", "M", "Count"), ("Weekly", "W", "Count")]: |
|
st.subheader(f"{trend_name} Trends for {sitemap_url}") |
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
|
if trend_name == "Weekly": |
|
trends = sitemap_data['lastmod'].dt.dayofweek.value_counts().sort_index() |
|
trends.index = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] |
|
else: |
|
trends = sitemap_data.resample(resample_rule, on='lastmod').size() |
|
trends.index = trends.index.strftime('%Y-%m-%d') |
|
|
|
ax.bar(trends.index, trends.values) |
|
ax.set_ylabel(ylabel) |
|
ax.set_title(f"{trend_name} Trends") |
|
st.pyplot(fig) |
|
|
|
|
|
st.subheader(f"Total Number of URLs for {sitemap_url}") |
|
total_urls = len(sitemap_data) |
|
st.write(f"The total number of URLs in the sitemap is {total_urls}.") |
|
|
|
except Exception as e: |
|
st.write(f"An error occurred for {sitemap_url}:", str(e)) |
|
|