|
import streamlit as st
|
|
import re
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
|
st.title('Site Migration URL Mapper')
|
|
|
|
|
|
old_site_urls = st.text_area('Enter the old site URLs (one per line)').split('\n')
|
|
new_site_urls = st.text_area('Enter the new site URLs (one per line)').split('\n')
|
|
|
|
def preprocess_url(url):
|
|
"""Preprocess the URL to extract keywords."""
|
|
processed_url = re.sub(r'https?:\/\/(?:www\.)?[^\/]+', '', url)
|
|
processed_url = re.sub(r'\.\w+$', '', processed_url)
|
|
processed_url = re.sub(r'[^a-z0-9\s]', ' ', processed_url.lower())
|
|
return processed_url
|
|
|
|
def map_urls(old_urls, new_urls):
|
|
"""Map old site URLs to new site URLs based on content similarity using TF-IDF."""
|
|
old_urls_processed = [preprocess_url(url) for url in old_urls]
|
|
new_urls_processed = [preprocess_url(url) for url in new_urls]
|
|
|
|
vectorizer = TfidfVectorizer()
|
|
tfidf_matrix = vectorizer.fit_transform(old_urls_processed + new_urls_processed)
|
|
|
|
old_vectors = tfidf_matrix[:len(old_urls)]
|
|
new_vectors = tfidf_matrix[len(old_urls):]
|
|
|
|
similarity_matrix = cosine_similarity(old_vectors, new_vectors)
|
|
mappings = []
|
|
|
|
for idx, old_url in enumerate(old_urls):
|
|
best_match_idx = similarity_matrix[idx].argmax()
|
|
best_match_url = new_urls[best_match_idx]
|
|
mappings.append((old_url, best_match_url))
|
|
|
|
return mappings
|
|
|
|
if st.button('Generate Mappings'):
|
|
if old_site_urls and new_site_urls:
|
|
mappings = map_urls(old_site_urls, new_site_urls)
|
|
df_mappings = pd.DataFrame(mappings, columns=['Old URL', 'New URL'])
|
|
st.dataframe(df_mappings)
|
|
|
|
|
|
csv = df_mappings.to_csv(index=False).encode('utf-8')
|
|
st.download_button("Download Mappings", csv, "url_mappings.csv", "text/csv", key='download-csv')
|
|
else:
|
|
st.error("Please enter URLs for both old and new sites.")
|
|
|