migrator / app.py
dejanseo's picture
Upload app.py
b751684 verified
raw
history blame
2.1 kB
import streamlit as st
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Streamlit app interface
st.title('Site Migration URL Mapper')
# Text areas for old and new site URLs
old_site_urls = st.text_area('Enter the old site URLs (one per line)').split('\n')
new_site_urls = st.text_area('Enter the new site URLs (one per line)').split('\n')
def preprocess_url(url):
"""Preprocess the URL to extract keywords."""
processed_url = re.sub(r'https?:\/\/(?:www\.)?[^\/]+', '', url)
processed_url = re.sub(r'\.\w+$', '', processed_url)
processed_url = re.sub(r'[^a-z0-9\s]', ' ', processed_url.lower())
return processed_url
def map_urls(old_urls, new_urls):
"""Map old site URLs to new site URLs based on content similarity using TF-IDF."""
old_urls_processed = [preprocess_url(url) for url in old_urls]
new_urls_processed = [preprocess_url(url) for url in new_urls]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(old_urls_processed + new_urls_processed)
old_vectors = tfidf_matrix[:len(old_urls)]
new_vectors = tfidf_matrix[len(old_urls):]
similarity_matrix = cosine_similarity(old_vectors, new_vectors)
mappings = []
for idx, old_url in enumerate(old_urls):
best_match_idx = similarity_matrix[idx].argmax()
best_match_url = new_urls[best_match_idx]
mappings.append((old_url, best_match_url))
return mappings
if st.button('Generate Mappings'):
if old_site_urls and new_site_urls:
mappings = map_urls(old_site_urls, new_site_urls)
df_mappings = pd.DataFrame(mappings, columns=['Old URL', 'New URL'])
st.dataframe(df_mappings)
# Download button for the mappings
csv = df_mappings.to_csv(index=False).encode('utf-8')
st.download_button("Download Mappings", csv, "url_mappings.csv", "text/csv", key='download-csv')
else:
st.error("Please enter URLs for both old and new sites.")