import streamlit as st import re import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # Streamlit app interface st.title('Site Migration URL Mapper') # Text areas for old and new site URLs old_site_urls = st.text_area('Enter the old site URLs (one per line)').split('\n') new_site_urls = st.text_area('Enter the new site URLs (one per line)').split('\n') def preprocess_url(url): """Preprocess the URL to extract keywords.""" processed_url = re.sub(r'https?:\/\/(?:www\.)?[^\/]+', '', url) processed_url = re.sub(r'\.\w+$', '', processed_url) processed_url = re.sub(r'[^a-z0-9\s]', ' ', processed_url.lower()) return processed_url def map_urls(old_urls, new_urls): """Map old site URLs to new site URLs based on content similarity using TF-IDF.""" old_urls_processed = [preprocess_url(url) for url in old_urls] new_urls_processed = [preprocess_url(url) for url in new_urls] vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(old_urls_processed + new_urls_processed) old_vectors = tfidf_matrix[:len(old_urls)] new_vectors = tfidf_matrix[len(old_urls):] similarity_matrix = cosine_similarity(old_vectors, new_vectors) mappings = [] for idx, old_url in enumerate(old_urls): best_match_idx = similarity_matrix[idx].argmax() best_match_url = new_urls[best_match_idx] mappings.append((old_url, best_match_url)) return mappings if st.button('Generate Mappings'): if old_site_urls and new_site_urls: mappings = map_urls(old_site_urls, new_site_urls) df_mappings = pd.DataFrame(mappings, columns=['Old URL', 'New URL']) st.dataframe(df_mappings) # Download button for the mappings csv = df_mappings.to_csv(index=False).encode('utf-8') st.download_button("Download Mappings", csv, "url_mappings.csv", "text/csv", key='download-csv') else: st.error("Please enter URLs for both old and new sites.")