Spaces:

dejanseo
/

migrator

Sleeping

App Files Files Community

migrator / app.py

dejanseo

Upload app.py

b751684 verified 4 months ago

raw

history blame

2.1 kB

	import streamlit as st
	import re
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	# Streamlit app interface
	st.title('Site Migration URL Mapper')

	# Text areas for old and new site URLs
	old_site_urls = st.text_area('Enter the old site URLs (one per line)').split('\n')
	new_site_urls = st.text_area('Enter the new site URLs (one per line)').split('\n')

	def preprocess_url(url):
	"""Preprocess the URL to extract keywords."""
	processed_url = re.sub(r'https?:\/\/(?:www\.)?[^\/]+', '', url)
	processed_url = re.sub(r'\.\w+$', '', processed_url)
	processed_url = re.sub(r'[^a-z0-9\s]', ' ', processed_url.lower())
	return processed_url

	def map_urls(old_urls, new_urls):
	"""Map old site URLs to new site URLs based on content similarity using TF-IDF."""
	old_urls_processed = [preprocess_url(url) for url in old_urls]
	new_urls_processed = [preprocess_url(url) for url in new_urls]

	vectorizer = TfidfVectorizer()
	tfidf_matrix = vectorizer.fit_transform(old_urls_processed + new_urls_processed)

	old_vectors = tfidf_matrix[:len(old_urls)]
	new_vectors = tfidf_matrix[len(old_urls):]

	similarity_matrix = cosine_similarity(old_vectors, new_vectors)
	mappings = []

	for idx, old_url in enumerate(old_urls):
	best_match_idx = similarity_matrix[idx].argmax()
	best_match_url = new_urls[best_match_idx]
	mappings.append((old_url, best_match_url))

	return mappings

	if st.button('Generate Mappings'):
	if old_site_urls and new_site_urls:
	mappings = map_urls(old_site_urls, new_site_urls)
	df_mappings = pd.DataFrame(mappings, columns=['Old URL', 'New URL'])
	st.dataframe(df_mappings)

	# Download button for the mappings
	csv = df_mappings.to_csv(index=False).encode('utf-8')
	st.download_button("Download Mappings", csv, "url_mappings.csv", "text/csv", key='download-csv')
	else:
	st.error("Please enter URLs for both old and new sites.")