File size: 2,104 Bytes
b751684
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import streamlit as st
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Streamlit app interface
st.title('Site Migration URL Mapper')

# Text areas for old and new site URLs
old_site_urls = st.text_area('Enter the old site URLs (one per line)').split('\n')
new_site_urls = st.text_area('Enter the new site URLs (one per line)').split('\n')

def preprocess_url(url):
    """Preprocess the URL to extract keywords."""
    processed_url = re.sub(r'https?:\/\/(?:www\.)?[^\/]+', '', url)
    processed_url = re.sub(r'\.\w+$', '', processed_url)
    processed_url = re.sub(r'[^a-z0-9\s]', ' ', processed_url.lower())
    return processed_url

def map_urls(old_urls, new_urls):
    """Map old site URLs to new site URLs based on content similarity using TF-IDF."""
    old_urls_processed = [preprocess_url(url) for url in old_urls]
    new_urls_processed = [preprocess_url(url) for url in new_urls]

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(old_urls_processed + new_urls_processed)

    old_vectors = tfidf_matrix[:len(old_urls)]
    new_vectors = tfidf_matrix[len(old_urls):]

    similarity_matrix = cosine_similarity(old_vectors, new_vectors)
    mappings = []

    for idx, old_url in enumerate(old_urls):
        best_match_idx = similarity_matrix[idx].argmax()
        best_match_url = new_urls[best_match_idx]
        mappings.append((old_url, best_match_url))

    return mappings

if st.button('Generate Mappings'):
    if old_site_urls and new_site_urls:
        mappings = map_urls(old_site_urls, new_site_urls)
        df_mappings = pd.DataFrame(mappings, columns=['Old URL', 'New URL'])
        st.dataframe(df_mappings)

        # Download button for the mappings
        csv = df_mappings.to_csv(index=False).encode('utf-8')
        st.download_button("Download Mappings", csv, "url_mappings.csv", "text/csv", key='download-csv')
    else:
        st.error("Please enter URLs for both old and new sites.")