dejanseo commited on
Commit
b751684
1 Parent(s): 4711a0c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -0
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import re
3
+ import pandas as pd
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+
7
+ # Streamlit app interface
8
+ st.title('Site Migration URL Mapper')
9
+
10
+ # Text areas for old and new site URLs
11
+ old_site_urls = st.text_area('Enter the old site URLs (one per line)').split('\n')
12
+ new_site_urls = st.text_area('Enter the new site URLs (one per line)').split('\n')
13
+
14
+ def preprocess_url(url):
15
+ """Preprocess the URL to extract keywords."""
16
+ processed_url = re.sub(r'https?:\/\/(?:www\.)?[^\/]+', '', url)
17
+ processed_url = re.sub(r'\.\w+$', '', processed_url)
18
+ processed_url = re.sub(r'[^a-z0-9\s]', ' ', processed_url.lower())
19
+ return processed_url
20
+
21
+ def map_urls(old_urls, new_urls):
22
+ """Map old site URLs to new site URLs based on content similarity using TF-IDF."""
23
+ old_urls_processed = [preprocess_url(url) for url in old_urls]
24
+ new_urls_processed = [preprocess_url(url) for url in new_urls]
25
+
26
+ vectorizer = TfidfVectorizer()
27
+ tfidf_matrix = vectorizer.fit_transform(old_urls_processed + new_urls_processed)
28
+
29
+ old_vectors = tfidf_matrix[:len(old_urls)]
30
+ new_vectors = tfidf_matrix[len(old_urls):]
31
+
32
+ similarity_matrix = cosine_similarity(old_vectors, new_vectors)
33
+ mappings = []
34
+
35
+ for idx, old_url in enumerate(old_urls):
36
+ best_match_idx = similarity_matrix[idx].argmax()
37
+ best_match_url = new_urls[best_match_idx]
38
+ mappings.append((old_url, best_match_url))
39
+
40
+ return mappings
41
+
42
+ if st.button('Generate Mappings'):
43
+ if old_site_urls and new_site_urls:
44
+ mappings = map_urls(old_site_urls, new_site_urls)
45
+ df_mappings = pd.DataFrame(mappings, columns=['Old URL', 'New URL'])
46
+ st.dataframe(df_mappings)
47
+
48
+ # Download button for the mappings
49
+ csv = df_mappings.to_csv(index=False).encode('utf-8')
50
+ st.download_button("Download Mappings", csv, "url_mappings.csv", "text/csv", key='download-csv')
51
+ else:
52
+ st.error("Please enter URLs for both old and new sites.")