Rename pages/1 Keywords to pages/1
Browse files- pages/1 Keywords +0 -217
- pages/1 +358 -0
pages/1 Keywords
@@ -1,217 +0,0 @@
1 |
import streamlit as st
2 |
import pandas as pd
3 |
import numpy as np
4 |
import re
5 |
import nltk
6 |
7 |
from nltk.stem import WordNetLemmatizer
8 |
9 |
from nltk.corpus import stopwords
10 |
from pprint import pprint
11 |
import pickle
12 |
import streamlit.components.v1 as components
13 |
from io import StringIO
14 |
from nltk.stem.snowball import SnowballStemmer
15 |
import csv
16 |
import sys
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
st.header("Keywords Stem")
25 |
hide_streamlit_style = """
26 |
27 |
#MainMenu {visibility: hidden;}
28 |
footer {visibility: hidden;}
29 |
30 |
31 |
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
32 |
33 |
st.subheader('Put your file here...')
34 |
35 |
def reset_data():
36 |
37 |
38 |
#===check filetype===
39 |
40 |
def get_ext(extype):
41 |
extype =
42 |
return extype
43 |
44 |
45 |
46 |
def upload(extype):
47 |
keywords = pd.read_csv(uploaded_file)
48 |
return keywords
49 |
50 |
51 |
def conv_txt(extype):
52 |
col_dict = {'TI': 'Title',
53 |
'SO': 'Source title',
54 |
'DE': 'Author Keywords',
55 |
'ID': 'Keywords Plus'}
56 |
keywords = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
57 |
keywords.rename(columns=col_dict, inplace=True)
58 |
return keywords
59 |
60 |
61 |
def rev_conv_txt(extype):
62 |
col_dict_rev = {'Title': 'TI',
63 |
'Source title': 'SO',
64 |
'Author Keywords': 'DE',
65 |
'Keywords Plus': 'ID'}
66 |
keywords.rename(columns=col_dict_rev, inplace=True)
67 |
return keywords
68 |
69 |
70 |
def get_data(extype):
71 |
list_of_column_key = list(keywords.columns)
72 |
list_of_column_key = [k for k in list_of_column_key if 'Keyword' in k]
73 |
return list_of_column_key
74 |
75 |
uploaded_file = st.file_uploader("Choose your a file", type=['csv','txt'], on_change=reset_data)
76 |
77 |
if uploaded_file is not None:
78 |
extype = get_ext(uploaded_file)
79 |
if extype.endswith('.csv'):
80 |
keywords = upload(extype)
81 |
82 |
elif extype.endswith('.txt'):
83 |
keywords = conv_txt(extype)
84 |
85 |
list_of_column_key = get_data(extype)
86 |
87 |
col1, col2 = st.columns(2)
88 |
with col1:
89 |
method = st.selectbox(
90 |
'Choose method',
91 |
('Lemmatization', 'Stemming'), on_change=reset_data)
92 |
with col2:
93 |
keyword = st.selectbox(
94 |
'Choose column',
95 |
(list_of_column_key), on_change=reset_data)
96 |
97 |
98 |
def clean_keyword(extype):
99 |
global keyword, keywords
100 |
101 |
key = keywords[keyword]
102 |
except KeyError:
103 |
st.error('Error: Please check your Author/Index Keywords column.')
104 |
105 |
keywords = keywords.replace(np.nan, '', regex=True)
106 |
keywords[keyword] = keywords[keyword].astype(str)
107 |
keywords[keyword] = keywords[keyword].map(lambda x: re.sub('-', ' ', x))
108 |
keywords[keyword] = keywords[keyword].map(lambda x: re.sub('; ', ' ; ', x))
109 |
keywords[keyword] = keywords[keyword].map(lambda x: x.lower())
110 |
111 |
#===Keywords list===
112 |
key = key.dropna()
113 |
key = pd.concat([key.str.split('; ', expand=True)], axis=1)
114 |
key = pd.Series(np.ravel(key)).dropna().drop_duplicates().sort_values().reset_index()
115 |
key[0] = key[0].map(lambda x: re.sub('-', ' ', x))
116 |
key['new']=key[0].map(lambda x: x.lower())
117 |
118 |
return keywords, key
119 |
120 |
121 |
122 |
def Lemmatization(extype):
123 |
lemmatizer = WordNetLemmatizer()
124 |
def lemmatize_words(text):
125 |
words = text.split()
126 |
words = [lemmatizer.lemmatize(word) for word in words]
127 |
return ' '.join(words)
128 |
keywords[keyword] = keywords[keyword].apply(lemmatize_words)
129 |
key['new'] = key['new'].apply(lemmatize_words)
130 |
keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
131 |
return keywords, key
132 |
133 |
134 |
def Stemming(extype):
135 |
stemmer = SnowballStemmer("english")
136 |
def stem_words(text):
137 |
words = text.split()
138 |
words = [stemmer.stem(word) for word in words]
139 |
return ' '.join(words)
140 |
keywords[keyword] = keywords[keyword].apply(stem_words)
141 |
key['new'] = key['new'].apply(stem_words)
142 |
keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
143 |
return keywords, key
144 |
145 |
keywords, key = clean_keyword(extype)
146 |
147 |
if method is 'Lemmatization':
148 |
keywords, key = Lemmatization(extype)
149 |
150 |
keywords, key = Stemming(extype)
151 |
152 |
st.write('Congratulations! 🤩 You choose',keyword ,'with',method,'method. Now, you can easily download the result by clicking the button below')
153 |
154 |
155 |
#===show & download csv===
156 |
tab1, tab2, tab3, tab4 = st.tabs(["📥 Result", "📥 List of Keywords", "📃 Reference", "📃 Recommended Reading"])
157 |
158 |
with tab1:
159 |
st.dataframe(keywords, use_container_width=True, hide_index=True)
160 |
161 |
def convert_df(extype):
162 |
return keywords.to_csv(index=False).encode('utf-8')
163 |
164 |
165 |
def convert_txt(extype):
166 |
return keywords.to_csv(index=False, sep='\t', lineterminator='\r').encode('utf-8')
167 |
168 |
if extype.endswith('.csv'):
169 |
csv = convert_df(extype)
170 |
171 |
"Press to download result 👈",
172 |
173 |
174 |
175 |
176 |
elif extype.endswith('.txt'):
177 |
keywords = rev_conv_txt(extype)
178 |
txt = convert_txt(extype)
179 |
180 |
"Press to download result 👈",
181 |
182 |
183 |
184 |
185 |
with tab2:
186 |
187 |
def table_keyword(extype):
188 |
keytab = key.drop(['index'], axis=1).rename(columns={0: 'label'})
189 |
return keytab
190 |
#===coloring the same keywords===
191 |
192 |
def highlight_cells(value):
193 |
if keytab['new'].duplicated(keep=False).any() and keytab['new'].duplicated(keep=False)[keytab['new'] == value].any():
194 |
return 'background-color: yellow'
195 |
return ''
196 |
keytab = table_keyword(extype)
197 |
st.dataframe(, subset=['new']), use_container_width=True, hide_index=True)
198 |
199 |
200 |
def convert_dfs(extype):
201 |
return key.to_csv(index=False).encode('utf-8')
202 |
203 |
csv = convert_dfs(extype)
204 |
205 |
206 |
"Press to download keywords 👈",
207 |
208 |
209 |
210 |
211 |
with tab3:
212 |
st.markdown('**Santosa, F. A. (2023). Prior steps into knowledge mapping: Text mining application and comparison. Issues in Science and Technology Librarianship, 102.**')
213 |
214 |
with tab4:
215 |
st.markdown('**Beri, A. (2021, January 27). Stemming vs Lemmatization. Medium.**')
216 |
st.markdown('**Khyani, D., Siddhartha B S, Niveditha N M, & Divya B M. (2020). An Interpretation of Lemmatization and Stemming in Natural Language Processing. Journal of University of Shanghai for Science and Technology , 22(10), 350–357.**')
217 |
st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Text Pre-Processing. Text Mining for Information Professionals, 79–103.**')
@@ -0,0 +1,358 @@
1 |
import streamlit as st
2 |
import scattertext as stx
3 |
import pandas as pd
4 |
import re
5 |
import nltk
6 |
7 |
from nltk.stem import WordNetLemmatizer
8 |
9 |
from nltk.corpus import stopwords
10 |
import time
11 |
import sys
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
hide_streamlit_style = """
22 |
23 |
24 |
{visibility: hidden;}
25 |
footer {visibility: hidden;}
26 |
[data-testid="collapsedControl"] {display: none}
27 |
28 |
29 |
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
30 |
31 |
with st.popover("🔗 Menu"):
32 |
st.page_link("", label="Home", icon="🏠")
33 |
st.page_link("pages/1", label="Scattertext", icon="1️⃣")
34 |
st.page_link("pages/2 Topic", label="Topic Modeling", icon="2️⃣")
35 |
st.page_link("pages/3 Bidirected", label="Bidirected Network", icon="3️⃣")
36 |
st.page_link("pages/4", label="Sunburst", icon="4️⃣")
37 |
st.page_link("pages/5 Burst", label="Burst Detection", icon="5️⃣")
38 |
st.page_link("pages/6 Keywords", label="Keywords Stem", icon="6️⃣")
39 |
40 |
st.header("Scattertext", anchor=False)
41 |
st.subheader('Put your file here...', anchor=False)
42 |
43 |
def reset_all():
44 |
45 |
46 |
47 |
def get_ext(extype):
48 |
extype =
49 |
return extype
50 |
51 |
#===upload file===
52 |
53 |
def upload(extype):
54 |
papers = pd.read_csv(uploaded_file)
55 |
56 |
if 'Publication Year' in papers.columns:
57 |
papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
58 |
'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
59 |
return papers
60 |
61 |
62 |
def conv_txt(extype):
63 |
col_dict = {'TI': 'Title',
64 |
'SO': 'Source title',
65 |
'DT': 'Document Type',
66 |
'AB': 'Abstract',
67 |
'PY': 'Year'}
68 |
papers = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
69 |
papers.rename(columns=col_dict, inplace=True)
70 |
return papers
71 |
72 |
73 |
def get_data(extype):
74 |
df_col = sorted(papers.select_dtypes(include=['object']).columns.tolist())
75 |
list_title = [col for col in df_col if col.lower() == "title"]
76 |
abstract_pattern = re.compile(r'abstract', re.IGNORECASE)
77 |
list_abstract = [col for col in df_col if]
78 |
79 |
if all(col in df_col for col in list_title) and all(col in df_col for col in list_abstract):
80 |
selected_cols = list_abstract + list_title
81 |
elif all(col in df_col for col in list_title):
82 |
selected_cols = list_title
83 |
elif all(col in df_col for col in list_abstract):
84 |
selected_cols = list_abstract
85 |
86 |
selected_cols = df_col
87 |
88 |
if not selected_cols:
89 |
selected_cols = df_col
90 |
91 |
return df_col, selected_cols
92 |
93 |
94 |
def check_comparison(extype):
95 |
comparison = ['Word-to-word', 'Manual label']
96 |
97 |
if any('year' in col.lower() for col in papers.columns):
98 |
99 |
if any('source title' in col.lower() for col in papers.columns):
100 |
101 |
102 |
103 |
return comparison
104 |
105 |
#===clean csv===
106 |
@st.cache_data(ttl=3600, show_spinner=False)
107 |
def clean_csv(extype):
108 |
paper = papers.dropna(subset=[ColCho])
109 |
110 |
111 |
paper[ColCho].map(lambda x: x.lower())
112 |
if rem_punc:
113 |
paper[ColCho] = paper[ColCho].map(lambda x: re.sub('[,:;\.!-?•=]', ' ', x))
114 |
paper[ColCho] = paper[ColCho].str.replace('\u201c|\u201d', '', regex=True)
115 |
if rem_copyright:
116 |
paper[ColCho] = paper[ColCho].map(lambda x: re.sub('©.*', '', x))
117 |
118 |
#===stopword removal===
119 |
stop = stopwords.words('english')
120 |
paper[ColCho].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
121 |
122 |
123 |
lemmatizer = WordNetLemmatizer()
124 |
def lemmatize_words(text):
125 |
words = text.split()
126 |
words = [lemmatizer.lemmatize(word) for word in words]
127 |
return ' '.join(words)
128 |
129 |
130 |
words_rmv = [word.strip() for word in words_to_remove.split(";")]
131 |
remove_set = set(words_rmv)
132 |
def remove_words(text):
133 |
words = text.split()
134 |
cleaned_words = [word for word in words if word not in remove_set]
135 |
return ' '.join(cleaned_words)
136 |
paper[ColCho] = paper[ColCho].apply(remove_words)
137 |
138 |
return paper
139 |
140 |
141 |
def get_minmax(extype):
142 |
MIN = int(papers['Year'].min())
143 |
MAX = int(papers['Year'].max())
144 |
145 |
MID = round((MIN + MAX) / 2)
146 |
return MIN, MAX, GAP, MID
147 |
148 |
149 |
def running_scattertext(cat_col, catname, noncatname):
150 |
151 |
corpus = stx.CorpusFromPandas(filtered_df,
152 |
category_col = cat_col,
153 |
text_col = ColCho,
154 |
nlp = stx.whitespace_nlp_with_sentences,
155 |
).build().get_unigram_corpus().remove_infrequent_words(minimum_term_count = min_term)
156 |
157 |
st.toast('Building corpus completed', icon='🎉')
158 |
159 |
160 |
html = stx.produce_scattertext_explorer(corpus,
161 |
category = catname,
162 |
category_name = catname,
163 |
not_category_name = noncatname,
164 |
width_in_pixels = 900,
165 |
minimum_term_frequency = 0,
166 |
metadata = filtered_df['Title'],
167 |
168 |
169 |
except KeyError:
170 |
html = stx.produce_scattertext_explorer(corpus,
171 |
category = catname,
172 |
category_name = catname,
173 |
not_category_name = noncatname,
174 |
width_in_pixels = 900,
175 |
minimum_term_frequency = 0,
176 |
177 |
178 |
st.toast('Process completed', icon='🎉')
179 |
180 |
st.toast('Visualizing', icon='⏳')
181 |
st.components.v1.html(html, height = 1200, scrolling = True)
182 |
183 |
except ValueError:
184 |
st.warning('Please decrease the Minimum term count in the advanced settings.', icon="⚠️")
185 |
186 |
187 |
188 |
def df_w2w(search_terms1, search_terms2):
189 |
selected_col = [ColCho]
190 |
dfs1 = pd.DataFrame()
191 |
for term in search_terms1:
192 |
dfs1 = pd.concat([dfs1, paper[paper[selected_col[0]].str.contains(r'\b' + term + r'\b', case=False, na=False)]], ignore_index=True)
193 |
dfs1['Topic'] = 'First Term'
194 |
195 |
dfs2 = pd.DataFrame()
196 |
for term in search_terms2:
197 |
dfs2 = pd.concat([dfs2, paper[paper[selected_col[0]].str.contains(r'\b' + term + r'\b', case=False, na=False)]], ignore_index=True)
198 |
dfs2['Topic'] = 'Second Term'
199 |
filtered_df = pd.concat([dfs1, dfs2], ignore_index=True)
200 |
201 |
return dfs1, dfs2, filtered_df
202 |
203 |
204 |
def df_sources(stitle1, stitle2):
205 |
dfs1 = paper[paper['Source title'].str.contains(stitle1, case=False, na=False)]
206 |
dfs1['Topic'] = stitle1
207 |
dfs2 = paper[paper['Source title'].str.contains(stitle2, case=False, na=False)]
208 |
dfs2['Topic'] = stitle2
209 |
filtered_df = pd.concat([dfs1, dfs2], ignore_index=True)
210 |
211 |
return filtered_df
212 |
213 |
214 |
def df_years(first_range, second_range):
215 |
first_range_filter_df = paper[(paper['Year'] >= first_range[0]) & (paper['Year'] <= first_range[1])].copy()
216 |
first_range_filter_df['Topic Range'] = 'First range'
217 |
218 |
second_range_filter_df = paper[(paper['Year'] >= second_range[0]) & (paper['Year'] <= second_range[1])].copy()
219 |
second_range_filter_df['Topic Range'] = 'Second range'
220 |
221 |
filtered_df = pd.concat([first_range_filter_df, second_range_filter_df], ignore_index=True)
222 |
223 |
return filtered_df
224 |
225 |
#===Read data===
226 |
uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
227 |
228 |
if uploaded_file is not None:
229 |
extype = get_ext(uploaded_file)
230 |
231 |
if extype.endswith('.csv'):
232 |
papers = upload(extype)
233 |
elif extype.endswith('.txt'):
234 |
papers = conv_txt(extype)
235 |
236 |
df_col, selected_cols = get_data(extype)
237 |
comparison = check_comparison(extype)
238 |
239 |
240 |
c1, c2, c3 = st.columns([4,0.1,4])
241 |
ColCho = c1.selectbox(
242 |
'Choose column to analyze',
243 |
(selected_cols), on_change=reset_all)
244 |
245 |
246 |
247 |
compare = c3.selectbox(
248 |
'Type of comparison',
249 |
(comparison), on_change=reset_all)
250 |
251 |
with st.expander("🧮 Show advance settings"):
252 |
y1, y2 = st.columns([8,2])
253 |
t1, t2 = st.columns([3,3])
254 |
words_to_remove = y1.text_input('Input your text', on_change=reset_all, placeholder='Remove specific words. Separate words by semicolons (;)')
255 |
min_term = y2.number_input("Minimum term count", min_value=0, max_value=10, value=3, step=1, on_change=reset_all)
256 |
rem_copyright = t1.toggle('Remove copyright statement', value=True, on_change=reset_all)
257 |
rem_punc = t2.toggle('Remove punctuation', value=False, on_change=reset_all)
258 |
259 |
+'Scattertext is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
260 |
261 |
paper = clean_csv(extype)
262 |
263 |
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
264 |
265 |
with tab1:
266 |
267 |
if compare == 'Word-to-word':
268 |
col1, col2, col3 = st.columns([4,0.1,4])
269 |
text1 = col1.text_input('First Term', on_change=reset_all, placeholder='put comma if you have more than one')
270 |
search_terms1 = [term.strip() for term in text1.split(",") if term.strip()]
271 |
272 |
text2 = col3.text_input('Second Term', on_change=reset_all, placeholder='put comma if you have more than one')
273 |
search_terms2 = [term.strip() for term in text2.split(",") if term.strip()]
274 |
275 |
dfs1, dfs2, filtered_df = df_w2w(search_terms1, search_terms2)
276 |
277 |
if dfs1.empty and dfs2.empty:
278 |
st.warning('We cannot find anything in your document.', icon="⚠️")
279 |
elif dfs1.empty:
280 |
st.warning(f'We cannot find {text1} in your document.', icon="⚠️")
281 |
elif dfs2.empty:
282 |
st.warning(f'We cannot find {text2} in your document.', icon="⚠️")
283 |
284 |
with st.spinner('Processing. Please wait until the visualization comes up'):
285 |
running_scattertext('Topic', 'First Term', 'Second Term')
286 |
287 |
elif compare == 'Manual label':
288 |
col1, col2, col3 = st.columns(3)
289 |
290 |
df_col_sel = sorted([col for col in paper.columns.tolist()])
291 |
292 |
column_selected = col1.selectbox(
293 |
'Choose column',
294 |
(df_col_sel), on_change=reset_all)
295 |
296 |
list_words = paper[column_selected].values.tolist()
297 |
list_unique = sorted(list(set(list_words)))
298 |
299 |
if column_selected is not None:
300 |
label1 = col2.selectbox(
301 |
'Choose first label',
302 |
(list_unique), on_change=reset_all)
303 |
304 |
default_index = 0 if len(list_unique) == 1 else 1
305 |
label2 = col3.selectbox(
306 |
'Choose second label',
307 |
(list_unique), on_change=reset_all, index=default_index)
308 |
309 |
filtered_df = paper[paper[column_selected].isin([label1, label2])].reset_index(drop=True)
310 |
311 |
with st.spinner('Processing. Please wait until the visualization comes up'):
312 |
running_scattertext(column_selected, label1, label2)
313 |
314 |
elif compare == 'Sources':
315 |
col1, col2, col3 = st.columns([4,0.1,4])
316 |
317 |
unique_stitle = set()
318 |
unique_stitle.update(paper['Source title'].dropna())
319 |
list_stitle = sorted(list(unique_stitle))
320 |
321 |
stitle1 = col1.selectbox(
322 |
'Choose first label',
323 |
(list_stitle), on_change=reset_all)
324 |
325 |
default_index = 0 if len(list_stitle) == 1 else 1
326 |
stitle2 = col3.selectbox(
327 |
'Choose second label',
328 |
(list_stitle), on_change=reset_all, index=default_index)
329 |
330 |
filtered_df = df_sources(stitle1, stitle2)
331 |
332 |
with st.spinner('Processing. Please wait until the visualization comes up'):
333 |
running_scattertext('Source title', stitle1, stitle2)
334 |
335 |
elif compare == 'Years':
336 |
col1, col2, col3 = st.columns([4,0.1,4])
337 |
338 |
MIN, MAX, GAP, MID = get_minmax(extype)
339 |
if (GAP != 0):
340 |
first_range = col1.slider('First Range', min_value=MIN, max_value=MAX, value=(MIN, MID), on_change=reset_all)
341 |
342 |
second_range = col3.slider('Second Range', min_value=MIN, max_value=MAX, value=(MID, MAX), on_change=reset_all)
343 |
344 |
filtered_df = df_years(first_range, second_range)
345 |
346 |
with st.spinner('Processing. Please wait until the visualization comes up'):
347 |
running_scattertext('Topic Range', 'First range', 'Second range')
348 |
349 |
350 |
st.write('You only have data in ', (MAX))
351 |
352 |
with tab2:
353 |
st.markdown('**Kessler, J.S. (2017). Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ.**')
354 |
355 |
with tab3:
356 |
st.markdown('**Marrone, M., & Linnenluecke, M.K. (2020). Interdisciplinary Research Maps: A new technique for visualizing research topics. PLoS ONE, 15.**')
357 |
st.markdown('**Moreno, A., & Iglesias, C.A. (2021). Understanding Customers’ Transport Services with Topic Clustering and Sentiment Analysis. Applied Sciences.**')
358 |
st.markdown('**Sánchez-Franco, M.J., & Rey-Tienda, S. (2023). The role of user-generated content in tourism decision-making: an exemplary study of Andalusia, Spain. Management Decision.**')