faizhalas commited on
Commit
099854e
1 Parent(s): 2b45b17

Rename pages/1 Keywords Stem.py to pages/1 Scattertext.py

Browse files
Files changed (2) hide show
  1. pages/1 Keywords Stem.py +0 -217
  2. pages/1 Scattertext.py +358 -0
pages/1 Keywords Stem.py DELETED
@@ -1,217 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import numpy as np
4
- import re
5
- import nltk
6
- nltk.download('wordnet')
7
- from nltk.stem import WordNetLemmatizer
8
- nltk.download('stopwords')
9
- from nltk.corpus import stopwords
10
- from pprint import pprint
11
- import pickle
12
- import streamlit.components.v1 as components
13
- from io import StringIO
14
- from nltk.stem.snowball import SnowballStemmer
15
- import csv
16
- import sys
17
-
18
- #===config===
19
- st.set_page_config(
20
- page_title="Coconut",
21
- page_icon="🥥",
22
- layout="wide"
23
- )
24
- st.header("Keywords Stem")
25
- hide_streamlit_style = """
26
- <style>
27
- #MainMenu {visibility: hidden;}
28
- footer {visibility: hidden;}
29
- </style>
30
- """
31
- st.markdown(hide_streamlit_style, unsafe_allow_html=True)
32
-
33
- st.subheader('Put your file here...')
34
-
35
- def reset_data():
36
- st.cache_data.clear()
37
-
38
- #===check filetype===
39
- @st.cache_data(ttl=3600)
40
- def get_ext(extype):
41
- extype = uploaded_file.name
42
- return extype
43
-
44
- #===upload===
45
- @st.cache_data(ttl=3600)
46
- def upload(extype):
47
- keywords = pd.read_csv(uploaded_file)
48
- return keywords
49
-
50
- @st.cache_data(ttl=3600)
51
- def conv_txt(extype):
52
- col_dict = {'TI': 'Title',
53
- 'SO': 'Source title',
54
- 'DE': 'Author Keywords',
55
- 'ID': 'Keywords Plus'}
56
- keywords = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
57
- keywords.rename(columns=col_dict, inplace=True)
58
- return keywords
59
-
60
- @st.cache_data(ttl=3600)
61
- def rev_conv_txt(extype):
62
- col_dict_rev = {'Title': 'TI',
63
- 'Source title': 'SO',
64
- 'Author Keywords': 'DE',
65
- 'Keywords Plus': 'ID'}
66
- keywords.rename(columns=col_dict_rev, inplace=True)
67
- return keywords
68
-
69
- @st.cache_data(ttl=3600)
70
- def get_data(extype):
71
- list_of_column_key = list(keywords.columns)
72
- list_of_column_key = [k for k in list_of_column_key if 'Keyword' in k]
73
- return list_of_column_key
74
-
75
- uploaded_file = st.file_uploader("Choose your a file", type=['csv','txt'], on_change=reset_data)
76
-
77
- if uploaded_file is not None:
78
- extype = get_ext(uploaded_file)
79
- if extype.endswith('.csv'):
80
- keywords = upload(extype)
81
-
82
- elif extype.endswith('.txt'):
83
- keywords = conv_txt(extype)
84
-
85
- list_of_column_key = get_data(extype)
86
-
87
- col1, col2 = st.columns(2)
88
- with col1:
89
- method = st.selectbox(
90
- 'Choose method',
91
- ('Lemmatization', 'Stemming'), on_change=reset_data)
92
- with col2:
93
- keyword = st.selectbox(
94
- 'Choose column',
95
- (list_of_column_key), on_change=reset_data)
96
-
97
- @st.cache_data(ttl=3600)
98
- def clean_keyword(extype):
99
- global keyword, keywords
100
- try:
101
- key = keywords[keyword]
102
- except KeyError:
103
- st.error('Error: Please check your Author/Index Keywords column.')
104
- sys.exit(1)
105
- keywords = keywords.replace(np.nan, '', regex=True)
106
- keywords[keyword] = keywords[keyword].astype(str)
107
- keywords[keyword] = keywords[keyword].map(lambda x: re.sub('-', ' ', x))
108
- keywords[keyword] = keywords[keyword].map(lambda x: re.sub('; ', ' ; ', x))
109
- keywords[keyword] = keywords[keyword].map(lambda x: x.lower())
110
-
111
- #===Keywords list===
112
- key = key.dropna()
113
- key = pd.concat([key.str.split('; ', expand=True)], axis=1)
114
- key = pd.Series(np.ravel(key)).dropna().drop_duplicates().sort_values().reset_index()
115
- key[0] = key[0].map(lambda x: re.sub('-', ' ', x))
116
- key['new']=key[0].map(lambda x: x.lower())
117
-
118
- return keywords, key
119
-
120
- #===stem/lem===
121
- @st.cache_data(ttl=3600)
122
- def Lemmatization(extype):
123
- lemmatizer = WordNetLemmatizer()
124
- def lemmatize_words(text):
125
- words = text.split()
126
- words = [lemmatizer.lemmatize(word) for word in words]
127
- return ' '.join(words)
128
- keywords[keyword] = keywords[keyword].apply(lemmatize_words)
129
- key['new'] = key['new'].apply(lemmatize_words)
130
- keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
131
- return keywords, key
132
-
133
- @st.cache_data(ttl=3600)
134
- def Stemming(extype):
135
- stemmer = SnowballStemmer("english")
136
- def stem_words(text):
137
- words = text.split()
138
- words = [stemmer.stem(word) for word in words]
139
- return ' '.join(words)
140
- keywords[keyword] = keywords[keyword].apply(stem_words)
141
- key['new'] = key['new'].apply(stem_words)
142
- keywords[keyword] = keywords[keyword].map(lambda x: re.sub(' ; ', '; ', x))
143
- return keywords, key
144
-
145
- keywords, key = clean_keyword(extype)
146
-
147
- if method is 'Lemmatization':
148
- keywords, key = Lemmatization(extype)
149
- else:
150
- keywords, key = Stemming(extype)
151
-
152
- st.write('Congratulations! 🤩 You choose',keyword ,'with',method,'method. Now, you can easily download the result by clicking the button below')
153
- st.divider()
154
-
155
- #===show & download csv===
156
- tab1, tab2, tab3, tab4 = st.tabs(["📥 Result", "📥 List of Keywords", "📃 Reference", "📃 Recommended Reading"])
157
-
158
- with tab1:
159
- st.dataframe(keywords, use_container_width=True, hide_index=True)
160
- @st.cache_data(ttl=3600)
161
- def convert_df(extype):
162
- return keywords.to_csv(index=False).encode('utf-8')
163
-
164
- @st.cache_data(ttl=3600)
165
- def convert_txt(extype):
166
- return keywords.to_csv(index=False, sep='\t', lineterminator='\r').encode('utf-8')
167
-
168
- if extype.endswith('.csv'):
169
- csv = convert_df(extype)
170
- st.download_button(
171
- "Press to download result 👈",
172
- csv,
173
- "scopus.csv",
174
- "text/csv")
175
-
176
- elif extype.endswith('.txt'):
177
- keywords = rev_conv_txt(extype)
178
- txt = convert_txt(extype)
179
- st.download_button(
180
- "Press to download result 👈",
181
- txt,
182
- "savedrecs.txt",
183
- "text/csv")
184
-
185
- with tab2:
186
- @st.cache_data(ttl=3600)
187
- def table_keyword(extype):
188
- keytab = key.drop(['index'], axis=1).rename(columns={0: 'label'})
189
- return keytab
190
- #===coloring the same keywords===
191
- @st.cache_data(ttl=3600)
192
- def highlight_cells(value):
193
- if keytab['new'].duplicated(keep=False).any() and keytab['new'].duplicated(keep=False)[keytab['new'] == value].any():
194
- return 'background-color: yellow'
195
- return ''
196
- keytab = table_keyword(extype)
197
- st.dataframe(keytab.style.applymap(highlight_cells, subset=['new']), use_container_width=True, hide_index=True)
198
-
199
- @st.cache_data(ttl=3600)
200
- def convert_dfs(extype):
201
- return key.to_csv(index=False).encode('utf-8')
202
-
203
- csv = convert_dfs(extype)
204
-
205
- st.download_button(
206
- "Press to download keywords 👈",
207
- csv,
208
- "keywords.csv",
209
- "text/csv")
210
-
211
- with tab3:
212
- st.markdown('**Santosa, F. A. (2023). Prior steps into knowledge mapping: Text mining application and comparison. Issues in Science and Technology Librarianship, 102.** https://doi.org/10.29173/istl2736')
213
-
214
- with tab4:
215
- st.markdown('**Beri, A. (2021, January 27). Stemming vs Lemmatization. Medium.** https://towardsdatascience.com/stemming-vs-lemmatization-2daddabcb221')
216
- st.markdown('**Khyani, D., Siddhartha B S, Niveditha N M, &amp; Divya B M. (2020). An Interpretation of Lemmatization and Stemming in Natural Language Processing. Journal of University of Shanghai for Science and Technology , 22(10), 350–357.** https://jusst.org/an-interpretation-of-lemmatization-and-stemming-in-natural-language-processing/')
217
- st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Text Pre-Processing. Text Mining for Information Professionals, 79–103.** https://doi.org/10.1007/978-3-030-85085-2_3')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/1 Scattertext.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import scattertext as stx
3
+ import pandas as pd
4
+ import re
5
+ import nltk
6
+ nltk.download('wordnet')
7
+ from nltk.stem import WordNetLemmatizer
8
+ nltk.download('stopwords')
9
+ from nltk.corpus import stopwords
10
+ import time
11
+ import sys
12
+
13
+ #===config===
14
+ st.set_page_config(
15
+ page_title="Coconut",
16
+ page_icon="🥥",
17
+ layout="wide",
18
+ initial_sidebar_state="collapsed"
19
+ )
20
+
21
+ hide_streamlit_style = """
22
+ <style>
23
+ #MainMenu
24
+ {visibility: hidden;}
25
+ footer {visibility: hidden;}
26
+ [data-testid="collapsedControl"] {display: none}
27
+ </style>
28
+ """
29
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
30
+
31
+ with st.popover("🔗 Menu"):
32
+ st.page_link("Home.py", label="Home", icon="🏠")
33
+ st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
34
+ st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
35
+ st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
36
+ st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
37
+ st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
38
+ st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
39
+
40
+ st.header("Scattertext", anchor=False)
41
+ st.subheader('Put your file here...', anchor=False)
42
+
43
+ def reset_all():
44
+ st.cache_data.clear()
45
+
46
+ @st.cache_data(ttl=3600)
47
+ def get_ext(extype):
48
+ extype = uploaded_file.name
49
+ return extype
50
+
51
+ #===upload file===
52
+ @st.cache_data(ttl=3600)
53
+ def upload(extype):
54
+ papers = pd.read_csv(uploaded_file)
55
+ #lens.org
56
+ if 'Publication Year' in papers.columns:
57
+ papers.rename(columns={'Publication Year': 'Year', 'Citing Works Count': 'Cited by',
58
+ 'Publication Type': 'Document Type', 'Source Title': 'Source title'}, inplace=True)
59
+ return papers
60
+
61
+ @st.cache_data(ttl=3600)
62
+ def conv_txt(extype):
63
+ col_dict = {'TI': 'Title',
64
+ 'SO': 'Source title',
65
+ 'DT': 'Document Type',
66
+ 'AB': 'Abstract',
67
+ 'PY': 'Year'}
68
+ papers = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
69
+ papers.rename(columns=col_dict, inplace=True)
70
+ return papers
71
+
72
+ @st.cache_data(ttl=3600)
73
+ def get_data(extype):
74
+ df_col = sorted(papers.select_dtypes(include=['object']).columns.tolist())
75
+ list_title = [col for col in df_col if col.lower() == "title"]
76
+ abstract_pattern = re.compile(r'abstract', re.IGNORECASE)
77
+ list_abstract = [col for col in df_col if abstract_pattern.search(col)]
78
+
79
+ if all(col in df_col for col in list_title) and all(col in df_col for col in list_abstract):
80
+ selected_cols = list_abstract + list_title
81
+ elif all(col in df_col for col in list_title):
82
+ selected_cols = list_title
83
+ elif all(col in df_col for col in list_abstract):
84
+ selected_cols = list_abstract
85
+ else:
86
+ selected_cols = df_col
87
+
88
+ if not selected_cols:
89
+ selected_cols = df_col
90
+
91
+ return df_col, selected_cols
92
+
93
+ @st.cache_data(ttl=3600)
94
+ def check_comparison(extype):
95
+ comparison = ['Word-to-word', 'Manual label']
96
+
97
+ if any('year' in col.lower() for col in papers.columns):
98
+ comparison.append('Years')
99
+ if any('source title' in col.lower() for col in papers.columns):
100
+ comparison.append('Sources')
101
+
102
+ comparison.sort(reverse=False)
103
+ return comparison
104
+
105
+ #===clean csv===
106
+ @st.cache_data(ttl=3600, show_spinner=False)
107
+ def clean_csv(extype):
108
+ paper = papers.dropna(subset=[ColCho])
109
+
110
+ #===mapping===
111
+ paper[ColCho].map(lambda x: x.lower())
112
+ if rem_punc:
113
+ paper[ColCho] = paper[ColCho].map(lambda x: re.sub('[,:;\.!-?•=]', ' ', x))
114
+ paper[ColCho] = paper[ColCho].str.replace('\u201c|\u201d', '', regex=True)
115
+ if rem_copyright:
116
+ paper[ColCho] = paper[ColCho].map(lambda x: re.sub('©.*', '', x))
117
+
118
+ #===stopword removal===
119
+ stop = stopwords.words('english')
120
+ paper[ColCho].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
121
+
122
+ #===lemmatize===
123
+ lemmatizer = WordNetLemmatizer()
124
+ def lemmatize_words(text):
125
+ words = text.split()
126
+ words = [lemmatizer.lemmatize(word) for word in words]
127
+ return ' '.join(words)
128
+ paper[ColCho].apply(lemmatize_words)
129
+
130
+ words_rmv = [word.strip() for word in words_to_remove.split(";")]
131
+ remove_set = set(words_rmv)
132
+ def remove_words(text):
133
+ words = text.split()
134
+ cleaned_words = [word for word in words if word not in remove_set]
135
+ return ' '.join(cleaned_words)
136
+ paper[ColCho] = paper[ColCho].apply(remove_words)
137
+
138
+ return paper
139
+
140
+ @st.cache_data(ttl=3600)
141
+ def get_minmax(extype):
142
+ MIN = int(papers['Year'].min())
143
+ MAX = int(papers['Year'].max())
144
+ GAP = MAX - MIN
145
+ MID = round((MIN + MAX) / 2)
146
+ return MIN, MAX, GAP, MID
147
+
148
+ @st.cache_data(ttl=3600)
149
+ def running_scattertext(cat_col, catname, noncatname):
150
+ try:
151
+ corpus = stx.CorpusFromPandas(filtered_df,
152
+ category_col = cat_col,
153
+ text_col = ColCho,
154
+ nlp = stx.whitespace_nlp_with_sentences,
155
+ ).build().get_unigram_corpus().remove_infrequent_words(minimum_term_count = min_term)
156
+
157
+ st.toast('Building corpus completed', icon='🎉')
158
+
159
+ try:
160
+ html = stx.produce_scattertext_explorer(corpus,
161
+ category = catname,
162
+ category_name = catname,
163
+ not_category_name = noncatname,
164
+ width_in_pixels = 900,
165
+ minimum_term_frequency = 0,
166
+ metadata = filtered_df['Title'],
167
+ save_svg_button=True)
168
+
169
+ except KeyError:
170
+ html = stx.produce_scattertext_explorer(corpus,
171
+ category = catname,
172
+ category_name = catname,
173
+ not_category_name = noncatname,
174
+ width_in_pixels = 900,
175
+ minimum_term_frequency = 0,
176
+ save_svg_button=True)
177
+
178
+ st.toast('Process completed', icon='🎉')
179
+ time.sleep(1)
180
+ st.toast('Visualizing', icon='⏳')
181
+ st.components.v1.html(html, height = 1200, scrolling = True)
182
+
183
+ except ValueError:
184
+ st.warning('Please decrease the Minimum term count in the advanced settings.', icon="⚠️")
185
+ sys.exit()
186
+
187
+ @st.cache_data(ttl=3600)
188
+ def df_w2w(search_terms1, search_terms2):
189
+ selected_col = [ColCho]
190
+ dfs1 = pd.DataFrame()
191
+ for term in search_terms1:
192
+ dfs1 = pd.concat([dfs1, paper[paper[selected_col[0]].str.contains(r'\b' + term + r'\b', case=False, na=False)]], ignore_index=True)
193
+ dfs1['Topic'] = 'First Term'
194
+
195
+ dfs2 = pd.DataFrame()
196
+ for term in search_terms2:
197
+ dfs2 = pd.concat([dfs2, paper[paper[selected_col[0]].str.contains(r'\b' + term + r'\b', case=False, na=False)]], ignore_index=True)
198
+ dfs2['Topic'] = 'Second Term'
199
+ filtered_df = pd.concat([dfs1, dfs2], ignore_index=True)
200
+
201
+ return dfs1, dfs2, filtered_df
202
+
203
+ @st.cache_data(ttl=3600)
204
+ def df_sources(stitle1, stitle2):
205
+ dfs1 = paper[paper['Source title'].str.contains(stitle1, case=False, na=False)]
206
+ dfs1['Topic'] = stitle1
207
+ dfs2 = paper[paper['Source title'].str.contains(stitle2, case=False, na=False)]
208
+ dfs2['Topic'] = stitle2
209
+ filtered_df = pd.concat([dfs1, dfs2], ignore_index=True)
210
+
211
+ return filtered_df
212
+
213
+ @st.cache_data(ttl=3600)
214
+ def df_years(first_range, second_range):
215
+ first_range_filter_df = paper[(paper['Year'] >= first_range[0]) & (paper['Year'] <= first_range[1])].copy()
216
+ first_range_filter_df['Topic Range'] = 'First range'
217
+
218
+ second_range_filter_df = paper[(paper['Year'] >= second_range[0]) & (paper['Year'] <= second_range[1])].copy()
219
+ second_range_filter_df['Topic Range'] = 'Second range'
220
+
221
+ filtered_df = pd.concat([first_range_filter_df, second_range_filter_df], ignore_index=True)
222
+
223
+ return filtered_df
224
+
225
+ #===Read data===
226
+ uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
227
+
228
+ if uploaded_file is not None:
229
+ extype = get_ext(uploaded_file)
230
+
231
+ if extype.endswith('.csv'):
232
+ papers = upload(extype)
233
+ elif extype.endswith('.txt'):
234
+ papers = conv_txt(extype)
235
+
236
+ df_col, selected_cols = get_data(extype)
237
+ comparison = check_comparison(extype)
238
+
239
+ #Menu
240
+ c1, c2, c3 = st.columns([4,0.1,4])
241
+ ColCho = c1.selectbox(
242
+ 'Choose column to analyze',
243
+ (selected_cols), on_change=reset_all)
244
+
245
+ c2.write('')
246
+
247
+ compare = c3.selectbox(
248
+ 'Type of comparison',
249
+ (comparison), on_change=reset_all)
250
+
251
+ with st.expander("🧮 Show advance settings"):
252
+ y1, y2 = st.columns([8,2])
253
+ t1, t2 = st.columns([3,3])
254
+ words_to_remove = y1.text_input('Input your text', on_change=reset_all, placeholder='Remove specific words. Separate words by semicolons (;)')
255
+ min_term = y2.number_input("Minimum term count", min_value=0, max_value=10, value=3, step=1, on_change=reset_all)
256
+ rem_copyright = t1.toggle('Remove copyright statement', value=True, on_change=reset_all)
257
+ rem_punc = t2.toggle('Remove punctuation', value=False, on_change=reset_all)
258
+
259
+ st.info('Scattertext is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
260
+
261
+ paper = clean_csv(extype)
262
+
263
+ tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
264
+
265
+ with tab1:
266
+ #===visualization===
267
+ if compare == 'Word-to-word':
268
+ col1, col2, col3 = st.columns([4,0.1,4])
269
+ text1 = col1.text_input('First Term', on_change=reset_all, placeholder='put comma if you have more than one')
270
+ search_terms1 = [term.strip() for term in text1.split(",") if term.strip()]
271
+ col2.write('')
272
+ text2 = col3.text_input('Second Term', on_change=reset_all, placeholder='put comma if you have more than one')
273
+ search_terms2 = [term.strip() for term in text2.split(",") if term.strip()]
274
+
275
+ dfs1, dfs2, filtered_df = df_w2w(search_terms1, search_terms2)
276
+
277
+ if dfs1.empty and dfs2.empty:
278
+ st.warning('We cannot find anything in your document.', icon="⚠️")
279
+ elif dfs1.empty:
280
+ st.warning(f'We cannot find {text1} in your document.', icon="⚠️")
281
+ elif dfs2.empty:
282
+ st.warning(f'We cannot find {text2} in your document.', icon="⚠️")
283
+ else:
284
+ with st.spinner('Processing. Please wait until the visualization comes up'):
285
+ running_scattertext('Topic', 'First Term', 'Second Term')
286
+
287
+ elif compare == 'Manual label':
288
+ col1, col2, col3 = st.columns(3)
289
+
290
+ df_col_sel = sorted([col for col in paper.columns.tolist()])
291
+
292
+ column_selected = col1.selectbox(
293
+ 'Choose column',
294
+ (df_col_sel), on_change=reset_all)
295
+
296
+ list_words = paper[column_selected].values.tolist()
297
+ list_unique = sorted(list(set(list_words)))
298
+
299
+ if column_selected is not None:
300
+ label1 = col2.selectbox(
301
+ 'Choose first label',
302
+ (list_unique), on_change=reset_all)
303
+
304
+ default_index = 0 if len(list_unique) == 1 else 1
305
+ label2 = col3.selectbox(
306
+ 'Choose second label',
307
+ (list_unique), on_change=reset_all, index=default_index)
308
+
309
+ filtered_df = paper[paper[column_selected].isin([label1, label2])].reset_index(drop=True)
310
+
311
+ with st.spinner('Processing. Please wait until the visualization comes up'):
312
+ running_scattertext(column_selected, label1, label2)
313
+
314
+ elif compare == 'Sources':
315
+ col1, col2, col3 = st.columns([4,0.1,4])
316
+
317
+ unique_stitle = set()
318
+ unique_stitle.update(paper['Source title'].dropna())
319
+ list_stitle = sorted(list(unique_stitle))
320
+
321
+ stitle1 = col1.selectbox(
322
+ 'Choose first label',
323
+ (list_stitle), on_change=reset_all)
324
+ col2.write('')
325
+ default_index = 0 if len(list_stitle) == 1 else 1
326
+ stitle2 = col3.selectbox(
327
+ 'Choose second label',
328
+ (list_stitle), on_change=reset_all, index=default_index)
329
+
330
+ filtered_df = df_sources(stitle1, stitle2)
331
+
332
+ with st.spinner('Processing. Please wait until the visualization comes up'):
333
+ running_scattertext('Source title', stitle1, stitle2)
334
+
335
+ elif compare == 'Years':
336
+ col1, col2, col3 = st.columns([4,0.1,4])
337
+
338
+ MIN, MAX, GAP, MID = get_minmax(extype)
339
+ if (GAP != 0):
340
+ first_range = col1.slider('First Range', min_value=MIN, max_value=MAX, value=(MIN, MID), on_change=reset_all)
341
+ col2.write('')
342
+ second_range = col3.slider('Second Range', min_value=MIN, max_value=MAX, value=(MID, MAX), on_change=reset_all)
343
+
344
+ filtered_df = df_years(first_range, second_range)
345
+
346
+ with st.spinner('Processing. Please wait until the visualization comes up'):
347
+ running_scattertext('Topic Range', 'First range', 'Second range')
348
+
349
+ else:
350
+ st.write('You only have data in ', (MAX))
351
+
352
+ with tab2:
353
+ st.markdown('**Kessler, J.S. (2017). Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ.** https://doi.org/10.48550/arXiv.1703.00565')
354
+
355
+ with tab3:
356
+ st.markdown('**Marrone, M., & Linnenluecke, M.K. (2020). Interdisciplinary Research Maps: A new technique for visualizing research topics. PLoS ONE, 15.** https://doi.org/10.1371/journal.pone.0242283')
357
+ st.markdown('**Moreno, A., & Iglesias, C.A. (2021). Understanding Customers’ Transport Services with Topic Clustering and Sentiment Analysis. Applied Sciences.** https://doi.org/10.3390/app112110169')
358
+ st.markdown('**Sánchez-Franco, M.J., & Rey-Tienda, S. (2023). The role of user-generated content in tourism decision-making: an exemplary study of Andalusia, Spain. Management Decision.** https://doi.org/10.1108/MD-06-2023-0966')