Spaces:
Running
Running
Update pages/1 Scattertext.py
Browse files- pages/1 Scattertext.py +125 -120
pages/1 Scattertext.py
CHANGED
@@ -42,7 +42,7 @@ st.header("Scattertext", anchor=False)
|
|
42 |
st.subheader('Put your file here...', anchor=False)
|
43 |
|
44 |
def reset_all():
|
45 |
-
|
46 |
|
47 |
@st.cache_data(ttl=3600)
|
48 |
def get_ext(extype):
|
@@ -229,133 +229,138 @@ def df_years(first_range, second_range):
|
|
229 |
uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
|
230 |
|
231 |
if uploaded_file is not None:
|
232 |
-
|
233 |
-
|
234 |
-
if extype.endswith('.csv'):
|
235 |
-
papers = upload(extype)
|
236 |
-
elif extype.endswith('.txt'):
|
237 |
-
papers = conv_txt(extype)
|
238 |
-
|
239 |
-
df_col, selected_cols = get_data(extype)
|
240 |
-
comparison = check_comparison(extype)
|
241 |
-
|
242 |
-
#Menu
|
243 |
-
c1, c2, c3 = st.columns([4,0.1,4])
|
244 |
-
ColCho = c1.selectbox(
|
245 |
-
'Choose column to analyze',
|
246 |
-
(selected_cols), on_change=reset_all)
|
247 |
-
|
248 |
-
c2.write('')
|
249 |
-
|
250 |
-
compare = c3.selectbox(
|
251 |
-
'Type of comparison',
|
252 |
-
(comparison), on_change=reset_all)
|
253 |
-
|
254 |
-
with st.expander("🧮 Show advance settings"):
|
255 |
-
y1, y2 = st.columns([8,2])
|
256 |
-
t1, t2 = st.columns([3,3])
|
257 |
-
words_to_remove = y1.text_input('Input your text', on_change=reset_all, placeholder='Remove specific words. Separate words by semicolons (;)')
|
258 |
-
min_term = y2.number_input("Minimum term count", min_value=0, max_value=10, value=3, step=1, on_change=reset_all)
|
259 |
-
rem_copyright = t1.toggle('Remove copyright statement', value=True, on_change=reset_all)
|
260 |
-
rem_punc = t2.toggle('Remove punctuation', value=False, on_change=reset_all)
|
261 |
-
|
262 |
-
st.info('Scattertext is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
|
263 |
-
|
264 |
-
paper = clean_csv(extype)
|
265 |
-
|
266 |
-
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
|
267 |
-
|
268 |
-
with tab1:
|
269 |
-
#===visualization===
|
270 |
-
if compare == 'Word-to-word':
|
271 |
-
col1, col2, col3 = st.columns([4,0.1,4])
|
272 |
-
text1 = col1.text_input('First Term', on_change=reset_all, placeholder='put comma if you have more than one')
|
273 |
-
search_terms1 = [term.strip() for term in text1.split(",") if term.strip()]
|
274 |
-
col2.write('')
|
275 |
-
text2 = col3.text_input('Second Term', on_change=reset_all, placeholder='put comma if you have more than one')
|
276 |
-
search_terms2 = [term.strip() for term in text2.split(",") if term.strip()]
|
277 |
-
|
278 |
-
dfs1, dfs2, filtered_df = df_w2w(search_terms1, search_terms2)
|
279 |
-
|
280 |
-
if dfs1.empty and dfs2.empty:
|
281 |
-
st.warning('We cannot find anything in your document.', icon="⚠️")
|
282 |
-
elif dfs1.empty:
|
283 |
-
st.warning(f'We cannot find {text1} in your document.', icon="⚠️")
|
284 |
-
elif dfs2.empty:
|
285 |
-
st.warning(f'We cannot find {text2} in your document.', icon="⚠️")
|
286 |
-
else:
|
287 |
-
with st.spinner('Processing. Please wait until the visualization comes up'):
|
288 |
-
running_scattertext('Topic', 'First Term', 'Second Term')
|
289 |
|
290 |
-
|
291 |
-
|
|
|
|
|
292 |
|
293 |
-
|
294 |
-
|
295 |
-
column_selected = col1.selectbox(
|
296 |
-
'Choose column',
|
297 |
-
(df_col_sel), on_change=reset_all)
|
298 |
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
'Choose first label',
|
305 |
-
(list_unique), on_change=reset_all)
|
306 |
|
307 |
-
|
308 |
-
label2 = col3.selectbox(
|
309 |
-
'Choose second label',
|
310 |
-
(list_unique), on_change=reset_all, index=default_index)
|
311 |
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
|
317 |
-
|
318 |
-
|
|
|
319 |
|
320 |
-
|
321 |
-
unique_stitle.update(paper['Source title'].dropna())
|
322 |
-
list_stitle = sorted(list(unique_stitle))
|
323 |
-
|
324 |
-
stitle1 = col1.selectbox(
|
325 |
-
'Choose first label',
|
326 |
-
(list_stitle), on_change=reset_all)
|
327 |
-
col2.write('')
|
328 |
-
default_index = 0 if len(list_stitle) == 1 else 1
|
329 |
-
stitle2 = col3.selectbox(
|
330 |
-
'Choose second label',
|
331 |
-
(list_stitle), on_change=reset_all, index=default_index)
|
332 |
|
333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
|
335 |
-
|
336 |
-
|
337 |
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
MIN, MAX, GAP, MID = get_minmax(extype)
|
342 |
-
if (GAP != 0):
|
343 |
-
first_range = col1.slider('First Range', min_value=MIN, max_value=MAX, value=(MIN, MID), on_change=reset_all)
|
344 |
-
col2.write('')
|
345 |
-
second_range = col3.slider('Second Range', min_value=MIN, max_value=MAX, value=(MID, MAX), on_change=reset_all)
|
346 |
-
|
347 |
-
filtered_df = df_years(first_range, second_range)
|
348 |
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
st.
|
357 |
-
|
358 |
-
with tab3:
|
359 |
-
st.markdown('**Marrone, M., & Linnenluecke, M.K. (2020). Interdisciplinary Research Maps: A new technique for visualizing research topics. PLoS ONE, 15.** https://doi.org/10.1371/journal.pone.0242283')
|
360 |
-
st.markdown('**Moreno, A., & Iglesias, C.A. (2021). Understanding Customers’ Transport Services with Topic Clustering and Sentiment Analysis. Applied Sciences.** https://doi.org/10.3390/app112110169')
|
361 |
-
st.markdown('**Sánchez-Franco, M.J., & Rey-Tienda, S. (2023). The role of user-generated content in tourism decision-making: an exemplary study of Andalusia, Spain. Management Decision.** https://doi.org/10.1108/MD-06-2023-0966')
|
|
|
42 |
st.subheader('Put your file here...', anchor=False)
|
43 |
|
44 |
def reset_all():
|
45 |
+
st.cache_data.clear()
|
46 |
|
47 |
@st.cache_data(ttl=3600)
|
48 |
def get_ext(extype):
|
|
|
229 |
uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
|
230 |
|
231 |
if uploaded_file is not None:
|
232 |
+
try:
|
233 |
+
extype = get_ext(uploaded_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
+
if extype.endswith('.csv'):
|
236 |
+
papers = upload(extype)
|
237 |
+
elif extype.endswith('.txt'):
|
238 |
+
papers = conv_txt(extype)
|
239 |
|
240 |
+
df_col, selected_cols = get_data(extype)
|
241 |
+
comparison = check_comparison(extype)
|
|
|
|
|
|
|
242 |
|
243 |
+
#Menu
|
244 |
+
c1, c2, c3 = st.columns([4,0.1,4])
|
245 |
+
ColCho = c1.selectbox(
|
246 |
+
'Choose column to analyze',
|
247 |
+
(selected_cols), on_change=reset_all)
|
|
|
|
|
248 |
|
249 |
+
c2.write('')
|
|
|
|
|
|
|
250 |
|
251 |
+
compare = c3.selectbox(
|
252 |
+
'Type of comparison',
|
253 |
+
(comparison), on_change=reset_all)
|
254 |
+
|
255 |
+
with st.expander("🧮 Show advance settings"):
|
256 |
+
y1, y2 = st.columns([8,2])
|
257 |
+
t1, t2 = st.columns([3,3])
|
258 |
+
words_to_remove = y1.text_input('Input your text', on_change=reset_all, placeholder='Remove specific words. Separate words by semicolons (;)')
|
259 |
+
min_term = y2.number_input("Minimum term count", min_value=0, max_value=10, value=3, step=1, on_change=reset_all)
|
260 |
+
rem_copyright = t1.toggle('Remove copyright statement', value=True, on_change=reset_all)
|
261 |
+
rem_punc = t2.toggle('Remove punctuation', value=False, on_change=reset_all)
|
262 |
|
263 |
+
st.info('Scattertext is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
|
264 |
+
|
265 |
+
paper = clean_csv(extype)
|
266 |
|
267 |
+
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
|
269 |
+
with tab1:
|
270 |
+
#===visualization===
|
271 |
+
if compare == 'Word-to-word':
|
272 |
+
col1, col2, col3 = st.columns([4,0.1,4])
|
273 |
+
text1 = col1.text_input('First Term', on_change=reset_all, placeholder='put comma if you have more than one')
|
274 |
+
search_terms1 = [term.strip() for term in text1.split(",") if term.strip()]
|
275 |
+
col2.write('')
|
276 |
+
text2 = col3.text_input('Second Term', on_change=reset_all, placeholder='put comma if you have more than one')
|
277 |
+
search_terms2 = [term.strip() for term in text2.split(",") if term.strip()]
|
278 |
+
|
279 |
+
dfs1, dfs2, filtered_df = df_w2w(search_terms1, search_terms2)
|
280 |
+
|
281 |
+
if dfs1.empty and dfs2.empty:
|
282 |
+
st.warning('We cannot find anything in your document.', icon="⚠️")
|
283 |
+
elif dfs1.empty:
|
284 |
+
st.warning(f'We cannot find {text1} in your document.', icon="⚠️")
|
285 |
+
elif dfs2.empty:
|
286 |
+
st.warning(f'We cannot find {text2} in your document.', icon="⚠️")
|
287 |
+
else:
|
288 |
+
with st.spinner('Processing. Please wait until the visualization comes up'):
|
289 |
+
running_scattertext('Topic', 'First Term', 'Second Term')
|
290 |
+
|
291 |
+
elif compare == 'Manual label':
|
292 |
+
col1, col2, col3 = st.columns(3)
|
293 |
+
|
294 |
+
df_col_sel = sorted([col for col in paper.columns.tolist()])
|
295 |
+
|
296 |
+
column_selected = col1.selectbox(
|
297 |
+
'Choose column',
|
298 |
+
(df_col_sel), on_change=reset_all)
|
299 |
+
|
300 |
+
list_words = paper[column_selected].values.tolist()
|
301 |
+
list_unique = sorted(list(set(list_words)))
|
302 |
+
|
303 |
+
if column_selected is not None:
|
304 |
+
label1 = col2.selectbox(
|
305 |
+
'Choose first label',
|
306 |
+
(list_unique), on_change=reset_all)
|
307 |
+
|
308 |
+
default_index = 0 if len(list_unique) == 1 else 1
|
309 |
+
label2 = col3.selectbox(
|
310 |
+
'Choose second label',
|
311 |
+
(list_unique), on_change=reset_all, index=default_index)
|
312 |
+
|
313 |
+
filtered_df = paper[paper[column_selected].isin([label1, label2])].reset_index(drop=True)
|
314 |
+
|
315 |
+
with st.spinner('Processing. Please wait until the visualization comes up'):
|
316 |
+
running_scattertext(column_selected, label1, label2)
|
317 |
+
|
318 |
+
elif compare == 'Sources':
|
319 |
+
col1, col2, col3 = st.columns([4,0.1,4])
|
320 |
+
|
321 |
+
unique_stitle = set()
|
322 |
+
unique_stitle.update(paper['Source title'].dropna())
|
323 |
+
list_stitle = sorted(list(unique_stitle))
|
324 |
+
|
325 |
+
stitle1 = col1.selectbox(
|
326 |
+
'Choose first label',
|
327 |
+
(list_stitle), on_change=reset_all)
|
328 |
+
col2.write('')
|
329 |
+
default_index = 0 if len(list_stitle) == 1 else 1
|
330 |
+
stitle2 = col3.selectbox(
|
331 |
+
'Choose second label',
|
332 |
+
(list_stitle), on_change=reset_all, index=default_index)
|
333 |
+
|
334 |
+
filtered_df = df_sources(stitle1, stitle2)
|
335 |
+
|
336 |
+
with st.spinner('Processing. Please wait until the visualization comes up'):
|
337 |
+
running_scattertext('Source title', stitle1, stitle2)
|
338 |
+
|
339 |
+
elif compare == 'Years':
|
340 |
+
col1, col2, col3 = st.columns([4,0.1,4])
|
341 |
+
|
342 |
+
MIN, MAX, GAP, MID = get_minmax(extype)
|
343 |
+
if (GAP != 0):
|
344 |
+
first_range = col1.slider('First Range', min_value=MIN, max_value=MAX, value=(MIN, MID), on_change=reset_all)
|
345 |
+
col2.write('')
|
346 |
+
second_range = col3.slider('Second Range', min_value=MIN, max_value=MAX, value=(MID, MAX), on_change=reset_all)
|
347 |
+
|
348 |
+
filtered_df = df_years(first_range, second_range)
|
349 |
+
|
350 |
+
with st.spinner('Processing. Please wait until the visualization comes up'):
|
351 |
+
running_scattertext('Topic Range', 'First range', 'Second range')
|
352 |
|
353 |
+
else:
|
354 |
+
st.write('You only have data in ', (MAX))
|
355 |
|
356 |
+
with tab2:
|
357 |
+
st.markdown('**Jason Kessler. 2017. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. In Proceedings of ACL 2017, System Demonstrations, pages 85–90, Vancouver, Canada. Association for Computational Linguistics.** https://doi.org/10.48550/arXiv.1703.00565')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
|
359 |
+
with tab3:
|
360 |
+
st.markdown('**Marrone, M., & Linnenluecke, M.K. (2020). Interdisciplinary Research Maps: A new technique for visualizing research topics. PLoS ONE, 15.** https://doi.org/10.1371/journal.pone.0242283')
|
361 |
+
st.markdown('**Moreno, A., & Iglesias, C.A. (2021). Understanding Customers’ Transport Services with Topic Clustering and Sentiment Analysis. Applied Sciences.** https://doi.org/10.3390/app112110169')
|
362 |
+
st.markdown('**Sánchez-Franco, M.J., & Rey-Tienda, S. (2023). The role of user-generated content in tourism decision-making: an exemplary study of Andalusia, Spain. Management Decision.** https://doi.org/10.1108/MD-06-2023-0966')
|
363 |
+
|
364 |
+
except:
|
365 |
+
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
366 |
+
st.stop()
|
|
|
|
|
|
|
|
|
|