faizhalas commited on
Commit
f365120
1 Parent(s): f4f82d3

Update pages/1 Scattertext.py

Browse files
Files changed (1) hide show
  1. pages/1 Scattertext.py +125 -120
pages/1 Scattertext.py CHANGED
@@ -42,7 +42,7 @@ st.header("Scattertext", anchor=False)
42
  st.subheader('Put your file here...', anchor=False)
43
 
44
  def reset_all():
45
- st.cache_data.clear()
46
 
47
  @st.cache_data(ttl=3600)
48
  def get_ext(extype):
@@ -229,133 +229,138 @@ def df_years(first_range, second_range):
229
  uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
230
 
231
  if uploaded_file is not None:
232
- extype = get_ext(uploaded_file)
233
-
234
- if extype.endswith('.csv'):
235
- papers = upload(extype)
236
- elif extype.endswith('.txt'):
237
- papers = conv_txt(extype)
238
-
239
- df_col, selected_cols = get_data(extype)
240
- comparison = check_comparison(extype)
241
-
242
- #Menu
243
- c1, c2, c3 = st.columns([4,0.1,4])
244
- ColCho = c1.selectbox(
245
- 'Choose column to analyze',
246
- (selected_cols), on_change=reset_all)
247
-
248
- c2.write('')
249
-
250
- compare = c3.selectbox(
251
- 'Type of comparison',
252
- (comparison), on_change=reset_all)
253
-
254
- with st.expander("🧮 Show advance settings"):
255
- y1, y2 = st.columns([8,2])
256
- t1, t2 = st.columns([3,3])
257
- words_to_remove = y1.text_input('Input your text', on_change=reset_all, placeholder='Remove specific words. Separate words by semicolons (;)')
258
- min_term = y2.number_input("Minimum term count", min_value=0, max_value=10, value=3, step=1, on_change=reset_all)
259
- rem_copyright = t1.toggle('Remove copyright statement', value=True, on_change=reset_all)
260
- rem_punc = t2.toggle('Remove punctuation', value=False, on_change=reset_all)
261
-
262
- st.info('Scattertext is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
263
-
264
- paper = clean_csv(extype)
265
-
266
- tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
267
-
268
- with tab1:
269
- #===visualization===
270
- if compare == 'Word-to-word':
271
- col1, col2, col3 = st.columns([4,0.1,4])
272
- text1 = col1.text_input('First Term', on_change=reset_all, placeholder='put comma if you have more than one')
273
- search_terms1 = [term.strip() for term in text1.split(",") if term.strip()]
274
- col2.write('')
275
- text2 = col3.text_input('Second Term', on_change=reset_all, placeholder='put comma if you have more than one')
276
- search_terms2 = [term.strip() for term in text2.split(",") if term.strip()]
277
-
278
- dfs1, dfs2, filtered_df = df_w2w(search_terms1, search_terms2)
279
-
280
- if dfs1.empty and dfs2.empty:
281
- st.warning('We cannot find anything in your document.', icon="⚠️")
282
- elif dfs1.empty:
283
- st.warning(f'We cannot find {text1} in your document.', icon="⚠️")
284
- elif dfs2.empty:
285
- st.warning(f'We cannot find {text2} in your document.', icon="⚠️")
286
- else:
287
- with st.spinner('Processing. Please wait until the visualization comes up'):
288
- running_scattertext('Topic', 'First Term', 'Second Term')
289
 
290
- elif compare == 'Manual label':
291
- col1, col2, col3 = st.columns(3)
 
 
292
 
293
- df_col_sel = sorted([col for col in paper.columns.tolist()])
294
-
295
- column_selected = col1.selectbox(
296
- 'Choose column',
297
- (df_col_sel), on_change=reset_all)
298
 
299
- list_words = paper[column_selected].values.tolist()
300
- list_unique = sorted(list(set(list_words)))
301
-
302
- if column_selected is not None:
303
- label1 = col2.selectbox(
304
- 'Choose first label',
305
- (list_unique), on_change=reset_all)
306
 
307
- default_index = 0 if len(list_unique) == 1 else 1
308
- label2 = col3.selectbox(
309
- 'Choose second label',
310
- (list_unique), on_change=reset_all, index=default_index)
311
 
312
- filtered_df = paper[paper[column_selected].isin([label1, label2])].reset_index(drop=True)
313
-
314
- with st.spinner('Processing. Please wait until the visualization comes up'):
315
- running_scattertext(column_selected, label1, label2)
 
 
 
 
 
 
 
316
 
317
- elif compare == 'Sources':
318
- col1, col2, col3 = st.columns([4,0.1,4])
 
319
 
320
- unique_stitle = set()
321
- unique_stitle.update(paper['Source title'].dropna())
322
- list_stitle = sorted(list(unique_stitle))
323
-
324
- stitle1 = col1.selectbox(
325
- 'Choose first label',
326
- (list_stitle), on_change=reset_all)
327
- col2.write('')
328
- default_index = 0 if len(list_stitle) == 1 else 1
329
- stitle2 = col3.selectbox(
330
- 'Choose second label',
331
- (list_stitle), on_change=reset_all, index=default_index)
332
 
333
- filtered_df = df_sources(stitle1, stitle2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
- with st.spinner('Processing. Please wait until the visualization comes up'):
336
- running_scattertext('Source title', stitle1, stitle2)
337
 
338
- elif compare == 'Years':
339
- col1, col2, col3 = st.columns([4,0.1,4])
340
-
341
- MIN, MAX, GAP, MID = get_minmax(extype)
342
- if (GAP != 0):
343
- first_range = col1.slider('First Range', min_value=MIN, max_value=MAX, value=(MIN, MID), on_change=reset_all)
344
- col2.write('')
345
- second_range = col3.slider('Second Range', min_value=MIN, max_value=MAX, value=(MID, MAX), on_change=reset_all)
346
-
347
- filtered_df = df_years(first_range, second_range)
348
 
349
- with st.spinner('Processing. Please wait until the visualization comes up'):
350
- running_scattertext('Topic Range', 'First range', 'Second range')
351
-
352
- else:
353
- st.write('You only have data in ', (MAX))
354
-
355
- with tab2:
356
- st.markdown('**Jason Kessler. 2017. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. In Proceedings of ACL 2017, System Demonstrations, pages 85–90, Vancouver, Canada. Association for Computational Linguistics.** https://doi.org/10.48550/arXiv.1703.00565')
357
-
358
- with tab3:
359
- st.markdown('**Marrone, M., & Linnenluecke, M.K. (2020). Interdisciplinary Research Maps: A new technique for visualizing research topics. PLoS ONE, 15.** https://doi.org/10.1371/journal.pone.0242283')
360
- st.markdown('**Moreno, A., & Iglesias, C.A. (2021). Understanding Customers’ Transport Services with Topic Clustering and Sentiment Analysis. Applied Sciences.** https://doi.org/10.3390/app112110169')
361
- st.markdown('**Sánchez-Franco, M.J., & Rey-Tienda, S. (2023). The role of user-generated content in tourism decision-making: an exemplary study of Andalusia, Spain. Management Decision.** https://doi.org/10.1108/MD-06-2023-0966')
 
42
  st.subheader('Put your file here...', anchor=False)
43
 
44
  def reset_all():
45
+ st.cache_data.clear()
46
 
47
  @st.cache_data(ttl=3600)
48
  def get_ext(extype):
 
229
  uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
230
 
231
  if uploaded_file is not None:
232
+ try:
233
+ extype = get_ext(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
+ if extype.endswith('.csv'):
236
+ papers = upload(extype)
237
+ elif extype.endswith('.txt'):
238
+ papers = conv_txt(extype)
239
 
240
+ df_col, selected_cols = get_data(extype)
241
+ comparison = check_comparison(extype)
 
 
 
242
 
243
+ #Menu
244
+ c1, c2, c3 = st.columns([4,0.1,4])
245
+ ColCho = c1.selectbox(
246
+ 'Choose column to analyze',
247
+ (selected_cols), on_change=reset_all)
 
 
248
 
249
+ c2.write('')
 
 
 
250
 
251
+ compare = c3.selectbox(
252
+ 'Type of comparison',
253
+ (comparison), on_change=reset_all)
254
+
255
+ with st.expander("🧮 Show advance settings"):
256
+ y1, y2 = st.columns([8,2])
257
+ t1, t2 = st.columns([3,3])
258
+ words_to_remove = y1.text_input('Input your text', on_change=reset_all, placeholder='Remove specific words. Separate words by semicolons (;)')
259
+ min_term = y2.number_input("Minimum term count", min_value=0, max_value=10, value=3, step=1, on_change=reset_all)
260
+ rem_copyright = t1.toggle('Remove copyright statement', value=True, on_change=reset_all)
261
+ rem_punc = t2.toggle('Remove punctuation', value=False, on_change=reset_all)
262
 
263
+ st.info('Scattertext is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
264
+
265
+ paper = clean_csv(extype)
266
 
267
+ tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
 
 
 
 
 
 
 
 
 
 
 
268
 
269
+ with tab1:
270
+ #===visualization===
271
+ if compare == 'Word-to-word':
272
+ col1, col2, col3 = st.columns([4,0.1,4])
273
+ text1 = col1.text_input('First Term', on_change=reset_all, placeholder='put comma if you have more than one')
274
+ search_terms1 = [term.strip() for term in text1.split(",") if term.strip()]
275
+ col2.write('')
276
+ text2 = col3.text_input('Second Term', on_change=reset_all, placeholder='put comma if you have more than one')
277
+ search_terms2 = [term.strip() for term in text2.split(",") if term.strip()]
278
+
279
+ dfs1, dfs2, filtered_df = df_w2w(search_terms1, search_terms2)
280
+
281
+ if dfs1.empty and dfs2.empty:
282
+ st.warning('We cannot find anything in your document.', icon="⚠️")
283
+ elif dfs1.empty:
284
+ st.warning(f'We cannot find {text1} in your document.', icon="⚠️")
285
+ elif dfs2.empty:
286
+ st.warning(f'We cannot find {text2} in your document.', icon="⚠️")
287
+ else:
288
+ with st.spinner('Processing. Please wait until the visualization comes up'):
289
+ running_scattertext('Topic', 'First Term', 'Second Term')
290
+
291
+ elif compare == 'Manual label':
292
+ col1, col2, col3 = st.columns(3)
293
+
294
+ df_col_sel = sorted([col for col in paper.columns.tolist()])
295
+
296
+ column_selected = col1.selectbox(
297
+ 'Choose column',
298
+ (df_col_sel), on_change=reset_all)
299
+
300
+ list_words = paper[column_selected].values.tolist()
301
+ list_unique = sorted(list(set(list_words)))
302
+
303
+ if column_selected is not None:
304
+ label1 = col2.selectbox(
305
+ 'Choose first label',
306
+ (list_unique), on_change=reset_all)
307
+
308
+ default_index = 0 if len(list_unique) == 1 else 1
309
+ label2 = col3.selectbox(
310
+ 'Choose second label',
311
+ (list_unique), on_change=reset_all, index=default_index)
312
+
313
+ filtered_df = paper[paper[column_selected].isin([label1, label2])].reset_index(drop=True)
314
+
315
+ with st.spinner('Processing. Please wait until the visualization comes up'):
316
+ running_scattertext(column_selected, label1, label2)
317
+
318
+ elif compare == 'Sources':
319
+ col1, col2, col3 = st.columns([4,0.1,4])
320
+
321
+ unique_stitle = set()
322
+ unique_stitle.update(paper['Source title'].dropna())
323
+ list_stitle = sorted(list(unique_stitle))
324
+
325
+ stitle1 = col1.selectbox(
326
+ 'Choose first label',
327
+ (list_stitle), on_change=reset_all)
328
+ col2.write('')
329
+ default_index = 0 if len(list_stitle) == 1 else 1
330
+ stitle2 = col3.selectbox(
331
+ 'Choose second label',
332
+ (list_stitle), on_change=reset_all, index=default_index)
333
+
334
+ filtered_df = df_sources(stitle1, stitle2)
335
+
336
+ with st.spinner('Processing. Please wait until the visualization comes up'):
337
+ running_scattertext('Source title', stitle1, stitle2)
338
+
339
+ elif compare == 'Years':
340
+ col1, col2, col3 = st.columns([4,0.1,4])
341
+
342
+ MIN, MAX, GAP, MID = get_minmax(extype)
343
+ if (GAP != 0):
344
+ first_range = col1.slider('First Range', min_value=MIN, max_value=MAX, value=(MIN, MID), on_change=reset_all)
345
+ col2.write('')
346
+ second_range = col3.slider('Second Range', min_value=MIN, max_value=MAX, value=(MID, MAX), on_change=reset_all)
347
+
348
+ filtered_df = df_years(first_range, second_range)
349
+
350
+ with st.spinner('Processing. Please wait until the visualization comes up'):
351
+ running_scattertext('Topic Range', 'First range', 'Second range')
352
 
353
+ else:
354
+ st.write('You only have data in ', (MAX))
355
 
356
+ with tab2:
357
+ st.markdown('**Jason Kessler. 2017. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. In Proceedings of ACL 2017, System Demonstrations, pages 85–90, Vancouver, Canada. Association for Computational Linguistics.** https://doi.org/10.48550/arXiv.1703.00565')
 
 
 
 
 
 
 
 
358
 
359
+ with tab3:
360
+ st.markdown('**Marrone, M., & Linnenluecke, M.K. (2020). Interdisciplinary Research Maps: A new technique for visualizing research topics. PLoS ONE, 15.** https://doi.org/10.1371/journal.pone.0242283')
361
+ st.markdown('**Moreno, A., & Iglesias, C.A. (2021). Understanding Customers’ Transport Services with Topic Clustering and Sentiment Analysis. Applied Sciences.** https://doi.org/10.3390/app112110169')
362
+ st.markdown('**Sánchez-Franco, M.J., & Rey-Tienda, S. (2023). The role of user-generated content in tourism decision-making: an exemplary study of Andalusia, Spain. Management Decision.** https://doi.org/10.1108/MD-06-2023-0966')
363
+
364
+ except:
365
+ st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
366
+ st.stop()