faizhalas commited on
Commit
89a3b96
1 Parent(s): f365120

Update pages/2 Topic Modeling.py

Browse files
Files changed (1) hide show
  1. pages/2 Topic Modeling.py +312 -308
pages/2 Topic Modeling.py CHANGED
@@ -87,14 +87,14 @@ def get_ext(uploaded_file):
87
  #===clear cache===
88
 
89
  def reset_biterm():
90
- try:
91
- biterm_map.clear()
92
- biterm_bar.clear()
93
- except NameError:
94
- biterm_topic.clear()
95
 
96
  def reset_all():
97
- st.cache_data.clear()
98
 
99
  #===avoiding deadlock===
100
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -121,321 +121,325 @@ def conv_txt(extype):
121
  uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
122
 
123
  if uploaded_file is not None:
124
- extype = get_ext(uploaded_file)
125
-
126
- if extype.endswith('.csv'):
127
- papers = upload(extype)
128
- elif extype.endswith('.txt'):
129
- papers = conv_txt(extype)
130
-
131
- coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
132
-
133
- c1, c2 = st.columns([3,4])
134
- method = c1.selectbox(
135
- 'Choose method',
136
- ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'), on_change=reset_all)
137
- num_cho = c1.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
138
- ColCho = c2.selectbox(
139
- 'Choose column',
140
- (coldf), on_change=reset_all)
141
- words_to_remove = c2.text_input("Remove specific words. Separate words by semicolons (;)")
142
- rem_copyright = c1.toggle('Remove copyright statement', value=True, on_change=reset_all)
143
- rem_punc = c2.toggle('Remove punctuation', value=True, on_change=reset_all)
144
-
145
- #===clean csv===
146
- @st.cache_data(ttl=3600, show_spinner=False)
147
- def clean_csv(extype):
148
- paper = papers.dropna(subset=[ColCho])
149
-
150
- #===mapping===
151
- paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
152
- if rem_punc:
153
- paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('[,:;\.!-?•=]', ' ', x))
154
- paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('\u201c|\u201d', '', regex=True)
155
- if rem_copyright:
156
- paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('©.*', '', x))
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
- #===stopword removal===
159
- stop = stopwords.words('english')
160
- paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
161
-
162
- #===lemmatize===
163
- lemmatizer = WordNetLemmatizer()
164
- def lemmatize_words(text):
165
- words = text.split()
166
- words = [lemmatizer.lemmatize(word) for word in words]
167
- return ' '.join(words)
168
- paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
- words_rmv = [word.strip() for word in words_to_remove.split(";")]
171
- remove_dict = {word: None for word in words_rmv}
172
- def remove_words(text):
173
- words = text.split()
174
- cleaned_words = [word for word in words if word not in remove_dict]
175
- return ' '.join(cleaned_words)
176
- paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
 
 
 
 
 
 
 
 
177
 
178
- topic_abs = paper.Abstract_lem.values.tolist()
179
- return topic_abs, paper
180
-
181
- d1, d2 = st.columns([7,3])
182
- d2.info("Don't do anything during the computing", icon="⚠️")
183
- topic_abs, paper=clean_csv(extype)
184
-
185
- #===advance settings===
186
- with d1.expander("🧮 Show advance settings"):
187
- t1, t2 = st.columns([5,5])
188
- if method == 'pyLDA':
189
- py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1)
190
- py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
191
- elif method == 'Biterm':
192
- btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1)
193
- btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1)
194
- elif method == 'BERTopic':
195
- bert_top_n_words = t1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1)
196
- bert_random_state = t1.number_input('random_state', value=42 , min_value=1, max_value=None, step=1)
197
- bert_n_components = t2.number_input('n_components', value=5 , min_value=1, max_value=None, step=1)
198
- bert_n_neighbors = t2.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1)
199
- bert_embedding_model = st.radio(
200
- "embedding_model",
201
- ["all-MiniLM-L6-v2", "paraphrase-multilingual-MiniLM-L12-v2", "en_core_web_md"], index=0, horizontal=True)
202
- else:
203
- st.write('Please choose your preferred method')
204
- if st.button("Submit", on_click=reset_all):
205
- num_topic = num_cho
206
-
207
- if method == 'BERTopic':
208
- st.info('BERTopic is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
209
-
210
- #===topic===
211
- if method == 'Choose...':
212
- st.write('')
213
-
214
- elif method == 'pyLDA':
215
- tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
216
-
217
- with tab1:
218
- #===visualization===
219
- @st.cache_data(ttl=3600, show_spinner=False)
220
- def pylda(extype):
221
- topic_abs_LDA = [t.split(' ') for t in topic_abs]
222
- id2word = Dictionary(topic_abs_LDA)
223
- corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
224
- #===LDA===
225
- lda_model = LdaModel(corpus=corpus,
226
- id2word=id2word,
227
- num_topics=num_topic,
228
- random_state=py_random_state,
229
- chunksize=py_chunksize,
230
- alpha='auto',
231
- per_word_topics=True)
232
-
233
- pprint(lda_model.print_topics())
234
- doc_lda = lda_model[corpus]
235
-
236
- #===visualization===
237
- coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
238
- coherence_lda = coherence_model_lda.get_coherence()
239
- vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
240
- py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
241
- return py_lda_vis_html, coherence_lda, vis
242
-
243
- with st.spinner('Performing computations. Please wait ...'):
244
- try:
245
  py_lda_vis_html, coherence_lda, vis = pylda(extype)
246
  st.write('Coherence score: ', coherence_lda)
247
  components.html(py_lda_vis_html, width=1500, height=800)
248
  st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
249
-
250
  @st.cache_data(ttl=3600, show_spinner=False)
251
  def img_lda(vis):
252
- pyLDAvis.save_html(vis, 'output.html')
253
- hti = Html2Image()
254
- hti.browser.flags = ['--default-background-color=ffffff', '--hide-scrollbars']
255
- css = "body {background: white;}"
256
- hti.screenshot(
257
- other_file='output.html', css_str=css, size=(1500, 800),
258
- save_as='ldavis_img.png'
259
- )
260
-
261
  img_lda(vis)
262
  with open("ldavis_img.png", "rb") as file:
263
- btn = st.download_button(
264
- label="Download image",
265
- data=file,
266
- file_name="ldavis_img.png",
267
- mime="image/png"
268
- )
269
-
270
- except NameError:
271
  st.warning('🖱️ Please click Submit')
272
-
273
- with tab2:
274
- st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
275
-
276
- with tab3:
277
- st.markdown('**Chen, X., & Wang, H. (2019, January). Automated chat transcript analysis using topic modeling for library reference services. Proceedings of the Association for Information Science and Technology, 56(1), 368–371.** https://doi.org/10.1002/pra2.31')
278
- st.markdown('**Joo, S., Ingram, E., & Cahill, M. (2021, December 15). Exploring Topics and Genres in Storytime Books: A Text Mining Approach. Evidence Based Library and Information Practice, 16(4), 41–62.** https://doi.org/10.18438/eblip29963')
279
- st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Topic Modeling. Text Mining for Information Professionals, 105–137.** https://doi.org/10.1007/978-3-030-85085-2_4')
280
- st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477–505.** https://doi.org/10.1007/s11192-019-03137-5')
281
-
282
- #===Biterm===
283
- elif method == 'Biterm':
284
-
285
- #===optimize Biterm===
286
- @st.cache_data(ttl=3600, show_spinner=False)
287
- def biterm_topic(extype):
288
- X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs)
289
- tf = np.array(X.sum(axis=0)).ravel()
290
- docs_vec = btm.get_vectorized_docs(topic_abs, vocabulary)
291
- docs_lens = list(map(len, docs_vec))
292
- biterms = btm.get_biterms(docs_vec)
293
- model = btm.BTM(
294
- X, vocabulary, seed=btm_seed, T=num_topic, M=20, alpha=50/8, beta=0.01)
295
- model.fit(biterms, iterations=btm_iterations)
296
- p_zd = model.transform(docs_vec)
297
- coherence = model.coherence_
298
- phi = tmp.get_phi(model)
299
- topics_coords = tmp.prepare_coords(model)
300
- totaltop = topics_coords.label.values.tolist()
301
- perplexity = model.perplexity_
302
- return topics_coords, phi, totaltop, perplexity
303
-
304
- tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
305
- with tab1:
306
- try:
307
- with st.spinner('Performing computations. Please wait ...'):
308
- topics_coords, phi, totaltop, perplexity = biterm_topic(extype)
309
- col1, col2 = st.columns([4,6])
310
-
311
- @st.cache_data(ttl=3600)
312
- def biterm_map(extype):
313
- btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
314
- return btmvis_coords
315
-
316
- @st.cache_data(ttl=3600)
317
- def biterm_bar(extype):
318
- terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
319
- btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
320
- return btmvis_probs
321
-
322
- with col1:
323
- st.write('Perplexity score: ', perplexity)
324
- st.write('')
325
- numvis = st.selectbox(
326
- 'Choose topic',
327
- (totaltop), on_change=reset_biterm)
328
- btmvis_coords = biterm_map(extype)
329
- st.altair_chart(btmvis_coords)
330
- with col2:
331
- btmvis_probs = biterm_bar(extype)
332
- st.altair_chart(btmvis_probs, use_container_width=True)
333
-
334
- except ValueError:
335
- st.error('🙇‍♂️ Please raise the number of topics and click submit')
336
- except NameError:
337
- st.warning('🖱️ Please click Submit')
338
-
339
- with tab2:
340
- st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
341
- with tab3:
342
- st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
343
- st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
344
- st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
345
- st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
346
-
347
- #===BERTopic===
348
- elif method == 'BERTopic':
349
- @st.cache_data(ttl=3600, show_spinner=False)
350
- def bertopic_vis(extype):
351
- umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
352
- min_dist=0.0, metric='cosine', random_state=bert_random_state)
353
- cluster_model = KMeans(n_clusters=num_topic)
354
- if bert_embedding_model == 'all-MiniLM-L6-v2':
355
- emb_mod = 'all-MiniLM-L6-v2'
356
- lang = 'en'
357
- elif bert_embedding_model == 'en_core_web_md':
358
- emb_mod = en_core_web_md.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
359
- lang = 'en'
360
- elif bert_embedding_model == 'paraphrase-multilingual-MiniLM-L12-v2':
361
- emb_mod = 'paraphrase-multilingual-MiniLM-L12-v2'
362
- lang = 'multilingual'
363
- topic_model = BERTopic(embedding_model=emb_mod, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, top_n_words=bert_top_n_words)
364
- topics, probs = topic_model.fit_transform(topic_abs)
365
- return topic_model, topics, probs
366
-
367
- @st.cache_data(ttl=3600, show_spinner=False)
368
- def Vis_Topics(extype):
369
- fig1 = topic_model.visualize_topics()
370
- return fig1
 
 
 
 
 
 
 
 
 
371
 
372
- @st.cache_data(ttl=3600, show_spinner=False)
373
- def Vis_Documents(extype):
374
- fig2 = topic_model.visualize_documents(topic_abs)
375
- return fig2
376
-
377
- @st.cache_data(ttl=3600, show_spinner=False)
378
- def Vis_Hierarchy(extype):
379
- fig3 = topic_model.visualize_hierarchy(top_n_topics=num_topic)
380
- return fig3
381
 
382
- @st.cache_data(ttl=3600, show_spinner=False)
383
- def Vis_Heatmap(extype):
384
- global topic_model
385
- fig4 = topic_model.visualize_heatmap(n_clusters=num_topic-1, width=1000, height=1000)
386
- return fig4
387
-
388
- @st.cache_data(ttl=3600, show_spinner=False)
389
- def Vis_Barchart(extype):
390
- fig5 = topic_model.visualize_barchart(top_n_topics=num_topic)
391
- return fig5
392
-
393
- tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
394
- with tab1:
395
- try:
396
- with st.spinner('Performing computations. Please wait ...'):
397
-
398
- topic_model, topics, probs = bertopic_vis(extype)
399
- time.sleep(.5)
400
- st.toast('Visualize Topics', icon='🏃')
401
- fig1 = Vis_Topics(extype)
402
-
403
- time.sleep(.5)
404
- st.toast('Visualize Document', icon='🏃')
405
- fig2 = Vis_Documents(extype)
406
-
407
- time.sleep(.5)
408
- st.toast('Visualize Document Hierarchy', icon='🏃')
409
- fig3 = Vis_Hierarchy(extype)
410
-
411
- time.sleep(.5)
412
- st.toast('Visualize Topic Similarity', icon='🏃')
413
- fig4 = Vis_Heatmap(extype)
414
-
415
- time.sleep(.5)
416
- st.toast('Visualize Terms', icon='🏃')
417
- fig5 = Vis_Barchart(extype)
418
 
419
- with st.expander("Visualize Topics"):
420
- st.write(fig1)
421
- with st.expander("Visualize Terms"):
422
- st.write(fig5)
423
- with st.expander("Visualize Documents"):
424
- st.write(fig2)
425
- with st.expander("Visualize Document Hierarchy"):
426
- st.write(fig3)
427
- with st.expander("Visualize Topic Similarity"):
428
- st.write(fig4)
429
-
430
- except ValueError:
431
- st.error('🙇‍♂️ Please raise the number of topics and click submit')
432
-
433
- except NameError:
434
- st.warning('🖱️ Please click Submit')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
 
436
- with tab2:
437
- st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')
438
-
439
- with tab3:
440
- st.markdown('**Jeet Rawat, A., Ghildiyal, S., & Dixit, A. K. (2022, December 1). Topic modelling of legal documents using NLP and bidirectional encoder representations from transformers. Indonesian Journal of Electrical Engineering and Computer Science, 28(3), 1749.** https://doi.org/10.11591/ijeecs.v28.i3.pp1749-1755')
441
- st.markdown('**Yao, L. F., Ferawati, K., Liew, K., Wakamiya, S., & Aramaki, E. (2023, April 20). Disruptions in the Cystic Fibrosis Community’s Experiences and Concerns During the COVID-19 Pandemic: Topic Modeling and Time Series Analysis of Reddit Comments. Journal of Medical Internet Research, 25, e45249.** https://doi.org/10.2196/45249')
 
87
  #===clear cache===
88
 
89
  def reset_biterm():
90
+ try:
91
+ biterm_map.clear()
92
+ biterm_bar.clear()
93
+ except NameError:
94
+ biterm_topic.clear()
95
 
96
  def reset_all():
97
+ st.cache_data.clear()
98
 
99
  #===avoiding deadlock===
100
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
121
  uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
122
 
123
  if uploaded_file is not None:
124
+ try:
125
+ extype = get_ext(uploaded_file)
126
+
127
+ if extype.endswith('.csv'):
128
+ papers = upload(extype)
129
+ elif extype.endswith('.txt'):
130
+ papers = conv_txt(extype)
131
+
132
+ coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
133
+
134
+ c1, c2 = st.columns([3,4])
135
+ method = c1.selectbox(
136
+ 'Choose method',
137
+ ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'), on_change=reset_all)
138
+ num_cho = c1.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
139
+ ColCho = c2.selectbox(
140
+ 'Choose column',
141
+ (coldf), on_change=reset_all)
142
+ words_to_remove = c2.text_input("Remove specific words. Separate words by semicolons (;)")
143
+ rem_copyright = c1.toggle('Remove copyright statement', value=True, on_change=reset_all)
144
+ rem_punc = c2.toggle('Remove punctuation', value=True, on_change=reset_all)
145
+
146
+ #===clean csv===
147
+ @st.cache_data(ttl=3600, show_spinner=False)
148
+ def clean_csv(extype):
149
+ paper = papers.dropna(subset=[ColCho])
150
+
151
+ #===mapping===
152
+ paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
153
+ if rem_punc:
154
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('[,:;\.!-?•=]', ' ', x))
155
+ paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('\u201c|\u201d', '', regex=True)
156
+ if rem_copyright:
157
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('©.*', '', x))
158
+
159
+ #===stopword removal===
160
+ stop = stopwords.words('english')
161
+ paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
162
+
163
+ #===lemmatize===
164
+ lemmatizer = WordNetLemmatizer()
165
+ def lemmatize_words(text):
166
+ words = text.split()
167
+ words = [lemmatizer.lemmatize(word) for word in words]
168
+ return ' '.join(words)
169
+ paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
170
 
171
+ words_rmv = [word.strip() for word in words_to_remove.split(";")]
172
+ remove_dict = {word: None for word in words_rmv}
173
+ def remove_words(text):
174
+ words = text.split()
175
+ cleaned_words = [word for word in words if word not in remove_dict]
176
+ return ' '.join(cleaned_words)
177
+ paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
178
+
179
+ topic_abs = paper.Abstract_lem.values.tolist()
180
+ return topic_abs, paper
181
+
182
+ d1, d2 = st.columns([7,3])
183
+ d2.info("Don't do anything during the computing", icon="⚠️")
184
+ topic_abs, paper=clean_csv(extype)
185
+
186
+ #===advance settings===
187
+ with d1.expander("🧮 Show advance settings"):
188
+ t1, t2 = st.columns([5,5])
189
+ if method == 'pyLDA':
190
+ py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1)
191
+ py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
192
+ elif method == 'Biterm':
193
+ btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1)
194
+ btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1)
195
+ elif method == 'BERTopic':
196
+ bert_top_n_words = t1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1)
197
+ bert_random_state = t1.number_input('random_state', value=42 , min_value=1, max_value=None, step=1)
198
+ bert_n_components = t2.number_input('n_components', value=5 , min_value=1, max_value=None, step=1)
199
+ bert_n_neighbors = t2.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1)
200
+ bert_embedding_model = st.radio(
201
+ "embedding_model",
202
+ ["all-MiniLM-L6-v2", "paraphrase-multilingual-MiniLM-L12-v2", "en_core_web_md"], index=0, horizontal=True)
203
+ else:
204
+ st.write('Please choose your preferred method')
205
+ if st.button("Submit", on_click=reset_all):
206
+ num_topic = num_cho
207
+
208
+ if method == 'BERTopic':
209
+ st.info('BERTopic is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
210
+
211
+ #===topic===
212
+ if method == 'Choose...':
213
+ st.write('')
214
+
215
+ elif method == 'pyLDA':
216
+ tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
217
 
218
+ with tab1:
219
+ #===visualization===
220
+ @st.cache_data(ttl=3600, show_spinner=False)
221
+ def pylda(extype):
222
+ topic_abs_LDA = [t.split(' ') for t in topic_abs]
223
+ id2word = Dictionary(topic_abs_LDA)
224
+ corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
225
+ #===LDA===
226
+ lda_model = LdaModel(corpus=corpus,
227
+ id2word=id2word,
228
+ num_topics=num_topic,
229
+ random_state=py_random_state,
230
+ chunksize=py_chunksize,
231
+ alpha='auto',
232
+ per_word_topics=True)
233
 
234
+ pprint(lda_model.print_topics())
235
+ doc_lda = lda_model[corpus]
236
+
237
+ #===visualization===
238
+ coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
239
+ coherence_lda = coherence_model_lda.get_coherence()
240
+ vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
241
+ py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
242
+ return py_lda_vis_html, coherence_lda, vis
243
+
244
+ with st.spinner('Performing computations. Please wait ...'):
245
+ try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  py_lda_vis_html, coherence_lda, vis = pylda(extype)
247
  st.write('Coherence score: ', coherence_lda)
248
  components.html(py_lda_vis_html, width=1500, height=800)
249
  st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
250
+
251
  @st.cache_data(ttl=3600, show_spinner=False)
252
  def img_lda(vis):
253
+ pyLDAvis.save_html(vis, 'output.html')
254
+ hti = Html2Image()
255
+ hti.browser.flags = ['--default-background-color=ffffff', '--hide-scrollbars']
256
+ css = "body {background: white;}"
257
+ hti.screenshot(
258
+ other_file='output.html', css_str=css, size=(1500, 800),
259
+ save_as='ldavis_img.png'
260
+ )
261
+
262
  img_lda(vis)
263
  with open("ldavis_img.png", "rb") as file:
264
+ btn = st.download_button(
265
+ label="Download image",
266
+ data=file,
267
+ file_name="ldavis_img.png",
268
+ mime="image/png"
269
+ )
270
+
271
+ except NameError:
272
  st.warning('🖱️ Please click Submit')
273
+
274
+ with tab2:
275
+ st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
276
+
277
+ with tab3:
278
+ st.markdown('**Chen, X., & Wang, H. (2019, January). Automated chat transcript analysis using topic modeling for library reference services. Proceedings of the Association for Information Science and Technology, 56(1), 368–371.** https://doi.org/10.1002/pra2.31')
279
+ st.markdown('**Joo, S., Ingram, E., & Cahill, M. (2021, December 15). Exploring Topics and Genres in Storytime Books: A Text Mining Approach. Evidence Based Library and Information Practice, 16(4), 41–62.** https://doi.org/10.18438/eblip29963')
280
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Topic Modeling. Text Mining for Information Professionals, 105–137.** https://doi.org/10.1007/978-3-030-85085-2_4')
281
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477–505.** https://doi.org/10.1007/s11192-019-03137-5')
282
+
283
+ #===Biterm===
284
+ elif method == 'Biterm':
285
+
286
+ #===optimize Biterm===
287
+ @st.cache_data(ttl=3600, show_spinner=False)
288
+ def biterm_topic(extype):
289
+ X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs)
290
+ tf = np.array(X.sum(axis=0)).ravel()
291
+ docs_vec = btm.get_vectorized_docs(topic_abs, vocabulary)
292
+ docs_lens = list(map(len, docs_vec))
293
+ biterms = btm.get_biterms(docs_vec)
294
+ model = btm.BTM(X, vocabulary, seed=btm_seed, T=num_topic, M=20, alpha=50/8, beta=0.01)
295
+ model.fit(biterms, iterations=btm_iterations)
296
+ p_zd = model.transform(docs_vec)
297
+ coherence = model.coherence_
298
+ phi = tmp.get_phi(model)
299
+ topics_coords = tmp.prepare_coords(model)
300
+ totaltop = topics_coords.label.values.tolist()
301
+ perplexity = model.perplexity_
302
+ return topics_coords, phi, totaltop, perplexity
303
+
304
+ tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
305
+ with tab1:
306
+ try:
307
+ with st.spinner('Performing computations. Please wait ...'):
308
+ topics_coords, phi, totaltop, perplexity = biterm_topic(extype)
309
+ col1, col2 = st.columns([4,6])
310
+
311
+ @st.cache_data(ttl=3600)
312
+ def biterm_map(extype):
313
+ btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
314
+ return btmvis_coords
315
+
316
+ @st.cache_data(ttl=3600)
317
+ def biterm_bar(extype):
318
+ terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
319
+ btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
320
+ return btmvis_probs
321
+
322
+ with col1:
323
+ st.write('Perplexity score: ', perplexity)
324
+ st.write('')
325
+ numvis = st.selectbox(
326
+ 'Choose topic',
327
+ (totaltop), on_change=reset_biterm)
328
+ btmvis_coords = biterm_map(extype)
329
+ st.altair_chart(btmvis_coords)
330
+ with col2:
331
+ btmvis_probs = biterm_bar(extype)
332
+ st.altair_chart(btmvis_probs, use_container_width=True)
333
+
334
+ except ValueError:
335
+ st.error('🙇‍♂️ Please raise the number of topics and click submit')
336
+ except NameError:
337
+ st.warning('🖱️ Please click Submit')
338
+
339
+ with tab2:
340
+ st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
341
+ with tab3:
342
+ st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
343
+ st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
344
+ st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
345
+ st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
346
+
347
+ #===BERTopic===
348
+ elif method == 'BERTopic':
349
+ @st.cache_data(ttl=3600, show_spinner=False)
350
+ def bertopic_vis(extype):
351
+ umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
352
+ min_dist=0.0, metric='cosine', random_state=bert_random_state)
353
+ cluster_model = KMeans(n_clusters=num_topic)
354
+ if bert_embedding_model == 'all-MiniLM-L6-v2':
355
+ emb_mod = 'all-MiniLM-L6-v2'
356
+ lang = 'en'
357
+ elif bert_embedding_model == 'en_core_web_md':
358
+ emb_mod = en_core_web_md.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
359
+ lang = 'en'
360
+ elif bert_embedding_model == 'paraphrase-multilingual-MiniLM-L12-v2':
361
+ emb_mod = 'paraphrase-multilingual-MiniLM-L12-v2'
362
+ lang = 'multilingual'
363
+ topic_model = BERTopic(embedding_model=emb_mod, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, top_n_words=bert_top_n_words)
364
+ topics, probs = topic_model.fit_transform(topic_abs)
365
+ return topic_model, topics, probs
366
+
367
+ @st.cache_data(ttl=3600, show_spinner=False)
368
+ def Vis_Topics(extype):
369
+ fig1 = topic_model.visualize_topics()
370
+ return fig1
371
+
372
+ @st.cache_data(ttl=3600, show_spinner=False)
373
+ def Vis_Documents(extype):
374
+ fig2 = topic_model.visualize_documents(topic_abs)
375
+ return fig2
376
+
377
+ @st.cache_data(ttl=3600, show_spinner=False)
378
+ def Vis_Hierarchy(extype):
379
+ fig3 = topic_model.visualize_hierarchy(top_n_topics=num_topic)
380
+ return fig3
381
 
382
+ @st.cache_data(ttl=3600, show_spinner=False)
383
+ def Vis_Heatmap(extype):
384
+ global topic_model
385
+ fig4 = topic_model.visualize_heatmap(n_clusters=num_topic-1, width=1000, height=1000)
386
+ return fig4
 
 
 
 
387
 
388
+ @st.cache_data(ttl=3600, show_spinner=False)
389
+ def Vis_Barchart(extype):
390
+ fig5 = topic_model.visualize_barchart(top_n_topics=num_topic)
391
+ return fig5
392
+
393
+ tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
394
+ with tab1:
395
+ try:
396
+ with st.spinner('Performing computations. Please wait ...'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
+ topic_model, topics, probs = bertopic_vis(extype)
399
+ time.sleep(.5)
400
+ st.toast('Visualize Topics', icon='🏃')
401
+ fig1 = Vis_Topics(extype)
402
+
403
+ time.sleep(.5)
404
+ st.toast('Visualize Document', icon='🏃')
405
+ fig2 = Vis_Documents(extype)
406
+
407
+ time.sleep(.5)
408
+ st.toast('Visualize Document Hierarchy', icon='🏃')
409
+ fig3 = Vis_Hierarchy(extype)
410
+
411
+ time.sleep(.5)
412
+ st.toast('Visualize Topic Similarity', icon='🏃')
413
+ fig4 = Vis_Heatmap(extype)
414
+
415
+ time.sleep(.5)
416
+ st.toast('Visualize Terms', icon='🏃')
417
+ fig5 = Vis_Barchart(extype)
418
+
419
+ with st.expander("Visualize Topics"):
420
+ st.write(fig1)
421
+ with st.expander("Visualize Terms"):
422
+ st.write(fig5)
423
+ with st.expander("Visualize Documents"):
424
+ st.write(fig2)
425
+ with st.expander("Visualize Document Hierarchy"):
426
+ st.write(fig3)
427
+ with st.expander("Visualize Topic Similarity"):
428
+ st.write(fig4)
429
+
430
+ except ValueError:
431
+ st.error('🙇‍���️ Please raise the number of topics and click submit')
432
+
433
+ except NameError:
434
+ st.warning('🖱️ Please click Submit')
435
+
436
+ with tab2:
437
+ st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')
438
+
439
+ with tab3:
440
+ st.markdown('**Jeet Rawat, A., Ghildiyal, S., & Dixit, A. K. (2022, December 1). Topic modelling of legal documents using NLP and bidirectional encoder representations from transformers. Indonesian Journal of Electrical Engineering and Computer Science, 28(3), 1749.** https://doi.org/10.11591/ijeecs.v28.i3.pp1749-1755')
441
+ st.markdown('**Yao, L. F., Ferawati, K., Liew, K., Wakamiya, S., & Aramaki, E. (2023, April 20). Disruptions in the Cystic Fibrosis Community’s Experiences and Concerns During the COVID-19 Pandemic: Topic Modeling and Time Series Analysis of Reddit Comments. Journal of Medical Internet Research, 25, e45249.** https://doi.org/10.2196/45249')
442
 
443
+ except:
444
+ st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
445
+ st.stop()