faizhalas commited on
Commit
c2c9bab
β€’
1 Parent(s): e951f20

Create 2 Topic Modeling.py

Browse files
Files changed (1) hide show
  1. pages/2 Topic Modeling.py +329 -0
pages/2 Topic Modeling.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import module
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ import re
6
+ import nltk
7
+ nltk.download('wordnet')
8
+ from nltk.stem import WordNetLemmatizer
9
+ nltk.download('stopwords')
10
+ from nltk.corpus import stopwords
11
+ import gensim
12
+ import gensim.corpora as corpora
13
+ from gensim.corpora import Dictionary
14
+ from gensim.models.coherencemodel import CoherenceModel
15
+ from gensim.models.ldamodel import LdaModel
16
+ from pprint import pprint
17
+ import pickle
18
+ import pyLDAvis
19
+ import pyLDAvis.gensim_models as gensimvis
20
+ import matplotlib.pyplot as plt
21
+ import streamlit.components.v1 as components
22
+ from io import StringIO
23
+ from ipywidgets.embed import embed_minimal_html
24
+ from nltk.stem.snowball import SnowballStemmer
25
+ from bertopic import BERTopic
26
+ import plotly.express as px
27
+ from sklearn.cluster import KMeans
28
+ import bitermplus as btm
29
+ import tmplot as tmp
30
+ import tomotopy
31
+ import sys
32
+ import spacy
33
+ import en_core_web_sm
34
+ import pipeline
35
+
36
+
37
+ #===config===
38
+ st.set_page_config(
39
+ page_title="Coconut",
40
+ page_icon="πŸ₯₯",
41
+ layout="wide"
42
+ )
43
+ st.header("Topic Modeling")
44
+ st.subheader('Put your file here...')
45
+
46
+ #========unique id========
47
+ @st.cache_resource(ttl=3600)
48
+ def create_list():
49
+ l = [1, 2, 3]
50
+ return l
51
+
52
+ l = create_list()
53
+ first_list_value = l[0]
54
+ l[0] = first_list_value + 1
55
+ uID = str(l[0])
56
+
57
+ @st.cache_data(ttl=3600)
58
+ def get_ext(uploaded_file):
59
+ extype = uID+uploaded_file.name
60
+ return extype
61
+
62
+ #===clear cache===
63
+
64
+ def reset_biterm():
65
+ try:
66
+ biterm_map.clear()
67
+ biterm_bar.clear()
68
+ except NameError:
69
+ biterm_topic.clear()
70
+
71
+ def reset_all():
72
+ st.cache_data.clear()
73
+
74
+ #===clean csv===
75
+ @st.cache_data(ttl=3600, show_spinner=False)
76
+ def clean_csv(extype):
77
+ try:
78
+ paper = papers.dropna(subset=['Abstract'])
79
+ except KeyError:
80
+ st.error('Error: Please check your Abstract column.')
81
+ sys.exit(1)
82
+ paper = paper[~paper.Abstract.str.contains("No abstract available")]
83
+ paper = paper[~paper.Abstract.str.contains("STRAIT")]
84
+
85
+ #===mapping===
86
+ paper['Abstract_pre'] = paper['Abstract'].map(lambda x: re.sub('[,:;\.!-?β€’=]', '', x))
87
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: x.lower())
88
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('Β©.*', '', x))
89
+
90
+ #===stopword removal===
91
+ stop = stopwords.words('english')
92
+ paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
93
+
94
+ #===lemmatize===
95
+ lemmatizer = WordNetLemmatizer()
96
+ def lemmatize_words(text):
97
+ words = text.split()
98
+ words = [lemmatizer.lemmatize(word) for word in words]
99
+ return ' '.join(words)
100
+ paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
101
+
102
+ topic_abs = paper.Abstract_lem.values.tolist()
103
+ return topic_abs, paper
104
+
105
+ #===upload file===
106
+ @st.cache_data(ttl=3600)
107
+ def upload(file):
108
+ papers = pd.read_csv(uploaded_file)
109
+ return papers
110
+
111
+ @st.cache_data(ttl=3600)
112
+ def conv_txt(extype):
113
+ col_dict = {'TI': 'Title',
114
+ 'SO': 'Source title',
115
+ 'DT': 'Document Type',
116
+ 'AB': 'Abstract',
117
+ 'PY': 'Year'}
118
+ papers = pd.read_csv(uploaded_file, sep='\t', lineterminator='\r')
119
+ papers.rename(columns=col_dict, inplace=True)
120
+ return papers
121
+
122
+
123
+ #===Read data===
124
+ uploaded_file = st.file_uploader("Choose a file", type=['csv', 'txt'], on_change=reset_all)
125
+
126
+ if uploaded_file is not None:
127
+ extype = get_ext(uploaded_file)
128
+
129
+ if extype.endswith('.csv'):
130
+ papers = upload(extype)
131
+ elif extype.endswith('.txt'):
132
+ papers = conv_txt(extype)
133
+
134
+ topic_abs, paper=clean_csv(extype)
135
+ method = st.selectbox(
136
+ 'Choose method',
137
+ ('Choose...', 'pyLDA', 'Biterm','BERTopic'), on_change=reset_all)
138
+
139
+ #===topic===
140
+ if method == 'Choose...':
141
+ st.write('')
142
+
143
+ elif method == 'pyLDA':
144
+ num_topic = st.slider('Choose number of topics', min_value=2, max_value=15, step=1, on_change=reset_all)
145
+ @st.cache_data(ttl=3600, show_spinner=False)
146
+ def pylda(extype):
147
+ topic_abs_LDA = [t.split(' ') for t in topic_abs]
148
+ id2word = Dictionary(topic_abs_LDA)
149
+ corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
150
+ #===LDA===
151
+ lda_model = LdaModel(corpus=corpus,
152
+ id2word=id2word,
153
+ num_topics=num_topic,
154
+ random_state=0,
155
+ chunksize=100,
156
+ alpha='auto',
157
+ per_word_topics=True)
158
+
159
+ pprint(lda_model.print_topics())
160
+ doc_lda = lda_model[corpus]
161
+
162
+ #===visualization===
163
+ coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
164
+ coherence_lda = coherence_model_lda.get_coherence()
165
+ vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
166
+ py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
167
+ return py_lda_vis_html, coherence_lda
168
+
169
+ tab1, tab2, tab3 = st.tabs(["πŸ“ˆ Generate visualization & Calculate coherence", "πŸ“ƒ Reference", "πŸ““ Recommended Reading"])
170
+
171
+ with tab1:
172
+ #===visualization===
173
+ with st.spinner('Calculating and Creating pyLDAvis Visualization ...'):
174
+ py_lda_vis_html, coherence_lda = pylda(extype)
175
+ st.write('Coherence: ', (coherence_lda))
176
+ components.html(py_lda_vis_html, width=1700, height=800)
177
+ st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
178
+
179
+ with tab2:
180
+ st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
181
+
182
+ with tab3:
183
+ st.markdown('**Chen, X., & Wang, H. (2019, January). Automated chat transcript analysis using topic modeling for library reference services. Proceedings of the Association for Information Science and Technology, 56(1), 368–371.** https://doi.org/10.1002/pra2.31')
184
+ st.markdown('**Joo, S., Ingram, E., & Cahill, M. (2021, December 15). Exploring Topics and Genres in Storytime Books: A Text Mining Approach. Evidence Based Library and Information Practice, 16(4), 41–62.** https://doi.org/10.18438/eblip29963')
185
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Topic Modeling. Text Mining for Information Professionals, 105–137.** https://doi.org/10.1007/978-3-030-85085-2_4')
186
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477–505.** https://doi.org/10.1007/s11192-019-03137-5')
187
+
188
+ #===Biterm===
189
+ elif method == 'Biterm':
190
+ num_bitopic = st.slider('Choose number of topics', min_value=2, max_value=20, step=1, on_change=reset_all)
191
+ #===optimize Biterm===
192
+ @st.cache_data(ttl=3600)
193
+ def biterm_topic(extype):
194
+ X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs)
195
+ tf = np.array(X.sum(axis=0)).ravel()
196
+ docs_vec = btm.get_vectorized_docs(topic_abs, vocabulary)
197
+ docs_lens = list(map(len, docs_vec))
198
+ biterms = btm.get_biterms(docs_vec)
199
+ model = btm.BTM(
200
+ X, vocabulary, seed=12321, T=num_bitopic, M=20, alpha=50/8, beta=0.01)
201
+ model.fit(biterms, iterations=20)
202
+ p_zd = model.transform(docs_vec)
203
+ coherence = model.coherence_
204
+ phi = tmp.get_phi(model)
205
+ topics_coords = tmp.prepare_coords(model)
206
+ totaltop = topics_coords.label.values.tolist()
207
+ return topics_coords, phi, totaltop
208
+
209
+ try:
210
+ topics_coords, phi, totaltop = biterm_topic(extype)
211
+ #with st.spinner('Visualizing, please wait ....'):
212
+ tab1, tab2, tab3 = st.tabs(["πŸ“ˆ Generate visualization", "πŸ“ƒ Reference", "πŸ““ Recommended Reading"])
213
+ with tab1:
214
+ col1, col2 = st.columns(2)
215
+
216
+ @st.cache_data(ttl=3600)
217
+ def biterm_map(extype):
218
+ btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
219
+ return btmvis_coords
220
+
221
+ @st.cache_data(ttl=3600)
222
+ def biterm_bar(extype):
223
+ terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
224
+ btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
225
+ return btmvis_probs
226
+
227
+ with col1:
228
+ numvis = st.selectbox(
229
+ 'Choose topic',
230
+ (totaltop), on_change=reset_biterm)
231
+ btmvis_coords = biterm_map(extype)
232
+ st.altair_chart(btmvis_coords, use_container_width=True)
233
+ with col2:
234
+ btmvis_probs = biterm_bar(extype)
235
+ st.altair_chart(btmvis_probs, use_container_width=True)
236
+
237
+ with tab2:
238
+ st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
239
+ with tab3:
240
+ st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
241
+ st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
242
+ st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
243
+ st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
244
+
245
+ except ValueError:
246
+ st.error('Please raise the number of topics')
247
+
248
+ #===BERTopic===
249
+ elif method == 'BERTopic':
250
+ num_btopic = st.slider('Choose number of topics', min_value=4, max_value=20, step=1, on_change=reset_all)
251
+ @st.cache_data(ttl=3600)
252
+ def bertopic_vis(extype):
253
+ topic_time = paper.Year.values.tolist()
254
+ cluster_model = KMeans(n_clusters=num_btopic)
255
+ nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
256
+ topic_model = BERTopic(embedding_model=nlp, hdbscan_model=cluster_model, language="multilingual").fit(topic_abs)
257
+ topics, probs = topic_model.fit_transform(topic_abs)
258
+ return topic_model, topic_time, topics, probs
259
+
260
+ @st.cache_data(ttl=3600)
261
+ def Vis_Topics(extype):
262
+ fig1 = topic_model.visualize_topics()
263
+ return fig1
264
+
265
+ @st.cache_data(ttl=3600)
266
+ def Vis_Documents(extype):
267
+ fig2 = topic_model.visualize_documents(topic_abs)
268
+ return fig2
269
+
270
+ @st.cache_data(ttl=3600)
271
+ def Vis_Hierarchy(extype):
272
+ fig3 = topic_model.visualize_hierarchy(top_n_topics=num_btopic)
273
+ return fig3
274
+
275
+ @st.cache_data(ttl=3600)
276
+ def Vis_Heatmap(extype):
277
+ global topic_model
278
+ fig4 = topic_model.visualize_heatmap(n_clusters=num_btopic-1, width=1000, height=1000)
279
+ return fig4
280
+
281
+ @st.cache_data(ttl=3600)
282
+ def Vis_Barchart(extype):
283
+ fig5 = topic_model.visualize_barchart(top_n_topics=num_btopic, n_words=10)
284
+ return fig5
285
+
286
+ @st.cache_data(ttl=3600)
287
+ def Vis_ToT(extype):
288
+ topics_over_time = topic_model.topics_over_time(topic_abs, topic_time)
289
+ fig6 = topic_model.visualize_topics_over_time(topics_over_time)
290
+ return fig6
291
+
292
+ tab1, tab2, tab3 = st.tabs(["πŸ“ˆ Generate visualization", "πŸ“ƒ Reference", "πŸ““ Recommended Reading"])
293
+ with tab1:
294
+ topic_model, topic_time, topics, probs = bertopic_vis(extype)
295
+ #===visualization===
296
+ viz = st.selectbox(
297
+ 'Choose visualization',
298
+ ('Visualize Topics', 'Visualize Documents', 'Visualize Document Hierarchy', 'Visualize Topic Similarity', 'Visualize Terms', 'Visualize Topics over Time'))
299
+
300
+ if viz == 'Visualize Topics':
301
+ fig1 = Vis_Topics(extype)
302
+ st.write(fig1)
303
+
304
+ elif viz == 'Visualize Documents':
305
+ fig2 = Vis_Documents(extype)
306
+ st.write(fig2)
307
+
308
+ elif viz == 'Visualize Document Hierarchy':
309
+ fig3 = Vis_Hierarchy(extype)
310
+ st.write(fig3)
311
+
312
+ elif viz == 'Visualize Topic Similarity':
313
+ fig4 = Vis_Heatmap(extype)
314
+ st.write(fig4)
315
+
316
+ elif viz == 'Visualize Terms':
317
+ fig5 = Vis_Barchart(extype)
318
+ st.write(fig5)
319
+
320
+ elif viz == 'Visualize Topics over Time':
321
+ fig6 = Vis_ToT(extype)
322
+ st.write(fig6)
323
+
324
+ with tab2:
325
+ st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')
326
+
327
+ with tab3:
328
+ st.markdown('**Jeet Rawat, A., Ghildiyal, S., & Dixit, A. K. (2022, December 1). Topic modelling of legal documents using NLP and bidirectional encoder representations from transformers. Indonesian Journal of Electrical Engineering and Computer Science, 28(3), 1749.** https://doi.org/10.11591/ijeecs.v28.i3.pp1749-1755')
329
+ st.markdown('**Yao, L. F., Ferawati, K., Liew, K., Wakamiya, S., & Aramaki, E. (2023, April 20). Disruptions in the Cystic Fibrosis Community’s Experiences and Concerns During the COVID-19 Pandemic: Topic Modeling and Time Series Analysis of Reddit Comments. Journal of Medical Internet Research, 25, e45249.** https://doi.org/10.2196/45249')