Spaces:
Running
Running
Update pages/2 Topic Modeling.py
Browse files- pages/2 Topic Modeling.py +312 -308
pages/2 Topic Modeling.py
CHANGED
@@ -87,14 +87,14 @@ def get_ext(uploaded_file):
|
|
87 |
#===clear cache===
|
88 |
|
89 |
def reset_biterm():
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
|
96 |
def reset_all():
|
97 |
-
|
98 |
|
99 |
#===avoiding deadlock===
|
100 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
@@ -121,321 +121,325 @@ def conv_txt(extype):
|
|
121 |
uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
|
122 |
|
123 |
if uploaded_file is not None:
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
return
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
|
191 |
-
elif method == 'Biterm':
|
192 |
-
btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1)
|
193 |
-
btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1)
|
194 |
-
elif method == 'BERTopic':
|
195 |
-
bert_top_n_words = t1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1)
|
196 |
-
bert_random_state = t1.number_input('random_state', value=42 , min_value=1, max_value=None, step=1)
|
197 |
-
bert_n_components = t2.number_input('n_components', value=5 , min_value=1, max_value=None, step=1)
|
198 |
-
bert_n_neighbors = t2.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1)
|
199 |
-
bert_embedding_model = st.radio(
|
200 |
-
"embedding_model",
|
201 |
-
["all-MiniLM-L6-v2", "paraphrase-multilingual-MiniLM-L12-v2", "en_core_web_md"], index=0, horizontal=True)
|
202 |
-
else:
|
203 |
-
st.write('Please choose your preferred method')
|
204 |
-
if st.button("Submit", on_click=reset_all):
|
205 |
-
num_topic = num_cho
|
206 |
-
|
207 |
-
if method == 'BERTopic':
|
208 |
-
st.info('BERTopic is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
|
209 |
-
|
210 |
-
#===topic===
|
211 |
-
if method == 'Choose...':
|
212 |
-
st.write('')
|
213 |
-
|
214 |
-
elif method == 'pyLDA':
|
215 |
-
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
|
216 |
-
|
217 |
-
with tab1:
|
218 |
-
#===visualization===
|
219 |
-
@st.cache_data(ttl=3600, show_spinner=False)
|
220 |
-
def pylda(extype):
|
221 |
-
topic_abs_LDA = [t.split(' ') for t in topic_abs]
|
222 |
-
id2word = Dictionary(topic_abs_LDA)
|
223 |
-
corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
|
224 |
-
#===LDA===
|
225 |
-
lda_model = LdaModel(corpus=corpus,
|
226 |
-
id2word=id2word,
|
227 |
-
num_topics=num_topic,
|
228 |
-
random_state=py_random_state,
|
229 |
-
chunksize=py_chunksize,
|
230 |
-
alpha='auto',
|
231 |
-
per_word_topics=True)
|
232 |
-
|
233 |
-
pprint(lda_model.print_topics())
|
234 |
-
doc_lda = lda_model[corpus]
|
235 |
-
|
236 |
-
#===visualization===
|
237 |
-
coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
|
238 |
-
coherence_lda = coherence_model_lda.get_coherence()
|
239 |
-
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
|
240 |
-
py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
|
241 |
-
return py_lda_vis_html, coherence_lda, vis
|
242 |
-
|
243 |
-
with st.spinner('Performing computations. Please wait ...'):
|
244 |
-
try:
|
245 |
py_lda_vis_html, coherence_lda, vis = pylda(extype)
|
246 |
st.write('Coherence score: ', coherence_lda)
|
247 |
components.html(py_lda_vis_html, width=1500, height=800)
|
248 |
st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
|
249 |
-
|
250 |
@st.cache_data(ttl=3600, show_spinner=False)
|
251 |
def img_lda(vis):
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
img_lda(vis)
|
262 |
with open("ldavis_img.png", "rb") as file:
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
st.warning('🖱️ Please click Submit')
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
@st.cache_data(ttl=3600, show_spinner=False)
|
378 |
-
def Vis_Hierarchy(extype):
|
379 |
-
fig3 = topic_model.visualize_hierarchy(top_n_topics=num_topic)
|
380 |
-
return fig3
|
381 |
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
return fig5
|
392 |
-
|
393 |
-
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
|
394 |
-
with tab1:
|
395 |
-
try:
|
396 |
-
with st.spinner('Performing computations. Please wait ...'):
|
397 |
-
|
398 |
-
topic_model, topics, probs = bertopic_vis(extype)
|
399 |
-
time.sleep(.5)
|
400 |
-
st.toast('Visualize Topics', icon='🏃')
|
401 |
-
fig1 = Vis_Topics(extype)
|
402 |
-
|
403 |
-
time.sleep(.5)
|
404 |
-
st.toast('Visualize Document', icon='🏃')
|
405 |
-
fig2 = Vis_Documents(extype)
|
406 |
-
|
407 |
-
time.sleep(.5)
|
408 |
-
st.toast('Visualize Document Hierarchy', icon='🏃')
|
409 |
-
fig3 = Vis_Hierarchy(extype)
|
410 |
-
|
411 |
-
time.sleep(.5)
|
412 |
-
st.toast('Visualize Topic Similarity', icon='🏃')
|
413 |
-
fig4 = Vis_Heatmap(extype)
|
414 |
-
|
415 |
-
time.sleep(.5)
|
416 |
-
st.toast('Visualize Terms', icon='🏃')
|
417 |
-
fig5 = Vis_Barchart(extype)
|
418 |
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
435 |
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
with tab3:
|
440 |
-
st.markdown('**Jeet Rawat, A., Ghildiyal, S., & Dixit, A. K. (2022, December 1). Topic modelling of legal documents using NLP and bidirectional encoder representations from transformers. Indonesian Journal of Electrical Engineering and Computer Science, 28(3), 1749.** https://doi.org/10.11591/ijeecs.v28.i3.pp1749-1755')
|
441 |
-
st.markdown('**Yao, L. F., Ferawati, K., Liew, K., Wakamiya, S., & Aramaki, E. (2023, April 20). Disruptions in the Cystic Fibrosis Community’s Experiences and Concerns During the COVID-19 Pandemic: Topic Modeling and Time Series Analysis of Reddit Comments. Journal of Medical Internet Research, 25, e45249.** https://doi.org/10.2196/45249')
|
|
|
87 |
#===clear cache===
|
88 |
|
89 |
def reset_biterm():
|
90 |
+
try:
|
91 |
+
biterm_map.clear()
|
92 |
+
biterm_bar.clear()
|
93 |
+
except NameError:
|
94 |
+
biterm_topic.clear()
|
95 |
|
96 |
def reset_all():
|
97 |
+
st.cache_data.clear()
|
98 |
|
99 |
#===avoiding deadlock===
|
100 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
121 |
uploaded_file = st.file_uploader('', type=['csv', 'txt'], on_change=reset_all)
|
122 |
|
123 |
if uploaded_file is not None:
|
124 |
+
try:
|
125 |
+
extype = get_ext(uploaded_file)
|
126 |
+
|
127 |
+
if extype.endswith('.csv'):
|
128 |
+
papers = upload(extype)
|
129 |
+
elif extype.endswith('.txt'):
|
130 |
+
papers = conv_txt(extype)
|
131 |
+
|
132 |
+
coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
|
133 |
+
|
134 |
+
c1, c2 = st.columns([3,4])
|
135 |
+
method = c1.selectbox(
|
136 |
+
'Choose method',
|
137 |
+
('Choose...', 'pyLDA', 'Biterm', 'BERTopic'), on_change=reset_all)
|
138 |
+
num_cho = c1.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
|
139 |
+
ColCho = c2.selectbox(
|
140 |
+
'Choose column',
|
141 |
+
(coldf), on_change=reset_all)
|
142 |
+
words_to_remove = c2.text_input("Remove specific words. Separate words by semicolons (;)")
|
143 |
+
rem_copyright = c1.toggle('Remove copyright statement', value=True, on_change=reset_all)
|
144 |
+
rem_punc = c2.toggle('Remove punctuation', value=True, on_change=reset_all)
|
145 |
+
|
146 |
+
#===clean csv===
|
147 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
148 |
+
def clean_csv(extype):
|
149 |
+
paper = papers.dropna(subset=[ColCho])
|
150 |
+
|
151 |
+
#===mapping===
|
152 |
+
paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
|
153 |
+
if rem_punc:
|
154 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('[,:;\.!-?•=]', ' ', x))
|
155 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('\u201c|\u201d', '', regex=True)
|
156 |
+
if rem_copyright:
|
157 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('©.*', '', x))
|
158 |
+
|
159 |
+
#===stopword removal===
|
160 |
+
stop = stopwords.words('english')
|
161 |
+
paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
|
162 |
+
|
163 |
+
#===lemmatize===
|
164 |
+
lemmatizer = WordNetLemmatizer()
|
165 |
+
def lemmatize_words(text):
|
166 |
+
words = text.split()
|
167 |
+
words = [lemmatizer.lemmatize(word) for word in words]
|
168 |
+
return ' '.join(words)
|
169 |
+
paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
|
170 |
|
171 |
+
words_rmv = [word.strip() for word in words_to_remove.split(";")]
|
172 |
+
remove_dict = {word: None for word in words_rmv}
|
173 |
+
def remove_words(text):
|
174 |
+
words = text.split()
|
175 |
+
cleaned_words = [word for word in words if word not in remove_dict]
|
176 |
+
return ' '.join(cleaned_words)
|
177 |
+
paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
|
178 |
+
|
179 |
+
topic_abs = paper.Abstract_lem.values.tolist()
|
180 |
+
return topic_abs, paper
|
181 |
+
|
182 |
+
d1, d2 = st.columns([7,3])
|
183 |
+
d2.info("Don't do anything during the computing", icon="⚠️")
|
184 |
+
topic_abs, paper=clean_csv(extype)
|
185 |
+
|
186 |
+
#===advance settings===
|
187 |
+
with d1.expander("🧮 Show advance settings"):
|
188 |
+
t1, t2 = st.columns([5,5])
|
189 |
+
if method == 'pyLDA':
|
190 |
+
py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1)
|
191 |
+
py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
|
192 |
+
elif method == 'Biterm':
|
193 |
+
btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1)
|
194 |
+
btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1)
|
195 |
+
elif method == 'BERTopic':
|
196 |
+
bert_top_n_words = t1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1)
|
197 |
+
bert_random_state = t1.number_input('random_state', value=42 , min_value=1, max_value=None, step=1)
|
198 |
+
bert_n_components = t2.number_input('n_components', value=5 , min_value=1, max_value=None, step=1)
|
199 |
+
bert_n_neighbors = t2.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1)
|
200 |
+
bert_embedding_model = st.radio(
|
201 |
+
"embedding_model",
|
202 |
+
["all-MiniLM-L6-v2", "paraphrase-multilingual-MiniLM-L12-v2", "en_core_web_md"], index=0, horizontal=True)
|
203 |
+
else:
|
204 |
+
st.write('Please choose your preferred method')
|
205 |
+
if st.button("Submit", on_click=reset_all):
|
206 |
+
num_topic = num_cho
|
207 |
+
|
208 |
+
if method == 'BERTopic':
|
209 |
+
st.info('BERTopic is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
|
210 |
+
|
211 |
+
#===topic===
|
212 |
+
if method == 'Choose...':
|
213 |
+
st.write('')
|
214 |
+
|
215 |
+
elif method == 'pyLDA':
|
216 |
+
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
|
217 |
|
218 |
+
with tab1:
|
219 |
+
#===visualization===
|
220 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
221 |
+
def pylda(extype):
|
222 |
+
topic_abs_LDA = [t.split(' ') for t in topic_abs]
|
223 |
+
id2word = Dictionary(topic_abs_LDA)
|
224 |
+
corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
|
225 |
+
#===LDA===
|
226 |
+
lda_model = LdaModel(corpus=corpus,
|
227 |
+
id2word=id2word,
|
228 |
+
num_topics=num_topic,
|
229 |
+
random_state=py_random_state,
|
230 |
+
chunksize=py_chunksize,
|
231 |
+
alpha='auto',
|
232 |
+
per_word_topics=True)
|
233 |
|
234 |
+
pprint(lda_model.print_topics())
|
235 |
+
doc_lda = lda_model[corpus]
|
236 |
+
|
237 |
+
#===visualization===
|
238 |
+
coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
|
239 |
+
coherence_lda = coherence_model_lda.get_coherence()
|
240 |
+
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
|
241 |
+
py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
|
242 |
+
return py_lda_vis_html, coherence_lda, vis
|
243 |
+
|
244 |
+
with st.spinner('Performing computations. Please wait ...'):
|
245 |
+
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
py_lda_vis_html, coherence_lda, vis = pylda(extype)
|
247 |
st.write('Coherence score: ', coherence_lda)
|
248 |
components.html(py_lda_vis_html, width=1500, height=800)
|
249 |
st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
|
250 |
+
|
251 |
@st.cache_data(ttl=3600, show_spinner=False)
|
252 |
def img_lda(vis):
|
253 |
+
pyLDAvis.save_html(vis, 'output.html')
|
254 |
+
hti = Html2Image()
|
255 |
+
hti.browser.flags = ['--default-background-color=ffffff', '--hide-scrollbars']
|
256 |
+
css = "body {background: white;}"
|
257 |
+
hti.screenshot(
|
258 |
+
other_file='output.html', css_str=css, size=(1500, 800),
|
259 |
+
save_as='ldavis_img.png'
|
260 |
+
)
|
261 |
+
|
262 |
img_lda(vis)
|
263 |
with open("ldavis_img.png", "rb") as file:
|
264 |
+
btn = st.download_button(
|
265 |
+
label="Download image",
|
266 |
+
data=file,
|
267 |
+
file_name="ldavis_img.png",
|
268 |
+
mime="image/png"
|
269 |
+
)
|
270 |
+
|
271 |
+
except NameError:
|
272 |
st.warning('🖱️ Please click Submit')
|
273 |
+
|
274 |
+
with tab2:
|
275 |
+
st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
|
276 |
+
|
277 |
+
with tab3:
|
278 |
+
st.markdown('**Chen, X., & Wang, H. (2019, January). Automated chat transcript analysis using topic modeling for library reference services. Proceedings of the Association for Information Science and Technology, 56(1), 368–371.** https://doi.org/10.1002/pra2.31')
|
279 |
+
st.markdown('**Joo, S., Ingram, E., & Cahill, M. (2021, December 15). Exploring Topics and Genres in Storytime Books: A Text Mining Approach. Evidence Based Library and Information Practice, 16(4), 41–62.** https://doi.org/10.18438/eblip29963')
|
280 |
+
st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Topic Modeling. Text Mining for Information Professionals, 105–137.** https://doi.org/10.1007/978-3-030-85085-2_4')
|
281 |
+
st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477–505.** https://doi.org/10.1007/s11192-019-03137-5')
|
282 |
+
|
283 |
+
#===Biterm===
|
284 |
+
elif method == 'Biterm':
|
285 |
+
|
286 |
+
#===optimize Biterm===
|
287 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
288 |
+
def biterm_topic(extype):
|
289 |
+
X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs)
|
290 |
+
tf = np.array(X.sum(axis=0)).ravel()
|
291 |
+
docs_vec = btm.get_vectorized_docs(topic_abs, vocabulary)
|
292 |
+
docs_lens = list(map(len, docs_vec))
|
293 |
+
biterms = btm.get_biterms(docs_vec)
|
294 |
+
model = btm.BTM(X, vocabulary, seed=btm_seed, T=num_topic, M=20, alpha=50/8, beta=0.01)
|
295 |
+
model.fit(biterms, iterations=btm_iterations)
|
296 |
+
p_zd = model.transform(docs_vec)
|
297 |
+
coherence = model.coherence_
|
298 |
+
phi = tmp.get_phi(model)
|
299 |
+
topics_coords = tmp.prepare_coords(model)
|
300 |
+
totaltop = topics_coords.label.values.tolist()
|
301 |
+
perplexity = model.perplexity_
|
302 |
+
return topics_coords, phi, totaltop, perplexity
|
303 |
+
|
304 |
+
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
|
305 |
+
with tab1:
|
306 |
+
try:
|
307 |
+
with st.spinner('Performing computations. Please wait ...'):
|
308 |
+
topics_coords, phi, totaltop, perplexity = biterm_topic(extype)
|
309 |
+
col1, col2 = st.columns([4,6])
|
310 |
+
|
311 |
+
@st.cache_data(ttl=3600)
|
312 |
+
def biterm_map(extype):
|
313 |
+
btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
|
314 |
+
return btmvis_coords
|
315 |
+
|
316 |
+
@st.cache_data(ttl=3600)
|
317 |
+
def biterm_bar(extype):
|
318 |
+
terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
|
319 |
+
btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
|
320 |
+
return btmvis_probs
|
321 |
+
|
322 |
+
with col1:
|
323 |
+
st.write('Perplexity score: ', perplexity)
|
324 |
+
st.write('')
|
325 |
+
numvis = st.selectbox(
|
326 |
+
'Choose topic',
|
327 |
+
(totaltop), on_change=reset_biterm)
|
328 |
+
btmvis_coords = biterm_map(extype)
|
329 |
+
st.altair_chart(btmvis_coords)
|
330 |
+
with col2:
|
331 |
+
btmvis_probs = biterm_bar(extype)
|
332 |
+
st.altair_chart(btmvis_probs, use_container_width=True)
|
333 |
+
|
334 |
+
except ValueError:
|
335 |
+
st.error('🙇♂️ Please raise the number of topics and click submit')
|
336 |
+
except NameError:
|
337 |
+
st.warning('🖱️ Please click Submit')
|
338 |
+
|
339 |
+
with tab2:
|
340 |
+
st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
|
341 |
+
with tab3:
|
342 |
+
st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
|
343 |
+
st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
|
344 |
+
st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
|
345 |
+
st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
|
346 |
+
|
347 |
+
#===BERTopic===
|
348 |
+
elif method == 'BERTopic':
|
349 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
350 |
+
def bertopic_vis(extype):
|
351 |
+
umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
|
352 |
+
min_dist=0.0, metric='cosine', random_state=bert_random_state)
|
353 |
+
cluster_model = KMeans(n_clusters=num_topic)
|
354 |
+
if bert_embedding_model == 'all-MiniLM-L6-v2':
|
355 |
+
emb_mod = 'all-MiniLM-L6-v2'
|
356 |
+
lang = 'en'
|
357 |
+
elif bert_embedding_model == 'en_core_web_md':
|
358 |
+
emb_mod = en_core_web_md.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
|
359 |
+
lang = 'en'
|
360 |
+
elif bert_embedding_model == 'paraphrase-multilingual-MiniLM-L12-v2':
|
361 |
+
emb_mod = 'paraphrase-multilingual-MiniLM-L12-v2'
|
362 |
+
lang = 'multilingual'
|
363 |
+
topic_model = BERTopic(embedding_model=emb_mod, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, top_n_words=bert_top_n_words)
|
364 |
+
topics, probs = topic_model.fit_transform(topic_abs)
|
365 |
+
return topic_model, topics, probs
|
366 |
+
|
367 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
368 |
+
def Vis_Topics(extype):
|
369 |
+
fig1 = topic_model.visualize_topics()
|
370 |
+
return fig1
|
371 |
+
|
372 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
373 |
+
def Vis_Documents(extype):
|
374 |
+
fig2 = topic_model.visualize_documents(topic_abs)
|
375 |
+
return fig2
|
376 |
+
|
377 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
378 |
+
def Vis_Hierarchy(extype):
|
379 |
+
fig3 = topic_model.visualize_hierarchy(top_n_topics=num_topic)
|
380 |
+
return fig3
|
381 |
|
382 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
383 |
+
def Vis_Heatmap(extype):
|
384 |
+
global topic_model
|
385 |
+
fig4 = topic_model.visualize_heatmap(n_clusters=num_topic-1, width=1000, height=1000)
|
386 |
+
return fig4
|
|
|
|
|
|
|
|
|
387 |
|
388 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
389 |
+
def Vis_Barchart(extype):
|
390 |
+
fig5 = topic_model.visualize_barchart(top_n_topics=num_topic)
|
391 |
+
return fig5
|
392 |
+
|
393 |
+
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
|
394 |
+
with tab1:
|
395 |
+
try:
|
396 |
+
with st.spinner('Performing computations. Please wait ...'):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
|
398 |
+
topic_model, topics, probs = bertopic_vis(extype)
|
399 |
+
time.sleep(.5)
|
400 |
+
st.toast('Visualize Topics', icon='🏃')
|
401 |
+
fig1 = Vis_Topics(extype)
|
402 |
+
|
403 |
+
time.sleep(.5)
|
404 |
+
st.toast('Visualize Document', icon='🏃')
|
405 |
+
fig2 = Vis_Documents(extype)
|
406 |
+
|
407 |
+
time.sleep(.5)
|
408 |
+
st.toast('Visualize Document Hierarchy', icon='🏃')
|
409 |
+
fig3 = Vis_Hierarchy(extype)
|
410 |
+
|
411 |
+
time.sleep(.5)
|
412 |
+
st.toast('Visualize Topic Similarity', icon='🏃')
|
413 |
+
fig4 = Vis_Heatmap(extype)
|
414 |
+
|
415 |
+
time.sleep(.5)
|
416 |
+
st.toast('Visualize Terms', icon='🏃')
|
417 |
+
fig5 = Vis_Barchart(extype)
|
418 |
+
|
419 |
+
with st.expander("Visualize Topics"):
|
420 |
+
st.write(fig1)
|
421 |
+
with st.expander("Visualize Terms"):
|
422 |
+
st.write(fig5)
|
423 |
+
with st.expander("Visualize Documents"):
|
424 |
+
st.write(fig2)
|
425 |
+
with st.expander("Visualize Document Hierarchy"):
|
426 |
+
st.write(fig3)
|
427 |
+
with st.expander("Visualize Topic Similarity"):
|
428 |
+
st.write(fig4)
|
429 |
+
|
430 |
+
except ValueError:
|
431 |
+
st.error('🙇���️ Please raise the number of topics and click submit')
|
432 |
+
|
433 |
+
except NameError:
|
434 |
+
st.warning('🖱️ Please click Submit')
|
435 |
+
|
436 |
+
with tab2:
|
437 |
+
st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')
|
438 |
+
|
439 |
+
with tab3:
|
440 |
+
st.markdown('**Jeet Rawat, A., Ghildiyal, S., & Dixit, A. K. (2022, December 1). Topic modelling of legal documents using NLP and bidirectional encoder representations from transformers. Indonesian Journal of Electrical Engineering and Computer Science, 28(3), 1749.** https://doi.org/10.11591/ijeecs.v28.i3.pp1749-1755')
|
441 |
+
st.markdown('**Yao, L. F., Ferawati, K., Liew, K., Wakamiya, S., & Aramaki, E. (2023, April 20). Disruptions in the Cystic Fibrosis Community’s Experiences and Concerns During the COVID-19 Pandemic: Topic Modeling and Time Series Analysis of Reddit Comments. Journal of Medical Internet Research, 25, e45249.** https://doi.org/10.2196/45249')
|
442 |
|
443 |
+
except:
|
444 |
+
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
445 |
+
st.stop()
|
|
|
|
|
|