Spaces:
Running
Running
adding advance setting
Browse files- pages/2 Topic Modeling.py +80 -52
pages/2 Topic Modeling.py
CHANGED
@@ -101,6 +101,14 @@ def clean_csv(extype):
|
|
101 |
words = [lemmatizer.lemmatize(word) for word in words]
|
102 |
return ' '.join(words)
|
103 |
paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
topic_abs = paper.Abstract_lem.values.tolist()
|
106 |
return topic_abs, paper
|
@@ -134,14 +142,38 @@ if uploaded_file is not None:
|
|
134 |
elif extype.endswith('.txt'):
|
135 |
papers = conv_txt(extype)
|
136 |
|
137 |
-
|
138 |
-
c1, c2 = st.columns([5,5])
|
139 |
method = c1.selectbox(
|
140 |
'Choose method',
|
141 |
('Choose...', 'pyLDA', 'Biterm', 'BERTopic'), on_change=reset_all)
|
142 |
-
c1.info("Don't do anything during the computing", icon="โ ๏ธ")
|
143 |
num_cho = c2.number_input('Choose number of topics', min_value=2, max_value=30, value=2)
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
num_topic = num_cho
|
146 |
|
147 |
#===topic===
|
@@ -149,7 +181,7 @@ if uploaded_file is not None:
|
|
149 |
st.write('')
|
150 |
|
151 |
elif method == 'pyLDA':
|
152 |
-
tab1, tab2, tab3 = st.tabs(["๐ Generate visualization
|
153 |
|
154 |
with tab1:
|
155 |
#===visualization===
|
@@ -162,8 +194,8 @@ if uploaded_file is not None:
|
|
162 |
lda_model = LdaModel(corpus=corpus,
|
163 |
id2word=id2word,
|
164 |
num_topics=num_topic,
|
165 |
-
random_state=
|
166 |
-
chunksize=
|
167 |
alpha='auto',
|
168 |
per_word_topics=True)
|
169 |
|
@@ -180,7 +212,7 @@ if uploaded_file is not None:
|
|
180 |
with st.spinner('Performing computations. Please wait ...'):
|
181 |
try:
|
182 |
py_lda_vis_html, coherence_lda, vis = pylda(extype)
|
183 |
-
st.write('Coherence: ',
|
184 |
st.components.v1.html(py_lda_vis_html, width=1500, height=800)
|
185 |
st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
|
186 |
|
@@ -228,20 +260,21 @@ if uploaded_file is not None:
|
|
228 |
docs_lens = list(map(len, docs_vec))
|
229 |
biterms = btm.get_biterms(docs_vec)
|
230 |
model = btm.BTM(
|
231 |
-
X, vocabulary, seed=
|
232 |
-
model.fit(biterms, iterations=
|
233 |
p_zd = model.transform(docs_vec)
|
234 |
coherence = model.coherence_
|
235 |
phi = tmp.get_phi(model)
|
236 |
topics_coords = tmp.prepare_coords(model)
|
237 |
totaltop = topics_coords.label.values.tolist()
|
238 |
-
|
|
|
239 |
|
240 |
tab1, tab2, tab3 = st.tabs(["๐ Generate visualization", "๐ Reference", "๐ Recommended Reading"])
|
241 |
with tab1:
|
242 |
try:
|
243 |
with st.spinner('Performing computations. Please wait ...'):
|
244 |
-
topics_coords, phi, totaltop = biterm_topic(extype)
|
245 |
col1, col2 = st.columns([4,6])
|
246 |
|
247 |
@st.cache_data(ttl=3600)
|
@@ -256,6 +289,8 @@ if uploaded_file is not None:
|
|
256 |
return btmvis_probs
|
257 |
|
258 |
with col1:
|
|
|
|
|
259 |
numvis = st.selectbox(
|
260 |
'Choose topic',
|
261 |
(totaltop), on_change=reset_biterm)
|
@@ -282,12 +317,22 @@ if uploaded_file is not None:
|
|
282 |
elif method == 'BERTopic':
|
283 |
@st.cache_data(ttl=3600, show_spinner=False)
|
284 |
def bertopic_vis(extype):
|
|
|
|
|
285 |
topic_time = paper.Year.values.tolist()
|
286 |
-
umap_model = UMAP(n_neighbors=
|
287 |
-
min_dist=0.0, metric='cosine', random_state=
|
288 |
cluster_model = KMeans(n_clusters=num_topic)
|
289 |
-
|
290 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
topics, probs = topic_model.fit_transform(topic_abs)
|
292 |
return topic_model, topic_time, topics, probs
|
293 |
|
@@ -314,7 +359,7 @@ if uploaded_file is not None:
|
|
314 |
|
315 |
@st.cache_data(ttl=3600, show_spinner=False)
|
316 |
def Vis_Barchart(extype):
|
317 |
-
fig5 = topic_model.visualize_barchart(top_n_topics=num_topic
|
318 |
return fig5
|
319 |
|
320 |
@st.cache_data(ttl=3600, show_spinner=False)
|
@@ -328,41 +373,24 @@ if uploaded_file is not None:
|
|
328 |
try:
|
329 |
with st.spinner('Performing computations. Please wait ...'):
|
330 |
topic_model, topic_time, topics, probs = bertopic_vis(extype)
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
st.write(fig3)
|
350 |
-
|
351 |
-
elif viz == 'Visualize Topic Similarity':
|
352 |
-
with st.spinner('Performing computations. Please wait ...'):
|
353 |
-
fig4 = Vis_Heatmap(extype)
|
354 |
-
st.write(fig4)
|
355 |
-
|
356 |
-
elif viz == 'Visualize Terms':
|
357 |
-
with st.spinner('Performing computations. Please wait ...'):
|
358 |
-
fig5 = Vis_Barchart(extype)
|
359 |
-
st.write(fig5)
|
360 |
-
|
361 |
-
elif viz == 'Visualize Topics over Time':
|
362 |
-
with st.spinner('Performing computations. Please wait ...'):
|
363 |
-
fig6 = Vis_ToT(extype)
|
364 |
-
st.write(fig6)
|
365 |
-
|
366 |
|
367 |
except ValueError:
|
368 |
st.error('๐โโ๏ธ Please raise the number of topics and click submit')
|
|
|
101 |
words = [lemmatizer.lemmatize(word) for word in words]
|
102 |
return ' '.join(words)
|
103 |
paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
|
104 |
+
|
105 |
+
words_rmv = [word.strip() for word in words_to_remove.split(";")]
|
106 |
+
remove_dict = {word: None for word in words_rmv}
|
107 |
+
def remove_words(text):
|
108 |
+
words = text.split()
|
109 |
+
cleaned_words = [word for word in words if word not in remove_dict]
|
110 |
+
return ' '.join(cleaned_words)
|
111 |
+
paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
|
112 |
|
113 |
topic_abs = paper.Abstract_lem.values.tolist()
|
114 |
return topic_abs, paper
|
|
|
142 |
elif extype.endswith('.txt'):
|
143 |
papers = conv_txt(extype)
|
144 |
|
145 |
+
c1, c2, c3 = st.columns([3,2,5])
|
|
|
146 |
method = c1.selectbox(
|
147 |
'Choose method',
|
148 |
('Choose...', 'pyLDA', 'Biterm', 'BERTopic'), on_change=reset_all)
|
|
|
149 |
num_cho = c2.number_input('Choose number of topics', min_value=2, max_value=30, value=2)
|
150 |
+
words_to_remove = c3.text_input("Remove specific words. Separate words by semicolons (;)")
|
151 |
+
|
152 |
+
d1, d2 = st.columns([8,2])
|
153 |
+
d2.info("Don't do anything during the computing", icon="โ ๏ธ")
|
154 |
+
topic_abs, paper=clean_csv(extype)
|
155 |
+
|
156 |
+
#===advance settings===
|
157 |
+
with d1.expander("๐งฎ Show advance settings"):
|
158 |
+
t1, t2 = st.columns([5,5])
|
159 |
+
if method == 'pyLDA':
|
160 |
+
py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1)
|
161 |
+
py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
|
162 |
+
elif method == 'Biterm':
|
163 |
+
btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1)
|
164 |
+
btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1)
|
165 |
+
elif method == 'BERTopic':
|
166 |
+
bert_top_n_words = t1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1)
|
167 |
+
bert_random_state = t1.number_input('random_state', value=42 , min_value=1, max_value=None, step=1)
|
168 |
+
bert_n_components = t2.number_input('n_components', value=5 , min_value=1, max_value=None, step=1)
|
169 |
+
bert_n_neighbors = t2.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1)
|
170 |
+
bert_embedding_model = st.radio(
|
171 |
+
"embedding_model",
|
172 |
+
["all-MiniLM-L6-v2", "en_core_web_sm", "paraphrase-multilingual-MiniLM-L12-v2"],
|
173 |
+
captions = ["English", "English", "Supports 50+ languages"], index=0, horizontal=True)
|
174 |
+
else:
|
175 |
+
st.write('Please choose your preferred method')
|
176 |
+
if st.button("Submit", on_click=reset_all):
|
177 |
num_topic = num_cho
|
178 |
|
179 |
#===topic===
|
|
|
181 |
st.write('')
|
182 |
|
183 |
elif method == 'pyLDA':
|
184 |
+
tab1, tab2, tab3 = st.tabs(["๐ Generate visualization", "๐ Reference", "๐ Recommended Reading"])
|
185 |
|
186 |
with tab1:
|
187 |
#===visualization===
|
|
|
194 |
lda_model = LdaModel(corpus=corpus,
|
195 |
id2word=id2word,
|
196 |
num_topics=num_topic,
|
197 |
+
random_state=py_random_state,
|
198 |
+
chunksize=py_chunksize,
|
199 |
alpha='auto',
|
200 |
per_word_topics=True)
|
201 |
|
|
|
212 |
with st.spinner('Performing computations. Please wait ...'):
|
213 |
try:
|
214 |
py_lda_vis_html, coherence_lda, vis = pylda(extype)
|
215 |
+
st.write('Coherence score: ', coherence_lda)
|
216 |
st.components.v1.html(py_lda_vis_html, width=1500, height=800)
|
217 |
st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
|
218 |
|
|
|
260 |
docs_lens = list(map(len, docs_vec))
|
261 |
biterms = btm.get_biterms(docs_vec)
|
262 |
model = btm.BTM(
|
263 |
+
X, vocabulary, seed=btm_seed, T=num_topic, M=20, alpha=50/8, beta=0.01)
|
264 |
+
model.fit(biterms, iterations=btm_iterations)
|
265 |
p_zd = model.transform(docs_vec)
|
266 |
coherence = model.coherence_
|
267 |
phi = tmp.get_phi(model)
|
268 |
topics_coords = tmp.prepare_coords(model)
|
269 |
totaltop = topics_coords.label.values.tolist()
|
270 |
+
perplexity = model.perplexity_
|
271 |
+
return topics_coords, phi, totaltop, perplexity
|
272 |
|
273 |
tab1, tab2, tab3 = st.tabs(["๐ Generate visualization", "๐ Reference", "๐ Recommended Reading"])
|
274 |
with tab1:
|
275 |
try:
|
276 |
with st.spinner('Performing computations. Please wait ...'):
|
277 |
+
topics_coords, phi, totaltop, perplexity = biterm_topic(extype)
|
278 |
col1, col2 = st.columns([4,6])
|
279 |
|
280 |
@st.cache_data(ttl=3600)
|
|
|
289 |
return btmvis_probs
|
290 |
|
291 |
with col1:
|
292 |
+
st.write('Perplexity score: ', perplexity)
|
293 |
+
st.write('')
|
294 |
numvis = st.selectbox(
|
295 |
'Choose topic',
|
296 |
(totaltop), on_change=reset_biterm)
|
|
|
317 |
elif method == 'BERTopic':
|
318 |
@st.cache_data(ttl=3600, show_spinner=False)
|
319 |
def bertopic_vis(extype):
|
320 |
+
if 'Publication Year' in paper.columns:
|
321 |
+
paper.rename(columns={'Publication Year': 'Year'}, inplace=True)
|
322 |
topic_time = paper.Year.values.tolist()
|
323 |
+
umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
|
324 |
+
min_dist=0.0, metric='cosine', random_state=bert_random_state)
|
325 |
cluster_model = KMeans(n_clusters=num_topic)
|
326 |
+
if bert_embedding_model == 'all-MiniLM-L6-v2':
|
327 |
+
emb_mod = 'all-MiniLM-L6-v2'
|
328 |
+
lang = 'en'
|
329 |
+
elif bert_embedding_model == 'en_core_web_sm':
|
330 |
+
emb_mod = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
|
331 |
+
lang = 'en'
|
332 |
+
elif bert_embedding_model == 'paraphrase-multilingual-MiniLM-L12-v2':
|
333 |
+
emb_mod = 'paraphrase-multilingual-MiniLM-L12-v2'
|
334 |
+
lang = 'multilingual'
|
335 |
+
topic_model = BERTopic(embedding_model=emb_mod, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, top_n_words=bert_top_n_words)
|
336 |
topics, probs = topic_model.fit_transform(topic_abs)
|
337 |
return topic_model, topic_time, topics, probs
|
338 |
|
|
|
359 |
|
360 |
@st.cache_data(ttl=3600, show_spinner=False)
|
361 |
def Vis_Barchart(extype):
|
362 |
+
fig5 = topic_model.visualize_barchart(top_n_topics=num_topic) #, n_words=10)
|
363 |
return fig5
|
364 |
|
365 |
@st.cache_data(ttl=3600, show_spinner=False)
|
|
|
373 |
try:
|
374 |
with st.spinner('Performing computations. Please wait ...'):
|
375 |
topic_model, topic_time, topics, probs = bertopic_vis(extype)
|
376 |
+
fig1 = Vis_Topics(extype)
|
377 |
+
fig2 = Vis_Documents(extype)
|
378 |
+
fig3 = Vis_Hierarchy(extype)
|
379 |
+
fig4 = Vis_Heatmap(extype)
|
380 |
+
fig5 = Vis_Barchart(extype)
|
381 |
+
fig6 = Vis_ToT(extype)
|
382 |
+
with st.expander("Visualize Topics"):
|
383 |
+
st.write(fig1)
|
384 |
+
with st.expander("Visualize Terms"):
|
385 |
+
st.write(fig5)
|
386 |
+
with st.expander("Visualize Documents"):
|
387 |
+
st.write(fig2)
|
388 |
+
with st.expander("Visualize Document Hierarchy"):
|
389 |
+
st.write(fig3)
|
390 |
+
with st.expander("Visualize Topic Similarity"):
|
391 |
+
st.write(fig4)
|
392 |
+
with st.expander("Visualize Topics over Time"):
|
393 |
+
st.write(fig6)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
394 |
|
395 |
except ValueError:
|
396 |
st.error('๐โโ๏ธ Please raise the number of topics and click submit')
|