faizhalas commited on
Commit
79c48b0
โ€ข
1 Parent(s): ea89481

adding advance setting

Browse files
Files changed (1) hide show
  1. pages/2 Topic Modeling.py +80 -52
pages/2 Topic Modeling.py CHANGED
@@ -101,6 +101,14 @@ def clean_csv(extype):
101
  words = [lemmatizer.lemmatize(word) for word in words]
102
  return ' '.join(words)
103
  paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
 
 
 
 
 
 
 
 
104
 
105
  topic_abs = paper.Abstract_lem.values.tolist()
106
  return topic_abs, paper
@@ -134,14 +142,38 @@ if uploaded_file is not None:
134
  elif extype.endswith('.txt'):
135
  papers = conv_txt(extype)
136
 
137
- topic_abs, paper=clean_csv(extype)
138
- c1, c2 = st.columns([5,5])
139
  method = c1.selectbox(
140
  'Choose method',
141
  ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'), on_change=reset_all)
142
- c1.info("Don't do anything during the computing", icon="โš ๏ธ")
143
  num_cho = c2.number_input('Choose number of topics', min_value=2, max_value=30, value=2)
144
- if c2.button("Submit", on_click=reset_all):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  num_topic = num_cho
146
 
147
  #===topic===
@@ -149,7 +181,7 @@ if uploaded_file is not None:
149
  st.write('')
150
 
151
  elif method == 'pyLDA':
152
- tab1, tab2, tab3 = st.tabs(["๐Ÿ“ˆ Generate visualization & Calculate coherence", "๐Ÿ“ƒ Reference", "๐Ÿ““ Recommended Reading"])
153
 
154
  with tab1:
155
  #===visualization===
@@ -162,8 +194,8 @@ if uploaded_file is not None:
162
  lda_model = LdaModel(corpus=corpus,
163
  id2word=id2word,
164
  num_topics=num_topic,
165
- random_state=0,
166
- chunksize=100,
167
  alpha='auto',
168
  per_word_topics=True)
169
 
@@ -180,7 +212,7 @@ if uploaded_file is not None:
180
  with st.spinner('Performing computations. Please wait ...'):
181
  try:
182
  py_lda_vis_html, coherence_lda, vis = pylda(extype)
183
- st.write('Coherence: ', (coherence_lda))
184
  st.components.v1.html(py_lda_vis_html, width=1500, height=800)
185
  st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
186
 
@@ -228,20 +260,21 @@ if uploaded_file is not None:
228
  docs_lens = list(map(len, docs_vec))
229
  biterms = btm.get_biterms(docs_vec)
230
  model = btm.BTM(
231
- X, vocabulary, seed=12321, T=num_topic, M=20, alpha=50/8, beta=0.01)
232
- model.fit(biterms, iterations=20)
233
  p_zd = model.transform(docs_vec)
234
  coherence = model.coherence_
235
  phi = tmp.get_phi(model)
236
  topics_coords = tmp.prepare_coords(model)
237
  totaltop = topics_coords.label.values.tolist()
238
- return topics_coords, phi, totaltop
 
239
 
240
  tab1, tab2, tab3 = st.tabs(["๐Ÿ“ˆ Generate visualization", "๐Ÿ“ƒ Reference", "๐Ÿ““ Recommended Reading"])
241
  with tab1:
242
  try:
243
  with st.spinner('Performing computations. Please wait ...'):
244
- topics_coords, phi, totaltop = biterm_topic(extype)
245
  col1, col2 = st.columns([4,6])
246
 
247
  @st.cache_data(ttl=3600)
@@ -256,6 +289,8 @@ if uploaded_file is not None:
256
  return btmvis_probs
257
 
258
  with col1:
 
 
259
  numvis = st.selectbox(
260
  'Choose topic',
261
  (totaltop), on_change=reset_biterm)
@@ -282,12 +317,22 @@ if uploaded_file is not None:
282
  elif method == 'BERTopic':
283
  @st.cache_data(ttl=3600, show_spinner=False)
284
  def bertopic_vis(extype):
 
 
285
  topic_time = paper.Year.values.tolist()
286
- umap_model = UMAP(n_neighbors=15, n_components=5,
287
- min_dist=0.0, metric='cosine', random_state=42)
288
  cluster_model = KMeans(n_clusters=num_topic)
289
- nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
290
- topic_model = BERTopic(embedding_model=nlp, hdbscan_model=cluster_model, language="multilingual", umap_model=umap_model)
 
 
 
 
 
 
 
 
291
  topics, probs = topic_model.fit_transform(topic_abs)
292
  return topic_model, topic_time, topics, probs
293
 
@@ -314,7 +359,7 @@ if uploaded_file is not None:
314
 
315
  @st.cache_data(ttl=3600, show_spinner=False)
316
  def Vis_Barchart(extype):
317
- fig5 = topic_model.visualize_barchart(top_n_topics=num_topic, n_words=10)
318
  return fig5
319
 
320
  @st.cache_data(ttl=3600, show_spinner=False)
@@ -328,41 +373,24 @@ if uploaded_file is not None:
328
  try:
329
  with st.spinner('Performing computations. Please wait ...'):
330
  topic_model, topic_time, topics, probs = bertopic_vis(extype)
331
- #===visualization===
332
- viz = st.selectbox(
333
- 'Choose visualization',
334
- ('Visualize Topics', 'Visualize Documents', 'Visualize Document Hierarchy', 'Visualize Topic Similarity', 'Visualize Terms', 'Visualize Topics over Time'))
335
-
336
- if viz == 'Visualize Topics':
337
- with st.spinner('Performing computations. Please wait ...'):
338
- fig1 = Vis_Topics(extype)
339
- st.write(fig1)
340
-
341
- elif viz == 'Visualize Documents':
342
- with st.spinner('Performing computations. Please wait ...'):
343
- fig2 = Vis_Documents(extype)
344
- st.write(fig2)
345
-
346
- elif viz == 'Visualize Document Hierarchy':
347
- with st.spinner('Performing computations. Please wait ...'):
348
- fig3 = Vis_Hierarchy(extype)
349
- st.write(fig3)
350
-
351
- elif viz == 'Visualize Topic Similarity':
352
- with st.spinner('Performing computations. Please wait ...'):
353
- fig4 = Vis_Heatmap(extype)
354
- st.write(fig4)
355
-
356
- elif viz == 'Visualize Terms':
357
- with st.spinner('Performing computations. Please wait ...'):
358
- fig5 = Vis_Barchart(extype)
359
- st.write(fig5)
360
-
361
- elif viz == 'Visualize Topics over Time':
362
- with st.spinner('Performing computations. Please wait ...'):
363
- fig6 = Vis_ToT(extype)
364
- st.write(fig6)
365
-
366
 
367
  except ValueError:
368
  st.error('๐Ÿ™‡โ€โ™‚๏ธ Please raise the number of topics and click submit')
 
101
  words = [lemmatizer.lemmatize(word) for word in words]
102
  return ' '.join(words)
103
  paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
104
+
105
+ words_rmv = [word.strip() for word in words_to_remove.split(";")]
106
+ remove_dict = {word: None for word in words_rmv}
107
+ def remove_words(text):
108
+ words = text.split()
109
+ cleaned_words = [word for word in words if word not in remove_dict]
110
+ return ' '.join(cleaned_words)
111
+ paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
112
 
113
  topic_abs = paper.Abstract_lem.values.tolist()
114
  return topic_abs, paper
 
142
  elif extype.endswith('.txt'):
143
  papers = conv_txt(extype)
144
 
145
+ c1, c2, c3 = st.columns([3,2,5])
 
146
  method = c1.selectbox(
147
  'Choose method',
148
  ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'), on_change=reset_all)
 
149
  num_cho = c2.number_input('Choose number of topics', min_value=2, max_value=30, value=2)
150
+ words_to_remove = c3.text_input("Remove specific words. Separate words by semicolons (;)")
151
+
152
+ d1, d2 = st.columns([8,2])
153
+ d2.info("Don't do anything during the computing", icon="โš ๏ธ")
154
+ topic_abs, paper=clean_csv(extype)
155
+
156
+ #===advance settings===
157
+ with d1.expander("๐Ÿงฎ Show advance settings"):
158
+ t1, t2 = st.columns([5,5])
159
+ if method == 'pyLDA':
160
+ py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1)
161
+ py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
162
+ elif method == 'Biterm':
163
+ btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1)
164
+ btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1)
165
+ elif method == 'BERTopic':
166
+ bert_top_n_words = t1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1)
167
+ bert_random_state = t1.number_input('random_state', value=42 , min_value=1, max_value=None, step=1)
168
+ bert_n_components = t2.number_input('n_components', value=5 , min_value=1, max_value=None, step=1)
169
+ bert_n_neighbors = t2.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1)
170
+ bert_embedding_model = st.radio(
171
+ "embedding_model",
172
+ ["all-MiniLM-L6-v2", "en_core_web_sm", "paraphrase-multilingual-MiniLM-L12-v2"],
173
+ captions = ["English", "English", "Supports 50+ languages"], index=0, horizontal=True)
174
+ else:
175
+ st.write('Please choose your preferred method')
176
+ if st.button("Submit", on_click=reset_all):
177
  num_topic = num_cho
178
 
179
  #===topic===
 
181
  st.write('')
182
 
183
  elif method == 'pyLDA':
184
+ tab1, tab2, tab3 = st.tabs(["๐Ÿ“ˆ Generate visualization", "๐Ÿ“ƒ Reference", "๐Ÿ““ Recommended Reading"])
185
 
186
  with tab1:
187
  #===visualization===
 
194
  lda_model = LdaModel(corpus=corpus,
195
  id2word=id2word,
196
  num_topics=num_topic,
197
+ random_state=py_random_state,
198
+ chunksize=py_chunksize,
199
  alpha='auto',
200
  per_word_topics=True)
201
 
 
212
  with st.spinner('Performing computations. Please wait ...'):
213
  try:
214
  py_lda_vis_html, coherence_lda, vis = pylda(extype)
215
+ st.write('Coherence score: ', coherence_lda)
216
  st.components.v1.html(py_lda_vis_html, width=1500, height=800)
217
  st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
218
 
 
260
  docs_lens = list(map(len, docs_vec))
261
  biterms = btm.get_biterms(docs_vec)
262
  model = btm.BTM(
263
+ X, vocabulary, seed=btm_seed, T=num_topic, M=20, alpha=50/8, beta=0.01)
264
+ model.fit(biterms, iterations=btm_iterations)
265
  p_zd = model.transform(docs_vec)
266
  coherence = model.coherence_
267
  phi = tmp.get_phi(model)
268
  topics_coords = tmp.prepare_coords(model)
269
  totaltop = topics_coords.label.values.tolist()
270
+ perplexity = model.perplexity_
271
+ return topics_coords, phi, totaltop, perplexity
272
 
273
  tab1, tab2, tab3 = st.tabs(["๐Ÿ“ˆ Generate visualization", "๐Ÿ“ƒ Reference", "๐Ÿ““ Recommended Reading"])
274
  with tab1:
275
  try:
276
  with st.spinner('Performing computations. Please wait ...'):
277
+ topics_coords, phi, totaltop, perplexity = biterm_topic(extype)
278
  col1, col2 = st.columns([4,6])
279
 
280
  @st.cache_data(ttl=3600)
 
289
  return btmvis_probs
290
 
291
  with col1:
292
+ st.write('Perplexity score: ', perplexity)
293
+ st.write('')
294
  numvis = st.selectbox(
295
  'Choose topic',
296
  (totaltop), on_change=reset_biterm)
 
317
  elif method == 'BERTopic':
318
  @st.cache_data(ttl=3600, show_spinner=False)
319
  def bertopic_vis(extype):
320
+ if 'Publication Year' in paper.columns:
321
+ paper.rename(columns={'Publication Year': 'Year'}, inplace=True)
322
  topic_time = paper.Year.values.tolist()
323
+ umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
324
+ min_dist=0.0, metric='cosine', random_state=bert_random_state)
325
  cluster_model = KMeans(n_clusters=num_topic)
326
+ if bert_embedding_model == 'all-MiniLM-L6-v2':
327
+ emb_mod = 'all-MiniLM-L6-v2'
328
+ lang = 'en'
329
+ elif bert_embedding_model == 'en_core_web_sm':
330
+ emb_mod = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
331
+ lang = 'en'
332
+ elif bert_embedding_model == 'paraphrase-multilingual-MiniLM-L12-v2':
333
+ emb_mod = 'paraphrase-multilingual-MiniLM-L12-v2'
334
+ lang = 'multilingual'
335
+ topic_model = BERTopic(embedding_model=emb_mod, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, top_n_words=bert_top_n_words)
336
  topics, probs = topic_model.fit_transform(topic_abs)
337
  return topic_model, topic_time, topics, probs
338
 
 
359
 
360
  @st.cache_data(ttl=3600, show_spinner=False)
361
  def Vis_Barchart(extype):
362
+ fig5 = topic_model.visualize_barchart(top_n_topics=num_topic) #, n_words=10)
363
  return fig5
364
 
365
  @st.cache_data(ttl=3600, show_spinner=False)
 
373
  try:
374
  with st.spinner('Performing computations. Please wait ...'):
375
  topic_model, topic_time, topics, probs = bertopic_vis(extype)
376
+ fig1 = Vis_Topics(extype)
377
+ fig2 = Vis_Documents(extype)
378
+ fig3 = Vis_Hierarchy(extype)
379
+ fig4 = Vis_Heatmap(extype)
380
+ fig5 = Vis_Barchart(extype)
381
+ fig6 = Vis_ToT(extype)
382
+ with st.expander("Visualize Topics"):
383
+ st.write(fig1)
384
+ with st.expander("Visualize Terms"):
385
+ st.write(fig5)
386
+ with st.expander("Visualize Documents"):
387
+ st.write(fig2)
388
+ with st.expander("Visualize Document Hierarchy"):
389
+ st.write(fig3)
390
+ with st.expander("Visualize Topic Similarity"):
391
+ st.write(fig4)
392
+ with st.expander("Visualize Topics over Time"):
393
+ st.write(fig6)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
  except ValueError:
396
  st.error('๐Ÿ™‡โ€โ™‚๏ธ Please raise the number of topics and click submit')