faizhalas commited on
Commit
20fe7b5
β€’
1 Parent(s): e13cd47

Adding an option to choose a column.

Browse files
Files changed (1) hide show
  1. pages/2 Topic Modeling.py +52 -64
pages/2 Topic Modeling.py CHANGED
@@ -85,46 +85,6 @@ def reset_all():
85
  #===avoiding deadlock===
86
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
87
 
88
- #===clean csv===
89
- @st.cache_data(ttl=3600, show_spinner=False)
90
- def clean_csv(extype):
91
- try:
92
- paper = papers.dropna(subset=['Abstract'])
93
- except KeyError:
94
- st.error('Error: Please check your Abstract column.')
95
- sys.exit(1)
96
- paper = paper[~paper.Abstract.str.contains("No abstract available")]
97
- paper = paper[~paper.Abstract.str.contains("STRAIT")]
98
-
99
- #===mapping===
100
- paper['Abstract_pre'] = paper['Abstract'].map(lambda x: re.sub('[,:;\.!-?β€’=]', ' ', x))
101
- paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: x.lower())
102
- paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('Β©.*', '', x))
103
- paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('\u201c|\u201d', '', regex=True)
104
-
105
- #===stopword removal===
106
- stop = stopwords.words('english')
107
- paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
108
-
109
- #===lemmatize===
110
- lemmatizer = WordNetLemmatizer()
111
- def lemmatize_words(text):
112
- words = text.split()
113
- words = [lemmatizer.lemmatize(word) for word in words]
114
- return ' '.join(words)
115
- paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
116
-
117
- words_rmv = [word.strip() for word in words_to_remove.split(";")]
118
- remove_dict = {word: None for word in words_rmv}
119
- def remove_words(text):
120
- words = text.split()
121
- cleaned_words = [word for word in words if word not in remove_dict]
122
- return ' '.join(cleaned_words)
123
- paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
124
-
125
- topic_abs = paper.Abstract_lem.values.tolist()
126
- return topic_abs, paper
127
-
128
  #===upload file===
129
  @st.cache_data(ttl=3600)
130
  def upload(file):
@@ -153,15 +113,58 @@ if uploaded_file is not None:
153
  papers = upload(extype)
154
  elif extype.endswith('.txt'):
155
  papers = conv_txt(extype)
156
-
157
- c1, c2, c3 = st.columns([3,2,5])
 
 
158
  method = c1.selectbox(
159
  'Choose method',
160
  ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'), on_change=reset_all)
161
- num_cho = c2.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
162
- words_to_remove = c3.text_input("Remove specific words. Separate words by semicolons (;)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
- d1, d2 = st.columns([8,2])
 
 
 
 
 
 
 
 
 
 
 
165
  d2.info("Don't do anything during the computing", icon="⚠️")
166
  topic_abs, paper=clean_csv(extype)
167
 
@@ -328,9 +331,6 @@ if uploaded_file is not None:
328
  elif method == 'BERTopic':
329
  @st.cache_data(ttl=3600, show_spinner=False)
330
  def bertopic_vis(extype):
331
- if 'Publication Year' in paper.columns:
332
- paper.rename(columns={'Publication Year': 'Year'}, inplace=True)
333
- topic_time = paper.Year.values.tolist()
334
  umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
335
  min_dist=0.0, metric='cosine', random_state=bert_random_state)
336
  cluster_model = KMeans(n_clusters=num_topic)
@@ -345,7 +345,7 @@ if uploaded_file is not None:
345
  lang = 'multilingual'
346
  topic_model = BERTopic(embedding_model=emb_mod, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, top_n_words=bert_top_n_words)
347
  topics, probs = topic_model.fit_transform(topic_abs)
348
- return topic_model, topic_time, topics, probs
349
 
350
  @st.cache_data(ttl=3600, show_spinner=False)
351
  def Vis_Topics(extype):
@@ -370,21 +370,15 @@ if uploaded_file is not None:
370
 
371
  @st.cache_data(ttl=3600, show_spinner=False)
372
  def Vis_Barchart(extype):
373
- fig5 = topic_model.visualize_barchart(top_n_topics=num_topic) #, n_words=10)
374
  return fig5
375
-
376
- @st.cache_data(ttl=3600, show_spinner=False)
377
- def Vis_ToT(extype):
378
- topics_over_time = topic_model.topics_over_time(topic_abs, topic_time)
379
- fig6 = topic_model.visualize_topics_over_time(topics_over_time)
380
- return fig6
381
 
382
  tab1, tab2, tab3 = st.tabs(["πŸ“ˆ Generate visualization", "πŸ“ƒ Reference", "πŸ““ Recommended Reading"])
383
  with tab1:
384
  try:
385
  with st.spinner('Performing computations. Please wait ...'):
386
 
387
- topic_model, topic_time, topics, probs = bertopic_vis(extype)
388
  time.sleep(.5)
389
  st.toast('Visualize Topics', icon='πŸƒ')
390
  fig1 = Vis_Topics(extype)
@@ -404,10 +398,6 @@ if uploaded_file is not None:
404
  time.sleep(.5)
405
  st.toast('Visualize Terms', icon='πŸƒ')
406
  fig5 = Vis_Barchart(extype)
407
-
408
- time.sleep(.5)
409
- st.toast('Visualize Topics over Time', icon='πŸƒ')
410
- fig6 = Vis_ToT(extype)
411
 
412
  with st.expander("Visualize Topics"):
413
  st.write(fig1)
@@ -419,9 +409,7 @@ if uploaded_file is not None:
419
  st.write(fig3)
420
  with st.expander("Visualize Topic Similarity"):
421
  st.write(fig4)
422
- with st.expander("Visualize Topics over Time"):
423
- st.write(fig6)
424
-
425
  except ValueError:
426
  st.error('πŸ™‡β€β™‚οΈ Please raise the number of topics and click submit')
427
 
 
85
  #===avoiding deadlock===
86
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  #===upload file===
89
  @st.cache_data(ttl=3600)
90
  def upload(file):
 
113
  papers = upload(extype)
114
  elif extype.endswith('.txt'):
115
  papers = conv_txt(extype)
116
+
117
+ coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
118
+
119
+ c1, c2 = st.columns([3,4])
120
  method = c1.selectbox(
121
  'Choose method',
122
  ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'), on_change=reset_all)
123
+ num_cho = c1.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
124
+ ColCho = c2.selectbox(
125
+ 'Choose column',
126
+ (coldf), on_change=reset_all)
127
+ words_to_remove = c2.text_input("Remove specific words. Separate words by semicolons (;)")
128
+ rem_copyright = c1.toggle('Remove copyright statement', value=True, on_change=reset_all)
129
+ rem_punc = c2.toggle('Remove punctuation', value=True, on_change=reset_all)
130
+
131
+ #===clean csv===
132
+ @st.cache_data(ttl=3600, show_spinner=False)
133
+ def clean_csv(extype):
134
+ paper = papers.dropna(subset=[ColCho])
135
+
136
+ #===mapping===
137
+ paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
138
+ if rem_punc:
139
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('[,:;\.!-?β€’=]', ' ', x))
140
+ paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('\u201c|\u201d', '', regex=True)
141
+ if rem_copyright:
142
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('Β©.*', '', x))
143
+
144
+ #===stopword removal===
145
+ stop = stopwords.words('english')
146
+ paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
147
+
148
+ #===lemmatize===
149
+ lemmatizer = WordNetLemmatizer()
150
+ def lemmatize_words(text):
151
+ words = text.split()
152
+ words = [lemmatizer.lemmatize(word) for word in words]
153
+ return ' '.join(words)
154
+ paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
155
 
156
+ words_rmv = [word.strip() for word in words_to_remove.split(";")]
157
+ remove_dict = {word: None for word in words_rmv}
158
+ def remove_words(text):
159
+ words = text.split()
160
+ cleaned_words = [word for word in words if word not in remove_dict]
161
+ return ' '.join(cleaned_words)
162
+ paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
163
+
164
+ topic_abs = paper.Abstract_lem.values.tolist()
165
+ return topic_abs, paper
166
+
167
+ d1, d2 = st.columns([7,3])
168
  d2.info("Don't do anything during the computing", icon="⚠️")
169
  topic_abs, paper=clean_csv(extype)
170
 
 
331
  elif method == 'BERTopic':
332
  @st.cache_data(ttl=3600, show_spinner=False)
333
  def bertopic_vis(extype):
 
 
 
334
  umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
335
  min_dist=0.0, metric='cosine', random_state=bert_random_state)
336
  cluster_model = KMeans(n_clusters=num_topic)
 
345
  lang = 'multilingual'
346
  topic_model = BERTopic(embedding_model=emb_mod, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, top_n_words=bert_top_n_words)
347
  topics, probs = topic_model.fit_transform(topic_abs)
348
+ return topic_model, topics, probs
349
 
350
  @st.cache_data(ttl=3600, show_spinner=False)
351
  def Vis_Topics(extype):
 
370
 
371
  @st.cache_data(ttl=3600, show_spinner=False)
372
  def Vis_Barchart(extype):
373
+ fig5 = topic_model.visualize_barchart(top_n_topics=num_topic)
374
  return fig5
 
 
 
 
 
 
375
 
376
  tab1, tab2, tab3 = st.tabs(["πŸ“ˆ Generate visualization", "πŸ“ƒ Reference", "πŸ““ Recommended Reading"])
377
  with tab1:
378
  try:
379
  with st.spinner('Performing computations. Please wait ...'):
380
 
381
+ topic_model, topics, probs = bertopic_vis(extype)
382
  time.sleep(.5)
383
  st.toast('Visualize Topics', icon='πŸƒ')
384
  fig1 = Vis_Topics(extype)
 
398
  time.sleep(.5)
399
  st.toast('Visualize Terms', icon='πŸƒ')
400
  fig5 = Vis_Barchart(extype)
 
 
 
 
401
 
402
  with st.expander("Visualize Topics"):
403
  st.write(fig1)
 
409
  st.write(fig3)
410
  with st.expander("Visualize Topic Similarity"):
411
  st.write(fig4)
412
+
 
 
413
  except ValueError:
414
  st.error('πŸ™‡β€β™‚οΈ Please raise the number of topics and click submit')
415