faizhalas commited on
Commit
843bacb
1 Parent(s): eea0cdb

Enable reproduce result using UMAP

Browse files
Files changed (1) hide show
  1. pages/2 Topic Modeling.py +4 -1
pages/2 Topic Modeling.py CHANGED
@@ -33,6 +33,7 @@ import spacy
33
  import en_core_web_sm
34
  import pipeline
35
  from html2image import Html2Image
 
36
 
37
 
38
  #===config===
@@ -282,9 +283,11 @@ if uploaded_file is not None:
282
  @st.cache_data(ttl=3600, show_spinner=False)
283
  def bertopic_vis(extype):
284
  topic_time = paper.Year.values.tolist()
 
 
285
  cluster_model = KMeans(n_clusters=num_topic)
286
  nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
287
- topic_model = BERTopic(embedding_model=nlp, hdbscan_model=cluster_model, language="multilingual").fit(topic_abs)
288
  topics, probs = topic_model.fit_transform(topic_abs)
289
  return topic_model, topic_time, topics, probs
290
 
 
33
  import en_core_web_sm
34
  import pipeline
35
  from html2image import Html2Image
36
+ from umap import UMAP
37
 
38
 
39
  #===config===
 
283
  @st.cache_data(ttl=3600, show_spinner=False)
284
  def bertopic_vis(extype):
285
  topic_time = paper.Year.values.tolist()
286
+ umap_model = UMAP(n_neighbors=15, n_components=5,
287
+ min_dist=0.0, metric='cosine', random_state=42)
288
  cluster_model = KMeans(n_clusters=num_topic)
289
  nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
290
+ topic_model = BERTopic(embedding_model=nlp, hdbscan_model=cluster_model, language="multilingual", umap_model=umap_model).fit(topic_abs)
291
  topics, probs = topic_model.fit_transform(topic_abs)
292
  return topic_model, topic_time, topics, probs
293