Spaces:
Running
Running
Enable reproduce result using UMAP
Browse files
pages/2 Topic Modeling.py
CHANGED
@@ -33,6 +33,7 @@ import spacy
|
|
33 |
import en_core_web_sm
|
34 |
import pipeline
|
35 |
from html2image import Html2Image
|
|
|
36 |
|
37 |
|
38 |
#===config===
|
@@ -282,9 +283,11 @@ if uploaded_file is not None:
|
|
282 |
@st.cache_data(ttl=3600, show_spinner=False)
|
283 |
def bertopic_vis(extype):
|
284 |
topic_time = paper.Year.values.tolist()
|
|
|
|
|
285 |
cluster_model = KMeans(n_clusters=num_topic)
|
286 |
nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
|
287 |
-
topic_model = BERTopic(embedding_model=nlp, hdbscan_model=cluster_model, language="multilingual").fit(topic_abs)
|
288 |
topics, probs = topic_model.fit_transform(topic_abs)
|
289 |
return topic_model, topic_time, topics, probs
|
290 |
|
|
|
33 |
import en_core_web_sm
|
34 |
import pipeline
|
35 |
from html2image import Html2Image
|
36 |
+
from umap import UMAP
|
37 |
|
38 |
|
39 |
#===config===
|
|
|
283 |
@st.cache_data(ttl=3600, show_spinner=False)
|
284 |
def bertopic_vis(extype):
|
285 |
topic_time = paper.Year.values.tolist()
|
286 |
+
umap_model = UMAP(n_neighbors=15, n_components=5,
|
287 |
+
min_dist=0.0, metric='cosine', random_state=42)
|
288 |
cluster_model = KMeans(n_clusters=num_topic)
|
289 |
nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
|
290 |
+
topic_model = BERTopic(embedding_model=nlp, hdbscan_model=cluster_model, language="multilingual", umap_model=umap_model).fit(topic_abs)
|
291 |
topics, probs = topic_model.fit_transform(topic_abs)
|
292 |
return topic_model, topic_time, topics, probs
|
293 |
|