Spaces:
Running
Running
Adding an option to choose a column.
Browse files- pages/2 Topic Modeling.py +52 -64
pages/2 Topic Modeling.py
CHANGED
@@ -85,46 +85,6 @@ def reset_all():
|
|
85 |
#===avoiding deadlock===
|
86 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
87 |
|
88 |
-
#===clean csv===
|
89 |
-
@st.cache_data(ttl=3600, show_spinner=False)
|
90 |
-
def clean_csv(extype):
|
91 |
-
try:
|
92 |
-
paper = papers.dropna(subset=['Abstract'])
|
93 |
-
except KeyError:
|
94 |
-
st.error('Error: Please check your Abstract column.')
|
95 |
-
sys.exit(1)
|
96 |
-
paper = paper[~paper.Abstract.str.contains("No abstract available")]
|
97 |
-
paper = paper[~paper.Abstract.str.contains("STRAIT")]
|
98 |
-
|
99 |
-
#===mapping===
|
100 |
-
paper['Abstract_pre'] = paper['Abstract'].map(lambda x: re.sub('[,:;\.!-?β’=]', ' ', x))
|
101 |
-
paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: x.lower())
|
102 |
-
paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('Β©.*', '', x))
|
103 |
-
paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('\u201c|\u201d', '', regex=True)
|
104 |
-
|
105 |
-
#===stopword removal===
|
106 |
-
stop = stopwords.words('english')
|
107 |
-
paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
|
108 |
-
|
109 |
-
#===lemmatize===
|
110 |
-
lemmatizer = WordNetLemmatizer()
|
111 |
-
def lemmatize_words(text):
|
112 |
-
words = text.split()
|
113 |
-
words = [lemmatizer.lemmatize(word) for word in words]
|
114 |
-
return ' '.join(words)
|
115 |
-
paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
|
116 |
-
|
117 |
-
words_rmv = [word.strip() for word in words_to_remove.split(";")]
|
118 |
-
remove_dict = {word: None for word in words_rmv}
|
119 |
-
def remove_words(text):
|
120 |
-
words = text.split()
|
121 |
-
cleaned_words = [word for word in words if word not in remove_dict]
|
122 |
-
return ' '.join(cleaned_words)
|
123 |
-
paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
|
124 |
-
|
125 |
-
topic_abs = paper.Abstract_lem.values.tolist()
|
126 |
-
return topic_abs, paper
|
127 |
-
|
128 |
#===upload file===
|
129 |
@st.cache_data(ttl=3600)
|
130 |
def upload(file):
|
@@ -153,15 +113,58 @@ if uploaded_file is not None:
|
|
153 |
papers = upload(extype)
|
154 |
elif extype.endswith('.txt'):
|
155 |
papers = conv_txt(extype)
|
156 |
-
|
157 |
-
|
|
|
|
|
158 |
method = c1.selectbox(
|
159 |
'Choose method',
|
160 |
('Choose...', 'pyLDA', 'Biterm', 'BERTopic'), on_change=reset_all)
|
161 |
-
num_cho =
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
d2.info("Don't do anything during the computing", icon="β οΈ")
|
166 |
topic_abs, paper=clean_csv(extype)
|
167 |
|
@@ -328,9 +331,6 @@ if uploaded_file is not None:
|
|
328 |
elif method == 'BERTopic':
|
329 |
@st.cache_data(ttl=3600, show_spinner=False)
|
330 |
def bertopic_vis(extype):
|
331 |
-
if 'Publication Year' in paper.columns:
|
332 |
-
paper.rename(columns={'Publication Year': 'Year'}, inplace=True)
|
333 |
-
topic_time = paper.Year.values.tolist()
|
334 |
umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
|
335 |
min_dist=0.0, metric='cosine', random_state=bert_random_state)
|
336 |
cluster_model = KMeans(n_clusters=num_topic)
|
@@ -345,7 +345,7 @@ if uploaded_file is not None:
|
|
345 |
lang = 'multilingual'
|
346 |
topic_model = BERTopic(embedding_model=emb_mod, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, top_n_words=bert_top_n_words)
|
347 |
topics, probs = topic_model.fit_transform(topic_abs)
|
348 |
-
return topic_model,
|
349 |
|
350 |
@st.cache_data(ttl=3600, show_spinner=False)
|
351 |
def Vis_Topics(extype):
|
@@ -370,21 +370,15 @@ if uploaded_file is not None:
|
|
370 |
|
371 |
@st.cache_data(ttl=3600, show_spinner=False)
|
372 |
def Vis_Barchart(extype):
|
373 |
-
fig5 = topic_model.visualize_barchart(top_n_topics=num_topic)
|
374 |
return fig5
|
375 |
-
|
376 |
-
@st.cache_data(ttl=3600, show_spinner=False)
|
377 |
-
def Vis_ToT(extype):
|
378 |
-
topics_over_time = topic_model.topics_over_time(topic_abs, topic_time)
|
379 |
-
fig6 = topic_model.visualize_topics_over_time(topics_over_time)
|
380 |
-
return fig6
|
381 |
|
382 |
tab1, tab2, tab3 = st.tabs(["π Generate visualization", "π Reference", "π Recommended Reading"])
|
383 |
with tab1:
|
384 |
try:
|
385 |
with st.spinner('Performing computations. Please wait ...'):
|
386 |
|
387 |
-
topic_model,
|
388 |
time.sleep(.5)
|
389 |
st.toast('Visualize Topics', icon='π')
|
390 |
fig1 = Vis_Topics(extype)
|
@@ -404,10 +398,6 @@ if uploaded_file is not None:
|
|
404 |
time.sleep(.5)
|
405 |
st.toast('Visualize Terms', icon='π')
|
406 |
fig5 = Vis_Barchart(extype)
|
407 |
-
|
408 |
-
time.sleep(.5)
|
409 |
-
st.toast('Visualize Topics over Time', icon='π')
|
410 |
-
fig6 = Vis_ToT(extype)
|
411 |
|
412 |
with st.expander("Visualize Topics"):
|
413 |
st.write(fig1)
|
@@ -419,9 +409,7 @@ if uploaded_file is not None:
|
|
419 |
st.write(fig3)
|
420 |
with st.expander("Visualize Topic Similarity"):
|
421 |
st.write(fig4)
|
422 |
-
|
423 |
-
st.write(fig6)
|
424 |
-
|
425 |
except ValueError:
|
426 |
st.error('πββοΈ Please raise the number of topics and click submit')
|
427 |
|
|
|
85 |
#===avoiding deadlock===
|
86 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
#===upload file===
|
89 |
@st.cache_data(ttl=3600)
|
90 |
def upload(file):
|
|
|
113 |
papers = upload(extype)
|
114 |
elif extype.endswith('.txt'):
|
115 |
papers = conv_txt(extype)
|
116 |
+
|
117 |
+
coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
|
118 |
+
|
119 |
+
c1, c2 = st.columns([3,4])
|
120 |
method = c1.selectbox(
|
121 |
'Choose method',
|
122 |
('Choose...', 'pyLDA', 'Biterm', 'BERTopic'), on_change=reset_all)
|
123 |
+
num_cho = c1.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
|
124 |
+
ColCho = c2.selectbox(
|
125 |
+
'Choose column',
|
126 |
+
(coldf), on_change=reset_all)
|
127 |
+
words_to_remove = c2.text_input("Remove specific words. Separate words by semicolons (;)")
|
128 |
+
rem_copyright = c1.toggle('Remove copyright statement', value=True, on_change=reset_all)
|
129 |
+
rem_punc = c2.toggle('Remove punctuation', value=True, on_change=reset_all)
|
130 |
+
|
131 |
+
#===clean csv===
|
132 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
133 |
+
def clean_csv(extype):
|
134 |
+
paper = papers.dropna(subset=[ColCho])
|
135 |
+
|
136 |
+
#===mapping===
|
137 |
+
paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
|
138 |
+
if rem_punc:
|
139 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('[,:;\.!-?β’=]', ' ', x))
|
140 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('\u201c|\u201d', '', regex=True)
|
141 |
+
if rem_copyright:
|
142 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('Β©.*', '', x))
|
143 |
+
|
144 |
+
#===stopword removal===
|
145 |
+
stop = stopwords.words('english')
|
146 |
+
paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
|
147 |
+
|
148 |
+
#===lemmatize===
|
149 |
+
lemmatizer = WordNetLemmatizer()
|
150 |
+
def lemmatize_words(text):
|
151 |
+
words = text.split()
|
152 |
+
words = [lemmatizer.lemmatize(word) for word in words]
|
153 |
+
return ' '.join(words)
|
154 |
+
paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
|
155 |
|
156 |
+
words_rmv = [word.strip() for word in words_to_remove.split(";")]
|
157 |
+
remove_dict = {word: None for word in words_rmv}
|
158 |
+
def remove_words(text):
|
159 |
+
words = text.split()
|
160 |
+
cleaned_words = [word for word in words if word not in remove_dict]
|
161 |
+
return ' '.join(cleaned_words)
|
162 |
+
paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
|
163 |
+
|
164 |
+
topic_abs = paper.Abstract_lem.values.tolist()
|
165 |
+
return topic_abs, paper
|
166 |
+
|
167 |
+
d1, d2 = st.columns([7,3])
|
168 |
d2.info("Don't do anything during the computing", icon="β οΈ")
|
169 |
topic_abs, paper=clean_csv(extype)
|
170 |
|
|
|
331 |
elif method == 'BERTopic':
|
332 |
@st.cache_data(ttl=3600, show_spinner=False)
|
333 |
def bertopic_vis(extype):
|
|
|
|
|
|
|
334 |
umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
|
335 |
min_dist=0.0, metric='cosine', random_state=bert_random_state)
|
336 |
cluster_model = KMeans(n_clusters=num_topic)
|
|
|
345 |
lang = 'multilingual'
|
346 |
topic_model = BERTopic(embedding_model=emb_mod, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, top_n_words=bert_top_n_words)
|
347 |
topics, probs = topic_model.fit_transform(topic_abs)
|
348 |
+
return topic_model, topics, probs
|
349 |
|
350 |
@st.cache_data(ttl=3600, show_spinner=False)
|
351 |
def Vis_Topics(extype):
|
|
|
370 |
|
371 |
@st.cache_data(ttl=3600, show_spinner=False)
|
372 |
def Vis_Barchart(extype):
|
373 |
+
fig5 = topic_model.visualize_barchart(top_n_topics=num_topic)
|
374 |
return fig5
|
|
|
|
|
|
|
|
|
|
|
|
|
375 |
|
376 |
tab1, tab2, tab3 = st.tabs(["π Generate visualization", "π Reference", "π Recommended Reading"])
|
377 |
with tab1:
|
378 |
try:
|
379 |
with st.spinner('Performing computations. Please wait ...'):
|
380 |
|
381 |
+
topic_model, topics, probs = bertopic_vis(extype)
|
382 |
time.sleep(.5)
|
383 |
st.toast('Visualize Topics', icon='π')
|
384 |
fig1 = Vis_Topics(extype)
|
|
|
398 |
time.sleep(.5)
|
399 |
st.toast('Visualize Terms', icon='π')
|
400 |
fig5 = Vis_Barchart(extype)
|
|
|
|
|
|
|
|
|
401 |
|
402 |
with st.expander("Visualize Topics"):
|
403 |
st.write(fig1)
|
|
|
409 |
st.write(fig3)
|
410 |
with st.expander("Visualize Topic Similarity"):
|
411 |
st.write(fig4)
|
412 |
+
|
|
|
|
|
413 |
except ValueError:
|
414 |
st.error('πββοΈ Please raise the number of topics and click submit')
|
415 |
|