Spaces:

faizhalas
/

coconut

Running

App Files Files Community

faizhalas commited on Jul 5, 2023

Commit

35fca6a

•

1 Parent(s): 96e879c

Adding button to prevent crash

Browse files

Files changed (1) hide show

pages/2 Topic Modeling.py +132 -110

pages/2 Topic Modeling.py CHANGED Viewed

@@ -132,49 +132,56 @@ if uploaded_file is not None:
          papers = conv_txt(extype)
     topic_abs, paper=clean_csv(extype)
-    method = st.selectbox(
             'Choose method',
-            ('Choose...', 'pyLDA', 'Biterm','BERTopic'), on_change=reset_all)
     #===topic===
     if method == 'Choose...':
         st.write('')
-    elif method == 'pyLDA':
-         num_topic = st.slider('Choose number of topics', min_value=2, max_value=15, step=1, on_change=reset_all)
-         @st.cache_data(ttl=3600, show_spinner=False)
-         def pylda(extype):
-            topic_abs_LDA = [t.split(' ') for t in topic_abs]
-            id2word = Dictionary(topic_abs_LDA)
-            corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
-            #===LDA===
-            lda_model = LdaModel(corpus=corpus,
-                        id2word=id2word,
-                        num_topics=num_topic,
-                        random_state=0,
-                        chunksize=100,
-                        alpha='auto',
-                        per_word_topics=True)
-            pprint(lda_model.print_topics())
-            doc_lda = lda_model[corpus]
-            #===visualization===
-            coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
-            coherence_lda = coherence_model_lda.get_coherence()
-            vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
-            py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
-            return py_lda_vis_html, coherence_lda
          tab1, tab2, tab3 = st.tabs(["📈 Generate visualization & Calculate coherence", "📃 Reference", "📓 Recommended Reading"])
          with tab1:
          #===visualization===
-             with st.spinner('Calculating and Creating pyLDAvis Visualization ...'):
-              py_lda_vis_html, coherence_lda = pylda(extype)
-              st.write('Coherence: ', (coherence_lda))
-              components.html(py_lda_vis_html, width=1700, height=800)
-              st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
          with tab2:
              st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
@@ -186,10 +193,10 @@ if uploaded_file is not None:
              st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477–505.** https://doi.org/10.1007/s11192-019-03137-5')
      #===Biterm===
-    elif method == 'Biterm':
-        num_bitopic = st.slider('Choose number of topics', min_value=2, max_value=20, step=1, on_change=reset_all)
         #===optimize Biterm===
-        @st.cache_data(ttl=3600)
         def biterm_topic(extype):
             X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs)
             tf = np.array(X.sum(axis=0)).ravel()
@@ -197,7 +204,7 @@ if uploaded_file is not None:
             docs_lens = list(map(len, docs_vec))
             biterms = btm.get_biterms(docs_vec)
             model = btm.BTM(
-              X, vocabulary, seed=12321, T=num_bitopic, M=20, alpha=50/8, beta=0.01)
             model.fit(biterms, iterations=20)
             p_zd = model.transform(docs_vec)
             coherence = model.coherence_
@@ -206,84 +213,85 @@ if uploaded_file is not None:
             totaltop = topics_coords.label.values.tolist()
             return topics_coords, phi, totaltop
-        try:
-          topics_coords, phi, totaltop = biterm_topic(extype)
-          #with st.spinner('Visualizing, please wait ....'):
-          tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
-          with tab1:
-            col1, col2 = st.columns(2)
-            @st.cache_data(ttl=3600)
-            def biterm_map(extype):
-              btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
-              return btmvis_coords
-            @st.cache_data(ttl=3600)
-            def biterm_bar(extype):
-              terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
-              btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
-              return btmvis_probs
-            with col1:
-              numvis = st.selectbox(
-                'Choose topic',
-                (totaltop), on_change=reset_biterm)
-              btmvis_coords = biterm_map(extype)
-              st.altair_chart(btmvis_coords, use_container_width=True)
-            with col2:
-              btmvis_probs = biterm_bar(extype)
-              st.altair_chart(btmvis_probs, use_container_width=True)
-          with tab2:
             st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
-          with tab3:
             st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
             st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
             st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
             st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
-        except ValueError:
-          st.error('Please raise the number of topics')
      #===BERTopic===
     elif method == 'BERTopic':
-        num_btopic = st.slider('Choose number of topics', min_value=4, max_value=20, step=1, on_change=reset_all)
-        @st.cache_data(ttl=3600)
         def bertopic_vis(extype):
           topic_time = paper.Year.values.tolist()
-          cluster_model = KMeans(n_clusters=num_btopic)
           nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
           topic_model = BERTopic(embedding_model=nlp, hdbscan_model=cluster_model, language="multilingual").fit(topic_abs)
           topics, probs = topic_model.fit_transform(topic_abs)
           return topic_model, topic_time, topics, probs
-        @st.cache_data(ttl=3600)
         def Vis_Topics(extype):
           fig1 = topic_model.visualize_topics()
           return fig1
-        @st.cache_data(ttl=3600)
         def Vis_Documents(extype):
           fig2 = topic_model.visualize_documents(topic_abs)
           return fig2
-        @st.cache_data(ttl=3600)
         def Vis_Hierarchy(extype):
-          fig3 = topic_model.visualize_hierarchy(top_n_topics=num_btopic)
           return fig3
-        @st.cache_data(ttl=3600)
         def Vis_Heatmap(extype):
           global topic_model
-          fig4 = topic_model.visualize_heatmap(n_clusters=num_btopic-1, width=1000, height=1000)
           return fig4
-        @st.cache_data(ttl=3600)
         def Vis_Barchart(extype):
-          fig5 = topic_model.visualize_barchart(top_n_topics=num_btopic, n_words=10)
           return fig5
-        @st.cache_data(ttl=3600)
         def Vis_ToT(extype):
           topics_over_time = topic_model.topics_over_time(topic_abs, topic_time)
           fig6 = topic_model.visualize_topics_over_time(topics_over_time)
@@ -291,35 +299,49 @@ if uploaded_file is not None:
         tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
         with tab1:
-          topic_model, topic_time, topics, probs = bertopic_vis(extype)
-          #===visualization===
-          viz = st.selectbox(
-            'Choose visualization',
-            ('Visualize Topics', 'Visualize Documents', 'Visualize Document Hierarchy', 'Visualize Topic Similarity', 'Visualize Terms', 'Visualize Topics over Time'))
-          if viz == 'Visualize Topics':
-                 fig1 = Vis_Topics(extype)
-                 st.write(fig1)
-          elif viz == 'Visualize Documents':
-                 fig2 = Vis_Documents(extype)
-                 st.write(fig2)
-          elif viz == 'Visualize Document Hierarchy':
-                 fig3 = Vis_Hierarchy(extype)
-                 st.write(fig3)
-          elif viz == 'Visualize Topic Similarity':
-                 fig4 = Vis_Heatmap(extype)
-                 st.write(fig4)
-          elif viz == 'Visualize Terms':
-                 fig5 = Vis_Barchart(extype)
-                 st.write(fig5)
-          elif viz == 'Visualize Topics over Time':
-                 fig6 = Vis_ToT(extype)
-                 st.write(fig6)
         with tab2:
           st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')

          papers = conv_txt(extype)
     topic_abs, paper=clean_csv(extype)
+    c1, c2 = st.columns([5,5])
+    method = c1.selectbox(
             'Choose method',
+            ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'), on_change=reset_all)
+    c1.info("Don't do anything during the computing", icon="⚠️")
+    num_cho = c2.number_input('Choose number of topics', min_value=2, max_value=30, value=2)
+    if c2.button("Submit", on_click=reset_all):
+         num_topic = num_cho
     #===topic===
     if method == 'Choose...':
         st.write('')
+    elif method == 'pyLDA':
          tab1, tab2, tab3 = st.tabs(["📈 Generate visualization & Calculate coherence", "📃 Reference", "📓 Recommended Reading"])
          with tab1:
          #===visualization===
+              @st.cache_data(ttl=3600, show_spinner=False)
+              def pylda(extype):
+                 topic_abs_LDA = [t.split(' ') for t in topic_abs]
+                 id2word = Dictionary(topic_abs_LDA)
+                 corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
+                 #===LDA===
+                 lda_model = LdaModel(corpus=corpus,
+                             id2word=id2word,
+                             num_topics=num_topic,
+                             random_state=0,
+                             chunksize=100,
+                             alpha='auto',
+                             per_word_topics=True)
+                 pprint(lda_model.print_topics())
+                 doc_lda = lda_model[corpus]
+                 #===visualization===
+                 coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
+                 coherence_lda = coherence_model_lda.get_coherence()
+                 vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
+                 py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
+                 return py_lda_vis_html, coherence_lda
+              with st.spinner('Performing computations. Please wait ...'):
+                   try:
+                        py_lda_vis_html, coherence_lda = pylda(extype)
+                        st.write('Coherence: ', (coherence_lda))
+                        components.html(py_lda_vis_html, width=1700, height=800)
+                        st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
+                   except NameError:
+                        st.error('🖱️ Please click Submit')
          with tab2:
              st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
              st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477–505.** https://doi.org/10.1007/s11192-019-03137-5')
      #===Biterm===
+    elif method == 'Biterm':
         #===optimize Biterm===
+        @st.cache_data(ttl=3600, show_spinner=False)
         def biterm_topic(extype):
             X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs)
             tf = np.array(X.sum(axis=0)).ravel()
             docs_lens = list(map(len, docs_vec))
             biterms = btm.get_biterms(docs_vec)
             model = btm.BTM(
+              X, vocabulary, seed=12321, T=num_topic, M=20, alpha=50/8, beta=0.01)
             model.fit(biterms, iterations=20)
             p_zd = model.transform(docs_vec)
             coherence = model.coherence_
             totaltop = topics_coords.label.values.tolist()
             return topics_coords, phi, totaltop
+        tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
+        with tab1:
+             try:
+               with st.spinner('Performing computations. Please wait ...'):
+                    topics_coords, phi, totaltop = biterm_topic(extype)
+                    col1, col2 = st.columns([4,6])
+                    @st.cache_data(ttl=3600)
+                    def biterm_map(extype):
+                         btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
+                         return btmvis_coords
+                    @st.cache_data(ttl=3600)
+                    def biterm_bar(extype):
+                         terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
+                         btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
+                         return btmvis_probs
+                    with col1:
+                         numvis = st.selectbox(
+                              'Choose topic',
+                              (totaltop), on_change=reset_biterm)
+                         btmvis_coords = biterm_map(extype)
+                         st.altair_chart(btmvis_coords)
+                    with col2:
+                         btmvis_probs = biterm_bar(extype)
+                         st.altair_chart(btmvis_probs, use_container_width=True)
+             except ValueError:
+                   st.error('🙇‍♂️ Please raise the number of topics and click submit')
+             except NameError:
+                   st.error('🖱️ Please click Submit')
+        with tab2:
             st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
+        with tab3:
             st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
             st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
             st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
             st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
      #===BERTopic===
     elif method == 'BERTopic':
+        @st.cache_data(ttl=3600, show_spinner=False)
         def bertopic_vis(extype):
           topic_time = paper.Year.values.tolist()
+          cluster_model = KMeans(n_clusters=num_topic)
           nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
           topic_model = BERTopic(embedding_model=nlp, hdbscan_model=cluster_model, language="multilingual").fit(topic_abs)
           topics, probs = topic_model.fit_transform(topic_abs)
           return topic_model, topic_time, topics, probs
+        @st.cache_data(ttl=3600, show_spinner=False)
         def Vis_Topics(extype):
           fig1 = topic_model.visualize_topics()
           return fig1
+        @st.cache_data(ttl=3600, show_spinner=False)
         def Vis_Documents(extype):
           fig2 = topic_model.visualize_documents(topic_abs)
           return fig2
+        @st.cache_data(ttl=3600, show_spinner=False)
         def Vis_Hierarchy(extype):
+          fig3 = topic_model.visualize_hierarchy(top_n_topics=num_topic)
           return fig3
+        @st.cache_data(ttl=3600, show_spinner=False)
         def Vis_Heatmap(extype):
           global topic_model
+          fig4 = topic_model.visualize_heatmap(n_clusters=num_topic-1, width=1000, height=1000)
           return fig4
+        @st.cache_data(ttl=3600, show_spinner=False)
         def Vis_Barchart(extype):
+          fig5 = topic_model.visualize_barchart(top_n_topics=num_topic, n_words=10)
           return fig5
+        @st.cache_data(ttl=3600, show_spinner=False)
         def Vis_ToT(extype):
           topics_over_time = topic_model.topics_over_time(topic_abs, topic_time)
           fig6 = topic_model.visualize_topics_over_time(topics_over_time)
         tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading"])
         with tab1:
+          try:
+               with st.spinner('Performing computations. Please wait ...'):
+                    topic_model, topic_time, topics, probs = bertopic_vis(extype)
+                    #===visualization===
+                    viz = st.selectbox(
+                      'Choose visualization',
+                      ('Visualize Topics', 'Visualize Documents', 'Visualize Document Hierarchy', 'Visualize Topic Similarity', 'Visualize Terms', 'Visualize Topics over Time'))
+                    if viz == 'Visualize Topics':
+                           with st.spinner('Performing computations. Please wait ...'):
+                                fig1 = Vis_Topics(extype)
+                                st.write(fig1)
+                    elif viz == 'Visualize Documents':
+                           with st.spinner('Performing computations. Please wait ...'):
+                                fig2 = Vis_Documents(extype)
+                                st.write(fig2)
+                    elif viz == 'Visualize Document Hierarchy':
+                           with st.spinner('Performing computations. Please wait ...'):
+                                fig3 = Vis_Hierarchy(extype)
+                                st.write(fig3)
+                    elif viz == 'Visualize Topic Similarity':
+                           with st.spinner('Performing computations. Please wait ...'):
+                                fig4 = Vis_Heatmap(extype)
+                                st.write(fig4)
+                    elif viz == 'Visualize Terms':
+                           with st.spinner('Performing computations. Please wait ...'):
+                                fig5 = Vis_Barchart(extype)
+                                st.write(fig5)
+                    elif viz == 'Visualize Topics over Time':
+                           with st.spinner('Performing computations. Please wait ...'):
+                                fig6 = Vis_ToT(extype)
+                                st.write(fig6)
+          except ValueError:
+               st.error('🙇‍♂️ Please raise the number of topics and click submit')
+          except NameError:
+               st.error('🖱️ Please click Submit')
         with tab2:
           st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')