Spaces:

jfataphd
/

OncoDigger

Sleeping

App Files Files Community

jfataphd commited on Apr 20, 2023

Commit

7978486

•

1 Parent(s): e19aaab

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -59

app.py CHANGED Viewed

@@ -13,17 +13,10 @@ import urllib.request
 import random
 import plotly.express as px
-st.set_page_config(
-    page_title="Abstractalytics",
-                page_icon=":microscope:",
-                layout="wide", #centered
-                initial_sidebar_state="auto",
-                menu_items={
-                    'About': "Abstractalytics is a Natural Language Processing (NLP) that harnesses Word2Vec to mine"
-                             " insight from pubmed abstracts. Created by Jimmie E. Fata, PhD"
-                }
-                )
 # Define the HTML and CSS styles
 st.markdown("""
@@ -50,24 +43,28 @@ st.markdown("""
 st.header(":red[*Abstractalytics*]")
-st.subheader("*A web app designed to explore :red[*PubMed abstracts*] for deeper understanding and fresh insights, driven "
-             "by Natural Language Processing (NLP) techniques.*")
 def custom_subheader(text, identifier, font_size):
     st.markdown(f"<h3 id='{identifier}' style='font-size: {font_size}px;'>{text}</h3>", unsafe_allow_html=True)
 custom_subheader("Welcome to our innovative web2vec app designed to unlock the wealth of knowledge and insights hidden "
-             "within PubMed abstracts! To begin, simply select a corpus that interests you. Next, enter a single keyword "
-             "you wish to explore within the corpus. Abstractalytics powerful Natural Language "
-             "Processing (NLP) algorithms will analyze the chosen corpus and present you with a list of top words, "
-             "genes, drugs, phytochemicals, and compounds that are contextually and semantically related "
-             "to your input. This advanced text-mining technique enables you to explore and understand complex "
-             "relationships, uncovering new discoveries and connections in your field of research across a massive "
-             "amount of abstracts. Dive in and enjoy the exploration! More oncology-related corpora comming soon.", "unique-id", 18)
 st.markdown("---")
-#Define the correct password
 # CORRECT_PASSWORD = "123"
 # Define a function to check if the password is correct
@@ -82,9 +79,7 @@ st.markdown("---")
 #
 # # If the password is correct, show the app content
 # if authenticate(password):
-opt = st.sidebar.radio("Select a PubMed Corpus",
-                           options=(
-                                    'Breast Cancer corpus', 'Lung Cancer corpus'))
 # if opt == "Clotting corpus":
 #     model_used = ("pubmed_model_clotting")
 #     num_abstracts = 45493
@@ -101,6 +96,14 @@ if opt == "Lung Cancer corpus":
     model_used = ("lung_cancer_pubmed_model")
     num_abstracts = 143886
     database_name = "Lung_cancer"
 st.header(f":blue[{database_name} Pubmed corpus.]")
 text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
@@ -115,7 +118,8 @@ query = re.sub(" ", "-", query)
 if query:
     bar = st.progress(0)
     time.sleep(.05)
-    st.caption(f"Searching {num_abstracts} {database_name} PubMed primary abstracts covering 1990-2022 (Reviews not included)")
     for i in range(10):
         bar.progress((i + 1) * 10)
@@ -130,6 +134,7 @@ if query:
     # print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
     df = pd.DataFrame(X)
     def get_compound_ids(compound_names):
         with concurrent.futures.ThreadPoolExecutor() as executor:
             compound_ids = list(executor.map(get_compound_id, compound_names))
@@ -149,6 +154,7 @@ if query:
                 return compound_id
         return None
     # except:
     #     st.error("Term occurrence is too low - please try another term")
     #     st.stop()
@@ -203,12 +209,11 @@ if query:
         fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
         fig.update_annotations(visible=False)
         fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
-                              hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
-                              texttemplate="<br><span "
-                                           "style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
-                                           "<a href='%{customdata[0]}'>PubMed"
-                                           "</a><br><br><a href='%{customdata[3]}'>Wikipedia"
-                                           "</span></a>")
         fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
         # st.pyplot(fig2)
@@ -220,10 +225,10 @@ if query:
         csv = table2.head(value_word).to_csv().encode('utf-8')
         st.download_button(label=f"download top {value_word} words (csv)", data=csv,
-                               file_name=f'{database_name}_words.csv', mime='text/csv')
     except:
         st.warning(
-                f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")
     # st.markdown("---")
     # # st.write(short_table)
@@ -373,7 +378,7 @@ if query:
         # Define the `text` column for labels and `href` column for links
         df11['text'] = df11.index
         df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
-                            '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
         df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']]
         assert isinstance(df11, object)
         df11['database'] = database_name
@@ -382,17 +387,17 @@ if query:
         # Create the treemap using `px.treemap`
         fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
-                             hover_name=(df4.head(value_gene)['SIMILARITY']))
         fig.update(layout_coloraxis_showscale=False)
         fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
         fig.update_annotations(visible=False)
         fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
-                              hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
-                              texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
-                                           "<a href='%{customdata[0]}'>PubMed"
-                                           "</a><br><br><a href='%{customdata[2]}'>GeneCard"
-                                           "</span></a>")
         fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"])
         # # display the treemap in Streamlit
         # with treemap2:
@@ -403,18 +408,19 @@ if query:
         # st.caption(
         #         "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa")
         # st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
-        st.caption("Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
         st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
         st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
         csv = df1.head(value_gene).to_csv().encode('utf-8')
         st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
-                               file_name=f'{database_name}_genes.csv', mime='text/csv')
     else:
         st.warning(
-                f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
     st.markdown("---")
     # print()
     # print("Human genes similar to " + str(query))
@@ -477,7 +483,7 @@ if query:
         df13.set_index('Drugs', inplace=True)
         df13['text'] = df13.index
         df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
-                            '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']]
         df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']]
         assert isinstance(df13, object)
         df13['database'] = database_name
@@ -486,17 +492,17 @@ if query:
         # Create the treemap using `px.treemap`
         fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
-                             hover_name=(df6.head(value_drug)['SIMILARITY']))
         fig.update(layout_coloraxis_showscale=False)
         fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
         fig.update_annotations(visible=False)
         fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
-                              hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
-                              texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
-                                           "<a href='%{customdata[0]}'>PubMed"
-                                           "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
-                                           "</span></a>")
         fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"])
         # # display the treemap in Streamlit
         # with treemap2:
@@ -504,17 +510,16 @@ if query:
         # st.pyplot(fig2)
         st.plotly_chart(fig, use_container_width=True)
-        st.caption(
-                "Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/")
         csv = df1.head(value_drug).to_csv().encode('utf-8')
         st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv,
-                               file_name=f'{database_name}_drugs.csv', mime='text/csv')
     else:
         st.warning(
-                f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
     st.markdown("---")
     #
     # st.markdown("---")
@@ -926,9 +931,8 @@ if query:
             f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
     st.markdown("---")
     # import os
     # from datasets import Dataset
     # # Check if the comments directory exists
@@ -955,9 +959,6 @@ if query:
     #     print('Comment saved to dataset.')
     # st.title("Abstractalytics Web App")
     # st.write("We appreciate your feedback!")

 import random
 import plotly.express as px
+st.set_page_config(page_title="Abstractalytics", page_icon=":microscope:", layout="wide",  # centered
+                   initial_sidebar_state="auto",
+                   menu_items={'About': "Abstractalytics is a Natural Language Processing (NLP) that harnesses Word2Vec to mine"
+                         " insight from pubmed abstracts. Created by Jimmie E. Fata, PhD"})
 # Define the HTML and CSS styles
 st.markdown("""
 st.header(":red[*Abstractalytics*]")
+st.subheader(
+    "*A web app designed to explore :red[*PubMed abstracts*] for deeper understanding and fresh insights, driven "
+    "by Natural Language Processing (NLP) techniques.*")
 def custom_subheader(text, identifier, font_size):
     st.markdown(f"<h3 id='{identifier}' style='font-size: {font_size}px;'>{text}</h3>", unsafe_allow_html=True)
 custom_subheader("Welcome to our innovative web2vec app designed to unlock the wealth of knowledge and insights hidden "
+                 "within PubMed abstracts! To begin, simply select a corpus that interests you. Next, enter a single keyword "
+                 "you wish to explore within the corpus. Abstractalytics powerful Natural Language "
+                 "Processing (NLP) algorithms will analyze the chosen corpus and present you with a list of top words, "
+                 "genes, drugs, phytochemicals, and compounds that are contextually and semantically related "
+                 "to your input. This advanced text-mining technique enables you to explore and understand complex "
+                 "relationships, uncovering new discoveries and connections in your field of research across a massive "
+                 "amount of abstracts. Dive in and enjoy the exploration! More oncology-related corpora comming soon.",
+                 "unique-id", 18)
 st.markdown("---")
+# Define the correct password
 # CORRECT_PASSWORD = "123"
 # Define a function to check if the password is correct
 #
 # # If the password is correct, show the app content
 # if authenticate(password):
+opt = st.sidebar.radio("Select a PubMed Corpus", options=('Breast Cancer corpus', 'Lung Cancer corpus', 'Prostate Cancer corpus'))
 # if opt == "Clotting corpus":
 #     model_used = ("pubmed_model_clotting")
 #     num_abstracts = 45493
     model_used = ("lung_cancer_pubmed_model")
     num_abstracts = 143886
     database_name = "Lung_cancer"
+if opt == "Breast Cancer corpus":
+    model_used = ("pubmed_model_breast_cancer2")
+    num_abstracts = 204381
+    database_name = "Breast_cancer"
+if opt == "Prostate Cancer corpus":
+    model_used = ("prostate_cancer_pubmed_model")
+    num_abstracts = 89000
+    database_name = "Prostate_cancer"
 st.header(f":blue[{database_name} Pubmed corpus.]")
 text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
 if query:
     bar = st.progress(0)
     time.sleep(.05)
+    st.caption(
+        f"Searching {num_abstracts} {database_name} PubMed primary abstracts covering 1990-2022 (Reviews not included)")
     for i in range(10):
         bar.progress((i + 1) * 10)
     # print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
     df = pd.DataFrame(X)
     def get_compound_ids(compound_names):
         with concurrent.futures.ThreadPoolExecutor() as executor:
             compound_ids = list(executor.map(get_compound_id, compound_names))
                 return compound_id
         return None
     # except:
     #     st.error("Term occurrence is too low - please try another term")
     #     st.stop()
         fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
         fig.update_annotations(visible=False)
         fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
+                          hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="<br><span "
+                                                                                                         "style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
+                                                                                                         "<a href='%{customdata[0]}'>PubMed"
+                                                                                                         "</a><br><br><a href='%{customdata[3]}'>Wikipedia"
+                                                                                                         "</span></a>")
         fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
         # st.pyplot(fig2)
         csv = table2.head(value_word).to_csv().encode('utf-8')
         st.download_button(label=f"download top {value_word} words (csv)", data=csv,
+                           file_name=f'{database_name}_words.csv', mime='text/csv')
     except:
         st.warning(
+            f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")
     # st.markdown("---")
     # # st.write(short_table)
         # Define the `text` column for labels and `href` column for links
         df11['text'] = df11.index
         df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
+                        '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
         df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']]
         assert isinstance(df11, object)
         df11['database'] = database_name
         # Create the treemap using `px.treemap`
         fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
+                         hover_name=(df4.head(value_gene)['SIMILARITY']))
         fig.update(layout_coloraxis_showscale=False)
         fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
         fig.update_annotations(visible=False)
         fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
+                          hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
+                          texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
+                                       "<a href='%{customdata[0]}'>PubMed"
+                                       "</a><br><br><a href='%{customdata[2]}'>GeneCard"
+                                       "</span></a>")
         fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"])
         # # display the treemap in Streamlit
         # with treemap2:
         # st.caption(
         #         "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa")
         # st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
+        st.caption(
+            "Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
         st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
         st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
         csv = df1.head(value_gene).to_csv().encode('utf-8')
         st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
+                           file_name=f'{database_name}_genes.csv', mime='text/csv')
     else:
         st.warning(
+            f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
     st.markdown("---")
     # print()
     # print("Human genes similar to " + str(query))
         df13.set_index('Drugs', inplace=True)
         df13['text'] = df13.index
         df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
+                        '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']]
         df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']]
         assert isinstance(df13, object)
         df13['database'] = database_name
         # Create the treemap using `px.treemap`
         fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
+                         hover_name=(df6.head(value_drug)['SIMILARITY']))
         fig.update(layout_coloraxis_showscale=False)
         fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
         fig.update_annotations(visible=False)
         fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
+                          hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
+                          texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
+                                       "<a href='%{customdata[0]}'>PubMed"
+                                       "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
+                                       "</span></a>")
         fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"])
         # # display the treemap in Streamlit
         # with treemap2:
         # st.pyplot(fig2)
         st.plotly_chart(fig, use_container_width=True)
+        st.caption("Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/")
         csv = df1.head(value_drug).to_csv().encode('utf-8')
         st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv,
+                           file_name=f'{database_name}_drugs.csv', mime='text/csv')
     else:
         st.warning(
+            f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
     st.markdown("---")
     #
     # st.markdown("---")
             f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
     st.markdown("---")
     # import os
     # from datasets import Dataset
     # # Check if the comments directory exists
     #     print('Comment saved to dataset.')
     # st.title("Abstractalytics Web App")
     # st.write("We appreciate your feedback!")