jfataphd commited on
Commit
7978486
1 Parent(s): e19aaab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -59
app.py CHANGED
@@ -13,17 +13,10 @@ import urllib.request
13
  import random
14
  import plotly.express as px
15
 
16
-
17
- st.set_page_config(
18
- page_title="Abstractalytics",
19
- page_icon=":microscope:",
20
- layout="wide", #centered
21
- initial_sidebar_state="auto",
22
- menu_items={
23
- 'About': "Abstractalytics is a Natural Language Processing (NLP) that harnesses Word2Vec to mine"
24
- " insight from pubmed abstracts. Created by Jimmie E. Fata, PhD"
25
- }
26
- )
27
 
28
  # Define the HTML and CSS styles
29
  st.markdown("""
@@ -50,24 +43,28 @@ st.markdown("""
50
 
51
  st.header(":red[*Abstractalytics*]")
52
 
53
- st.subheader("*A web app designed to explore :red[*PubMed abstracts*] for deeper understanding and fresh insights, driven "
54
- "by Natural Language Processing (NLP) techniques.*")
 
 
55
 
56
  def custom_subheader(text, identifier, font_size):
57
  st.markdown(f"<h3 id='{identifier}' style='font-size: {font_size}px;'>{text}</h3>", unsafe_allow_html=True)
58
 
 
59
  custom_subheader("Welcome to our innovative web2vec app designed to unlock the wealth of knowledge and insights hidden "
60
- "within PubMed abstracts! To begin, simply select a corpus that interests you. Next, enter a single keyword "
61
- "you wish to explore within the corpus. Abstractalytics powerful Natural Language "
62
- "Processing (NLP) algorithms will analyze the chosen corpus and present you with a list of top words, "
63
- "genes, drugs, phytochemicals, and compounds that are contextually and semantically related "
64
- "to your input. This advanced text-mining technique enables you to explore and understand complex "
65
- "relationships, uncovering new discoveries and connections in your field of research across a massive "
66
- "amount of abstracts. Dive in and enjoy the exploration! More oncology-related corpora comming soon.", "unique-id", 18)
 
67
 
68
  st.markdown("---")
69
 
70
- #Define the correct password
71
  # CORRECT_PASSWORD = "123"
72
 
73
  # Define a function to check if the password is correct
@@ -82,9 +79,7 @@ st.markdown("---")
82
  #
83
  # # If the password is correct, show the app content
84
  # if authenticate(password):
85
- opt = st.sidebar.radio("Select a PubMed Corpus",
86
- options=(
87
- 'Breast Cancer corpus', 'Lung Cancer corpus'))
88
  # if opt == "Clotting corpus":
89
  # model_used = ("pubmed_model_clotting")
90
  # num_abstracts = 45493
@@ -101,6 +96,14 @@ if opt == "Lung Cancer corpus":
101
  model_used = ("lung_cancer_pubmed_model")
102
  num_abstracts = 143886
103
  database_name = "Lung_cancer"
 
 
 
 
 
 
 
 
104
 
105
  st.header(f":blue[{database_name} Pubmed corpus.]")
106
  text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
@@ -115,7 +118,8 @@ query = re.sub(" ", "-", query)
115
  if query:
116
  bar = st.progress(0)
117
  time.sleep(.05)
118
- st.caption(f"Searching {num_abstracts} {database_name} PubMed primary abstracts covering 1990-2022 (Reviews not included)")
 
119
 
120
  for i in range(10):
121
  bar.progress((i + 1) * 10)
@@ -130,6 +134,7 @@ if query:
130
  # print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
131
  df = pd.DataFrame(X)
132
 
 
133
  def get_compound_ids(compound_names):
134
  with concurrent.futures.ThreadPoolExecutor() as executor:
135
  compound_ids = list(executor.map(get_compound_id, compound_names))
@@ -149,6 +154,7 @@ if query:
149
  return compound_id
150
  return None
151
 
 
152
  # except:
153
  # st.error("Term occurrence is too low - please try another term")
154
  # st.stop()
@@ -203,12 +209,11 @@ if query:
203
  fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
204
  fig.update_annotations(visible=False)
205
  fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
206
- hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
207
- texttemplate="<br><span "
208
- "style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
209
- "<a href='%{customdata[0]}'>PubMed"
210
- "</a><br><br><a href='%{customdata[3]}'>Wikipedia"
211
- "</span></a>")
212
  fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
213
 
214
  # st.pyplot(fig2)
@@ -220,10 +225,10 @@ if query:
220
 
221
  csv = table2.head(value_word).to_csv().encode('utf-8')
222
  st.download_button(label=f"download top {value_word} words (csv)", data=csv,
223
- file_name=f'{database_name}_words.csv', mime='text/csv')
224
  except:
225
  st.warning(
226
- f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")
227
 
228
  # st.markdown("---")
229
  # # st.write(short_table)
@@ -373,7 +378,7 @@ if query:
373
  # Define the `text` column for labels and `href` column for links
374
  df11['text'] = df11.index
375
  df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
376
- '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
377
  df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']]
378
  assert isinstance(df11, object)
379
  df11['database'] = database_name
@@ -382,17 +387,17 @@ if query:
382
 
383
  # Create the treemap using `px.treemap`
384
  fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
385
- hover_name=(df4.head(value_gene)['SIMILARITY']))
386
 
387
  fig.update(layout_coloraxis_showscale=False)
388
  fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
389
  fig.update_annotations(visible=False)
390
  fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
391
- hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
392
- texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
393
- "<a href='%{customdata[0]}'>PubMed"
394
- "</a><br><br><a href='%{customdata[2]}'>GeneCard"
395
- "</span></a>")
396
  fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"])
397
  # # display the treemap in Streamlit
398
  # with treemap2:
@@ -403,18 +408,19 @@ if query:
403
  # st.caption(
404
  # "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa")
405
  # st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
406
- st.caption("Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
 
407
  st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
408
  st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
409
 
410
  csv = df1.head(value_gene).to_csv().encode('utf-8')
411
  st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
412
- file_name=f'{database_name}_genes.csv', mime='text/csv')
413
 
414
 
415
  else:
416
  st.warning(
417
- f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
418
  st.markdown("---")
419
  # print()
420
  # print("Human genes similar to " + str(query))
@@ -477,7 +483,7 @@ if query:
477
  df13.set_index('Drugs', inplace=True)
478
  df13['text'] = df13.index
479
  df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
480
- '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']]
481
  df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']]
482
  assert isinstance(df13, object)
483
  df13['database'] = database_name
@@ -486,17 +492,17 @@ if query:
486
 
487
  # Create the treemap using `px.treemap`
488
  fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
489
- hover_name=(df6.head(value_drug)['SIMILARITY']))
490
 
491
  fig.update(layout_coloraxis_showscale=False)
492
  fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
493
  fig.update_annotations(visible=False)
494
  fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
495
- hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
496
- texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
497
- "<a href='%{customdata[0]}'>PubMed"
498
- "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
499
- "</span></a>")
500
  fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"])
501
  # # display the treemap in Streamlit
502
  # with treemap2:
@@ -504,17 +510,16 @@ if query:
504
  # st.pyplot(fig2)
505
  st.plotly_chart(fig, use_container_width=True)
506
 
507
- st.caption(
508
- "Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/")
509
 
510
  csv = df1.head(value_drug).to_csv().encode('utf-8')
511
  st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv,
512
- file_name=f'{database_name}_drugs.csv', mime='text/csv')
513
 
514
 
515
  else:
516
  st.warning(
517
- f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
518
  st.markdown("---")
519
  #
520
  # st.markdown("---")
@@ -926,9 +931,8 @@ if query:
926
  f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
927
  st.markdown("---")
928
 
929
-
930
  # import os
931
-
932
  # from datasets import Dataset
933
 
934
  # # Check if the comments directory exists
@@ -955,9 +959,6 @@ if query:
955
 
956
  # print('Comment saved to dataset.')
957
 
958
-
959
-
960
-
961
  # st.title("Abstractalytics Web App")
962
  # st.write("We appreciate your feedback!")
963
 
 
13
  import random
14
  import plotly.express as px
15
 
16
+ st.set_page_config(page_title="Abstractalytics", page_icon=":microscope:", layout="wide", # centered
17
+ initial_sidebar_state="auto",
18
+ menu_items={'About': "Abstractalytics is a Natural Language Processing (NLP) that harnesses Word2Vec to mine"
19
+ " insight from pubmed abstracts. Created by Jimmie E. Fata, PhD"})
 
 
 
 
 
 
 
20
 
21
  # Define the HTML and CSS styles
22
  st.markdown("""
 
43
 
44
  st.header(":red[*Abstractalytics*]")
45
 
46
+ st.subheader(
47
+ "*A web app designed to explore :red[*PubMed abstracts*] for deeper understanding and fresh insights, driven "
48
+ "by Natural Language Processing (NLP) techniques.*")
49
+
50
 
51
  def custom_subheader(text, identifier, font_size):
52
  st.markdown(f"<h3 id='{identifier}' style='font-size: {font_size}px;'>{text}</h3>", unsafe_allow_html=True)
53
 
54
+
55
  custom_subheader("Welcome to our innovative web2vec app designed to unlock the wealth of knowledge and insights hidden "
56
+ "within PubMed abstracts! To begin, simply select a corpus that interests you. Next, enter a single keyword "
57
+ "you wish to explore within the corpus. Abstractalytics powerful Natural Language "
58
+ "Processing (NLP) algorithms will analyze the chosen corpus and present you with a list of top words, "
59
+ "genes, drugs, phytochemicals, and compounds that are contextually and semantically related "
60
+ "to your input. This advanced text-mining technique enables you to explore and understand complex "
61
+ "relationships, uncovering new discoveries and connections in your field of research across a massive "
62
+ "amount of abstracts. Dive in and enjoy the exploration! More oncology-related corpora comming soon.",
63
+ "unique-id", 18)
64
 
65
  st.markdown("---")
66
 
67
+ # Define the correct password
68
  # CORRECT_PASSWORD = "123"
69
 
70
  # Define a function to check if the password is correct
 
79
  #
80
  # # If the password is correct, show the app content
81
  # if authenticate(password):
82
+ opt = st.sidebar.radio("Select a PubMed Corpus", options=('Breast Cancer corpus', 'Lung Cancer corpus', 'Prostate Cancer corpus'))
 
 
83
  # if opt == "Clotting corpus":
84
  # model_used = ("pubmed_model_clotting")
85
  # num_abstracts = 45493
 
96
  model_used = ("lung_cancer_pubmed_model")
97
  num_abstracts = 143886
98
  database_name = "Lung_cancer"
99
+ if opt == "Breast Cancer corpus":
100
+ model_used = ("pubmed_model_breast_cancer2")
101
+ num_abstracts = 204381
102
+ database_name = "Breast_cancer"
103
+ if opt == "Prostate Cancer corpus":
104
+ model_used = ("prostate_cancer_pubmed_model")
105
+ num_abstracts = 89000
106
+ database_name = "Prostate_cancer"
107
 
108
  st.header(f":blue[{database_name} Pubmed corpus.]")
109
  text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
 
118
  if query:
119
  bar = st.progress(0)
120
  time.sleep(.05)
121
+ st.caption(
122
+ f"Searching {num_abstracts} {database_name} PubMed primary abstracts covering 1990-2022 (Reviews not included)")
123
 
124
  for i in range(10):
125
  bar.progress((i + 1) * 10)
 
134
  # print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
135
  df = pd.DataFrame(X)
136
 
137
+
138
  def get_compound_ids(compound_names):
139
  with concurrent.futures.ThreadPoolExecutor() as executor:
140
  compound_ids = list(executor.map(get_compound_id, compound_names))
 
154
  return compound_id
155
  return None
156
 
157
+
158
  # except:
159
  # st.error("Term occurrence is too low - please try another term")
160
  # st.stop()
 
209
  fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
210
  fig.update_annotations(visible=False)
211
  fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
212
+ hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000", texttemplate="<br><span "
213
+ "style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
214
+ "<a href='%{customdata[0]}'>PubMed"
215
+ "</a><br><br><a href='%{customdata[3]}'>Wikipedia"
216
+ "</span></a>")
 
217
  fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
218
 
219
  # st.pyplot(fig2)
 
225
 
226
  csv = table2.head(value_word).to_csv().encode('utf-8')
227
  st.download_button(label=f"download top {value_word} words (csv)", data=csv,
228
+ file_name=f'{database_name}_words.csv', mime='text/csv')
229
  except:
230
  st.warning(
231
+ f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")
232
 
233
  # st.markdown("---")
234
  # # st.write(short_table)
 
378
  # Define the `text` column for labels and `href` column for links
379
  df11['text'] = df11.index
380
  df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
381
+ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
382
  df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']]
383
  assert isinstance(df11, object)
384
  df11['database'] = database_name
 
387
 
388
  # Create the treemap using `px.treemap`
389
  fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
390
+ hover_name=(df4.head(value_gene)['SIMILARITY']))
391
 
392
  fig.update(layout_coloraxis_showscale=False)
393
  fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
394
  fig.update_annotations(visible=False)
395
  fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
396
+ hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
397
+ texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
398
+ "<a href='%{customdata[0]}'>PubMed"
399
+ "</a><br><br><a href='%{customdata[2]}'>GeneCard"
400
+ "</span></a>")
401
  fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"])
402
  # # display the treemap in Streamlit
403
  # with treemap2:
 
408
  # st.caption(
409
  # "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa")
410
  # st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
411
+ st.caption(
412
+ "Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
413
  st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
414
  st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
415
 
416
  csv = df1.head(value_gene).to_csv().encode('utf-8')
417
  st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
418
+ file_name=f'{database_name}_genes.csv', mime='text/csv')
419
 
420
 
421
  else:
422
  st.warning(
423
+ f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
424
  st.markdown("---")
425
  # print()
426
  # print("Human genes similar to " + str(query))
 
483
  df13.set_index('Drugs', inplace=True)
484
  df13['text'] = df13.index
485
  df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
486
+ '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']]
487
  df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']]
488
  assert isinstance(df13, object)
489
  df13['database'] = database_name
 
492
 
493
  # Create the treemap using `px.treemap`
494
  fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
495
+ hover_name=(df6.head(value_drug)['SIMILARITY']))
496
 
497
  fig.update(layout_coloraxis_showscale=False)
498
  fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
499
  fig.update_annotations(visible=False)
500
  fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
501
+ hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
502
+ texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
503
+ "<a href='%{customdata[0]}'>PubMed"
504
+ "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
505
+ "</span></a>")
506
  fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"])
507
  # # display the treemap in Streamlit
508
  # with treemap2:
 
510
  # st.pyplot(fig2)
511
  st.plotly_chart(fig, use_container_width=True)
512
 
513
+ st.caption("Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/")
 
514
 
515
  csv = df1.head(value_drug).to_csv().encode('utf-8')
516
  st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv,
517
+ file_name=f'{database_name}_drugs.csv', mime='text/csv')
518
 
519
 
520
  else:
521
  st.warning(
522
+ f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
523
  st.markdown("---")
524
  #
525
  # st.markdown("---")
 
931
  f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
932
  st.markdown("---")
933
 
 
934
  # import os
935
+
936
  # from datasets import Dataset
937
 
938
  # # Check if the comments directory exists
 
959
 
960
  # print('Comment saved to dataset.')
961
 
 
 
 
962
  # st.title("Abstractalytics Web App")
963
  # st.write("We appreciate your feedback!")
964