import streamlit as st import time import json from gensim.models import Word2Vec import pandas as pd import matplotlib.pyplot as plt import squarify import numpy as np # Define the HTML and CSS styles st.markdown( """ """, unsafe_allow_html=True ) st.header("Word2Vec App for Clotting Pubmed Database.") text_input_value = st.text_input("Enter one term to search within the Clotting database") query = text_input_value query = query.lower() # query = input ("Enter your keyword(s):") if query: bar = st.progress(0) time.sleep(.2) st.caption(":LightSkyBlue[searching 40123 PubMed abstracts]") for i in range(10): bar.progress((i+1)*10) time.sleep(.1) model = Word2Vec.load("pubmed_model_clotting") # you can continue training with the loaded model! words = list(model.wv.key_to_index) X = model.wv[model.wv.key_to_index] model2 = model.wv[query] df = pd.DataFrame(X) # def findRelationships(query, df): table = model.wv.most_similar_cosmul(query, topn=10000) table = (pd.DataFrame(table)) table.index.name = 'Rank' table.columns = ['Word', 'SIMILARITY'] print() print("Similarity to " + str(query)) pd.set_option('display.max_rows', None) print(table.head(50)) # table.head(10).to_csv("clotting_sim1.csv", index=True) # short_table = table.head(50) # print(table) st.subheader(f"Similar Words to {query}") # calculate the sizes of the squares in the treemap short_table = table.head(10) short_table.index += 1 short_table.index = 1 / short_table.index sizes = short_table.index.tolist() cmap = plt.cm.Greens(np.linspace(0.05, .5, len(sizes))) color = [cmap[i] for i in range(len(sizes))] short_table.set_index('Word', inplace=True) squarify.plot(sizes=sizes, label=short_table.index.tolist(), color=color, edgecolor="#EBF5FB", text_kwargs={'fontsize': 10}) # # plot the treemap using matplotlib plt.axis('off') fig = plt.gcf() fig.patch.set_facecolor('#EBF5FB') # # display the treemap in Streamlit st.pyplot(fig) plt.clf() csv = table.head(100) st.download_button( label="download top 100 words (csv)", data=csv, file_name='clotting_words.csv', mime='text/csv') # st.write(short_table) # print() print("Human genes similar to " + str(query)) df1 = table df2 = pd.read_csv('Human_Genes.csv') m = df1.Word.isin(df2.symbol) df1 = df1[m] df1.rename(columns={'Word': 'Human Gene'}, inplace=True) df1["Human Gene"] = df1["Human Gene"].str.upper() print(df1.head(50)) print() # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) # time.sleep(2) st.subheader(f"Similar Genes to {query}") df10 = df1.head(10) df10.index = 1/df10.index sizes = df10.index.tolist() cmap2 = plt.cm.Blues(np.linspace(0.05, .5, len(sizes))) color2 = [cmap2[i] for i in range(len(sizes))] df1.set_index('Human Gene', inplace=True) squarify.plot(sizes=sizes, label=df1.index.tolist(), color=color2, edgecolor="#EBF5FB", text_kwargs={'fontsize': 12}) # # # plot the treemap using matplotlib plt.axis('off') fig2 = plt.gcf() fig2.patch.set_facecolor('#EBF5FB') # plt.show() # # # display the treemap in Streamlit st.pyplot(fig2) csv = df1.head(100) st.download_button( label="download top 100 genes (csv)", data=csv, file_name='clotting_genes.csv', mime='text/csv') # findRelationships(query, df) # model = gensim.models.KeyedVectors.load_word2vec_format('pubmed_model_clotting', binary=True) # similar_words = model.most_similar(word) # output = json.dumps({"word": word, "similar_words": similar_words}) # st.write(output)