Spaces:

jfataphd
/

OncoDigger

Sleeping

App Files Files Community

OncoDigger / app.py

jfataphd

Update app.py

c85c4ca over 1 year ago

raw

history blame

No virus

3.6 kB

	import streamlit as st
	import time
	import json
	from gensim.models import Word2Vec
	import pandas as pd
	import matplotlib.pyplot as plt
	import squarify
	import numpy as np

	# Define the HTML and CSS styles
	st.markdown(
	"""
	<style>
	body {
	background-color: #EBF5FB;
	# color: #ffffff;
	}
	.stApp {
	background-color: #EBF5FB;
	# color: #ffffff;
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	st.header("Word2Vec App for Clotting Pubmed Database.")

	text_input_value = st.text_input("Enter one term to search within the Clotting database")
	query = text_input_value
	query = query.lower()
	# query = input ("Enter your keyword(s):")

	if query:
	bar = st.progress(0)
	for i in range(10):
	bar.progresses((i+1)*10)
	time.sleep(.1)
	st.subheader("searching 40123 PubMed abstracts")
	model = Word2Vec.load("pubmed_model_clotting") # you can continue training with the loaded model!
	words = list(model.wv.key_to_index)
	X = model.wv[model.wv.key_to_index]
	model2 = model.wv[query]
	df = pd.DataFrame(X)


	# def findRelationships(query, df):
	table = model.wv.most_similar_cosmul(query, topn=10000)
	table = (pd.DataFrame(table))
	table.index.name = 'Rank'
	table.columns = ['Word', 'SIMILARITY']
	print()
	print("Similarity to " + str(query))
	pd.set_option('display.max_rows', None)
	print(table.head(50))
	table.head(10).to_csv("clotting_sim1.csv", index=True)
	# short_table = table.head(50)
	# print(table)
	st.subheader(f"Similar Words to {query}")

	# calculate the sizes of the squares in the treemap
	short_table = table.head(10)
	short_table.index += 1
	short_table.index = 1 / short_table.index
	sizes = short_table.index.tolist()

	cmap = plt.cm.Greens(np.linspace(0.05, .5, len(sizes)))
	color = [cmap[i] for i in range(len(sizes))]

	short_table.set_index('Word', inplace=True)
	squarify.plot(sizes=sizes, label=short_table.index.tolist(), color=color, edgecolor="#EBF5FB", text_kwargs={'fontsize': 10})
	# # plot the treemap using matplotlib
	plt.axis('off')
	fig = plt.gcf()
	fig.patch.set_facecolor('#EBF5FB')
	# # display the treemap in Streamlit
	st.pyplot(fig)
	plt.clf()

	# st.write(short_table)
	#

	print()
	print("Human genes similar to " + str(query))
	df1 = table
	df2 = pd.read_csv('Human_Genes.csv')
	m = df1.Word.isin(df2.symbol)
	df1 = df1[m]
	df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
	df1["Human Gene"] = df1["Human Gene"].str.upper()
	print(df1.head(50))
	print()
	df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
	# time.sleep(2)
	st.subheader(f"Similar Genes to {query}")

	df1 = df1.head(10)
	df1.index = 1/df1.index
	sizes = df1.index.tolist()

	cmap2 = plt.cm.Blues(np.linspace(0.05, .5, len(sizes)))
	color2 = [cmap2[i] for i in range(len(sizes))]

	df1.set_index('Human Gene', inplace=True)
	squarify.plot(sizes=sizes, label=df1.index.tolist(), color=color2, edgecolor="#EBF5FB", text_kwargs={'fontsize': 12})
	#
	# # plot the treemap using matplotlib

	plt.axis('off')
	fig2 = plt.gcf()
	fig2.patch.set_facecolor('#EBF5FB')
	# plt.show()
	#
	# # display the treemap in Streamlit
	st.pyplot(fig2)



	# findRelationships(query, df)







	# model = gensim.models.KeyedVectors.load_word2vec_format('pubmed_model_clotting', binary=True)
	# similar_words = model.most_similar(word)
	# output = json.dumps({"word": word, "similar_words": similar_words})
	# st.write(output)