Spaces:

emilyalsentzer
/

SHEPHERD

Runtime error

SHEPHERD / app.py

Alsentzer

add instructions

e95d544 about 2 years ago

16.4 kB

	import gradio as gr
	import pandas as pd
	from pathlib import Path
	import ast

	'''
	Causal Gene Discovery Model
	/home/ema30/zaklab/rare_disease_dx/checkpoints/aligner/04_30_22:13:29:55_lr_1e-05_val_simulated_pats.disease_split_val_sim_pats_kg_8.9.21_kg_losstype_gene_multisimilarity/all_udn_patients_kg_8.9.21_kgsolved_manual_baylor_nobgm_distractor_genes_5_candidates_mapped_only_genes

	Patients-Like-Me Model
	/home/ema30/zaklab/rare_disease_dx/checkpoints/patient_NCA/04_26_22:17:38:30_lr_5e-05_val_simulated_pats.disease_split_val_sim_pats_kg_8.9.21_kg_losstype_patient_patient_NCA/mygene2_all_sim_all_udn_patients_kg_8.9.21_kgsolved_with_phenotypes

	Disease Characterization Model
	/home/ema30/zaklab/rare_disease_dx/checkpoints/patient_NCA/05_13_22:08:00:32_lr_1e-05_val_simulated_pats.disease_split_val_sim_pats_kg_8.9.21_kg_losstype_pd_NCA/mygene2_all_sim_all_udn_patients_kg_8.9.21_kgsolved_with_phenotypes
	'''



	gene_scores_df = pd.read_csv('gene_discovery_scores.csv')
	exomiser_gene_scores_df = pd.read_csv('exomiser_gene_discovery_scores.csv')
	patient_scores_df = pd.read_csv('patients_like_me_scores.csv')
	dx_scores_df = pd.read_csv('dx_characterization_scores.csv')
	plm_attn_df = pd.read_csv('patients_like_me_scores_attn.csv')
	dx_attn_df = pd.read_csv('dx_characterization_scores_attn.csv')
	gene_attn_df = pd.read_csv('gene_discovery_scores_attn.csv')
	exomiser_gene_attn_df = pd.read_csv('exomiser_gene_discovery_scores_attn.csv')

	diseases_map = {'UDN-P1': 'POLR3-releated leukodystrophy', 'UDN-P2': 'Novel Syndrome', 'UDN-P3':'Coffin-Lowry syndrome' ,
	'UDN-P4': 'automsomal recessive spastic paraplegia type 76', 'UDN-P5': 'atypical presentation of familial cold autoinflammatory syndrome',
	'UDN-P6': 'GATAD2B-associated syndrome', 'UDN-P7': 'AR limb-girdle muscular atrophy type 2D', 'UDN-P8': 'ATP5PO-related Leigh syndrome', 'UDN-P9': 'Spondyloepimetaphyseal dysplasia, Isidor-Toutain type'}
	genes_map = {'UDN-P3': 'RPS6KA3', 'UDN-P4': 'CAPN1', 'UDN-P5': 'NLRP12, RAPGEFL1', 'UDN-P6': 'GATAD2B', 'UDN-P7': 'SGCA', 'UDN-P8': 'ATP5P0', 'UDN-P9': 'RPL13'}



	def get_patient(patient_id, attn_df):
	'''
	Returns phenotypes, candidate genes, Causal gene, disease
	'''
	if patient_id in genes_map: gene = genes_map[patient_id]
	else:
	patient_gene_scores_df = gene_scores_df.loc[gene_scores_df['patient_id'] == patient_id]
	gene = ', '.join(patient_gene_scores_df.loc[patient_gene_scores_df['correct_gene_label'] == 1, 'genes'].tolist())

	if patient_id in diseases_map: disease = diseases_map[patient_id]
	else:
	patient_dx_scores_df = dx_scores_df.loc[dx_scores_df['patient_id'] == patient_id]
	disease = ', '.join(patient_dx_scores_df.loc[patient_dx_scores_df['correct_label'] == 1, 'diseases'].tolist())

	patient_attn_df = attn_df.loc[attn_df['patient_id'] == patient_id]
	phenotypes = ', '.join(patient_attn_df['phenotypes'].tolist())

	patient_str = f'''
	Selected Patient: {patient_id}<br>
	Causal Gene: {gene}<br>
	Disease: {disease}<br>
	Phenotypes: {phenotypes}<br><br>
	'''

	return patient_str


	def read_file(filename):
	with open(filename, 'r') as file:
	f = file.read()
	return f


	def causal_gene_discovery(patient_id, prioritization_type):
	if prioritization_type == 'Variant Filtered':
	scores_df = exomiser_gene_scores_df.loc[exomiser_gene_scores_df['patient_id'] == patient_id]
	else:
	scores_df = gene_scores_df.loc[gene_scores_df['patient_id'] == patient_id]



	# read in gene scores
	scores_df = scores_df.sort_values("similarities", ascending=False)
	scores_df['similarities'] = scores_df['similarities'].round(3).astype(str)

	# add links to gene cards
	scores_df['genes'] = scores_df['genes'].apply(lambda x: f'<u>[{x}](https://www.genecards.org/cgi-bin/carddisp.pl?gene={x})</u>')

	# bold/color causal gene
	scores_df.loc[scores_df['correct_gene_label'] == 1, 'similarities'] = scores_df.loc[scores_df['correct_gene_label'] == 1, 'similarities'].apply(lambda x: f'<span style="color:green">{x}</span>')
	scores_df.loc[scores_df['correct_gene_label'] == 1, 'genes'] = scores_df.loc[scores_df['correct_gene_label'] == 1, 'genes'].apply(lambda x: f'<span style="color:green">{x}</span>')

	#filter df
	scores_df = scores_df.drop(columns=['patient_id', 'correct_gene_label']).rename(columns={ 'similarities': 'SHEPHERD Score', 'genes': 'Candidate Genes'}) #'correct_gene_label' : 'Is Causal Gene',

	#############
	# Attention

	#read in phenotype attention
	if prioritization_type == 'Variant Filtered':
	attn_df = exomiser_gene_attn_df.loc[exomiser_gene_attn_df['patient_id'] == patient_id]
	else:
	attn_df = gene_attn_df.loc[gene_attn_df['patient_id'] == patient_id]
	attn_df = attn_df.sort_values("attention", ascending=False)
	attn_df['attention'] = attn_df['attention'].round(4)
	attn_df = attn_df.drop(columns=['patient_id', 'degrees'])

	#############
	# KG neighborhood
	#image_loc = f'images/{patient_id}.png'
	html_file = f'https://michellemli.github.io/test_html/{patient_id}.html'
	kg_html = f'''<iframe id="igraph" scrolling="no" style="border:none; width: 100%; height: 600px" seamless="seamless" src="{html_file}"></iframe>'''


	#patient_info
	patient = get_patient(patient_id, gene_attn_df)

	return patient, scores_df, attn_df, kg_html


	def patients_like_me(patient_id, k=10):


	scores_df = patient_scores_df.loc[patient_scores_df['patient_id'] == patient_id]
	scores_df = scores_df.sort_values("similarities", ascending=False)

	#scores_df['phenotypes'] ='PHEN'

	# add links to disease pages
	scores_df['disease_ids'] = scores_df['disease_ids'].apply(lambda x: f'(https://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert={x})</u>')
	scores_df['diseases'] = scores_df['diseases'].apply(lambda x: f'<u>[{x}]')
	scores_df['diseases'] = scores_df['diseases'] + scores_df['disease_ids']

	scores_df['genes'] = scores_df['genes'].apply(lambda x: f'<u>[{x}](https://www.genecards.org/cgi-bin/carddisp.pl?gene={x})</u>')



	# bold/color patients with same causal gene
	scores_df.loc[scores_df['correct_label'] == 1, 'candidate_patients'] = scores_df.loc[scores_df['correct_label'] == 1, 'candidate_patients'].apply(lambda x: f'<span style="color:green">{x}</span>')
	scores_df.loc[scores_df['correct_label'] == 1, 'genes'] = scores_df.loc[scores_df['correct_label'] == 1, 'genes'].apply(lambda x: f'<span style="color:green">{x}</span>')
	scores_df.loc[scores_df['correct_label'] == 1, 'diseases'] = scores_df.loc[scores_df['correct_label'] == 1, 'diseases'].apply(lambda x: f'<span style="color:green">{x}</span>')

	scores_df = scores_df.drop(columns=['patient_id', 'similarities', 'correct_label', 'disease_ids']).rename(columns={'candidate_patients': 'Candidate Patient', 'genes': 'Candidate Patient\'s Gene', 'diseases': 'Candidate Patient\'s Disease' }) #'phenotypes': 'Candidate Patient\'s Phenotypes'
	scores_df = scores_df.head(k)


	#read in phenotype attention
	attn_df = plm_attn_df.loc[plm_attn_df['patient_id'] == patient_id]
	attn_df = attn_df.sort_values("attention", ascending=False)
	attn_df['attention'] = attn_df['attention'].round(4)
	attn_df = attn_df.drop(columns=['patient_id', 'degrees'])

	#patient_info
	patient = get_patient(patient_id, plm_attn_df)


	return patient, scores_df, attn_df


	def disease_characterization(patient_id, k=10):


	#TODO: limit # of rows
	scores_df = dx_scores_df.loc[dx_scores_df['patient_id'] == patient_id]
	scores_df = scores_df.sort_values("similarities", ascending=False)
	scores_df = scores_df.head(k)

	scores_df.loc[ scores_df['disease_ids'].str.contains('Coxa vara'), 'disease_ids'] = '2812'
	scores_df.loc[ scores_df['disease_ids'].str.contains('Multiple epiphyseal dysplasia'), 'disease_ids'] = '2654'



	scores_df['disease_ids'] = scores_df['disease_ids'].apply(lambda x: ast.literal_eval(x))
	scores_df['type_disease_ids'] = scores_df['disease_ids'].apply(lambda x: type(x))

	scores_df.loc[scores_df['type_disease_ids'] == list, 'disease_ids'] = scores_df.loc[scores_df['type_disease_ids'] == list, 'disease_ids'].apply(lambda x: x[0])


	# add links to disease pages
	scores_df['disease_ids'] = scores_df['disease_ids'].apply(lambda x: f'(https://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert={x})</u>')
	scores_df['diseases'] = scores_df['diseases'].apply(lambda x: f'<u>[{x}]')
	scores_df['diseases'] = scores_df['diseases'] + scores_df['disease_ids']

	# one disease couldn't map to orphanet
	scores_df.loc[ scores_df['disease_ids'].str.contains('33657'), 'diseases'] = '<u>[leukodystrophy, hypomyelinating, 20](https://www.omim.org/entry/619071)</u>'
	scores_df.loc[ scores_df['disease_ids'].str.contains('2654'), 'diseases'] = '<u>[Multiple epiphyseal dysplasia](https://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=251)</u>'
	scores_df.loc[ scores_df['disease_ids'].str.contains('2812'), 'diseases'] = '<u>[Coxa vara](https://omim.org/entry/122750)</u>'



	scores_df = scores_df.drop(columns=['patient_id', 'similarities', 'correct_label', 'disease_ids','type_disease_ids']).rename(columns={'diseases' : 'Disease'})



	#read in phenotype attention
	attn_df = dx_attn_df.loc[dx_attn_df['patient_id'] == patient_id]
	attn_df = attn_df.sort_values("attention", ascending=False)
	attn_df['attention'] = attn_df['attention'].round(4)
	attn_df = attn_df.drop(columns=['patient_id', 'degrees'])

	#patient_info
	patient = get_patient(patient_id, dx_attn_df)



	return patient, scores_df, attn_df

	def get_umap(umap_type):
	# get UMAP
	if umap_type == 'disease':
	html_file = 'https://michellemli.github.io/test_html/shepherd_disease_characterization_umap.html'
	#html_file = read_file('images/udn_orphafit_patient_umap_nneigh=50_mindist=0.9_spread=1.0colored_by_disease_category.html')
	elif umap_type == 'patient':
	html_file = 'https://michellemli.github.io/test_html/shepherd_patient_umap.html'

	else:
	raise NotImplementedError


	# return f"""<iframe style="width: 100%; height: 480px" name="result" allow="midi;
	# display-capture; encrypted-media;" sandbox="allow-modals allow-forms
	# allow-scripts allow-same-origin allow-popups
	# allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
	# allowpaymentrequest="" frameborder="0" srcdoc='{html_file}'></iframe>"""
	return f'''<embed style="border: none;" src="{html_file}" dpi="300" width="100%" height="750px" />'''

	#return f'''<iframe id="igraph" scrolling="no" style="border:none; width: 100%; height: 750px" seamless="seamless" src="{html_file}"></iframe>'''


	with gr.Blocks() as demo: #css="#gene_attn_accordion {text-align: center}" css="kg_neigh {width: 70%}"
	gr.Markdown('<center><h1>AI-assisted Rare Disease Diagnosis with SHEPHERD</h1></center>')
	#gr.Markdown('<center><h2>A few SHot Explainable Predictor for Hard-to-diagnosE Rare Diseases</h2></center>')

	with gr.Tabs():
	with gr.TabItem("Causal Gene Discovery"):
	with gr.Column():
	gr.Markdown('<center><h2>Select a patient to view SHEPHERD\'s predictions</h2></center>')
	gene_dropdown = gr.Dropdown(choices=['UDN-P1', 'UDN-P2'], label='Rare Disease Patients', type='value') #value='UDN-P1',
	gene_radio = gr.Radio(choices=['Expert Curated', 'Variant Filtered'], value='Expert Curated', label='Type of Gene List')
	patient_info = gr.Markdown() #get_patient('UDN-P1')

	with gr.Accordion(label=f'SHEPHERD\'s Ranking of Patient\'s Candidate Genes', open=True, elem_id='gene_accordion'):
	#gr.Markdown(f'<center><h3>SHEPHERD\'s Ranking of Patient\'s Candidate Genes</h3></center>')
	gr.Markdown('Below are SHEPHERD\'s ranking of either all Expert Curated candidate genes or the top 10 Variant Filtered candidate genes. The patient\'s causal gene (i.e. gene harboring a variant that explains the patient\'s symptoms) is colored in green.')
	gene_dataframe = gr.Dataframe( elem_id="gene_df", datatype = 'markdown', headers=['Candidate Genes', 'SHEPHERD Score' ], overflow_row_behaviour='paginate') # label='Candidate Genes', show_label=False,
	with gr.Accordion(label=f'SHEPHERD\'s Attention to Patient\'s Phenotypes', open=False, elem_id='gene_attn_accordion'):
	#gr.Markdown(f'<center><h3>SHEPHERD\'s Attention to Patient\'s Phenotypes</h3></center>')
	gene_attn_dataframe = gr.Dataframe( elem_id="gene_attn_df", headers=['Phenotypes', 'Attention' ], overflow_row_behaviour='paginate') # label='Candidate Genes', show_label=False,
	with gr.Accordion(label=f'Visualization of Patient\'s Neighborhood in the Knowledge Graph', open=False, elem_id='kg_neigh_accordion'):
	#kg_neighborhood_image = gr.Image(elem_id='kg_neigh')#.style(height=200, width=200)
	kg_neighborhood_image = gr.HTML(elem_id = 'kg_patient_neighborhood')

	#gene_button = gr.Button("Go")

	with gr.TabItem("Patients Like Me"):
	gr.HTML(get_umap('patient'))
	gr.Markdown('<center><h2>Select a patient to view SHEPHERD\'s predictions</h2></center>')
	patient_dropdown = gr.Dropdown(choices=['UDN-P3','UDN-P4','UDN-P5','UDN-P6'], label='Rare Disease Patients', type='value')
	p_patient_info = gr.Markdown()
	with gr.Accordion(label=f'Top 10 Most Similar Patients according to SHEPHERD', open=True, elem_id='pt_accordion'): #
	patient_dataframe = gr.Dataframe(max_rows=10, datatype = 'markdown', show_label=False, elem_id="pat_df", headers=['Candidate Patient', 'Candidate Patient\'s Gene', 'Candidate Patient\'s Disease' ]) #'Candidate Patient\'s Phenotypes'
	#patient_button = gr.Button("Go")
	with gr.Accordion(label='SHEPHERD\'s Attention to Patient\'s Phenotypes', open=False, elem_id='pt_attn_accordion'):
	pt_attn_dataframe = gr.Dataframe( elem_id="pt_attn_df", headers=['Phenotypes', 'Attention' ], overflow_row_behaviour='paginate')


	with gr.TabItem("Disease Characterization"):
	gr.HTML(get_umap('disease'))
	gr.Markdown('<center><h2>Select a patient to view SHEPHERD\'s predictions</h2></center>')
	dx_dropdown = gr.Dropdown(choices=['UDN-P7','UDN-P8','UDN-P9','UDN-P2'], label='Rare Disease Patients', type='value')
	dx_patient_info = gr.Markdown()
	with gr.Accordion(label='Top 10 Most Similar Diseases according to SHEPHERD', open=True, elem_id='pt_accordion'): #
	dx_dataframe = gr.Dataframe(max_rows=10, datatype = 'markdown', show_label=False, elem_id="dx_df", headers=['Diseases'])
	with gr.Accordion(label='SHEPHERD\'s Attention to Patient\'s Phenotypes', open=False, elem_id='dx_attn_accordion'):
	dx_attn_dataframe = gr.Dataframe( elem_id="dx_attn_df", headers=['Phenotypes', 'Attention' ], overflow_row_behaviour='paginate')

	#dx_button = gr.Button("Go")

	gene_dropdown.change(causal_gene_discovery, inputs=[gene_dropdown,gene_radio], outputs=[patient_info, gene_dataframe, gene_attn_dataframe, kg_neighborhood_image])
	gene_radio.change(causal_gene_discovery, inputs=[gene_dropdown,gene_radio], outputs=[patient_info, gene_dataframe, gene_attn_dataframe, kg_neighborhood_image])

	patient_dropdown.change(patients_like_me, inputs=patient_dropdown, outputs=[p_patient_info, patient_dataframe, pt_attn_dataframe])
	dx_dropdown.change(disease_characterization, inputs=dx_dropdown, outputs=[dx_patient_info, dx_dataframe, dx_attn_dataframe])

	#gene_dropdown.change(get_patient, inputs=gene_dropdown, outputs=patient_info)
	#gene_button.click(causal_gene_discovery, inputs=gene_dropdown, outputs=[gene_dataframe,gene_attn_dataframe, kg_neighborhood_image])
	#patient_button.click(patients_like_me, inputs=patient_dropdown, outputs=patient_dataframe)
	#dx_button.click(disease_characterization, inputs=dx_dropdown, outputs=dx_dataframe)

	demo.launch( ) #server_port=50018, share=True