import time import gradio as gr import pandas as pd import torch from pathlib import Path from Bio import SeqIO from dscript.pretrained import get_pretrained from dscript.language_model import lm_embed from tqdm.auto import tqdm from uuid import uuid4 from predict_3di import get_3di_sequences, predictions_to_dict, one_hot_3di_sequence model_map = { "D-SCRIPT": "human_v1", "Topsy-Turvy": "human_v2", "TT3D": "human_tt3d", } theme = "Default" title = "D-SCRIPT: Predicting Protein-Protein Interactions" description = """ """ article = """
D-SCRIPT architecture
D-SCRIPT is a deep learning method for predicting a physical interaction between two proteins given just their sequences. It generalizes well to new species and is robust to limitations in training data size. Its design reflects the intuition that for two proteins to physically interact, a subset of amino acids from each protein should be in contact with the other. The intermediate stages of D-SCRIPT directly implement this intuition, with the penultimate stage in D-SCRIPT being a rough estimate of the inter-protein contact map of the protein dimer. This structurally-motivated design enhances the interpretability of the results and, since structure is more conserved evolutionarily than sequence, improves generalizability across species.
Computational methods to predict protein-protein interaction (PPI) typically segregate into sequence-based "bottom-up" methods that infer properties from the characteristics of the individual protein sequences, or global "top-down" methods that infer properties from the pattern of already known PPIs in the species of interest. However, a way to incorporate top-down insights into sequence-based bottom-up PPI prediction methods has been elusive. Topsy-Turvy builds upon D-SCRIPT by synthesizing both views in a sequence-based, multi-scale, deep-learning model for PPI prediction. While Topsy-Turvy makes predictions using only sequence data, during the training phase it takes a transfer-learning approach by incorporating patterns from both global and molecular-level views of protein interaction. In a cross-species context, we show it achieves state-of-the-art performance, offering the ability to perform genome-scale, interpretable PPI prediction for non-model organisms with no existing experimental PPI data. """ fold_vocab = { "D": 0, "P": 1, "V": 2, "Q": 3, "A": 4, "W": 5, "K": 6, "E": 7, "I": 8, "T": 9, "L": 10, "F": 11, "G": 12, "S": 13, "M": 14, "H": 15, "C": 16, "R": 17, "Y": 18, "N": 19, "X": 20, } def predict(model_name, pairs_file, sequence_file, progress = gr.Progress()): run_id = uuid4() device = torch.cuda("0") if torch.cuda.is_available() else torch.device("cpu") # gr.Info("Loading model...") _ = lm_embed("M", use_cuda = (device.type == "cuda")) model = get_pretrained(model_map[model_name]).to(device) # gr.Info("Loading files...") try: seqs = SeqIO.to_dict(SeqIO.parse(sequence_file.name, "fasta")) except ValueError as _: raise gr.Error("Invalid FASTA file - duplicate entry") if Path(pairs_file.name).suffix == ".csv": pairs = pd.read_csv(pairs_file.name) elif Path(pairs_file.name).suffix == ".tsv": pairs = pd.read_csv(pairs_file.name, sep="\t") try: pairs.columns = ["protein1", "protein2"] except ValueError as _: raise gr.Error("Invalid pairs file - must have two columns 'protein1' and 'protein2'") do_foldseek = False if model_name == "TT3D": do_foldseek = True need_to_translate = set(pairs["protein1"]).union(set(pairs["protein2"])) seqs_to_translate = {k: str(seqs[k].seq) for k in need_to_translate if k in seqs} half_precision = False assert not (half_precision and device=="cpu"), print("Running fp16 on CPU is not supported, yet") gr.Info(f"Loading Foldseek embeddings -- this may take some time ({len(seqs_to_translate)} embeddings)...") predictions = get_3di_sequences( seqs_to_translate, model_dir = "Rostlab/ProstT5", report_fn = gr.Info, error_fn = gr.Error, device=device, ) foldseek_sequences = predictions_to_dict(predictions) foldseek_embeddings = {k: one_hot_3di_sequence(s.upper(), fold_vocab) for k, s in foldseek_sequences.items()} # for k in seqs_to_translate.keys(): # print(seqs_to_translate[k]) # print(len(seqs_to_translate[k])) # print(foldseek_embeddings[k]) # print(foldseek_embeddings[k].shape) progress(0, desc="Starting...") results = [] for i in progress.tqdm(range(len(pairs))): r = pairs.iloc[i] prot1 = r["protein1"] prot2 = r["protein2"] seq1 = str(seqs[prot1].seq) seq2 = str(seqs[prot2].seq) fold1 = foldseek_embeddings[prot1] if do_foldseek else None fold2 = foldseek_embeddings[prot2] if do_foldseek else None lm1 = lm_embed(seq1) lm2 = lm_embed(seq2) print(lm1.shape, lm2.shape, fold1.shape, fold2.shape) interaction = model.predict(lm1, lm2, embed_foldseek = do_foldseek, f0 = fold1, f1 = fold2).item() results.append([prot1, prot2, interaction]) results = pd.DataFrame(results, columns = ["Protein 1", "Protein 2", "Interaction"]) file_path = f"/tmp/{run_id}.tsv" with open(file_path, "w") as f: results.to_csv(f, sep="\t", index=False, header = True) return results, file_path demo = gr.Interface( fn=predict, inputs = [ gr.Dropdown(label="Model", choices = ["D-SCRIPT", "Topsy-Turvy", "TT3D"], value = "Topsy-Turvy"), gr.File(label="Pairs (.csv/.tsv)", file_types = [".csv", ".tsv"]), gr.File(label="Sequences (.fasta)", file_types = [".fasta"]), ], outputs = [ gr.DataFrame(label='Results', headers=['Protein 1', 'Protein 2', 'Interaction']), gr.File(label="Download results", type="file") ], title = title, description = description, article = article, theme = theme, ) if __name__ == "__main__": demo.queue(max_size=20).launch()