import gradio as gr from transformers import AutoModel, AutoTokenizer, AutoModelForTokenClassification import torch import numpy as np import torch.nn.functional as F import matplotlib.pyplot as plt tokenizer = AutoTokenizer.from_pretrained("./checkpoint-final/") model = AutoModelForTokenClassification.from_pretrained("./checkpoint-final/") model = model.eval() examples = [ ["GSHMSDNEDNFDGDDFDDVEEDEGLDDLENAEEEGQENVEILPSGERPQANQKRITTPYMTKYERARVLGTRALQIAMCAPVMVELEGETDPLLIAMKELKARKIPIIIRRYLPDGSYEDWGVDELIITD"]] def get_out(sent): prefix = "" if len(sent)>1022: sent = sent[:1022] prefix = "Your protein was longer than 1022 AAs. We are working on including longer sequences but in the meantime, here are the scores for the first 1022 AAs: \n " print(sent) encoded = tokenizer.encode_plus(sent, return_tensors="pt") with torch.no_grad(): output = model(**encoded) output = F.softmax(torch.squeeze(output['logits']))[1:-1,1].detach().numpy() fig = plt.figure() plt.plot(output) plt.xticks(fontsize=15) plt.yticks(fontsize=15) plt.xlabel('Sequence position', fontsize=15) plt.ylabel('DR-BERT score', fontsize=15) output = ','.join(str(x) for x in output) return (fig,prefix+output) gr.Interface( get_out, [ gr.components.Textbox(label="Input Amino Acid Sequence", placeholder = " Amino acid sequence here ...") ], ["plot","text"], examples=examples, title="DR-BERT: A Protein Language Model to Predict Disordered Regions", description="The app uses DR-BERT to predict disordered regions in proteins. Outputs generated are the probability that a residue is disordered." ).launch()