|
import streamlit as st |
|
import torch |
|
from transformers import AutoTokenizer, AutoModel, pipeline |
|
from torch import nn |
|
|
|
st.markdown("### Articles classificator.") |
|
|
|
@st.cache(allow_output_mutation=True) |
|
def get_tokenizer(): |
|
model_name = 'microsoft/deberta-v3-small' |
|
return AutoTokenizer.from_pretrained(model_name) |
|
|
|
tokenizer = get_tokenizer() |
|
|
|
class devops_model(nn.Module): |
|
def __init__(self): |
|
super(devops_model, self).__init__() |
|
self.berta = None |
|
self.fc = nn.Sequential( |
|
nn.Linear(768, 768), |
|
nn.ReLU(), |
|
nn.Dropout(0.3), |
|
nn.BatchNorm1d(768), |
|
nn.Linear(768, 5), |
|
nn.LogSoftmax(dim=-1) |
|
) |
|
|
|
def forward(self, train_batch): |
|
emb = self.berta(**train_batch)['last_hidden_state'].mean(axis=1) |
|
return self.fc(emb) |
|
|
|
@st.cache |
|
def LoadModel(): |
|
return torch.load('model_full.pt', map_location=torch.device('cpu')) |
|
|
|
model = LoadModel() |
|
|
|
classes = ['Computer Science', 'Mathematics', 'Physics', 'Quantitative Biology', 'Statistics'] |
|
|
|
def process(title, summary): |
|
text = title + summary |
|
if not text.strip(): |
|
return '' |
|
model.eval() |
|
lines = [text] |
|
X = tokenizer(lines, padding=True, truncation=True, return_tensors="pt") |
|
out = model(X) |
|
probs = torch.exp(out[0]) |
|
sorted_indexes = torch.argsort(probs, descending=True) |
|
probs_sum = idx = 0 |
|
res = [] |
|
while probs_sum < 0.95: |
|
prob_idx = sorted_indexes[idx] |
|
prob = probs[prob_idx] |
|
res.append(f'{classes[prob_idx]}: {prob:.3f}') |
|
idx += 1 |
|
probs_sum += prob |
|
return res |
|
|
|
title = st.text_area("Title", height=30) |
|
|
|
summary = st.text_area("Summary", height=180) |
|
|
|
for string in process(title, summary): |
|
st.markdown(string) |