import json from tqdm import tqdm import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score import torch from torch.utils.data import Dataset, DataLoader import matplotlib.pyplot as plt import warnings warnings.filterwarnings('ignore') from gensim.models import Word2Vec from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, f1_score from transformers import AutoTokenizer, AutoModel from typing import Tuple from tqdm.notebook import tqdm import torch from torch.utils.data import DataLoader, TensorDataset import torch.nn.functional as F import torch.nn as nn from torchmetrics.classification import BinaryAccuracy, F1Score from time import time import nltk import re import string from nltk.corpus import stopwords from pymystem3 import Mystem import pickle import joblib punc = string.punctuation + '«»—…' + string.digits sw = stopwords.words('russian') + ['х', 'эх', 'это'] mystem = Mystem() def clean(text): text = text.lower() # нижний регистр text = "".join([c for c in text if c not in punc]) text = mystem.lemmatize(text) text = ' '.join([word for word in text if word not in sw]) text = " ".join(text.split()) # удаляем /n return text with open('vocab_to_int.json', encoding='utf-8') as f: vocab_to_int = json.load(f) VOCAB_SIZE = len(vocab_to_int) + 1 EMBEDDING_DIM = 64 # embedding_dim SEQ_LEN = 350 HIDDEN_SIZE = 64 class BahdanauAttention(nn.Module): def __init__(self, hidden_size: int) -> None: super().__init__() self.W_q = nn.Linear(hidden_size, hidden_size) self.W_k = nn.Linear(2*hidden_size, hidden_size) self.V = nn.Linear(hidden_size, 1) def forward( self, keys: torch.Tensor, query: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor]: query = self.W_q(query) keys = self.W_k(keys) energy = self.V(F.tanh(query[0].unsqueeze(1) + query[1].unsqueeze(1) + keys)).squeeze(-1) weights = torch.softmax(energy, -1) context = torch.bmm(weights.unsqueeze(1), keys).squeeze(0) return context, weights with open('embedding_matrix.npy', 'rb') as f: embedding_matrix = np.load(f) embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix)) class LSTMConcatAttentionB(nn.Module): def __init__(self) -> None: super().__init__() self.embedding = embedding_layer self.lstm = nn.LSTM(EMBEDDING_DIM, int(HIDDEN_SIZE), batch_first=True, bidirectional=True) self.attnB = BahdanauAttention(HIDDEN_SIZE) self.clf = nn.Sequential( nn.Linear(HIDDEN_SIZE, 512), nn.Dropout(), nn.Tanh(), nn.Linear(512, 256), nn.SELU(), nn.Linear(256, 3), ) def forward(self, x): embeddings = self.embedding(x) outputs, (h_n, _) = self.lstm(embeddings) att_hidden, att_weights = self.attnB(outputs, h_n.squeeze(0)) out = self.clf(att_hidden).squeeze() return out, att_weights vectorizer = pickle.load(open("vectorizer.pickle", "rb")) class Text_ex(): def __init__(self, clean_func, voc, sl) -> None: self.clean = clean_func self.vocab = voc self.seq_len = sl def __call__(self, text): c_text = self.clean(text) review = [self.vocab[word] for word in c_text.split() if vocab_to_int.get(word)] if len(review) <= self.seq_len: zeros = list(np.zeros(self.seq_len - len(review))) new = zeros + review else: new = review[-self.seq_len:] t = torch.from_numpy(np.array(new)).to(int) return t.unsqueeze(0), c_text class PredMaker(): def __init__(self, model1, model2, rubert, model3, vectorizer, texter, clean_func, tokenizer, itc) -> None: self.log_reg_vec = model1 self.lstm = model2 self.bert = rubert self.log_reg_bert = model3 self.vec = vectorizer self.clean = clean_func self.texter = texter self.ItV = itc self.tokenizer = tokenizer self.lstm.eval() self.bert.eval() def __call__(self, text): time1 = time() res1 = log_reg_vec.predict(self.vec.transform([self.clean(text)])) time1 = time() - time1 time2 = time() t= self.texter(text) with torch.no_grad(): res2 = torch.argmax(self.lstm(t[0])[0]).item() time2 = time() - time2 tt = self.lstm(t[0])[1].detach().cpu().numpy()[0] time3 = time() text3 = self.tokenizer(text, truncation=True, return_tensors='pt',max_length=512) input = text3['input_ids'] mask = text3['attention_mask'] with torch.no_grad(): embeddings = torch.nn.functional.normalize(self.bert(input.to(self.bert.device),mask.to(self.bert.device)).last_hidden_state[:, 0, :]).cpu().numpy() res3 = self.log_reg_bert.predict(embeddings) time3 = time() - time3 return int(res1), int(res2), int(res3), t, tt, time1, time2, time3 log_reg_vec = joblib.load('log_reg_vec.sav') log_reg_bert = joblib.load('log_reg_bert.sav') texter = Text_ex(clean, vocab_to_int, SEQ_LEN) lstm = LSTMConcatAttentionB() log_reg_bert = LogisticRegression() vectorizer = pickle.load(open("vectorizer.pickle", "rb")) PATH = 'tokenzier' tokenizer = AutoTokenizer.from_pretrained(PATH, local_files_only=True) bert = torch.load('bert.pt')