Spaces:

DanilO0o
/

nlp_group_project

Running

+from PIL import Image, ImageFilter, ImageDraw
+import streamlit as st
+import pickle
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+from dataclasses import dataclass
+from typing import Union
+import re
+import string
+import pymorphy3
+from nltk.corpus import stopwords
+stop_words = set(stopwords.words("english"))
+# ------------------------------------------------------------#
+# Упрощенный метод создания класса
+@dataclass
+class ConfigRNN:
+    vocab_size: int  # сколько слов - столько embedding-ов; для инициализации embedding параметров
+    device: str
+    n_layers: int
+    embedding_dim: int  # чем больше, тем сложнее можно закодировать слово
+    hidden_size: int
+    seq_len: int
+    bidirectional: Union[bool, int]
+net_config = ConfigRNN(
+    vocab_size=17259 + 1,  # -> hand
+    device="cpu",
+    n_layers=1,
+    embedding_dim=8,  # не лучшее значение, но в рамках задачи сойдет
+    hidden_size=16,
+    seq_len=30,  # -> hand
+    bidirectional=False,
+)
+# ------------------------------------------------------------#
+class LSTMClassifier(nn.Module):
+    def __init__(self, rnn_conf=net_config) -> None:
+        super().__init__()
+        self.embedding_dim = rnn_conf.embedding_dim
+        self.hidden_size = rnn_conf.hidden_size
+        self.bidirectional = rnn_conf.bidirectional
+        self.n_layers = rnn_conf.n_layers
+        self.embedding = nn.Embedding(rnn_conf.vocab_size, self.embedding_dim)
+        self.lstm = nn.LSTM(
+            input_size=self.embedding_dim,
+            hidden_size=self.hidden_size,
+            bidirectional=self.bidirectional,
+            batch_first=True,
+            num_layers=self.n_layers,
+            dropout=0.5
+        )
+        self.bidirect_factor = 2 if self.bidirectional else 1
+        self.clf = nn.Sequential(
+            nn.Linear(self.hidden_size * self.bidirect_factor, 32),
+            nn.Dropout(),
+            nn.Tanh(),
+            nn.Dropout(),
+            nn.Linear(32, 5)  # len(df['label'].unique())
+        )
+    def model_description(self):
+        direction = "bidirect" if self.bidirectional else "onedirect"
+        return f"lstm_{direction}_{self.n_layers}"
+    def forward(self, x: torch.Tensor):
+        embeddings = self.embedding(x)
+        out, _ = self.lstm(embeddings)
+        # print(out.shape)
+        # [все элементы батча, последний h_n, все элементы последнего h_n]
+        out = out[:, -1, :]
+        # print(out.shape)
+        out = self.clf(out)
+        return out
+# ------------------------------------------------------------#
+# Загрузка модели
+@st.cache_resource
+def load_model():
+    model = LSTMClassifier(net_config)
+    model.load_state_dict(torch.load(
+        "models/lstm_weights.pth", map_location=torch.device("cpu")))
+    model.eval()
+    return model
+model_lstm = load_model()
+# ------------------------------------------------------------#
+def padding(text_int: list, seq_len: int) -> np.ndarray:
+    """Make left-sided padding for input list of tokens
+    Args:
+        review_int (list): input list of tokens
+        seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
+    Returns:
+        np.array: padded sequences
+    """
+    features = np.zeros((len(text_int), seq_len), dtype=int)
+    for i, review in enumerate(text_int):
+        if len(review) <= seq_len:
+            zeros = list(np.zeros(seq_len - len(review)))
+            new = zeros + review
+        else:
+            new = review[:seq_len]
+        features[i, :] = np.array(new)
+    return features
+morph = pymorphy3.MorphAnalyzer()
+def lemmatize(text):
+    # Разбиваем текст на слова
+    words = text.split()
+    # Лемматизируем каждое слово и убираем стоп-слова
+    lemmatized_words = [morph.parse(word)[0].normal_form for word in words]
+    # Собираем текст из лемматизированных слов
+    lemmatized_text = ' '.join(lemmatized_words)
+    return lemmatized_text
+def data_preprocessing(text):
+    # From Phase 1
+    text = re.sub(r':[a-zA-Z]+:', '', text)  # Убираем смайлики
+    text = text.lower()  # Переводим текст в нижний регистр
+    text = re.sub(r'@[\w_-]+', '', text)  # Убираем упоминания пользователей
+    text = re.sub(r'#(\w+)', '', text)  # Убираем хэштеги
+    text = re.sub(r'\d+', '', text)  # Убираем цифры
+    # Убираем ссылки
+    text = re.sub(
+        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
+    text = re.sub(r'\s+', ' ', text)  # Убираем лишние пробелы
+    # Удаление английских слов
+    text = ' '.join(re.findall(r'\b[а-яА-ЯёЁ]+\b', text))
+    # From Phase 2
+    text = re.sub("<.*?>", "", text)  # html tags
+    text = "".join([c for c in text if c not in string.punctuation])
+    splitted_text = [word for word in text.split() if word not in stop_words]
+    text = " ".join(splitted_text)
+    return text.strip()
+def preprocess_single_string(
+    input_string: str,
+    seq_len: int,
+    vocab_to_int: dict,
+    verbose: bool = False
+) -> Tensor:
+    """Function for all preprocessing steps on a single string
+    Args:
+        input_string (str): input single string for preprocessing
+        seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
+        vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
+    Returns:
+        list: preprocessed string
+    """
+    preprocessed_string = lemmatize(input_string)
+    preprocessed_string = data_preprocessing(input_string)
+    result_list = []
+    for word in preprocessed_string.split():
+        try:
+            result_list.append(vocab_to_int[word])
+        except KeyError as e:
+            if verbose:
+                print(f'{e}: not in dictionary!')
+            pass
+    result_padded = padding([result_list], seq_len)[0]
+    return Tensor(result_padded)
+# ------------------------------------------------------------#
+st.title("Классификация тематики новостей из телеграм каналов")
+# st.write('Model summary:')
+text = st.text_input('Input some news')
+text_4_test = text
+# Загрузка словаря из файла
+with open('model/vocab_to_int.pkl', 'rb') as f:
+    vocab_to_int = pickle.load(f)
+if text != '':
+    test_review = preprocess_single_string(
+        text_4_test, net_config.seq_len, vocab_to_int)
+    test_review = torch.tensor(test_review, dtype=torch.int64)
+    result = torch.sigmoid(model_lstm(test_review.unsqueeze(0)))
+    num = result.argmax().item()
+st.write('---')
+st.write('Initial text:')
+st.write(text)
+st.write('---')
+st.write('Preprocessing:')
+st.write(data_preprocessing(text))
+st.write('---')
+st.write('Classes:')
+classes = ['крипта', 'мода', 'спорт', 'технологии', 'финансы']
+st.write('крипта *', 'мода *', 'спорт *', 'технологии *', 'финансы')
+st.write('---')
+st.write('Predict:')
+if text != '':
+    st.write('Classification: ', classes[num])
+    st.write('Label num: ', num)
+# Загружаем изображение через PIL
+image = Image.open("images/tg_metrics.png")
+# Отображение
+st.image(image, caption="Кошмареус переобучения", use_column_width=True)