import re import pandas as pd import numpy as np from sklearn.base import BaseEstimator, TransformerMixin from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import Normalizer import joblib import nltk from nltk.corpus import stopwords from pymorphy2 import MorphAnalyzer import string nltk.download('stopwords') nltk.download('punkt') class TextPreprocessor(BaseEstimator, TransformerMixin): def __init__(self): self.stop_words = set(stopwords.words('russian')) self.morph = MorphAnalyzer() def preprocess_text(self, text): # Удаление всего, что не является буквами или знаками препинания clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]') text = clean_pattern.sub('', text) url_pattern = re.compile(r'http\S+|www\S+|https\S+') text = url_pattern.sub(r'', text) text = text.translate(str.maketrans('', '', string.punctuation)) text = text.lower() tokens = text.split() lemmatized_text = ' '.join([self.morph.parse(word)[0].normal_form for word in tokens if word not in self.stop_words]) return lemmatized_text def fit(self, X, y=None): return self def transform(self, X, y=None): return X.apply(self.preprocess_text)