import re import pandas as pd import numpy as np from sklearn.base import BaseEstimator, TransformerMixin import nltk from nltk.corpus import stopwords nltk.download('stopwords') stop_words = set(stopwords.words("russian")) class TextPreprocessorBERT(BaseEstimator, TransformerMixin): def __init__(self): self.stop_words = set(stopwords.words('russian')) def clean_text(self, text): # Удаление всего, что не является буквами или знаками препинания clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]') text = clean_pattern.sub('', text) url_pattern = re.compile(r'http\S+|www\S+|https\S+') text = url_pattern.sub(r'', text) text = re.sub("\s+", " ", text) splitted_text = [word for word in text.split() if word not in stop_words] text = " ".join(splitted_text) return text def fit(self, text): return self def transform(self, text): # return X.apply(self.clean_text) return self.clean_text(text)