import re
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
from nltk.corpus import stopwords


nltk.download('stopwords')
stop_words = set(stopwords.words("russian"))

class TextPreprocessorBERT(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('russian'))


    def clean_text(self, text):
        # Удаление всего, что не является буквами или знаками препинания
        clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]')
        text = clean_pattern.sub('', text)
        url_pattern = re.compile(r'http\S+|www\S+|https\S+')
        text = url_pattern.sub(r'', text)
        text = re.sub("\s+", " ", text)
        splitted_text = [word for word in text.split() if word not in stop_words]
        text = " ".join(splitted_text)
        return text

    def fit(self, text):
        return self

    def transform(self, text):
        # return X.apply(self.clean_text)
        return self.clean_text(text)