File size: 1,100 Bytes
ecbd4e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import re
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')
stop_words = set(stopwords.words("russian"))

class TextPreprocessorBERT(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('russian'))


    def clean_text(self, text):
        # Удаление всего, что не является буквами или знаками препинания
        clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]')
        text = clean_pattern.sub('', text)
        url_pattern = re.compile(r'http\S+|www\S+|https\S+')
        text = url_pattern.sub(r'', text)
        text = re.sub("\s+", " ", text)
        splitted_text = [word for word in text.split() if word not in stop_words]
        text = " ".join(splitted_text)
        return text

    def fit(self, text):
        return self

    def transform(self, text):
        # return X.apply(self.clean_text)
        return self.clean_text(text)