Spaces:
Sleeping
Sleeping
File size: 1,100 Bytes
ecbd4e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
import re
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
from nltk.corpus import stopwords
import string
nltk.download('stopwords')
stop_words = set(stopwords.words("russian"))
class TextPreprocessorBERT(BaseEstimator, TransformerMixin):
def __init__(self):
self.stop_words = set(stopwords.words('russian'))
def clean_text(self, text):
# Удаление всего, что не является буквами или знаками препинания
clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]')
text = clean_pattern.sub('', text)
url_pattern = re.compile(r'http\S+|www\S+|https\S+')
text = url_pattern.sub(r'', text)
text = re.sub("\s+", " ", text)
splitted_text = [word for word in text.split() if word not in stop_words]
text = " ".join(splitted_text)
return text
def fit(self, text):
return self
def transform(self, text):
# return X.apply(self.clean_text)
return self.clean_text(text) |