Spaces:
Sleeping
Sleeping
import re | |
import pandas as pd | |
import numpy as np | |
from sklearn.base import BaseEstimator, TransformerMixin | |
import nltk | |
from nltk.corpus import stopwords | |
import string | |
nltk.download('stopwords') | |
stop_words = set(stopwords.words("russian")) | |
class TextPreprocessorBERT(BaseEstimator, TransformerMixin): | |
def __init__(self): | |
self.stop_words = set(stopwords.words('russian')) | |
def clean_text(self, text): | |
# Удаление всего, что не является буквами или знаками препинания | |
clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]') | |
text = clean_pattern.sub('', text) | |
url_pattern = re.compile(r'http\S+|www\S+|https\S+') | |
text = url_pattern.sub(r'', text) | |
text = re.sub("\s+", " ", text) | |
splitted_text = [word for word in text.split() if word not in stop_words] | |
text = " ".join(splitted_text) | |
return text | |
def fit(self, text): | |
return self | |
def transform(self, text): | |
# return X.apply(self.clean_text) | |
return self.clean_text(text) |