nlp-bert-team / bot /preprocess_text.py
VerVelVel's picture
bot and new weights
cdb0abe
raw
history blame
No virus
1.09 kB
import re
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words("russian"))
class TextPreprocessorBERT(BaseEstimator, TransformerMixin):
def __init__(self):
self.stop_words = set(stopwords.words('russian'))
def clean_text(self, text):
# Удаление всего, что не является буквами или знаками препинания
clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]')
text = clean_pattern.sub('', text)
url_pattern = re.compile(r'http\S+|www\S+|https\S+')
text = url_pattern.sub(r'', text)
text = re.sub("\s+", " ", text)
splitted_text = [word for word in text.split() if word not in stop_words]
text = " ".join(splitted_text)
return text
def fit(self, text):
return self
def transform(self, text):
# return X.apply(self.clean_text)
return self.clean_text(text)