nlp-bert-team / models /model1 /Custom_class.py
VerVelVel's picture
for logreg custom class
087390d
raw
history blame
1.5 kB
import re
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer
import joblib
import nltk
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
import string
nltk.download('stopwords')
nltk.download('punkt')
class TextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self):
self.stop_words = set(stopwords.words('russian'))
self.morph = MorphAnalyzer()
def preprocess_text(self, text):
# Удаление всего, что не является буквами или знаками препинания
clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]')
text = clean_pattern.sub('', text)
url_pattern = re.compile(r'http\S+|www\S+|https\S+')
text = url_pattern.sub(r'', text)
text = text.translate(str.maketrans('', '', string.punctuation))
text = text.lower()
tokens = text.split()
lemmatized_text = ' '.join([self.morph.parse(word)[0].normal_form for word in tokens if word not in self.stop_words])
return lemmatized_text
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X.apply(self.preprocess_text)