Spaces:

Project-nlp
/

nlp-bert-team

Sleeping

nlp-bert-team / models /model2 /preprocess_text.py

second page

ecbd4e2 4 months ago

1.1 kB

	import re
	import pandas as pd
	import numpy as np
	from sklearn.base import BaseEstimator, TransformerMixin
	import nltk
	from nltk.corpus import stopwords
	import string

	nltk.download('stopwords')
	stop_words = set(stopwords.words("russian"))

	class TextPreprocessorBERT(BaseEstimator, TransformerMixin):
	def __init__(self):
	self.stop_words = set(stopwords.words('russian'))


	def clean_text(self, text):
	# Удаление всего, что не является буквами или знаками препинания
	clean_pattern = re.compile(r'[^a-zA-Zа-яА-ЯёЁ0-9.,!?;:\s]')
	text = clean_pattern.sub('', text)
	url_pattern = re.compile(r'http\S+\|www\S+\|https\S+')
	text = url_pattern.sub(r'', text)
	text = re.sub("\s+", " ", text)
	splitted_text = [word for word in text.split() if word not in stop_words]
	text = " ".join(splitted_text)
	return text

	def fit(self, text):
	return self

	def transform(self, text):
	# return X.apply(self.clean_text)
	return self.clean_text(text)