import re | |
import warnings | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from bs4 import BeautifulSoup | |
import nltk | |
class TextCleaner: | |
def __init__(self): | |
warnings.filterwarnings("ignore") | |
nltk.download('stopwords') | |
self.stop_words = set(stopwords.words('english')) | |
self.lemmatizer = WordNetLemmatizer() | |
def cleaning_text(self, text): | |
if text and isinstance(text, str): | |
text = BeautifulSoup(text, "html.parser").get_text() | |
text = re.sub(r'https?://\S+|www\.\S+|@\w+|#\w+|[^a-zA-Z]', ' ', text.lower()) | |
text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split() if len(word) > 1 and word not in self.stop_words]) | |
text = ' '.join(list(dict.fromkeys(text.split()))) | |
else: | |
text = '' | |
return text | |
if __name__ == "__main__": | |
# Example usage: | |
cleaner = TextCleaner() | |
print(cleaner.cleaning_text("I feel bullied online.")) | |