File size: 947 Bytes
bd9870c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
import re
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
class TextCleaner:
def __init__(self):
warnings.filterwarnings("ignore")
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def cleaning_text(self, text):
if text and isinstance(text, str):
text = BeautifulSoup(text, "html.parser").get_text()
text = re.sub(r'https?://\S+|www\.\S+|@\w+|#\w+|[^a-zA-Z]', ' ', text.lower())
text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split() if len(word) > 1 and word not in self.stop_words])
text = ' '.join(list(dict.fromkeys(text.split())))
else:
text = ''
return text
if __name__ == "__main__":
# Example usage:
cleaner = TextCleaner()
print(cleaner.cleaning_text("I feel bullied online."))
|