Spaces:
Sleeping
Sleeping
File size: 998 Bytes
3f4aa97 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
from imports import *
def clean(text):
text = re.sub(r'[^a-zA-Z\s]', '', str(text))
text = re.sub('https?://\S+', '', text)
# Convert to lower
text = text.lower()
# Remove extra whitespaces
text = re.sub(r'\s+', ' ', text).strip()
return text
# Function to lemmatize text
def lemmatize_text(text):
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
st = " ".join(lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text))
return st
# Function to preprocess texts for training
def preprocess_texts_train(texts, vectorizer=None):
if vectorizer is None:
vectorizer = TfidfVectorizer()
vectorizer.fit(texts)
transformed_texts = vectorizer.transform(texts)
return vectorizer, transformed_texts
# Function to preprocess new texts
def preprocess_new_texts(texts, vectorizer):
transformed_texts = vectorizer.transform(texts)
return transformed_texts |