Spaces:
Sleeping
Sleeping
from imports import * | |
def clean(text): | |
text = re.sub(r'[^a-zA-Z\s]', '', str(text)) | |
text = re.sub('https?://\S+', '', text) | |
# Convert to lower | |
text = text.lower() | |
# Remove extra whitespaces | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
# Function to lemmatize text | |
def lemmatize_text(text): | |
w_tokenizer = nltk.tokenize.WhitespaceTokenizer() | |
lemmatizer = nltk.stem.WordNetLemmatizer() | |
st = " ".join(lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)) | |
return st | |
# Function to preprocess texts for training | |
def preprocess_texts_train(texts, vectorizer=None): | |
if vectorizer is None: | |
vectorizer = TfidfVectorizer() | |
vectorizer.fit(texts) | |
transformed_texts = vectorizer.transform(texts) | |
return vectorizer, transformed_texts | |
# Function to preprocess new texts | |
def preprocess_new_texts(texts, vectorizer): | |
transformed_texts = vectorizer.transform(texts) | |
return transformed_texts |