File size: 998 Bytes
3f4aa97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from imports import *

def clean(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    text = re.sub('https?://\S+', '', text)
    # Convert to lower
    text = text.lower()
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to lemmatize text
def lemmatize_text(text):
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    st = " ".join(lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text))
    return st

# Function to preprocess texts for training
def preprocess_texts_train(texts, vectorizer=None):
    if vectorizer is None:
        vectorizer = TfidfVectorizer()
        vectorizer.fit(texts)
    transformed_texts = vectorizer.transform(texts)
    return vectorizer, transformed_texts

# Function to preprocess new texts
def preprocess_new_texts(texts, vectorizer):
    transformed_texts = vectorizer.transform(texts)
    return transformed_texts