Spaces:
Paused
Paused
import string | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
import re | |
from torchtext.vocab import build_vocab_from_iterator, GloVe | |
import numpy as np | |
from sklearn.base import TransformerMixin | |
from sklearn.metrics import ConfusionMatrixDisplay | |
from keras.preprocessing.text import Tokenizer | |
import nltk | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import wordnet | |
import torch | |
import torch.nn as nn | |
import torch.optim as optim | |
import torch.nn.functional as F | |
from torchtext.data.utils import get_tokenizer | |
def download_if_non_existent(res_path, res_name): | |
try: | |
nltk.data.find(res_path) | |
except LookupError: | |
print(f'resource {res_path} not found. Downloading now...') | |
nltk.download(res_name) | |
download_if_non_existent('corpora/stopwords', 'stopwords') | |
download_if_non_existent('taggers/averaged_perceptron_tagger', 'averaged_perceptron_tagger') | |
download_if_non_existent('corpora/wordnet', 'wordnet') | |
def fit_model(pipeline, x_train, y_train, x_test, y_test): | |
pipeline.fit(x_train, y_train) | |
return ConfusionMatrixDisplay.from_estimator(pipeline, x_test, y_test, normalize="true") | |
class LinguisticPreprocessor(TransformerMixin): | |
def __init__(self, ): | |
super().__init__() | |
self.lemmatizer = WordNetLemmatizer() | |
self.tokenizer = Tokenizer() | |
self.stop_words = set(stopwords.words('english')) | |
self.stop = stopwords.words('english') | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X, y=None): | |
X = self._remove_html_tags(X) | |
X = self._remove_all_punctuations(X) | |
X = self._remove_double_spaces(X) | |
X = self._lemmatize(X) | |
X = self._remove_stopwords(X) | |
return X | |
def _remove_html_tags(self, X): | |
X = list(map( lambda x: BeautifulSoup(x, 'html.parser').get_text(), X)) | |
return X | |
def _remove_all_punctuations(self, X): | |
X = list( | |
map( | |
lambda text: re.sub('[%s]' % re.escape(string.punctuation), '', text), | |
X | |
) | |
) | |
return X | |
def _remove_double_spaces(self, X): | |
X = list(map(lambda text: re.sub(" +", " ", text), X)) | |
return X | |
def _remove_stopwords(self, X): | |
X = list(map( | |
lambda text: " ".join( | |
[ | |
word for word in text.split() if word not in (self.stop_words) | |
] | |
), | |
X | |
) | |
) | |
return X | |
def _lemmatize(self, X): | |
X = list(map(lambda text: self._lemmatize_one_sentence(text), X)) | |
return X | |
def _lemmatize_one_sentence(self, sentence): | |
sentence = nltk.word_tokenize(sentence) | |
sentence = list(map(lambda word: self.lemmatizer.lemmatize(word), sentence)) | |
return " ".join(sentence) | |
def training_data(dataset_1, dataset_2, dataset_3): | |
X_test = dataset_1['test']['text'] | |
y_test = dataset_1['test']['label'] | |
test_df = pd.DataFrame({ | |
'text':X_test, | |
'label': y_test | |
}) | |
combined_train_df = pd.DataFrame({ | |
'text': dataset_1['train']['text'] + dataset_2['train']['text'] + dataset_3['train']['text'], | |
'label': dataset_1['train']['label'] + dataset_2['train']['label'] + dataset_3['train']['label'] | |
}) | |
combined_train_df.drop_duplicates(subset=['text'], inplace=True) | |
merged_df = pd.merge(combined_train_df, test_df, on="text", how='left', indicator=True) | |
result_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge']) | |
X_train = result_df['text'].tolist() | |
y_train = result_df['label_x'].tolist() | |
X_test = np.array(X_test) | |
X_train = np.array(X_train) | |
return X_train, y_train, X_test, y_test | |
class CNN(nn.Module): | |
def __init__(self, vocab_size, embed_size, n_filters, filter_sizes, dropout, num_classes): | |
super(CNN, self).__init__() | |
self.embedding = nn.Embedding(vocab_size, embed_size) | |
self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embed_size)) for fs in filter_sizes]) | |
self.dropout = nn.Dropout(dropout) | |
self.fc1 = nn.Linear(len(filter_sizes) * n_filters, num_classes) | |
def forward(self, text): | |
embedded = self.embedding(text) | |
embedded = embedded.unsqueeze(1) | |
conved = [F.leaky_relu(conv(embedded)).squeeze(3) for conv in self.convs] | |
pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] | |
cat = self.dropout(torch.cat(pooled, dim=1)) | |
return self.fc1(cat) | |
def build_vocab(data_iter): | |
tokenizer = get_tokenizer("basic_english") | |
def yield_tokens(): | |
for example in data_iter: | |
cleaned_text = clean_text(example['text']) | |
yield tokenizer(cleaned_text) | |
vocab = build_vocab_from_iterator(yield_tokens(), specials=["<unk>", "<pad>"]) | |
vocab.set_default_index(vocab["<unk>"]) | |
return vocab, tokenizer | |
def clean_text(text): | |
text = text.lower() | |
text = re.sub(r'\d+', '', text) | |
text = re.sub(r'[^\w\s]', '', text) | |
text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]) | |
return text | |