|
--- |
|
license: apache-2.0 |
|
metrics: |
|
- accuracy |
|
pipeline_tag: text-classification |
|
tags: |
|
- CNN |
|
- NLP |
|
- Yelp |
|
- Reviews |
|
- pre_trained |
|
language: |
|
- en |
|
datasets: |
|
- yassiracharki/Yelp_Reviews_for_Binary_Senti_Analysis |
|
library_name: fasttext |
|
--- |
|
# Model Card for Model ID |
|
|
|
# Downloads |
|
!pip install contractions |
|
!pip install textsearch |
|
!pip install tqdm |
|
|
|
import nltk |
|
nltk.download('punkt') |
|
|
|
# Fundamental classes |
|
import tensorflow as tf |
|
from tensorflow import keras |
|
import pandas as pd |
|
import numpy as np |
|
|
|
# Time |
|
import time |
|
import datetime |
|
|
|
# Preprocessing |
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
from tensorflow.keras.preprocessing import sequence |
|
from sklearn.preprocessing import LabelEncoder |
|
import contractions |
|
from bs4 import BeautifulSoup |
|
import re |
|
import tqdm |
|
import unicodedata |
|
|
|
seed = 3541 |
|
np.random.seed(seed) |
|
|
|
# Define a dummy loss to bypass the error during model loading |
|
def dummy_loss(y_true, y_pred): |
|
return tf.reduce_mean(y_pred - y_true) |
|
|
|
# Loading the model Trained on Yelp reviews |
|
modelYelp = keras.models.load_model( |
|
'/kaggle/input/pre-trained-model-binary-cnn-nlp-yelpreviews/tensorflow1/pre-trained-model-binary-cnn-nlp-yelp-reviews/1/Binary_Classification_90_Yelp_Reviews_CNN.h5', |
|
compile=False |
|
) |
|
|
|
# Compile the model with the correct loss function and reduction |
|
modelYelp.compile( |
|
optimizer='adam', |
|
loss=keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE), |
|
metrics=['accuracy'] |
|
) |
|
|
|
# Loading Yelp test data |
|
dataset_test_Yelp = pd.read_csv('/kaggle/input/yelp-reviews-for-sentianalysis-binary-np-csv/yelp_review_sa_binary_csv/test.csv') |
|
|
|
# Loading Yelp train data (to be used on the label encoder) |
|
dataset_train_Yelp = pd.read_csv('/kaggle/input/yelp-reviews-for-sentianalysis-binary-np-csv/yelp_review_sa_binary_csv/train.csv') |
|
|
|
# Shuffling the Test Data |
|
test_Yelp = dataset_test_Yelp.sample(frac=1) |
|
train_Yelp = dataset_train_Yelp.sample(frac=1) |
|
|
|
# Taking a tiny portion of the database (because it will only be used on the label encoder) |
|
train_Yelp = dataset_train_Yelp.iloc[:100, :] |
|
|
|
# Taking only necessary columns |
|
y_test_Yelp = test_Yelp['class_index'].values |
|
X_train_Yelp = train_Yelp['review_text'].values |
|
y_train_Yelp = train_Yelp['class_index'].values |
|
|
|
# Preprocess corpus function |
|
def pre_process_corpus(corpus): |
|
processed_corpus = [] |
|
for doc in tqdm.tqdm(corpus): |
|
doc = contractions.fix(doc) |
|
doc = BeautifulSoup(doc, "html.parser").get_text() |
|
doc = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore').decode('utf-8', 'ignore') |
|
doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A) |
|
doc = doc.lower() |
|
doc = doc.strip() |
|
processed_corpus.append(doc) |
|
return processed_corpus |
|
|
|
# Preprocessing the Data |
|
X_test_Yelp = pre_process_corpus(test_Yelp['review_text'].values) |
|
X_train_Yelp = pre_process_corpus(X_train_Yelp) |
|
|
|
# Creating and Fitting the Tokenizer |
|
etc ... |
|
|
|
# More info on the Model page on Kaggle : |
|
|
|
https://www.kaggle.com/models/yacharki/pre-trained-model-binary-cnn-nlp-yelpreviews |