--- license: apache-2.0 metrics: - accuracy pipeline_tag: text-classification tags: - CNN - NLP - Yelp - Reviews - pre_trained language: - en datasets: - yassiracharki/Yelp_Reviews_for_Binary_Senti_Analysis library_name: fasttext --- # Model Card for Model ID # Downloads !pip install contractions !pip install textsearch !pip install tqdm import nltk nltk.download('punkt') # Fundamental classes import tensorflow as tf from tensorflow import keras import pandas as pd import numpy as np # Time import time import datetime # Preprocessing from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing import sequence from sklearn.preprocessing import LabelEncoder import contractions from bs4 import BeautifulSoup import re import tqdm import unicodedata seed = 3541 np.random.seed(seed) # Define a dummy loss to bypass the error during model loading def dummy_loss(y_true, y_pred): return tf.reduce_mean(y_pred - y_true) # Loading the model Trained on Yelp reviews modelYelp = keras.models.load_model( '/kaggle/input/pre-trained-model-binary-cnn-nlp-yelpreviews/tensorflow1/pre-trained-model-binary-cnn-nlp-yelp-reviews/1/Binary_Classification_90_Yelp_Reviews_CNN.h5', compile=False ) # Compile the model with the correct loss function and reduction modelYelp.compile( optimizer='adam', loss=keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE), metrics=['accuracy'] ) # Loading Yelp test data dataset_test_Yelp = pd.read_csv('/kaggle/input/yelp-reviews-for-sentianalysis-binary-np-csv/yelp_review_sa_binary_csv/test.csv') # Loading Yelp train data (to be used on the label encoder) dataset_train_Yelp = pd.read_csv('/kaggle/input/yelp-reviews-for-sentianalysis-binary-np-csv/yelp_review_sa_binary_csv/train.csv') # Shuffling the Test Data test_Yelp = dataset_test_Yelp.sample(frac=1) train_Yelp = dataset_train_Yelp.sample(frac=1) # Taking a tiny portion of the database (because it will only be used on the label encoder) train_Yelp = dataset_train_Yelp.iloc[:100, :] # Taking only necessary columns y_test_Yelp = test_Yelp['class_index'].values X_train_Yelp = train_Yelp['review_text'].values y_train_Yelp = train_Yelp['class_index'].values # Preprocess corpus function def pre_process_corpus(corpus): processed_corpus = [] for doc in tqdm.tqdm(corpus): doc = contractions.fix(doc) doc = BeautifulSoup(doc, "html.parser").get_text() doc = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore').decode('utf-8', 'ignore') doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A) doc = doc.lower() doc = doc.strip() processed_corpus.append(doc) return processed_corpus # Preprocessing the Data X_test_Yelp = pre_process_corpus(test_Yelp['review_text'].values) X_train_Yelp = pre_process_corpus(X_train_Yelp) # Creating and Fitting the Tokenizer etc ... # More info on the Model page on Kaggle : https://www.kaggle.com/models/yacharki/pre-trained-model-binary-cnn-nlp-yelpreviews