Spaces:

efeperro
/

Movie_Analyzer

Paused

App Files Files Community

efeperro commited on Feb 7

Commit

93eddbd

•

1 Parent(s): f824a4f

Upload 4 files

Browse files

Files changed (4) hide show

app.py +43 -0
functions_preprocess.py +108 -0
main.py +44 -0
sentiment_model.pkl +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import streamlit as st
+from functions_preprocess import LinguisticPreprocessor
+import pickle
+#################################################################### Streamlit interface
+st.title("Movie Reviews: An NLP Sentiment analysis")
+st.markdown("### NLP Processing utilizing various ML approaches")
+st.markdown("##### This initial approach merges multiple datasets, processed through a TF-IDF vectorizer with 2 n-grams and fed into a Stochastic Gradient Descent model.")
+st.markdown("Give it a go by writing a positive or negative text, and analyze it!")
+#################################################################### Cache the model loading
+@st.cache_data()
+def load_model():
+    model_pkl_file = "sentiment_model.pkl"
+    with open(model_pkl_file, 'rb') as file:
+        model = pickle.load(file)
+    return model
+model = load_model()
+processor = LinguisticPreprocessor()
+def predict_sentiment(text, model):
+    processor.transform(text)
+    prediction = model.predict([text])
+    return prediction
+############################################################# Text input
+user_input = st.text_area("Enter text here...")
+if st.button('Analyze'):
+    # Displaying output
+    result = predict_sentiment(user_input, model)
+    if result >= 0.5:
+        st.write('The sentiment is: Positive 😀')
+    else:
+        st.write('The sentiment is: Negative 😞')
+st.caption("Por @efeperro con ❤️. Credits to 🤗")

functions_preprocess.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import string
+import pandas as pd
+from bs4 import BeautifulSoup
+import re
+import numpy as np
+from sklearn.base import TransformerMixin
+from sklearn.metrics import ConfusionMatrixDisplay
+from keras.preprocessing.text import Tokenizer
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+def download_if_non_existent(res_path, res_name):
+  try:
+    nltk.data.find(res_path)
+  except LookupError:
+    print(f'resource {res_path} not found. Downloading now...')
+    nltk.download(res_name)
+def fit_model(pipeline, x_train, y_train, x_test, y_test):
+  pipeline.fit(x_train, y_train)
+  return ConfusionMatrixDisplay.from_estimator(pipeline, x_test, y_test, normalize="true")
+class LinguisticPreprocessor(TransformerMixin):
+  def __init__(self, ):
+    super().__init__()
+    self.lemmatizer = WordNetLemmatizer()
+    self.tokenizer = Tokenizer()
+    self.stop_words = set(stopwords.words('english'))
+    self.stop = stopwords.words('english')
+  def fit(self, X, y=None):
+    return self
+  def transform(self, X, y=None):
+    X = self._remove_html_tags(X)
+    X = self._remove_all_punctuations(X)
+    X = self._remove_double_spaces(X)
+    X = self._lemmatize(X)
+    X = self._remove_stopwords(X)
+    return X
+  def _remove_html_tags(self, X):
+    X = list(map( lambda x: BeautifulSoup(x, 'html.parser').get_text(), X))
+    return X
+  def _remove_all_punctuations(self, X):
+    X = list(
+        map(
+            lambda text: re.sub('[%s]' % re.escape(string.punctuation), '', text),
+            X
+        )
+    )
+    return X
+  def _remove_double_spaces(self, X):
+    X = list(map(lambda text: re.sub(" +", " ", text), X))
+    return X
+  def _remove_stopwords(self, X):
+    X = list(map(
+            lambda text:  " ".join(
+                [
+                    word for word in text.split() if word not in (self.stop_words)
+                ]
+            ),
+            X
+        )
+    )
+    return X
+  def _lemmatize(self, X):
+    X = list(map(lambda text: self._lemmatize_one_sentence(text), X))
+    return X
+  def _lemmatize_one_sentence(self, sentence):
+    sentence = nltk.word_tokenize(sentence)
+    sentence = list(map(lambda word: self.lemmatizer.lemmatize(word), sentence))
+    return " ".join(sentence)
+def training_data(dataset_1, dataset_2, dataset_3):
+  X_test = dataset_1['test']['text']
+  y_test = dataset_1['test']['label']
+  test_df = pd.DataFrame({
+      'text':X_test,
+      'label': y_test
+  })
+  combined_train_df = pd.DataFrame({
+      'text': dataset_1['train']['text'] + dataset_2['train']['text'] + dataset_3['train']['text'],
+      'label': dataset_1['train']['label'] + dataset_2['train']['label'] + dataset_3['train']['label']
+  })
+  combined_train_df.drop_duplicates(subset=['text'], inplace=True)
+  merged_df = pd.merge(combined_train_df, test_df, on="text", how='left', indicator=True)
+  result_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
+  X_train = result_df['text'].tolist()
+  y_train = result_df['label_x'].tolist()
+  X_test = np.array(X_test)
+  X_train = np.array(X_train)
+  return X_train, y_train, X_test, y_test

main.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from functions_preprocess import LinguisticPreprocessor, fit_model, training_data
+from datasets import load_dataset
+import pandas as pd
+from sklearn.linear_model import SGDClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.feature_extraction.text import TfidfVectorizer
+def main():
+    #####load dataset
+    dataset_1 = load_dataset("rotten_tomatoes")
+    dataset_2 = load_dataset('sst2')
+    dataset_2 = dataset_2.rename_column('sentence', 'text')
+    dataset_3 = load_dataset('imdb')
+    X_train, y_train, X_test, y_test = training_data(dataset_1, dataset_2, dataset_3)
+    pipeline = Pipeline(
+    steps=[
+        ("processor", LinguisticPreprocessor()),
+        ("vectorizer", TfidfVectorizer(ngram_range=(1, 2))),
+        ("model", SGDClassifier(loss="log_loss", n_jobs = -1, alpha=0.000001, penalty= 'elasticnet'))])
+    ####### fit model and save the results
+    fit_model(pipeline, X_train, y_train, X_test, y_test)
+    predictions = pipeline.predict(X_test)
+    # Create a DataFrame with index and predictions
+    results_df = pd.DataFrame({
+    "index": range(len(predictions)),
+    "pred": predictions})
+    # Save the DataFrame to a CSV file
+    results_df.to_csv("results.csv", index=False)
+if __name__ == "__main__":
+    main()
+#    model_pkl_file = "sentiment_model.pkl"
+#
+#    with open(model_pkl_file, 'wb') as file:
+#        pickle.dump(pipeline, file)

sentiment_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6bf5286ceecdd7bfcee30a629414a0685d6871470d13cce06eacc9b952729551
+size 74212716