efeperro commited on
Commit
93eddbd
1 Parent(s): f824a4f

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +43 -0
  2. functions_preprocess.py +108 -0
  3. main.py +44 -0
  4. sentiment_model.pkl +3 -0
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from functions_preprocess import LinguisticPreprocessor
3
+ import pickle
4
+
5
+
6
+ #################################################################### Streamlit interface
7
+ st.title("Movie Reviews: An NLP Sentiment analysis")
8
+
9
+ st.markdown("### NLP Processing utilizing various ML approaches")
10
+ st.markdown("##### This initial approach merges multiple datasets, processed through a TF-IDF vectorizer with 2 n-grams and fed into a Stochastic Gradient Descent model.")
11
+ st.markdown("Give it a go by writing a positive or negative text, and analyze it!")
12
+
13
+
14
+ #################################################################### Cache the model loading
15
+ @st.cache_data()
16
+ def load_model():
17
+ model_pkl_file = "sentiment_model.pkl"
18
+ with open(model_pkl_file, 'rb') as file:
19
+ model = pickle.load(file)
20
+ return model
21
+
22
+ model = load_model()
23
+ processor = LinguisticPreprocessor()
24
+ def predict_sentiment(text, model):
25
+ processor.transform(text)
26
+ prediction = model.predict([text])
27
+ return prediction
28
+
29
+
30
+ ############################################################# Text input
31
+ user_input = st.text_area("Enter text here...")
32
+
33
+ if st.button('Analyze'):
34
+ # Displaying output
35
+ result = predict_sentiment(user_input, model)
36
+ if result >= 0.5:
37
+ st.write('The sentiment is: Positive 😀')
38
+
39
+ else:
40
+ st.write('The sentiment is: Negative 😞')
41
+
42
+
43
+ st.caption("Por @efeperro con ❤️. Credits to 🤗")
functions_preprocess.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import string
3
+ import pandas as pd
4
+ from bs4 import BeautifulSoup
5
+ import re
6
+ import numpy as np
7
+ from sklearn.base import TransformerMixin
8
+ from sklearn.metrics import ConfusionMatrixDisplay
9
+ from keras.preprocessing.text import Tokenizer
10
+ import nltk
11
+ from nltk.corpus import stopwords
12
+ from nltk.stem import WordNetLemmatizer
13
+
14
+
15
+ def download_if_non_existent(res_path, res_name):
16
+ try:
17
+ nltk.data.find(res_path)
18
+ except LookupError:
19
+ print(f'resource {res_path} not found. Downloading now...')
20
+ nltk.download(res_name)
21
+
22
+ def fit_model(pipeline, x_train, y_train, x_test, y_test):
23
+ pipeline.fit(x_train, y_train)
24
+ return ConfusionMatrixDisplay.from_estimator(pipeline, x_test, y_test, normalize="true")
25
+
26
+ class LinguisticPreprocessor(TransformerMixin):
27
+ def __init__(self, ):
28
+ super().__init__()
29
+ self.lemmatizer = WordNetLemmatizer()
30
+ self.tokenizer = Tokenizer()
31
+ self.stop_words = set(stopwords.words('english'))
32
+ self.stop = stopwords.words('english')
33
+
34
+ def fit(self, X, y=None):
35
+ return self
36
+
37
+ def transform(self, X, y=None):
38
+ X = self._remove_html_tags(X)
39
+ X = self._remove_all_punctuations(X)
40
+ X = self._remove_double_spaces(X)
41
+ X = self._lemmatize(X)
42
+ X = self._remove_stopwords(X)
43
+ return X
44
+
45
+ def _remove_html_tags(self, X):
46
+ X = list(map( lambda x: BeautifulSoup(x, 'html.parser').get_text(), X))
47
+ return X
48
+
49
+ def _remove_all_punctuations(self, X):
50
+ X = list(
51
+ map(
52
+ lambda text: re.sub('[%s]' % re.escape(string.punctuation), '', text),
53
+ X
54
+ )
55
+ )
56
+ return X
57
+
58
+ def _remove_double_spaces(self, X):
59
+ X = list(map(lambda text: re.sub(" +", " ", text), X))
60
+ return X
61
+
62
+ def _remove_stopwords(self, X):
63
+ X = list(map(
64
+ lambda text: " ".join(
65
+ [
66
+ word for word in text.split() if word not in (self.stop_words)
67
+ ]
68
+ ),
69
+ X
70
+ )
71
+ )
72
+ return X
73
+
74
+ def _lemmatize(self, X):
75
+ X = list(map(lambda text: self._lemmatize_one_sentence(text), X))
76
+ return X
77
+
78
+ def _lemmatize_one_sentence(self, sentence):
79
+ sentence = nltk.word_tokenize(sentence)
80
+ sentence = list(map(lambda word: self.lemmatizer.lemmatize(word), sentence))
81
+ return " ".join(sentence)
82
+
83
+ def training_data(dataset_1, dataset_2, dataset_3):
84
+ X_test = dataset_1['test']['text']
85
+ y_test = dataset_1['test']['label']
86
+
87
+ test_df = pd.DataFrame({
88
+ 'text':X_test,
89
+ 'label': y_test
90
+ })
91
+
92
+ combined_train_df = pd.DataFrame({
93
+ 'text': dataset_1['train']['text'] + dataset_2['train']['text'] + dataset_3['train']['text'],
94
+ 'label': dataset_1['train']['label'] + dataset_2['train']['label'] + dataset_3['train']['label']
95
+ })
96
+
97
+ combined_train_df.drop_duplicates(subset=['text'], inplace=True)
98
+
99
+ merged_df = pd.merge(combined_train_df, test_df, on="text", how='left', indicator=True)
100
+ result_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
101
+
102
+
103
+ X_train = result_df['text'].tolist()
104
+ y_train = result_df['label_x'].tolist()
105
+ X_test = np.array(X_test)
106
+ X_train = np.array(X_train)
107
+
108
+ return X_train, y_train, X_test, y_test
main.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functions_preprocess import LinguisticPreprocessor, fit_model, training_data
2
+ from datasets import load_dataset
3
+ import pandas as pd
4
+ from sklearn.linear_model import SGDClassifier
5
+ from sklearn.pipeline import Pipeline
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+
8
+
9
+ def main():
10
+ #####load dataset
11
+ dataset_1 = load_dataset("rotten_tomatoes")
12
+ dataset_2 = load_dataset('sst2')
13
+ dataset_2 = dataset_2.rename_column('sentence', 'text')
14
+ dataset_3 = load_dataset('imdb')
15
+
16
+ X_train, y_train, X_test, y_test = training_data(dataset_1, dataset_2, dataset_3)
17
+
18
+ pipeline = Pipeline(
19
+ steps=[
20
+ ("processor", LinguisticPreprocessor()),
21
+ ("vectorizer", TfidfVectorizer(ngram_range=(1, 2))),
22
+ ("model", SGDClassifier(loss="log_loss", n_jobs = -1, alpha=0.000001, penalty= 'elasticnet'))])
23
+
24
+ ####### fit model and save the results
25
+ fit_model(pipeline, X_train, y_train, X_test, y_test)
26
+ predictions = pipeline.predict(X_test)
27
+
28
+ # Create a DataFrame with index and predictions
29
+ results_df = pd.DataFrame({
30
+ "index": range(len(predictions)),
31
+ "pred": predictions})
32
+
33
+ # Save the DataFrame to a CSV file
34
+ results_df.to_csv("results.csv", index=False)
35
+
36
+
37
+ if __name__ == "__main__":
38
+ main()
39
+
40
+
41
+ # model_pkl_file = "sentiment_model.pkl"
42
+ #
43
+ # with open(model_pkl_file, 'wb') as file:
44
+ # pickle.dump(pipeline, file)
sentiment_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bf5286ceecdd7bfcee30a629414a0685d6871470d13cce06eacc9b952729551
3
+ size 74212716