Spaces:
Paused
Paused
Upload 4 files
Browse files- app.py +43 -0
- functions_preprocess.py +108 -0
- main.py +44 -0
- sentiment_model.pkl +3 -0
app.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from functions_preprocess import LinguisticPreprocessor
|
3 |
+
import pickle
|
4 |
+
|
5 |
+
|
6 |
+
#################################################################### Streamlit interface
|
7 |
+
st.title("Movie Reviews: An NLP Sentiment analysis")
|
8 |
+
|
9 |
+
st.markdown("### NLP Processing utilizing various ML approaches")
|
10 |
+
st.markdown("##### This initial approach merges multiple datasets, processed through a TF-IDF vectorizer with 2 n-grams and fed into a Stochastic Gradient Descent model.")
|
11 |
+
st.markdown("Give it a go by writing a positive or negative text, and analyze it!")
|
12 |
+
|
13 |
+
|
14 |
+
#################################################################### Cache the model loading
|
15 |
+
@st.cache_data()
|
16 |
+
def load_model():
|
17 |
+
model_pkl_file = "sentiment_model.pkl"
|
18 |
+
with open(model_pkl_file, 'rb') as file:
|
19 |
+
model = pickle.load(file)
|
20 |
+
return model
|
21 |
+
|
22 |
+
model = load_model()
|
23 |
+
processor = LinguisticPreprocessor()
|
24 |
+
def predict_sentiment(text, model):
|
25 |
+
processor.transform(text)
|
26 |
+
prediction = model.predict([text])
|
27 |
+
return prediction
|
28 |
+
|
29 |
+
|
30 |
+
############################################################# Text input
|
31 |
+
user_input = st.text_area("Enter text here...")
|
32 |
+
|
33 |
+
if st.button('Analyze'):
|
34 |
+
# Displaying output
|
35 |
+
result = predict_sentiment(user_input, model)
|
36 |
+
if result >= 0.5:
|
37 |
+
st.write('The sentiment is: Positive 😀')
|
38 |
+
|
39 |
+
else:
|
40 |
+
st.write('The sentiment is: Negative 😞')
|
41 |
+
|
42 |
+
|
43 |
+
st.caption("Por @efeperro con ❤️. Credits to 🤗")
|
functions_preprocess.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import string
|
3 |
+
import pandas as pd
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
import re
|
6 |
+
import numpy as np
|
7 |
+
from sklearn.base import TransformerMixin
|
8 |
+
from sklearn.metrics import ConfusionMatrixDisplay
|
9 |
+
from keras.preprocessing.text import Tokenizer
|
10 |
+
import nltk
|
11 |
+
from nltk.corpus import stopwords
|
12 |
+
from nltk.stem import WordNetLemmatizer
|
13 |
+
|
14 |
+
|
15 |
+
def download_if_non_existent(res_path, res_name):
|
16 |
+
try:
|
17 |
+
nltk.data.find(res_path)
|
18 |
+
except LookupError:
|
19 |
+
print(f'resource {res_path} not found. Downloading now...')
|
20 |
+
nltk.download(res_name)
|
21 |
+
|
22 |
+
def fit_model(pipeline, x_train, y_train, x_test, y_test):
|
23 |
+
pipeline.fit(x_train, y_train)
|
24 |
+
return ConfusionMatrixDisplay.from_estimator(pipeline, x_test, y_test, normalize="true")
|
25 |
+
|
26 |
+
class LinguisticPreprocessor(TransformerMixin):
|
27 |
+
def __init__(self, ):
|
28 |
+
super().__init__()
|
29 |
+
self.lemmatizer = WordNetLemmatizer()
|
30 |
+
self.tokenizer = Tokenizer()
|
31 |
+
self.stop_words = set(stopwords.words('english'))
|
32 |
+
self.stop = stopwords.words('english')
|
33 |
+
|
34 |
+
def fit(self, X, y=None):
|
35 |
+
return self
|
36 |
+
|
37 |
+
def transform(self, X, y=None):
|
38 |
+
X = self._remove_html_tags(X)
|
39 |
+
X = self._remove_all_punctuations(X)
|
40 |
+
X = self._remove_double_spaces(X)
|
41 |
+
X = self._lemmatize(X)
|
42 |
+
X = self._remove_stopwords(X)
|
43 |
+
return X
|
44 |
+
|
45 |
+
def _remove_html_tags(self, X):
|
46 |
+
X = list(map( lambda x: BeautifulSoup(x, 'html.parser').get_text(), X))
|
47 |
+
return X
|
48 |
+
|
49 |
+
def _remove_all_punctuations(self, X):
|
50 |
+
X = list(
|
51 |
+
map(
|
52 |
+
lambda text: re.sub('[%s]' % re.escape(string.punctuation), '', text),
|
53 |
+
X
|
54 |
+
)
|
55 |
+
)
|
56 |
+
return X
|
57 |
+
|
58 |
+
def _remove_double_spaces(self, X):
|
59 |
+
X = list(map(lambda text: re.sub(" +", " ", text), X))
|
60 |
+
return X
|
61 |
+
|
62 |
+
def _remove_stopwords(self, X):
|
63 |
+
X = list(map(
|
64 |
+
lambda text: " ".join(
|
65 |
+
[
|
66 |
+
word for word in text.split() if word not in (self.stop_words)
|
67 |
+
]
|
68 |
+
),
|
69 |
+
X
|
70 |
+
)
|
71 |
+
)
|
72 |
+
return X
|
73 |
+
|
74 |
+
def _lemmatize(self, X):
|
75 |
+
X = list(map(lambda text: self._lemmatize_one_sentence(text), X))
|
76 |
+
return X
|
77 |
+
|
78 |
+
def _lemmatize_one_sentence(self, sentence):
|
79 |
+
sentence = nltk.word_tokenize(sentence)
|
80 |
+
sentence = list(map(lambda word: self.lemmatizer.lemmatize(word), sentence))
|
81 |
+
return " ".join(sentence)
|
82 |
+
|
83 |
+
def training_data(dataset_1, dataset_2, dataset_3):
|
84 |
+
X_test = dataset_1['test']['text']
|
85 |
+
y_test = dataset_1['test']['label']
|
86 |
+
|
87 |
+
test_df = pd.DataFrame({
|
88 |
+
'text':X_test,
|
89 |
+
'label': y_test
|
90 |
+
})
|
91 |
+
|
92 |
+
combined_train_df = pd.DataFrame({
|
93 |
+
'text': dataset_1['train']['text'] + dataset_2['train']['text'] + dataset_3['train']['text'],
|
94 |
+
'label': dataset_1['train']['label'] + dataset_2['train']['label'] + dataset_3['train']['label']
|
95 |
+
})
|
96 |
+
|
97 |
+
combined_train_df.drop_duplicates(subset=['text'], inplace=True)
|
98 |
+
|
99 |
+
merged_df = pd.merge(combined_train_df, test_df, on="text", how='left', indicator=True)
|
100 |
+
result_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
|
101 |
+
|
102 |
+
|
103 |
+
X_train = result_df['text'].tolist()
|
104 |
+
y_train = result_df['label_x'].tolist()
|
105 |
+
X_test = np.array(X_test)
|
106 |
+
X_train = np.array(X_train)
|
107 |
+
|
108 |
+
return X_train, y_train, X_test, y_test
|
main.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functions_preprocess import LinguisticPreprocessor, fit_model, training_data
|
2 |
+
from datasets import load_dataset
|
3 |
+
import pandas as pd
|
4 |
+
from sklearn.linear_model import SGDClassifier
|
5 |
+
from sklearn.pipeline import Pipeline
|
6 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
+
|
8 |
+
|
9 |
+
def main():
|
10 |
+
#####load dataset
|
11 |
+
dataset_1 = load_dataset("rotten_tomatoes")
|
12 |
+
dataset_2 = load_dataset('sst2')
|
13 |
+
dataset_2 = dataset_2.rename_column('sentence', 'text')
|
14 |
+
dataset_3 = load_dataset('imdb')
|
15 |
+
|
16 |
+
X_train, y_train, X_test, y_test = training_data(dataset_1, dataset_2, dataset_3)
|
17 |
+
|
18 |
+
pipeline = Pipeline(
|
19 |
+
steps=[
|
20 |
+
("processor", LinguisticPreprocessor()),
|
21 |
+
("vectorizer", TfidfVectorizer(ngram_range=(1, 2))),
|
22 |
+
("model", SGDClassifier(loss="log_loss", n_jobs = -1, alpha=0.000001, penalty= 'elasticnet'))])
|
23 |
+
|
24 |
+
####### fit model and save the results
|
25 |
+
fit_model(pipeline, X_train, y_train, X_test, y_test)
|
26 |
+
predictions = pipeline.predict(X_test)
|
27 |
+
|
28 |
+
# Create a DataFrame with index and predictions
|
29 |
+
results_df = pd.DataFrame({
|
30 |
+
"index": range(len(predictions)),
|
31 |
+
"pred": predictions})
|
32 |
+
|
33 |
+
# Save the DataFrame to a CSV file
|
34 |
+
results_df.to_csv("results.csv", index=False)
|
35 |
+
|
36 |
+
|
37 |
+
if __name__ == "__main__":
|
38 |
+
main()
|
39 |
+
|
40 |
+
|
41 |
+
# model_pkl_file = "sentiment_model.pkl"
|
42 |
+
#
|
43 |
+
# with open(model_pkl_file, 'wb') as file:
|
44 |
+
# pickle.dump(pipeline, file)
|
sentiment_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6bf5286ceecdd7bfcee30a629414a0685d6871470d13cce06eacc9b952729551
|
3 |
+
size 74212716
|