|
|
|
"""ytcomments.ipynb
|
|
|
|
Automatically generated by Colaboratory.
|
|
|
|
Original file is located at
|
|
https://colab.research.google.com/drive/1IAkt_1sG94cjURWKBvghkoK2KlZiYzX9
|
|
"""
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.metrics import accuracy_score
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
|
df=pd.read_csv("/content/comments.csv")
|
|
|
|
df.head()
|
|
|
|
df=df.iloc[:,[2,4]]
|
|
|
|
df.head(2)
|
|
|
|
df.info()
|
|
|
|
df.isnull().sum()
|
|
|
|
df.dropna(axis=0,how='any',inplace=True)
|
|
|
|
df.columns
|
|
|
|
df['Sentiment']=df['Sentiment'].astype('int')
|
|
|
|
|
|
|
|
|
|
|
|
df['Comment']=df['Comment'].str.lower()
|
|
|
|
import string
|
|
string.punctuation
|
|
|
|
exclude=string.punctuation
|
|
def remove_punc(text):
|
|
for char in exclude:
|
|
text=text.replace(char,'')
|
|
return text
|
|
|
|
df['Comment']=df['Comment'].apply(remove_punc)
|
|
|
|
import nltk
|
|
nltk.download('stopwords')
|
|
|
|
from nltk.corpus import stopwords
|
|
stopwords.words('english')
|
|
|
|
def remove_stopwords(text):
|
|
new_text=[]
|
|
for word in text.split():
|
|
if word in stopwords.words('english'):
|
|
new_text.append('')
|
|
else:
|
|
new_text.append(word)
|
|
x=new_text[:]
|
|
new_text.clear()
|
|
return " ".join(x)
|
|
|
|
df['Comment']=df['Comment'].apply(remove_stopwords)
|
|
|
|
from nltk.stem.porter import PorterStemmer
|
|
ps=PorterStemmer()
|
|
|
|
def stem_words(text):
|
|
return " ".join([ps.stem(word) for word in text.split()])
|
|
|
|
df['Comment']=df['Comment'].apply(stem_words)
|
|
|
|
|
|
|
|
|
|
|
|
plt.pie(df['Sentiment'].value_counts(), labels=['negative','neutral','positive'],autopct="%0.2f")
|
|
plt.show()
|
|
|
|
import nltk
|
|
|
|
nltk.download('punkt')
|
|
|
|
df['total_characters']=df['Comment'].apply(len)
|
|
|
|
df.head(2)
|
|
|
|
df['total_words'] = df['Comment'].apply(lambda x:len(nltk.word_tokenize(x)))
|
|
|
|
df.head(2)
|
|
|
|
df['total_sentences'] = df['Comment'].apply(lambda x:len(nltk.sent_tokenize(x)))
|
|
|
|
df.head(2)
|
|
|
|
df[['total_characters','total_sentences','total_words']].describe()
|
|
|
|
mask0=df['Sentiment']==0
|
|
mask1=df['Sentiment']==1
|
|
mask2=df['Sentiment']==2
|
|
|
|
df[mask0][['total_sentences','total_words','total_characters']].describe()
|
|
|
|
df[mask1][['total_sentences','total_words','total_characters']].describe()
|
|
|
|
df[mask2][['total_sentences','total_words','total_characters']].describe()
|
|
|
|
plt.figure(figsize=(12,6))
|
|
sns.histplot(df[df['Sentiment'] == 0]['total_characters'],color='green')
|
|
sns.histplot(df[df['Sentiment'] == 1]['total_characters'],color='red')
|
|
sns.histplot(df[df['Sentiment'] == 2]['total_characters'],color='pink')
|
|
|
|
plt.figure(figsize=(10,4))
|
|
sns.histplot(df[df['Sentiment'] == 0]['total_words'],color='green')
|
|
sns.histplot(df[df['Sentiment'] == 1]['total_words'],color='red')
|
|
sns.histplot(df[df['Sentiment'] == 2]['total_words'],color='pink')
|
|
|
|
sns.pairplot(df,hue='Sentiment')
|
|
|
|
sns.heatmap(df.corr(),annot=True)
|
|
|
|
from wordcloud import WordCloud
|
|
wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')
|
|
|
|
negative_wc = wc.generate(df[df['Sentiment'] == 0]['Comment'].str.cat(sep=" "))
|
|
|
|
neutral_wc = wc.generate(df[df['Sentiment'] == 1]['Comment'].str.cat(sep=" "))
|
|
|
|
positive_wc = wc.generate(df[df['Sentiment'] == 2]['Comment'].str.cat(sep=" "))
|
|
|
|
plt.figure(figsize=(6,6))
|
|
plt.imshow(negative_wc)
|
|
|
|
plt.figure(figsize=(6,6))
|
|
plt.imshow(neutral_wc)
|
|
|
|
plt.figure(figsize=(6,6))
|
|
plt.imshow(positive_wc)
|
|
|
|
negative_corpus = []
|
|
for msg in df[df['Sentiment'] == 0]['Comment'].tolist():
|
|
for word in msg.split():
|
|
negative_corpus.append(word)
|
|
|
|
neutral_corpus = []
|
|
for msg in df[df['Sentiment'] == 1]['Comment'].tolist():
|
|
for word in msg.split():
|
|
neutral_corpus.append(word)
|
|
|
|
positive_corpus = []
|
|
for msg in df[df['Sentiment'] == 1]['Comment'].tolist():
|
|
for word in msg.split():
|
|
positive_corpus.append(word)
|
|
|
|
print(len(negative_corpus))
|
|
print(len(neutral_corpus))
|
|
print(len(positive_corpus))
|
|
|
|
from collections import Counter
|
|
|
|
pd.DataFrame(Counter(negative_corpus).most_common(30))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cv=CountVectorizer(lowercase=True,stop_words='english',max_features=3000)
|
|
tfidf=TfidfVectorizer(max_features=3000)
|
|
|
|
features_cv=cv.fit_transform(df['Comment']).toarray()
|
|
features_tfidf=tfidf.fit_transform(df['Comment']).toarray()
|
|
|
|
type(features_cv)
|
|
|
|
features_cv
|
|
|
|
dict1 = pd.DataFrame(features_cv)
|
|
dict2 = pd.DataFrame(features_tfidf)
|
|
|
|
dict1.shape
|
|
|
|
df.columns
|
|
|
|
x_cv=dict1.iloc[:,:]
|
|
y_cv=df[['Sentiment']]
|
|
|
|
y_cv.columns
|
|
|
|
np.unique(y_cv)
|
|
|
|
x_tfidf=dict1.iloc[:,:]
|
|
y_tfidf=df.iloc[:,-1]
|
|
|
|
print(x_cv.shape)
|
|
print(x_tfidf.shape)
|
|
|
|
print(y_cv.shape)
|
|
print(y_tfidf.shape)
|
|
|
|
x_cv_train,x_cv_test,y_cv_train,y_cv_test=train_test_split(x_cv,y_cv,test_size=0.2)
|
|
|
|
y_cv_test.shape
|
|
|
|
x_tfidf_train,x_tfidf_test,y_tfidf_train,y_tfidf_test=train_test_split(x_tfidf,y_tfidf,test_size=0.2)
|
|
|
|
print(x_cv_train.shape)
|
|
print(y_cv_test.shape)
|
|
print(x_tfidf_train.shape)
|
|
print(y_tfidf_test.shape)
|
|
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.svm import SVC
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.ensemble import AdaBoostClassifier
|
|
from sklearn.ensemble import BaggingClassifier
|
|
from sklearn.ensemble import ExtraTreesClassifier
|
|
from sklearn.ensemble import GradientBoostingClassifier
|
|
from xgboost import XGBClassifier
|
|
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
|
|
from sklearn.metrics import accuracy_score,precision_score
|
|
|
|
svc = SVC(kernel='sigmoid', gamma=1.0)
|
|
knc = KNeighborsClassifier()
|
|
mnb = MultinomialNB()
|
|
dtc = DecisionTreeClassifier(max_depth=5)
|
|
lrc = LogisticRegression(solver='liblinear', penalty='l1')
|
|
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
|
|
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
|
|
bc = BaggingClassifier(n_estimators=50, random_state=2)
|
|
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
|
|
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
|
|
xgb = XGBClassifier(n_estimators=50,random_state=2)
|
|
|
|
clfs = {
|
|
'SVC' : svc,
|
|
'KN' : knc,
|
|
'NB': mnb,
|
|
'DT': dtc,
|
|
'LR': lrc,
|
|
'RF': rfc,
|
|
'AdaBoost': abc,
|
|
'BgC': bc,
|
|
'ETC': etc,
|
|
'GBDT':gbdt,
|
|
'xgb':xgb
|
|
}
|
|
|
|
def train_classifier(clf,X_train,y_train,X_test,y_test):
|
|
clf.fit(x_cv_train,y_cv_train)
|
|
y_cv_pred = clf.predict(x_cv_test)
|
|
accuracy = accuracy_score(y_cv_test,y_cv_pred)
|
|
precision = precision_score(y_cv_test,y_cv_pred,average='micro')
|
|
|
|
return accuracy,precision
|
|
|
|
|
|
|
|
|
|
|
|
np.unique(y_cv_test)
|
|
|
|
accuracy_scores = []
|
|
precision_scores = []
|
|
|
|
for name,clf in clfs.items():
|
|
|
|
current_accuracy,current_precision = train_classifier(clf, x_cv_train,y_cv_train,x_cv_test,y_cv_test)
|
|
|
|
print("For ",name)
|
|
print("Accuracy - ",current_accuracy)
|
|
print("Precision - ",current_precision)
|
|
|
|
accuracy_scores.append(current_accuracy)
|
|
precision_scores.append(current_precision)
|
|
|
|
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
|
|
|
|
performance_df
|
|
|
|
from sklearn.ensemble import VotingClassifier
|
|
|
|
svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)
|
|
mnb = MultinomialNB()
|
|
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
|
|
|
|
voting = VotingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)],voting='soft')
|
|
|
|
voting.fit(x_cv_train,y_cv_train)
|
|
|
|
y_pred = voting.predict(x_cv_test)
|
|
|
|
print("Accuracy",accuracy_score(y_cv_test,y_pred))
|
|
print("Precision",precision_score(y_cv_test,y_pred,average='micro'))
|
|
|
|
|
|
|
|
import pickle
|
|
|
|
pickle.dump(cv,open('vectorizer.pkl','wb'))
|
|
|
|
pickle.dump(lrc,open('model.pkl','wb'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gnb = GaussianNB()
|
|
mnb = MultinomialNB()
|
|
bnb = BernoulliNB()
|
|
|
|
gnb.fit(x_cv_train,y_cv_train)
|
|
|
|
pred_gnb=gnb.predict(x_cv_test)
|
|
|
|
print(accuracy_score(y_cv_test,pred_gnb))
|
|
print(precision_score(y_cv_test,pred_gnb,average='micro'))
|
|
|
|
mnb.fit(x_cv_train,y_cv_train)
|
|
|
|
pred_mnb=mnb.predict(x_cv_test)
|
|
|
|
print(accuracy_score(y_cv_test,pred_mnb))
|
|
print(precision_score(y_cv_test,pred_mnb,average='micro'))
|
|
|
|
bnb.fit(x_cv_train,y_cv_train)
|
|
|
|
pred_bnb=bnb.predict(x_cv_test)
|
|
|
|
print(accuracy_score(y_cv_test,pred_bnb))
|
|
print(precision_score(y_cv_test,pred_bnb,average='micro')) |