## Importing Libraries

In [1]:
# Importing required Libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Downloading NLTK Packages

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Creating set of stop words

stop_words = set(stopwords.words('english'))

## Importing and Pre-processing Training Dataset

In [4]:
# Importing Tranining Dataset

train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,Entity Project,A director and her friends renting a haunted h...,horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action


In [5]:
# Getting Info about Train Data

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54000 entries, 0 to 53999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          54000 non-null  int64 
 1   movie_name  54000 non-null  object
 2   synopsis    54000 non-null  object
 3   genre       54000 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.6+ MB


In [6]:
# Checking for Null Values

train_data.isnull().sum()

id            0
movie_name    0
synopsis      0
genre         0
dtype: int64

In [7]:
# Getting Number of Classes and their Distribution in Train Data

train_data['genre'].value_counts()

fantasy      5400
horror       5400
family       5400
scifi        5400
action       5400
crime        5400
adventure    5400
mystery      5400
romance      5400
thriller     5400
Name: genre, dtype: int64

In [8]:
# Method to pre-process text from column: movie_name

def preprocessMovieName(movieNames):
  """
  Converting text to lowercase and Removing extra spaces from movie_name column values
  """
  cleanedMovieNames = []

  for movie in movieNames:
      text = movie.lower()
      text = text.strip('  ')
      cleanedMovieNames.append(text)

  return cleanedMovieNames

In [9]:
# Transforming movie_name column using preprocessMovieName method

movieNames = train_data['movie_name']
train_data['movie_name'] = preprocessMovieName(movieNames)
train_data.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,super me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,entity project,A director and her friends renting a haunted h...,horror
2,34131,behavioral family therapy for serious psychiat...,This is an educational video for families and ...,family
3,78522,blood glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action


In [10]:
# Method to pre-process text from column: synopsis

def preprocessSynopsis(synopsis):
  """
  Converting text to lowercase, Removing extra spaces, digits, symbols, stop words from synopsis column values
  """
  cleanedSynopses = []

  for synop in synopsis:
      text = re.sub(r'[^a-zA-Z]', ' ', synop.lower())
      text = text.strip('  ')
      word_tokens = word_tokenize(text)
      cleanedText = [w for w in word_tokens if w not in stop_words]
      cleanedSynop = ' '.join(cleanedText)
      cleanedSynopses.append(cleanedSynop)

  return cleanedSynopses

In [11]:
# Transforming synopsis column using preprocessSynopsis method

synopsis = train_data['synopsis']
train_data['synopsis'] = preprocessSynopsis(synopsis)
train_data.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,super me,young scriptwriter starts bringing valuable ob...,fantasy
1,50185,entity project,director friends renting haunted house capture...,horror
2,34131,behavioral family therapy for serious psychiat...,educational video families family therapists d...,family
3,78522,blood glacier,scientists working austrian alps discover glac...,scifi
4,2206,apat na anino,buy day four men widely apart life night shado...,action


In [12]:
# Method to combine text values from movie_name and synopsis columns

def mergeText(df):
  """
  Combining text from movie_name and synopsis i.e. resulting values will be of the form: movie_name+' '+synopsis
  """
  movieSynposis=[]

  for ind in df.index:
    ms_text = str(df['movie_name'][ind]) + ' ' + str(df['synopsis'][ind])
    movieSynposis.append(ms_text)

  return movieSynposis

In [13]:
# Applying mergeText method and storing values in new column: movie_synopsis

train_data['movie_synopsis'] = mergeText(train_data)
train_data.head()

Unnamed: 0,id,movie_name,synopsis,genre,movie_synopsis
0,44978,super me,young scriptwriter starts bringing valuable ob...,fantasy,super me young scriptwriter starts bringing va...
1,50185,entity project,director friends renting haunted house capture...,horror,entity project director friends renting haunte...
2,34131,behavioral family therapy for serious psychiat...,educational video families family therapists d...,family,behavioral family therapy for serious psychiat...
3,78522,blood glacier,scientists working austrian alps discover glac...,scifi,blood glacier scientists working austrian alps...
4,2206,apat na anino,buy day four men widely apart life night shado...,action,apat na anino buy day four men widely apart li...


## Label Encoding Target Classes

In [14]:
# Using Label Encoder to encode classes from genre

le_genre = LabelEncoder()
train_data['genre'] = le_genre.fit_transform(train_data['genre'])
train_data.head()

Unnamed: 0,id,movie_name,synopsis,genre,movie_synopsis
0,44978,super me,young scriptwriter starts bringing valuable ob...,4,super me young scriptwriter starts bringing va...
1,50185,entity project,director friends renting haunted house capture...,5,entity project director friends renting haunte...
2,34131,behavioral family therapy for serious psychiat...,educational video families family therapists d...,3,behavioral family therapy for serious psychiat...
3,78522,blood glacier,scientists working austrian alps discover glac...,8,blood glacier scientists working austrian alps...
4,2206,apat na anino,buy day four men widely apart life night shado...,0,apat na anino buy day four men widely apart li...


In [15]:
# Retrieving list of classes from Label Encoder

le_genre.classes_

array(['action', 'adventure', 'crime', 'family', 'fantasy', 'horror',
       'mystery', 'romance', 'scifi', 'thriller'], dtype=object)

## Vectorizing Textual Data

In [16]:
# Vectorizing textual data i.e. converting each text token into integers using TF-IDF Vectorizer

cv = TfidfVectorizer()
vectorized_synopsis = cv.fit_transform(train_data['movie_synopsis'])
vectorized_synopsis[0]

<1x60085 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [17]:
# Separating X: Features and Y: Target columns

X = vectorized_synopsis
Y = train_data['genre'].values

print("Features Shape: ",X.shape)
print("Target Shape: ",Y.shape)

Features Shape:  (54000, 60085)
Target Shape:  (54000,)


## Splitting data into Train and Validation Sets

In [18]:
# Splitting into Training and Validation Sets with 25% validation split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)
X_train[0]

<1x60085 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>

## Model Building: Training, Prediction and Metric Evaluation

In [19]:
# Training model using Multinomial Naive Bayes, Getting predictions on Validation set, Calculating Metric: Accuracy

from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

mnb.fit(X_train, y_train)

y_pred = mnb.predict(X_test)

print("Val Acc using MultinomialNB: ", accuracy_score(y_test, y_pred))

Val Acc using MultinomialNB:  0.3622222222222222


In [20]:
# Training model using Decision Tree Classifier, Getting predictions on Validation set, Calculating Metric: Accuracy

from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()

dt_clf.fit(X_train, y_train)

y_pred = dt_clf.predict(X_test)

print("Val Acc using Decision Tree: ", accuracy_score(y_test, y_pred))

Val Acc using Decision Tree:  0.18748148148148147


In [21]:
# Training model using KNN (K-Nearest Neighbours Classifier), Getting predictions on Validation set, Calculating Metric: Accuracy

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print("Val Acc using KNN: ", accuracy_score(y_test, y_pred))

Val Acc using KNN:  0.23837037037037037


As our principle metric to consider is Accuracy, we finalize Multinomial Naive Bayes as our Final Model. <br>
Multinomial Naives Bayes outperforms among all the considered models, hence using it for Test Data Prediction.

### Test Data Prediction

In [22]:
test_data = pd.read_csv("test.csv")
test_data.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,16863,A Death Sentence,"12 y.o. Ida's dad'll die without a DKK1,500,00...",action
1,48456,Intermedio,A group of four teenage friends become trapped...,action
2,41383,30 Chua Phai Tet,A guy left his home for 12 years till he came ...,action
3,84007,Paranoiac,A man long believed dead returns to the family...,action
4,40269,Ordinary Happiness,"After a deadly accident, Paolo comes back on E...",action


In [23]:
movieNames = test_data['movie_name']
test_data['movie_name'] = preprocessMovieName(movieNames)

synopsis = test_data['synopsis']
test_data['synopsis'] = preprocessSynopsis(synopsis)

test_data.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,16863,a death sentence,ida dad die without dkk operation ida plans st...,action
1,48456,intermedio,group four teenage friends become trapped mexi...,action
2,41383,30 chua phai tet,guy left home years till came back claim fathe...,action
3,84007,paranoiac,man long believed dead returns family estate c...,action
4,40269,ordinary happiness,deadly accident paolo comes back earth minutes...,action


In [24]:
test_data['movie_synopsis'] = mergeText(test_data)
test_data.drop(['genre'], axis=1, inplace=True)

test_data.head()

Unnamed: 0,id,movie_name,synopsis,movie_synopsis
0,16863,a death sentence,ida dad die without dkk operation ida plans st...,a death sentence ida dad die without dkk opera...
1,48456,intermedio,group four teenage friends become trapped mexi...,intermedio group four teenage friends become t...
2,41383,30 chua phai tet,guy left home years till came back claim fathe...,30 chua phai tet guy left home years till came...
3,84007,paranoiac,man long believed dead returns family estate c...,paranoiac man long believed dead returns famil...
4,40269,ordinary happiness,deadly accident paolo comes back earth minutes...,ordinary happiness deadly accident paolo comes...


In [25]:
vectorized_synopsis = cv.transform(test_data['movie_synopsis'])

predictions = mnb.predict(vectorized_synopsis)

genre_predictions = le_genre.inverse_transform(predictions)

In [26]:
submission = pd.DataFrame(pd.DataFrame({'id': test_data['id'], 'genre': genre_predictions}))
submission.head()

Unnamed: 0,id,genre
0,16863,crime
1,48456,horror
2,41383,scifi
3,84007,mystery
4,40269,fantasy


In [27]:
submission.to_csv('submission_ShalakaThorat.csv', index=False)