{
"cells": [
{
"cell_type": "markdown",
"source": [
"## Importing Libraries"
],
"metadata": {
"id": "jiNlo56ax2Us"
},
"id": "jiNlo56ax2Us"
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e0c77096",
"metadata": {
"id": "e0c77096"
},
"outputs": [],
"source": [
"# Importing required Libraries\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.feature_extraction.text import TfidfVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b5fbe912",
"metadata": {
"scrolled": true,
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "b5fbe912",
"outputId": "a8b49040-2f4b-429b-d174-aead00979ed5"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
}
],
"source": [
"# Downloading NLTK Packages\n",
"\n",
"import nltk\n",
"nltk.download('stopwords')\n",
"nltk.download('punkt')\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "fcb105bd",
"metadata": {
"id": "fcb105bd"
},
"outputs": [],
"source": [
"# Creating set of stop words\n",
"\n",
"stop_words = set(stopwords.words('english'))"
]
},
{
"cell_type": "markdown",
"source": [
"## Importing and Pre-processing Training Dataset"
],
"metadata": {
"id": "8mICKWhquvro"
},
"id": "8mICKWhquvro"
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f5965adb",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "f5965adb",
"outputId": "42ca0d77-46cb-4032-9071-37bad3d4ce61"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" id movie_name \\\n",
"0 44978 Super Me \n",
"1 50185 Entity Project \n",
"2 34131 Behavioral Family Therapy for Serious Psychiat... \n",
"3 78522 Blood Glacier \n",
"4 2206 Apat na anino \n",
"\n",
" synopsis genre \n",
"0 A young scriptwriter starts bringing valuable ... fantasy \n",
"1 A director and her friends renting a haunted h... horror \n",
"2 This is an educational video for families and ... family \n",
"3 Scientists working in the Austrian Alps discov... scifi \n",
"4 Buy Day - Four Men Widely - Apart in Life - By... action "
],
"text/html": [
"\n",
"\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" movie_name \n",
" synopsis \n",
" genre \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 44978 \n",
" Super Me \n",
" A young scriptwriter starts bringing valuable ... \n",
" fantasy \n",
" \n",
" \n",
" 1 \n",
" 50185 \n",
" Entity Project \n",
" A director and her friends renting a haunted h... \n",
" horror \n",
" \n",
" \n",
" 2 \n",
" 34131 \n",
" Behavioral Family Therapy for Serious Psychiat... \n",
" This is an educational video for families and ... \n",
" family \n",
" \n",
" \n",
" 3 \n",
" 78522 \n",
" Blood Glacier \n",
" Scientists working in the Austrian Alps discov... \n",
" scifi \n",
" \n",
" \n",
" 4 \n",
" 2206 \n",
" Apat na anino \n",
" Buy Day - Four Men Widely - Apart in Life - By... \n",
" action \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"
\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"
\n",
"\n",
"\n",
"\n",
" \n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n"
]
},
"metadata": {},
"execution_count": 4
}
],
"source": [
"# Importing Tranining Dataset\n",
"\n",
"train_data = pd.read_csv(\"train.csv\")\n",
"train_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "20d4b346",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "20d4b346",
"outputId": "7e787488-2023-4854-e1f3-e5e1d5e237c2"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"RangeIndex: 54000 entries, 0 to 53999\n",
"Data columns (total 4 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 54000 non-null int64 \n",
" 1 movie_name 54000 non-null object\n",
" 2 synopsis 54000 non-null object\n",
" 3 genre 54000 non-null object\n",
"dtypes: int64(1), object(3)\n",
"memory usage: 1.6+ MB\n"
]
}
],
"source": [
"# Getting Info about Train Data\n",
"\n",
"train_data.info()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "fc1a181f",
"metadata": {
"scrolled": false,
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fc1a181f",
"outputId": "594fa4a9-9456-489c-eea1-4e5b160f850f"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"id 0\n",
"movie_name 0\n",
"synopsis 0\n",
"genre 0\n",
"dtype: int64"
]
},
"metadata": {},
"execution_count": 6
}
],
"source": [
"# Checking for Null Values\n",
"\n",
"train_data.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ccbc40da",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ccbc40da",
"outputId": "27766f2e-df95-49b1-c6bf-dbf281b3bf21"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"fantasy 5400\n",
"horror 5400\n",
"family 5400\n",
"scifi 5400\n",
"action 5400\n",
"crime 5400\n",
"adventure 5400\n",
"mystery 5400\n",
"romance 5400\n",
"thriller 5400\n",
"Name: genre, dtype: int64"
]
},
"metadata": {},
"execution_count": 7
}
],
"source": [
"# Getting Number of Classes and their Distribution in Train Data\n",
"\n",
"train_data['genre'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b630de43",
"metadata": {
"id": "b630de43"
},
"outputs": [],
"source": [
"# Method to pre-process text from column: movie_name\n",
"\n",
"def preprocessMovieName(movieNames):\n",
" \"\"\"\n",
" Converting text to lowercase and Removing extra spaces from movie_name column values\n",
" \"\"\"\n",
" cleanedMovieNames = []\n",
"\n",
" for movie in movieNames:\n",
" text = movie.lower()\n",
" text = text.strip(' ')\n",
" cleanedMovieNames.append(text)\n",
"\n",
" return cleanedMovieNames"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "6bc44949",
"metadata": {
"scrolled": true,
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "6bc44949",
"outputId": "8765cf23-b184-4c66-fe6b-23709fcaa5c8"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" id movie_name \\\n",
"0 44978 super me \n",
"1 50185 entity project \n",
"2 34131 behavioral family therapy for serious psychiat... \n",
"3 78522 blood glacier \n",
"4 2206 apat na anino \n",
"\n",
" synopsis genre \n",
"0 A young scriptwriter starts bringing valuable ... fantasy \n",
"1 A director and her friends renting a haunted h... horror \n",
"2 This is an educational video for families and ... family \n",
"3 Scientists working in the Austrian Alps discov... scifi \n",
"4 Buy Day - Four Men Widely - Apart in Life - By... action "
],
"text/html": [
"\n",
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" movie_name \n",
" synopsis \n",
" genre \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 44978 \n",
" super me \n",
" A young scriptwriter starts bringing valuable ... \n",
" fantasy \n",
" \n",
" \n",
" 1 \n",
" 50185 \n",
" entity project \n",
" A director and her friends renting a haunted h... \n",
" horror \n",
" \n",
" \n",
" 2 \n",
" 34131 \n",
" behavioral family therapy for serious psychiat... \n",
" This is an educational video for families and ... \n",
" family \n",
" \n",
" \n",
" 3 \n",
" 78522 \n",
" blood glacier \n",
" Scientists working in the Austrian Alps discov... \n",
" scifi \n",
" \n",
" \n",
" 4 \n",
" 2206 \n",
" apat na anino \n",
" Buy Day - Four Men Widely - Apart in Life - By... \n",
" action \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"
\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"
\n",
"\n",
"\n",
"\n",
" \n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n"
]
},
"metadata": {},
"execution_count": 9
}
],
"source": [
"# Transforming movie_name column using preprocessMovieName method\n",
"\n",
"movieNames = train_data['movie_name']\n",
"train_data['movie_name'] = preprocessMovieName(movieNames)\n",
"train_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "bebcffc9",
"metadata": {
"id": "bebcffc9"
},
"outputs": [],
"source": [
"# Method to pre-process text from column: synopsis\n",
"\n",
"def preprocessSynopsis(synopsis):\n",
" \"\"\"\n",
" Converting text to lowercase, Removing extra spaces, digits, symbols, stop words from synopsis column values\n",
" \"\"\"\n",
" cleanedSynopses = []\n",
"\n",
" for synop in synopsis:\n",
" text = re.sub(r'[^a-zA-Z]', ' ', synop.lower())\n",
" text = text.strip(' ')\n",
" word_tokens = word_tokenize(text)\n",
" cleanedText = [w for w in word_tokens if w not in stop_words]\n",
" cleanedSynop = ' '.join(cleanedText)\n",
" cleanedSynopses.append(cleanedSynop)\n",
"\n",
" return cleanedSynopses"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "5994c4ee",
"metadata": {
"scrolled": true,
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "5994c4ee",
"outputId": "84e4f903-491b-4313-c7b0-287b0748e2bb"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" id movie_name \\\n",
"0 44978 super me \n",
"1 50185 entity project \n",
"2 34131 behavioral family therapy for serious psychiat... \n",
"3 78522 blood glacier \n",
"4 2206 apat na anino \n",
"\n",
" synopsis genre \n",
"0 young scriptwriter starts bringing valuable ob... fantasy \n",
"1 director friends renting haunted house capture... horror \n",
"2 educational video families family therapists d... family \n",
"3 scientists working austrian alps discover glac... scifi \n",
"4 buy day four men widely apart life night shado... action "
],
"text/html": [
"\n",
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" movie_name \n",
" synopsis \n",
" genre \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 44978 \n",
" super me \n",
" young scriptwriter starts bringing valuable ob... \n",
" fantasy \n",
" \n",
" \n",
" 1 \n",
" 50185 \n",
" entity project \n",
" director friends renting haunted house capture... \n",
" horror \n",
" \n",
" \n",
" 2 \n",
" 34131 \n",
" behavioral family therapy for serious psychiat... \n",
" educational video families family therapists d... \n",
" family \n",
" \n",
" \n",
" 3 \n",
" 78522 \n",
" blood glacier \n",
" scientists working austrian alps discover glac... \n",
" scifi \n",
" \n",
" \n",
" 4 \n",
" 2206 \n",
" apat na anino \n",
" buy day four men widely apart life night shado... \n",
" action \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"
\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"
\n",
"\n",
"\n",
"\n",
" \n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n"
]
},
"metadata": {},
"execution_count": 11
}
],
"source": [
"# Transforming synopsis column using preprocessSynopsis method\n",
"\n",
"synopsis = train_data['synopsis']\n",
"train_data['synopsis'] = preprocessSynopsis(synopsis)\n",
"train_data.head()"
]
},
{
"cell_type": "code",
"source": [
"# Method to combine text values from movie_name and synopsis columns\n",
"\n",
"def mergeText(df):\n",
" \"\"\"\n",
" Combining text from movie_name and synopsis i.e. resulting values will be of the form: movie_name+' '+synopsis\n",
" \"\"\"\n",
" movieSynposis=[]\n",
"\n",
" for ind in df.index:\n",
" ms_text = str(df['movie_name'][ind]) + ' ' + str(df['synopsis'][ind])\n",
" movieSynposis.append(ms_text)\n",
"\n",
" return movieSynposis"
],
"metadata": {
"id": "SuSa8M9yvemY"
},
"id": "SuSa8M9yvemY",
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Applying mergeText method and storing values in new column: movie_synopsis\n",
"\n",
"train_data['movie_synopsis'] = mergeText(train_data)\n",
"train_data.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "6Ag-7TllvLm4",
"outputId": "2d851bd2-c64a-4a4a-f97b-a98f24df8ac9"
},
"id": "6Ag-7TllvLm4",
"execution_count": 13,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" id movie_name \\\n",
"0 44978 super me \n",
"1 50185 entity project \n",
"2 34131 behavioral family therapy for serious psychiat... \n",
"3 78522 blood glacier \n",
"4 2206 apat na anino \n",
"\n",
" synopsis genre \\\n",
"0 young scriptwriter starts bringing valuable ob... fantasy \n",
"1 director friends renting haunted house capture... horror \n",
"2 educational video families family therapists d... family \n",
"3 scientists working austrian alps discover glac... scifi \n",
"4 buy day four men widely apart life night shado... action \n",
"\n",
" movie_synopsis \n",
"0 super me young scriptwriter starts bringing va... \n",
"1 entity project director friends renting haunte... \n",
"2 behavioral family therapy for serious psychiat... \n",
"3 blood glacier scientists working austrian alps... \n",
"4 apat na anino buy day four men widely apart li... "
],
"text/html": [
"\n",
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" movie_name \n",
" synopsis \n",
" genre \n",
" movie_synopsis \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 44978 \n",
" super me \n",
" young scriptwriter starts bringing valuable ob... \n",
" fantasy \n",
" super me young scriptwriter starts bringing va... \n",
" \n",
" \n",
" 1 \n",
" 50185 \n",
" entity project \n",
" director friends renting haunted house capture... \n",
" horror \n",
" entity project director friends renting haunte... \n",
" \n",
" \n",
" 2 \n",
" 34131 \n",
" behavioral family therapy for serious psychiat... \n",
" educational video families family therapists d... \n",
" family \n",
" behavioral family therapy for serious psychiat... \n",
" \n",
" \n",
" 3 \n",
" 78522 \n",
" blood glacier \n",
" scientists working austrian alps discover glac... \n",
" scifi \n",
" blood glacier scientists working austrian alps... \n",
" \n",
" \n",
" 4 \n",
" 2206 \n",
" apat na anino \n",
" buy day four men widely apart life night shado... \n",
" action \n",
" apat na anino buy day four men widely apart li... \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"
\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"
\n",
"\n",
"\n",
"\n",
" \n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n"
]
},
"metadata": {},
"execution_count": 13
}
]
},
{
"cell_type": "markdown",
"source": [
"## Label Encoding Target Classes"
],
"metadata": {
"id": "YpVgEjifxrCB"
},
"id": "YpVgEjifxrCB"
},
{
"cell_type": "code",
"source": [
"# Using Label Encoder to encode classes from genre\n",
"\n",
"le_genre = LabelEncoder()\n",
"train_data['genre'] = le_genre.fit_transform(train_data['genre'])\n",
"train_data.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "iNAZim2poDBz",
"outputId": "65e6a4fb-8570-42c1-83cd-ac713e003657"
},
"id": "iNAZim2poDBz",
"execution_count": 14,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" id movie_name \\\n",
"0 44978 super me \n",
"1 50185 entity project \n",
"2 34131 behavioral family therapy for serious psychiat... \n",
"3 78522 blood glacier \n",
"4 2206 apat na anino \n",
"\n",
" synopsis genre \\\n",
"0 young scriptwriter starts bringing valuable ob... 4 \n",
"1 director friends renting haunted house capture... 5 \n",
"2 educational video families family therapists d... 3 \n",
"3 scientists working austrian alps discover glac... 8 \n",
"4 buy day four men widely apart life night shado... 0 \n",
"\n",
" movie_synopsis \n",
"0 super me young scriptwriter starts bringing va... \n",
"1 entity project director friends renting haunte... \n",
"2 behavioral family therapy for serious psychiat... \n",
"3 blood glacier scientists working austrian alps... \n",
"4 apat na anino buy day four men widely apart li... "
],
"text/html": [
"\n",
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" movie_name \n",
" synopsis \n",
" genre \n",
" movie_synopsis \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 44978 \n",
" super me \n",
" young scriptwriter starts bringing valuable ob... \n",
" 4 \n",
" super me young scriptwriter starts bringing va... \n",
" \n",
" \n",
" 1 \n",
" 50185 \n",
" entity project \n",
" director friends renting haunted house capture... \n",
" 5 \n",
" entity project director friends renting haunte... \n",
" \n",
" \n",
" 2 \n",
" 34131 \n",
" behavioral family therapy for serious psychiat... \n",
" educational video families family therapists d... \n",
" 3 \n",
" behavioral family therapy for serious psychiat... \n",
" \n",
" \n",
" 3 \n",
" 78522 \n",
" blood glacier \n",
" scientists working austrian alps discover glac... \n",
" 8 \n",
" blood glacier scientists working austrian alps... \n",
" \n",
" \n",
" 4 \n",
" 2206 \n",
" apat na anino \n",
" buy day four men widely apart life night shado... \n",
" 0 \n",
" apat na anino buy day four men widely apart li... \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"
\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"
\n",
"\n",
"\n",
"\n",
" \n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n"
]
},
"metadata": {},
"execution_count": 14
}
]
},
{
"cell_type": "code",
"source": [
"# Retrieving list of classes from Label Encoder\n",
"\n",
"le_genre.classes_"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "jU8Xu7PmoFc1",
"outputId": "3e739ff8-6733-4985-c15e-bfb081c32802"
},
"id": "jU8Xu7PmoFc1",
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array(['action', 'adventure', 'crime', 'family', 'fantasy', 'horror',\n",
" 'mystery', 'romance', 'scifi', 'thriller'], dtype=object)"
]
},
"metadata": {},
"execution_count": 15
}
]
},
{
"cell_type": "markdown",
"source": [
"## Vectorizing Textual Data"
],
"metadata": {
"id": "9ha441zmxtih"
},
"id": "9ha441zmxtih"
},
{
"cell_type": "code",
"source": [
"# Vectorizing textual data i.e. converting each text token into integers using TF-IDF Vectorizer\n",
"\n",
"cv = TfidfVectorizer()\n",
"vectorized_synopsis = cv.fit_transform(train_data['movie_synopsis'])\n",
"vectorized_synopsis[0]"
],
"metadata": {
"id": "pcC4sDFUbuDv",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "d37267d4-441d-4bed-8aa3-1973e200b5d2"
},
"id": "pcC4sDFUbuDv",
"execution_count": 16,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<1x60085 sparse matrix of type ''\n",
"\twith 16 stored elements in Compressed Sparse Row format>"
]
},
"metadata": {},
"execution_count": 16
}
]
},
{
"cell_type": "code",
"source": [
"# Separating X: Features and Y: Target columns\n",
"\n",
"X = vectorized_synopsis\n",
"Y = train_data['genre'].values\n",
"\n",
"print(\"Features Shape: \",X.shape)\n",
"print(\"Target Shape: \",Y.shape)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5n0JM8UHmq5s",
"outputId": "77361117-be79-4e7a-928d-ed78c4cda003"
},
"id": "5n0JM8UHmq5s",
"execution_count": 17,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Features Shape: (54000, 60085)\n",
"Target Shape: (54000,)\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## Splitting data into Train and Validation Sets"
],
"metadata": {
"id": "gs1QN3aUx-Jj"
},
"id": "gs1QN3aUx-Jj"
},
{
"cell_type": "code",
"execution_count": 18,
"id": "7d5005b3",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7d5005b3",
"outputId": "ede369e4-af55-467f-8525-20a1ed145615"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<1x60085 sparse matrix of type ''\n",
"\twith 25 stored elements in Compressed Sparse Row format>"
]
},
"metadata": {},
"execution_count": 18
}
],
"source": [
"# Splitting into Training and Validation Sets with 25% validation split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)\n",
"X_train[0]"
]
},
{
"cell_type": "markdown",
"source": [
"## Model Building: Training, Prediction and Metric Evaluation"
],
"metadata": {
"id": "XXWXGR_iyH-G"
},
"id": "XXWXGR_iyH-G"
},
{
"cell_type": "code",
"source": [
"# Training model using Multinomial Naive Bayes, Getting predictions on Validation set, Calculating Metric: Accuracy\n",
"\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"\n",
"mnb = MultinomialNB()\n",
"\n",
"mnb.fit(X_train, y_train)\n",
"\n",
"y_pred = mnb.predict(X_test)\n",
"\n",
"print(\"Val Acc using MultinomialNB: \", accuracy_score(y_test, y_pred))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bJXTw8qxOEIV",
"outputId": "0ee8688b-60a0-42a8-d657-0327475f439c"
},
"id": "bJXTw8qxOEIV",
"execution_count": 19,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Val Acc using MultinomialNB: 0.3622222222222222\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Training model using Decision Tree Classifier, Getting predictions on Validation set, Calculating Metric: Accuracy\n",
"\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"dt_clf = DecisionTreeClassifier()\n",
"\n",
"dt_clf.fit(X_train, y_train)\n",
"\n",
"y_pred = dt_clf.predict(X_test)\n",
"\n",
"print(\"Val Acc using Decision Tree: \", accuracy_score(y_test, y_pred))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1ytk2S35RooF",
"outputId": "6b9f3f83-c832-4f10-edc5-a56ee6f62d7c"
},
"id": "1ytk2S35RooF",
"execution_count": 20,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Val Acc using Decision Tree: 0.18748148148148147\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Training model using KNN (K-Nearest Neighbours Classifier), Getting predictions on Validation set, Calculating Metric: Accuracy\n",
"\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"\n",
"knn = KNeighborsClassifier(n_neighbors=7)\n",
"\n",
"knn.fit(X_train, y_train)\n",
"\n",
"y_pred = knn.predict(X_test)\n",
"\n",
"print(\"Val Acc using KNN: \", accuracy_score(y_test, y_pred))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "6MOxLHJVe9um",
"outputId": "c3b8154c-2923-4a34-d474-0d8796d3c957"
},
"id": "6MOxLHJVe9um",
"execution_count": 21,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Val Acc using KNN: 0.23837037037037037\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"As our principle metric to consider is Accuracy, we finalize Multinomial Naive Bayes as our Final Model. \n",
"Multinomial Naives Bayes outperforms among all the considered models, hence using it for Test Data Prediction."
],
"metadata": {
"id": "-5VNI6OVyVXW"
},
"id": "-5VNI6OVyVXW"
},
{
"cell_type": "markdown",
"source": [
"### Test Data Prediction"
],
"metadata": {
"id": "6hZGPKFgT5X_"
},
"id": "6hZGPKFgT5X_"
},
{
"cell_type": "code",
"source": [
"test_data = pd.read_csv(\"test.csv\")\n",
"test_data.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "kKZySdPbT04P",
"outputId": "b525b13a-4f69-4eba-faaf-f1390788bfc8"
},
"id": "kKZySdPbT04P",
"execution_count": 22,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" id movie_name \\\n",
"0 16863 A Death Sentence \n",
"1 48456 Intermedio \n",
"2 41383 30 Chua Phai Tet \n",
"3 84007 Paranoiac \n",
"4 40269 Ordinary Happiness \n",
"\n",
" synopsis genre \n",
"0 12 y.o. Ida's dad'll die without a DKK1,500,00... action \n",
"1 A group of four teenage friends become trapped... action \n",
"2 A guy left his home for 12 years till he came ... action \n",
"3 A man long believed dead returns to the family... action \n",
"4 After a deadly accident, Paolo comes back on E... action "
],
"text/html": [
"\n",
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" movie_name \n",
" synopsis \n",
" genre \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 16863 \n",
" A Death Sentence \n",
" 12 y.o. Ida's dad'll die without a DKK1,500,00... \n",
" action \n",
" \n",
" \n",
" 1 \n",
" 48456 \n",
" Intermedio \n",
" A group of four teenage friends become trapped... \n",
" action \n",
" \n",
" \n",
" 2 \n",
" 41383 \n",
" 30 Chua Phai Tet \n",
" A guy left his home for 12 years till he came ... \n",
" action \n",
" \n",
" \n",
" 3 \n",
" 84007 \n",
" Paranoiac \n",
" A man long believed dead returns to the family... \n",
" action \n",
" \n",
" \n",
" 4 \n",
" 40269 \n",
" Ordinary Happiness \n",
" After a deadly accident, Paolo comes back on E... \n",
" action \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"
\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"
\n",
"\n",
"\n",
"\n",
" \n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n"
]
},
"metadata": {},
"execution_count": 22
}
]
},
{
"cell_type": "code",
"source": [
"movieNames = test_data['movie_name']\n",
"test_data['movie_name'] = preprocessMovieName(movieNames)\n",
"\n",
"synopsis = test_data['synopsis']\n",
"test_data['synopsis'] = preprocessSynopsis(synopsis)\n",
"\n",
"test_data.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "0_bFugnGUsux",
"outputId": "3014bb16-d73b-4bc8-9216-e7b812966f50"
},
"id": "0_bFugnGUsux",
"execution_count": 23,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" id movie_name \\\n",
"0 16863 a death sentence \n",
"1 48456 intermedio \n",
"2 41383 30 chua phai tet \n",
"3 84007 paranoiac \n",
"4 40269 ordinary happiness \n",
"\n",
" synopsis genre \n",
"0 ida dad die without dkk operation ida plans st... action \n",
"1 group four teenage friends become trapped mexi... action \n",
"2 guy left home years till came back claim fathe... action \n",
"3 man long believed dead returns family estate c... action \n",
"4 deadly accident paolo comes back earth minutes... action "
],
"text/html": [
"\n",
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" movie_name \n",
" synopsis \n",
" genre \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 16863 \n",
" a death sentence \n",
" ida dad die without dkk operation ida plans st... \n",
" action \n",
" \n",
" \n",
" 1 \n",
" 48456 \n",
" intermedio \n",
" group four teenage friends become trapped mexi... \n",
" action \n",
" \n",
" \n",
" 2 \n",
" 41383 \n",
" 30 chua phai tet \n",
" guy left home years till came back claim fathe... \n",
" action \n",
" \n",
" \n",
" 3 \n",
" 84007 \n",
" paranoiac \n",
" man long believed dead returns family estate c... \n",
" action \n",
" \n",
" \n",
" 4 \n",
" 40269 \n",
" ordinary happiness \n",
" deadly accident paolo comes back earth minutes... \n",
" action \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"
\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"
\n",
"\n",
"\n",
"\n",
" \n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n"
]
},
"metadata": {},
"execution_count": 23
}
]
},
{
"cell_type": "code",
"source": [
"test_data['movie_synopsis'] = mergeText(test_data)\n",
"test_data.drop(['genre'], axis=1, inplace=True)\n",
"\n",
"test_data.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "Jq_alTF3USTX",
"outputId": "30568322-7d54-4a0e-9934-289f892f0b4f"
},
"id": "Jq_alTF3USTX",
"execution_count": 24,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" id movie_name \\\n",
"0 16863 a death sentence \n",
"1 48456 intermedio \n",
"2 41383 30 chua phai tet \n",
"3 84007 paranoiac \n",
"4 40269 ordinary happiness \n",
"\n",
" synopsis \\\n",
"0 ida dad die without dkk operation ida plans st... \n",
"1 group four teenage friends become trapped mexi... \n",
"2 guy left home years till came back claim fathe... \n",
"3 man long believed dead returns family estate c... \n",
"4 deadly accident paolo comes back earth minutes... \n",
"\n",
" movie_synopsis \n",
"0 a death sentence ida dad die without dkk opera... \n",
"1 intermedio group four teenage friends become t... \n",
"2 30 chua phai tet guy left home years till came... \n",
"3 paranoiac man long believed dead returns famil... \n",
"4 ordinary happiness deadly accident paolo comes... "
],
"text/html": [
"\n",
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" movie_name \n",
" synopsis \n",
" movie_synopsis \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 16863 \n",
" a death sentence \n",
" ida dad die without dkk operation ida plans st... \n",
" a death sentence ida dad die without dkk opera... \n",
" \n",
" \n",
" 1 \n",
" 48456 \n",
" intermedio \n",
" group four teenage friends become trapped mexi... \n",
" intermedio group four teenage friends become t... \n",
" \n",
" \n",
" 2 \n",
" 41383 \n",
" 30 chua phai tet \n",
" guy left home years till came back claim fathe... \n",
" 30 chua phai tet guy left home years till came... \n",
" \n",
" \n",
" 3 \n",
" 84007 \n",
" paranoiac \n",
" man long believed dead returns family estate c... \n",
" paranoiac man long believed dead returns famil... \n",
" \n",
" \n",
" 4 \n",
" 40269 \n",
" ordinary happiness \n",
" deadly accident paolo comes back earth minutes... \n",
" ordinary happiness deadly accident paolo comes... \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"
\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"
\n",
"\n",
"\n",
"\n",
" \n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n"
]
},
"metadata": {},
"execution_count": 24
}
]
},
{
"cell_type": "code",
"source": [
"vectorized_synopsis = cv.transform(test_data['movie_synopsis'])\n",
"\n",
"predictions = mnb.predict(vectorized_synopsis)\n",
"\n",
"genre_predictions = le_genre.inverse_transform(predictions)"
],
"metadata": {
"id": "8yeCDG6kUifi"
},
"id": "8yeCDG6kUifi",
"execution_count": 25,
"outputs": []
},
{
"cell_type": "code",
"source": [
"submission = pd.DataFrame(pd.DataFrame({'id': test_data['id'], 'genre': genre_predictions}))\n",
"submission.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "Azmt1BZsVVgC",
"outputId": "a731ee69-6365-4537-e7ea-541a4962ab3e"
},
"id": "Azmt1BZsVVgC",
"execution_count": 26,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" id genre\n",
"0 16863 crime\n",
"1 48456 horror\n",
"2 41383 scifi\n",
"3 84007 mystery\n",
"4 40269 fantasy"
],
"text/html": [
"\n",
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" genre \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 16863 \n",
" crime \n",
" \n",
" \n",
" 1 \n",
" 48456 \n",
" horror \n",
" \n",
" \n",
" 2 \n",
" 41383 \n",
" scifi \n",
" \n",
" \n",
" 3 \n",
" 84007 \n",
" mystery \n",
" \n",
" \n",
" 4 \n",
" 40269 \n",
" fantasy \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"
\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
" \n",
" \n",
" \n",
"
\n",
"\n",
"\n",
"\n",
" \n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n"
]
},
"metadata": {},
"execution_count": 26
}
]
},
{
"cell_type": "code",
"source": [
"submission.to_csv('submission_ShalakaThorat.csv', index=False)"
],
"metadata": {
"id": "u1kZLHbBWO0d"
},
"id": "u1kZLHbBWO0d",
"execution_count": 27,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"colab": {
"provenance": [],
"gpuType": "T4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}