{ "cells": [ { "cell_type": "markdown", "source": [ "## Importing Libraries" ], "metadata": { "id": "jiNlo56ax2Us" }, "id": "jiNlo56ax2Us" }, { "cell_type": "code", "execution_count": 1, "id": "e0c77096", "metadata": { "id": "e0c77096" }, "outputs": [], "source": [ "# Importing required Libraries\n", "\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, { "cell_type": "code", "execution_count": 2, "id": "b5fbe912", "metadata": { "scrolled": true, "colab": { "base_uri": "https://localhost:8080/" }, "id": "b5fbe912", "outputId": "a8b49040-2f4b-429b-d174-aead00979ed5" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] } ], "source": [ "# Downloading NLTK Packages\n", "\n", "import nltk\n", "nltk.download('stopwords')\n", "nltk.download('punkt')\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "import re" ] }, { "cell_type": "code", "execution_count": 3, "id": "fcb105bd", "metadata": { "id": "fcb105bd" }, "outputs": [], "source": [ "# Creating set of stop words\n", "\n", "stop_words = set(stopwords.words('english'))" ] }, { "cell_type": "markdown", "source": [ "## Importing and Pre-processing Training Dataset" ], "metadata": { "id": "8mICKWhquvro" }, "id": "8mICKWhquvro" }, { "cell_type": "code", "execution_count": 4, "id": "f5965adb", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "f5965adb", "outputId": "42ca0d77-46cb-4032-9071-37bad3d4ce61" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id movie_name \\\n", "0 44978 Super Me \n", "1 50185 Entity Project \n", "2 34131 Behavioral Family Therapy for Serious Psychiat... \n", "3 78522 Blood Glacier \n", "4 2206 Apat na anino \n", "\n", " synopsis genre \n", "0 A young scriptwriter starts bringing valuable ... fantasy \n", "1 A director and her friends renting a haunted h... horror \n", "2 This is an educational video for families and ... family \n", "3 Scientists working in the Austrian Alps discov... scifi \n", "4 Buy Day - Four Men Widely - Apart in Life - By... action " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idmovie_namesynopsisgenre
044978Super MeA young scriptwriter starts bringing valuable ...fantasy
150185Entity ProjectA director and her friends renting a haunted h...horror
234131Behavioral Family Therapy for Serious Psychiat...This is an educational video for families and ...family
378522Blood GlacierScientists working in the Austrian Alps discov...scifi
42206Apat na aninoBuy Day - Four Men Widely - Apart in Life - By...action
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 4 } ], "source": [ "# Importing Tranining Dataset\n", "\n", "train_data = pd.read_csv(\"train.csv\")\n", "train_data.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "20d4b346", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "20d4b346", "outputId": "7e787488-2023-4854-e1f3-e5e1d5e237c2" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 54000 entries, 0 to 53999\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 54000 non-null int64 \n", " 1 movie_name 54000 non-null object\n", " 2 synopsis 54000 non-null object\n", " 3 genre 54000 non-null object\n", "dtypes: int64(1), object(3)\n", "memory usage: 1.6+ MB\n" ] } ], "source": [ "# Getting Info about Train Data\n", "\n", "train_data.info()" ] }, { "cell_type": "code", "execution_count": 6, "id": "fc1a181f", "metadata": { "scrolled": false, "colab": { "base_uri": "https://localhost:8080/" }, "id": "fc1a181f", "outputId": "594fa4a9-9456-489c-eea1-4e5b160f850f" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "id 0\n", "movie_name 0\n", "synopsis 0\n", "genre 0\n", "dtype: int64" ] }, "metadata": {}, "execution_count": 6 } ], "source": [ "# Checking for Null Values\n", "\n", "train_data.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 7, "id": "ccbc40da", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ccbc40da", "outputId": "27766f2e-df95-49b1-c6bf-dbf281b3bf21" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "fantasy 5400\n", "horror 5400\n", "family 5400\n", "scifi 5400\n", "action 5400\n", "crime 5400\n", "adventure 5400\n", "mystery 5400\n", "romance 5400\n", "thriller 5400\n", "Name: genre, dtype: int64" ] }, "metadata": {}, "execution_count": 7 } ], "source": [ "# Getting Number of Classes and their Distribution in Train Data\n", "\n", "train_data['genre'].value_counts()" ] }, { "cell_type": "code", "execution_count": 8, "id": "b630de43", "metadata": { "id": "b630de43" }, "outputs": [], "source": [ "# Method to pre-process text from column: movie_name\n", "\n", "def preprocessMovieName(movieNames):\n", " \"\"\"\n", " Converting text to lowercase and Removing extra spaces from movie_name column values\n", " \"\"\"\n", " cleanedMovieNames = []\n", "\n", " for movie in movieNames:\n", " text = movie.lower()\n", " text = text.strip(' ')\n", " cleanedMovieNames.append(text)\n", "\n", " return cleanedMovieNames" ] }, { "cell_type": "code", "execution_count": 9, "id": "6bc44949", "metadata": { "scrolled": true, "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "6bc44949", "outputId": "8765cf23-b184-4c66-fe6b-23709fcaa5c8" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id movie_name \\\n", "0 44978 super me \n", "1 50185 entity project \n", "2 34131 behavioral family therapy for serious psychiat... \n", "3 78522 blood glacier \n", "4 2206 apat na anino \n", "\n", " synopsis genre \n", "0 A young scriptwriter starts bringing valuable ... fantasy \n", "1 A director and her friends renting a haunted h... horror \n", "2 This is an educational video for families and ... family \n", "3 Scientists working in the Austrian Alps discov... scifi \n", "4 Buy Day - Four Men Widely - Apart in Life - By... action " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idmovie_namesynopsisgenre
044978super meA young scriptwriter starts bringing valuable ...fantasy
150185entity projectA director and her friends renting a haunted h...horror
234131behavioral family therapy for serious psychiat...This is an educational video for families and ...family
378522blood glacierScientists working in the Austrian Alps discov...scifi
42206apat na aninoBuy Day - Four Men Widely - Apart in Life - By...action
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 9 } ], "source": [ "# Transforming movie_name column using preprocessMovieName method\n", "\n", "movieNames = train_data['movie_name']\n", "train_data['movie_name'] = preprocessMovieName(movieNames)\n", "train_data.head()" ] }, { "cell_type": "code", "execution_count": 10, "id": "bebcffc9", "metadata": { "id": "bebcffc9" }, "outputs": [], "source": [ "# Method to pre-process text from column: synopsis\n", "\n", "def preprocessSynopsis(synopsis):\n", " \"\"\"\n", " Converting text to lowercase, Removing extra spaces, digits, symbols, stop words from synopsis column values\n", " \"\"\"\n", " cleanedSynopses = []\n", "\n", " for synop in synopsis:\n", " text = re.sub(r'[^a-zA-Z]', ' ', synop.lower())\n", " text = text.strip(' ')\n", " word_tokens = word_tokenize(text)\n", " cleanedText = [w for w in word_tokens if w not in stop_words]\n", " cleanedSynop = ' '.join(cleanedText)\n", " cleanedSynopses.append(cleanedSynop)\n", "\n", " return cleanedSynopses" ] }, { "cell_type": "code", "execution_count": 11, "id": "5994c4ee", "metadata": { "scrolled": true, "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "5994c4ee", "outputId": "84e4f903-491b-4313-c7b0-287b0748e2bb" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id movie_name \\\n", "0 44978 super me \n", "1 50185 entity project \n", "2 34131 behavioral family therapy for serious psychiat... \n", "3 78522 blood glacier \n", "4 2206 apat na anino \n", "\n", " synopsis genre \n", "0 young scriptwriter starts bringing valuable ob... fantasy \n", "1 director friends renting haunted house capture... horror \n", "2 educational video families family therapists d... family \n", "3 scientists working austrian alps discover glac... scifi \n", "4 buy day four men widely apart life night shado... action " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idmovie_namesynopsisgenre
044978super meyoung scriptwriter starts bringing valuable ob...fantasy
150185entity projectdirector friends renting haunted house capture...horror
234131behavioral family therapy for serious psychiat...educational video families family therapists d...family
378522blood glacierscientists working austrian alps discover glac...scifi
42206apat na aninobuy day four men widely apart life night shado...action
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 11 } ], "source": [ "# Transforming synopsis column using preprocessSynopsis method\n", "\n", "synopsis = train_data['synopsis']\n", "train_data['synopsis'] = preprocessSynopsis(synopsis)\n", "train_data.head()" ] }, { "cell_type": "code", "source": [ "# Method to combine text values from movie_name and synopsis columns\n", "\n", "def mergeText(df):\n", " \"\"\"\n", " Combining text from movie_name and synopsis i.e. resulting values will be of the form: movie_name+' '+synopsis\n", " \"\"\"\n", " movieSynposis=[]\n", "\n", " for ind in df.index:\n", " ms_text = str(df['movie_name'][ind]) + ' ' + str(df['synopsis'][ind])\n", " movieSynposis.append(ms_text)\n", "\n", " return movieSynposis" ], "metadata": { "id": "SuSa8M9yvemY" }, "id": "SuSa8M9yvemY", "execution_count": 12, "outputs": [] }, { "cell_type": "code", "source": [ "# Applying mergeText method and storing values in new column: movie_synopsis\n", "\n", "train_data['movie_synopsis'] = mergeText(train_data)\n", "train_data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "6Ag-7TllvLm4", "outputId": "2d851bd2-c64a-4a4a-f97b-a98f24df8ac9" }, "id": "6Ag-7TllvLm4", "execution_count": 13, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id movie_name \\\n", "0 44978 super me \n", "1 50185 entity project \n", "2 34131 behavioral family therapy for serious psychiat... \n", "3 78522 blood glacier \n", "4 2206 apat na anino \n", "\n", " synopsis genre \\\n", "0 young scriptwriter starts bringing valuable ob... fantasy \n", "1 director friends renting haunted house capture... horror \n", "2 educational video families family therapists d... family \n", "3 scientists working austrian alps discover glac... scifi \n", "4 buy day four men widely apart life night shado... action \n", "\n", " movie_synopsis \n", "0 super me young scriptwriter starts bringing va... \n", "1 entity project director friends renting haunte... \n", "2 behavioral family therapy for serious psychiat... \n", "3 blood glacier scientists working austrian alps... \n", "4 apat na anino buy day four men widely apart li... " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idmovie_namesynopsisgenremovie_synopsis
044978super meyoung scriptwriter starts bringing valuable ob...fantasysuper me young scriptwriter starts bringing va...
150185entity projectdirector friends renting haunted house capture...horrorentity project director friends renting haunte...
234131behavioral family therapy for serious psychiat...educational video families family therapists d...familybehavioral family therapy for serious psychiat...
378522blood glacierscientists working austrian alps discover glac...scifiblood glacier scientists working austrian alps...
42206apat na aninobuy day four men widely apart life night shado...actionapat na anino buy day four men widely apart li...
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "markdown", "source": [ "## Label Encoding Target Classes" ], "metadata": { "id": "YpVgEjifxrCB" }, "id": "YpVgEjifxrCB" }, { "cell_type": "code", "source": [ "# Using Label Encoder to encode classes from genre\n", "\n", "le_genre = LabelEncoder()\n", "train_data['genre'] = le_genre.fit_transform(train_data['genre'])\n", "train_data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "iNAZim2poDBz", "outputId": "65e6a4fb-8570-42c1-83cd-ac713e003657" }, "id": "iNAZim2poDBz", "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id movie_name \\\n", "0 44978 super me \n", "1 50185 entity project \n", "2 34131 behavioral family therapy for serious psychiat... \n", "3 78522 blood glacier \n", "4 2206 apat na anino \n", "\n", " synopsis genre \\\n", "0 young scriptwriter starts bringing valuable ob... 4 \n", "1 director friends renting haunted house capture... 5 \n", "2 educational video families family therapists d... 3 \n", "3 scientists working austrian alps discover glac... 8 \n", "4 buy day four men widely apart life night shado... 0 \n", "\n", " movie_synopsis \n", "0 super me young scriptwriter starts bringing va... \n", "1 entity project director friends renting haunte... \n", "2 behavioral family therapy for serious psychiat... \n", "3 blood glacier scientists working austrian alps... \n", "4 apat na anino buy day four men widely apart li... " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idmovie_namesynopsisgenremovie_synopsis
044978super meyoung scriptwriter starts bringing valuable ob...4super me young scriptwriter starts bringing va...
150185entity projectdirector friends renting haunted house capture...5entity project director friends renting haunte...
234131behavioral family therapy for serious psychiat...educational video families family therapists d...3behavioral family therapy for serious psychiat...
378522blood glacierscientists working austrian alps discover glac...8blood glacier scientists working austrian alps...
42206apat na aninobuy day four men widely apart life night shado...0apat na anino buy day four men widely apart li...
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "# Retrieving list of classes from Label Encoder\n", "\n", "le_genre.classes_" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "jU8Xu7PmoFc1", "outputId": "3e739ff8-6733-4985-c15e-bfb081c32802" }, "id": "jU8Xu7PmoFc1", "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['action', 'adventure', 'crime', 'family', 'fantasy', 'horror',\n", " 'mystery', 'romance', 'scifi', 'thriller'], dtype=object)" ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "markdown", "source": [ "## Vectorizing Textual Data" ], "metadata": { "id": "9ha441zmxtih" }, "id": "9ha441zmxtih" }, { "cell_type": "code", "source": [ "# Vectorizing textual data i.e. converting each text token into integers using TF-IDF Vectorizer\n", "\n", "cv = TfidfVectorizer()\n", "vectorized_synopsis = cv.fit_transform(train_data['movie_synopsis'])\n", "vectorized_synopsis[0]" ], "metadata": { "id": "pcC4sDFUbuDv", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "d37267d4-441d-4bed-8aa3-1973e200b5d2" }, "id": "pcC4sDFUbuDv", "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "<1x60085 sparse matrix of type ''\n", "\twith 16 stored elements in Compressed Sparse Row format>" ] }, "metadata": {}, "execution_count": 16 } ] }, { "cell_type": "code", "source": [ "# Separating X: Features and Y: Target columns\n", "\n", "X = vectorized_synopsis\n", "Y = train_data['genre'].values\n", "\n", "print(\"Features Shape: \",X.shape)\n", "print(\"Target Shape: \",Y.shape)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5n0JM8UHmq5s", "outputId": "77361117-be79-4e7a-928d-ed78c4cda003" }, "id": "5n0JM8UHmq5s", "execution_count": 17, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Features Shape: (54000, 60085)\n", "Target Shape: (54000,)\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Splitting data into Train and Validation Sets" ], "metadata": { "id": "gs1QN3aUx-Jj" }, "id": "gs1QN3aUx-Jj" }, { "cell_type": "code", "execution_count": 18, "id": "7d5005b3", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7d5005b3", "outputId": "ede369e4-af55-467f-8525-20a1ed145615" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "<1x60085 sparse matrix of type ''\n", "\twith 25 stored elements in Compressed Sparse Row format>" ] }, "metadata": {}, "execution_count": 18 } ], "source": [ "# Splitting into Training and Validation Sets with 25% validation split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)\n", "X_train[0]" ] }, { "cell_type": "markdown", "source": [ "## Model Building: Training, Prediction and Metric Evaluation" ], "metadata": { "id": "XXWXGR_iyH-G" }, "id": "XXWXGR_iyH-G" }, { "cell_type": "code", "source": [ "# Training model using Multinomial Naive Bayes, Getting predictions on Validation set, Calculating Metric: Accuracy\n", "\n", "from sklearn.naive_bayes import MultinomialNB\n", "\n", "mnb = MultinomialNB()\n", "\n", "mnb.fit(X_train, y_train)\n", "\n", "y_pred = mnb.predict(X_test)\n", "\n", "print(\"Val Acc using MultinomialNB: \", accuracy_score(y_test, y_pred))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bJXTw8qxOEIV", "outputId": "0ee8688b-60a0-42a8-d657-0327475f439c" }, "id": "bJXTw8qxOEIV", "execution_count": 19, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Val Acc using MultinomialNB: 0.3622222222222222\n" ] } ] }, { "cell_type": "code", "source": [ "# Training model using Decision Tree Classifier, Getting predictions on Validation set, Calculating Metric: Accuracy\n", "\n", "from sklearn.tree import DecisionTreeClassifier\n", "\n", "dt_clf = DecisionTreeClassifier()\n", "\n", "dt_clf.fit(X_train, y_train)\n", "\n", "y_pred = dt_clf.predict(X_test)\n", "\n", "print(\"Val Acc using Decision Tree: \", accuracy_score(y_test, y_pred))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1ytk2S35RooF", "outputId": "6b9f3f83-c832-4f10-edc5-a56ee6f62d7c" }, "id": "1ytk2S35RooF", "execution_count": 20, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Val Acc using Decision Tree: 0.18748148148148147\n" ] } ] }, { "cell_type": "code", "source": [ "# Training model using KNN (K-Nearest Neighbours Classifier), Getting predictions on Validation set, Calculating Metric: Accuracy\n", "\n", "from sklearn.neighbors import KNeighborsClassifier\n", "\n", "knn = KNeighborsClassifier(n_neighbors=7)\n", "\n", "knn.fit(X_train, y_train)\n", "\n", "y_pred = knn.predict(X_test)\n", "\n", "print(\"Val Acc using KNN: \", accuracy_score(y_test, y_pred))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6MOxLHJVe9um", "outputId": "c3b8154c-2923-4a34-d474-0d8796d3c957" }, "id": "6MOxLHJVe9um", "execution_count": 21, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Val Acc using KNN: 0.23837037037037037\n" ] } ] }, { "cell_type": "markdown", "source": [ "As our principle metric to consider is Accuracy, we finalize Multinomial Naive Bayes as our Final Model.
\n", "Multinomial Naives Bayes outperforms among all the considered models, hence using it for Test Data Prediction." ], "metadata": { "id": "-5VNI6OVyVXW" }, "id": "-5VNI6OVyVXW" }, { "cell_type": "markdown", "source": [ "### Test Data Prediction" ], "metadata": { "id": "6hZGPKFgT5X_" }, "id": "6hZGPKFgT5X_" }, { "cell_type": "code", "source": [ "test_data = pd.read_csv(\"test.csv\")\n", "test_data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "kKZySdPbT04P", "outputId": "b525b13a-4f69-4eba-faaf-f1390788bfc8" }, "id": "kKZySdPbT04P", "execution_count": 22, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id movie_name \\\n", "0 16863 A Death Sentence \n", "1 48456 Intermedio \n", "2 41383 30 Chua Phai Tet \n", "3 84007 Paranoiac \n", "4 40269 Ordinary Happiness \n", "\n", " synopsis genre \n", "0 12 y.o. Ida's dad'll die without a DKK1,500,00... action \n", "1 A group of four teenage friends become trapped... action \n", "2 A guy left his home for 12 years till he came ... action \n", "3 A man long believed dead returns to the family... action \n", "4 After a deadly accident, Paolo comes back on E... action " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idmovie_namesynopsisgenre
016863A Death Sentence12 y.o. Ida's dad'll die without a DKK1,500,00...action
148456IntermedioA group of four teenage friends become trapped...action
24138330 Chua Phai TetA guy left his home for 12 years till he came ...action
384007ParanoiacA man long believed dead returns to the family...action
440269Ordinary HappinessAfter a deadly accident, Paolo comes back on E...action
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 22 } ] }, { "cell_type": "code", "source": [ "movieNames = test_data['movie_name']\n", "test_data['movie_name'] = preprocessMovieName(movieNames)\n", "\n", "synopsis = test_data['synopsis']\n", "test_data['synopsis'] = preprocessSynopsis(synopsis)\n", "\n", "test_data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "0_bFugnGUsux", "outputId": "3014bb16-d73b-4bc8-9216-e7b812966f50" }, "id": "0_bFugnGUsux", "execution_count": 23, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id movie_name \\\n", "0 16863 a death sentence \n", "1 48456 intermedio \n", "2 41383 30 chua phai tet \n", "3 84007 paranoiac \n", "4 40269 ordinary happiness \n", "\n", " synopsis genre \n", "0 ida dad die without dkk operation ida plans st... action \n", "1 group four teenage friends become trapped mexi... action \n", "2 guy left home years till came back claim fathe... action \n", "3 man long believed dead returns family estate c... action \n", "4 deadly accident paolo comes back earth minutes... action " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idmovie_namesynopsisgenre
016863a death sentenceida dad die without dkk operation ida plans st...action
148456intermediogroup four teenage friends become trapped mexi...action
24138330 chua phai tetguy left home years till came back claim fathe...action
384007paranoiacman long believed dead returns family estate c...action
440269ordinary happinessdeadly accident paolo comes back earth minutes...action
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 23 } ] }, { "cell_type": "code", "source": [ "test_data['movie_synopsis'] = mergeText(test_data)\n", "test_data.drop(['genre'], axis=1, inplace=True)\n", "\n", "test_data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "Jq_alTF3USTX", "outputId": "30568322-7d54-4a0e-9934-289f892f0b4f" }, "id": "Jq_alTF3USTX", "execution_count": 24, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id movie_name \\\n", "0 16863 a death sentence \n", "1 48456 intermedio \n", "2 41383 30 chua phai tet \n", "3 84007 paranoiac \n", "4 40269 ordinary happiness \n", "\n", " synopsis \\\n", "0 ida dad die without dkk operation ida plans st... \n", "1 group four teenage friends become trapped mexi... \n", "2 guy left home years till came back claim fathe... \n", "3 man long believed dead returns family estate c... \n", "4 deadly accident paolo comes back earth minutes... \n", "\n", " movie_synopsis \n", "0 a death sentence ida dad die without dkk opera... \n", "1 intermedio group four teenage friends become t... \n", "2 30 chua phai tet guy left home years till came... \n", "3 paranoiac man long believed dead returns famil... \n", "4 ordinary happiness deadly accident paolo comes... " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idmovie_namesynopsismovie_synopsis
016863a death sentenceida dad die without dkk operation ida plans st...a death sentence ida dad die without dkk opera...
148456intermediogroup four teenage friends become trapped mexi...intermedio group four teenage friends become t...
24138330 chua phai tetguy left home years till came back claim fathe...30 chua phai tet guy left home years till came...
384007paranoiacman long believed dead returns family estate c...paranoiac man long believed dead returns famil...
440269ordinary happinessdeadly accident paolo comes back earth minutes...ordinary happiness deadly accident paolo comes...
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 24 } ] }, { "cell_type": "code", "source": [ "vectorized_synopsis = cv.transform(test_data['movie_synopsis'])\n", "\n", "predictions = mnb.predict(vectorized_synopsis)\n", "\n", "genre_predictions = le_genre.inverse_transform(predictions)" ], "metadata": { "id": "8yeCDG6kUifi" }, "id": "8yeCDG6kUifi", "execution_count": 25, "outputs": [] }, { "cell_type": "code", "source": [ "submission = pd.DataFrame(pd.DataFrame({'id': test_data['id'], 'genre': genre_predictions}))\n", "submission.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "Azmt1BZsVVgC", "outputId": "a731ee69-6365-4537-e7ea-541a4962ab3e" }, "id": "Azmt1BZsVVgC", "execution_count": 26, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id genre\n", "0 16863 crime\n", "1 48456 horror\n", "2 41383 scifi\n", "3 84007 mystery\n", "4 40269 fantasy" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idgenre
016863crime
148456horror
241383scifi
384007mystery
440269fantasy
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 26 } ] }, { "cell_type": "code", "source": [ "submission.to_csv('submission_ShalakaThorat.csv', index=False)" ], "metadata": { "id": "u1kZLHbBWO0d" }, "id": "u1kZLHbBWO0d", "execution_count": 27, "outputs": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "colab": { "provenance": [], "gpuType": "T4" } }, "nbformat": 4, "nbformat_minor": 5 }