{ "cells": [ { "cell_type": "markdown", "id": "56cccab6", "metadata": {}, "source": [ "# Emotions Detection in Text" ] }, { "cell_type": "code", "execution_count": 1, "id": "f0814628-3d83-4fd6-a511-2eccf79f9f1e", "metadata": {}, "outputs": [], "source": [ "# EDA\n", "import pandas as pd\n", "import numpy as np\n", "\n", "# Load Data Viz Pkgs\n", "import seaborn as sns\n", "\n", "# Load Text Cleaning Pkgs\n", "import neattext.functions as nfx\n", "\n", "# Load ML Pkgs\n", "# Estimators\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.naive_bayes import MultinomialNB\n", "\n", "# Transformers\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score,classification_report,confusion_matrix" ] }, { "cell_type": "code", "execution_count": 2, "id": "b209e004-ab77-4407-8689-b4318944d47f", "metadata": {}, "outputs": [], "source": [ "# Load Dataset\n", "df = pd.read_csv(\"../data/emotion_dataset_raw.csv\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "fea2d4c0-3bdd-405e-ab69-507ceaac36cb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EmotionText
0neutralWhy ?
1joySage Act upgrade on my to do list for tommorow.
2sadnessON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...
3joySuch an eye ! The true hazel eye-and so brill...
4joy@Iluvmiasantos ugh babe.. hugggzzz for u .! b...
\n", "
" ], "text/plain": [ " Emotion Text\n", "0 neutral Why ? \n", "1 joy Sage Act upgrade on my to do list for tommorow.\n", "2 sadness ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...\n", "3 joy Such an eye ! The true hazel eye-and so brill...\n", "4 joy @Iluvmiasantos ugh babe.. hugggzzz for u .! b..." ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "430565a3-cf3b-4c6f-afa5-bafd084f5676", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "joy 11045\n", "sadness 6722\n", "fear 5410\n", "anger 4297\n", "surprise 4062\n", "neutral 2254\n", "disgust 856\n", "shame 146\n", "Name: Emotion, dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Value Counts\n", "df['Emotion'].value_counts()" ] }, { "cell_type": "code", "execution_count": 5, "id": "531d3449-a959-4a19-bff0-3ffed551e619", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Plot\n", "sns.countplot(x='Emotion',data=df)" ] }, { "cell_type": "code", "execution_count": 6, "id": "40f991d0-952f-40c1-bf00-f3476ce0436d", "metadata": { "jupyter": { "outputs_hidden": true }, "scrolled": false, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "['BTC_ADDRESS_REGEX',\n", " 'CURRENCY_REGEX',\n", " 'CURRENCY_SYMB_REGEX',\n", " 'Counter',\n", " 'DATE_REGEX',\n", " 'EMAIL_REGEX',\n", " 'EMOJI_REGEX',\n", " 'HASTAG_REGEX',\n", " 'MASTERCard_REGEX',\n", " 'MD5_SHA_REGEX',\n", " 'MOST_COMMON_PUNCT_REGEX',\n", " 'NUMBERS_REGEX',\n", " 'PHONE_REGEX',\n", " 'PoBOX_REGEX',\n", " 'SPECIAL_CHARACTERS_REGEX',\n", " 'STOPWORDS',\n", " 'STOPWORDS_de',\n", " 'STOPWORDS_en',\n", " 'STOPWORDS_es',\n", " 'STOPWORDS_fr',\n", " 'STOPWORDS_ru',\n", " 'STOPWORDS_yo',\n", " 'STREET_ADDRESS_REGEX',\n", " 'TextFrame',\n", " 'URL_PATTERN',\n", " 'USER_HANDLES_REGEX',\n", " 'VISACard_REGEX',\n", " '__builtins__',\n", " '__cached__',\n", " '__doc__',\n", " '__file__',\n", " '__generate_text',\n", " '__loader__',\n", " '__name__',\n", " '__numbers_dict',\n", " '__package__',\n", " '__spec__',\n", " '_lex_richness_herdan',\n", " '_lex_richness_maas_ttr',\n", " 'clean_text',\n", " 'defaultdict',\n", " 'digit2words',\n", " 'extract_btc_address',\n", " 'extract_currencies',\n", " 'extract_currency_symbols',\n", " 'extract_dates',\n", " 'extract_emails',\n", " 'extract_emojis',\n", " 'extract_hashtags',\n", " 'extract_html_tags',\n", " 'extract_mastercard_addr',\n", " 'extract_md5sha',\n", " 'extract_numbers',\n", " 'extract_pattern',\n", " 'extract_phone_numbers',\n", " 'extract_postoffice_box',\n", " 'extract_shortwords',\n", " 'extract_special_characters',\n", " 'extract_stopwords',\n", " 'extract_street_address',\n", " 'extract_terms_in_bracket',\n", " 'extract_urls',\n", " 'extract_userhandles',\n", " 'extract_visacard_addr',\n", " 'fix_contractions',\n", " 'generate_sentence',\n", " 'hamming_distance',\n", " 'inverse_df',\n", " 'lexical_richness',\n", " 'markov_chain',\n", " 'math',\n", " 'nlargest',\n", " 'normalize',\n", " 'num2words',\n", " 'random',\n", " 're',\n", " 'read_txt',\n", " 'remove_accents',\n", " 'remove_bad_quotes',\n", " 'remove_btc_address',\n", " 'remove_currencies',\n", " 'remove_currency_symbols',\n", " 'remove_custom_pattern',\n", " 'remove_custom_words',\n", " 'remove_dates',\n", " 'remove_emails',\n", " 'remove_emojis',\n", " 'remove_hashtags',\n", " 'remove_html_tags',\n", " 'remove_mastercard_addr',\n", " 'remove_md5sha',\n", " 'remove_multiple_spaces',\n", " 'remove_non_ascii',\n", " 'remove_numbers',\n", " 'remove_phone_numbers',\n", " 'remove_postoffice_box',\n", " 'remove_puncts',\n", " 'remove_punctuations',\n", " 'remove_shortwords',\n", " 'remove_special_characters',\n", " 'remove_stopwords',\n", " 'remove_street_address',\n", " 'remove_terms_in_bracket',\n", " 'remove_urls',\n", " 'remove_userhandles',\n", " 'remove_visacard_addr',\n", " 'replace_bad_quotes',\n", " 'replace_currencies',\n", " 'replace_currency_symbols',\n", " 'replace_dates',\n", " 'replace_emails',\n", " 'replace_emojis',\n", " 'replace_numbers',\n", " 'replace_phone_numbers',\n", " 'replace_special_characters',\n", " 'replace_term',\n", " 'replace_urls',\n", " 'string',\n", " 'term_freq',\n", " 'to_txt',\n", " 'unicodedata',\n", " 'word_freq',\n", " 'word_length_freq']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Data Cleaning\n", "dir(nfx)" ] }, { "cell_type": "code", "execution_count": 7, "id": "b1f87847-a91c-4bd6-a307-d746eb5aa9a0", "metadata": {}, "outputs": [], "source": [ "# User handles\n", "df['Clean_Text'] = df['Text'].apply(nfx.remove_userhandles)" ] }, { "cell_type": "code", "execution_count": 8, "id": "03886bc3-1ac4-4f1b-842b-e5d2d770ff81", "metadata": {}, "outputs": [], "source": [ "# Stopwords\n", "df['Clean_Text'] = df['Clean_Text'].apply(nfx.remove_stopwords)" ] }, { "cell_type": "markdown", "id": "0ffcf4c7", "metadata": {}, "source": [ "## We are not removing Special Characters as some of the rows have just Special Characters and it'll result into empty row." ] }, { "cell_type": "code", "execution_count": 9, "id": "0a0fcc0c-4adf-4f0b-b226-164659ad70ba", "metadata": { "jupyter": { "outputs_hidden": true }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EmotionTextClean_Text
0neutralWhy ??
1joySage Act upgrade on my to do list for tommorow.Sage Act upgrade list tommorow.
2sadnessON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...WAY HOMEGIRL BABY FUNERAL!!! MAN HATE FUNERALS...
3joySuch an eye ! The true hazel eye-and so brill...eye ! true hazel eye-and brilliant ! Regular f...
4joy@Iluvmiasantos ugh babe.. hugggzzz for u .! b...ugh babe.. hugggzzz u .! babe naamazed nga ako...
............
34787surprise@MichelGW have you gift! Hope you like it! It'...gift! Hope like it! hand wear ! It'll warm! Lol
34788joyThe world didnt give it to me..so the world MO...world didnt me..so world DEFINITELY cnt away!!!
34789angerA man robbed me today .man robbed today .
34790fearYouu call it JEALOUSY, I call it of #Losing YO...Youu JEALOUSY, #Losing YOU...
34791sadnessI think about you baby, and I dream about you ...think baby, dream time
\n", "

34792 rows × 3 columns

\n", "
" ], "text/plain": [ " Emotion Text \\\n", "0 neutral Why ? \n", "1 joy Sage Act upgrade on my to do list for tommorow. \n", "2 sadness ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ... \n", "3 joy Such an eye ! The true hazel eye-and so brill... \n", "4 joy @Iluvmiasantos ugh babe.. hugggzzz for u .! b... \n", "... ... ... \n", "34787 surprise @MichelGW have you gift! Hope you like it! It'... \n", "34788 joy The world didnt give it to me..so the world MO... \n", "34789 anger A man robbed me today . \n", "34790 fear Youu call it JEALOUSY, I call it of #Losing YO... \n", "34791 sadness I think about you baby, and I dream about you ... \n", "\n", " Clean_Text \n", "0 ? \n", "1 Sage Act upgrade list tommorow. \n", "2 WAY HOMEGIRL BABY FUNERAL!!! MAN HATE FUNERALS... \n", "3 eye ! true hazel eye-and brilliant ! Regular f... \n", "4 ugh babe.. hugggzzz u .! babe naamazed nga ako... \n", "... ... \n", "34787 gift! Hope like it! hand wear ! It'll warm! Lol \n", "34788 world didnt me..so world DEFINITELY cnt away!!! \n", "34789 man robbed today . \n", "34790 Youu JEALOUSY, #Losing YOU... \n", "34791 think baby, dream time \n", "\n", "[34792 rows x 3 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 10, "id": "450c39c0-79dd-4eaf-85fe-57e344eb81bd", "metadata": {}, "outputs": [], "source": [ "# Features & Labels\n", "Xfeatures = df['Clean_Text']\n", "ylabels = df['Emotion']" ] }, { "cell_type": "markdown", "id": "edde3d4b", "metadata": {}, "source": [ "# It is advisable to split before applying pipelines because it prevents data leakage." ] }, { "cell_type": "code", "execution_count": 11, "id": "27d7f976-c28f-449e-ae1a-53a42bbda4e8", "metadata": {}, "outputs": [], "source": [ "# Split Data\n", "x_train,x_test,y_train,y_test = train_test_split(Xfeatures,ylabels,test_size=0.3,random_state=42)" ] }, { "cell_type": "code", "execution_count": 12, "id": "2f086f29-dba9-40d2-a9dd-f06a6cca3a4c", "metadata": {}, "outputs": [], "source": [ "# Build Pipeline\n", "from sklearn.pipeline import Pipeline" ] }, { "cell_type": "code", "execution_count": 13, "id": "6b81cc86-2bef-40c2-b9a3-668caaadaff0", "metadata": {}, "outputs": [], "source": [ "# LogisticRegression Pipeline\n", "pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression())])" ] }, { "cell_type": "code", "execution_count": 14, "id": "dc64b9a7-efe2-4bc4-a0e7-46dff1d52b31", "metadata": { "jupyter": { "outputs_hidden": true }, "scrolled": false, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Sanket\\anaconda3\\envs\\nlp\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", "\n", "Increase the number of iterations (max_iter) or scale the data as shown in:\n", " https://scikit-learn.org/stable/modules/preprocessing.html\n", "Please also refer to the documentation for alternative solver options:\n", " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", " n_iter_i = _check_optimize_result(\n" ] }, { "data": { "text/html": [ "
Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Train and Fit Data\n", "pipe_lr.fit(x_train,y_train)" ] }, { "cell_type": "code", "execution_count": 15, "id": "135ed6f8-56ff-4d53-85e3-541e3a7ae2d7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe_lr" ] }, { "cell_type": "code", "execution_count": 16, "id": "28396371-5f5c-4a3b-b974-164e047764f3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.619946349875455" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check Accuracy\n", "pipe_lr.score(x_test,y_test)" ] }, { "cell_type": "code", "execution_count": 17, "id": "eb3a26b6-d09e-422f-991b-b08c48f55b75", "metadata": {}, "outputs": [], "source": [ "# Make A Prediction\n", "ex1 = \"This book was so interesting it made me happy\"" ] }, { "cell_type": "code", "execution_count": 18, "id": "b08597d9-6f59-45cb-a648-95b0da1ce313", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['joy'], dtype=object)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe_lr.predict([ex1])" ] }, { "cell_type": "code", "execution_count": 19, "id": "5b3822ac-17fc-43dd-9bb7-8dad07a4d32c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1.60505334e-03, 7.06448086e-03, 6.95652453e-03, 9.43810868e-01,\n", " 1.00440585e-04, 2.63232385e-02, 6.63277122e-05, 1.40730665e-02]])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Prediction Prob\n", "pipe_lr.predict_proba([ex1])" ] }, { "cell_type": "code", "execution_count": 20, "id": "5b7c4596-d643-48e5-a777-79a6f55c49da", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'shame',\n", " 'surprise'], dtype=object)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# To Know the classes\n", "pipe_lr.classes_" ] }, { "cell_type": "code", "execution_count": 21, "id": "c0d40f62-b1fd-4748-a279-c8f50c748f26", "metadata": {}, "outputs": [], "source": [ "# Save Model & Pipeline\n", "import joblib\n", "pipeline_file = open(\"../models/emotion_classifier_pipe_lr.pkl\",\"wb\")\n", "joblib.dump(pipe_lr,pipeline_file)\n", "pipeline_file.close()" ] }, { "cell_type": "code", "execution_count": null, "id": "377c4e98-67f0-45e5-8dd5-0417585754f0", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }