{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "c415192a", "metadata": {}, "outputs": [], "source": [ "pip install datasets transformers huggingface_hub\n", "apt-get install git-lfs" ] }, { "cell_type": "code", "execution_count": 1, "id": "bd861c86", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
       " in <module>:14                                                                                   \n",
       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
       "ModuleNotFoundError: No module named 'google.colab'\n",
       "
\n" ], "text/plain": [ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m14\u001b[0m \u001b[31m│\u001b[0m\n", "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", "\u001b[1;91mModuleNotFoundError: \u001b[0mNo module named \u001b[32m'google.colab'\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "#from datasets import load_metric\n", "import os\n", "import matplotlib.pyplot as plt\n", "import wordcloud\n", "from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator\n", "from sklearn.model_selection import train_test_split\n", "from datasets import load_dataset, load_metric\n", "import io\n", "from transformers import AutoTokenizer, TrainingArguments, Trainer\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", "from google.colab import files\n", "from google.colab import drive\n", "from transformers import AutoModelForSequenceClassification\n", "\n", "#converting training data to PyTorch tensors to speed up training and adding padding:\n", "from transformers import DataCollatorWithPadding\n", "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n", "from huggingface_hub import notebook_login" ] }, { "cell_type": "code", "execution_count": null, "id": "8b2e212f", "metadata": {}, "outputs": [], "source": [ "# Disabe W&B\n", "os.environ[\"WANDB_DISABLED\"] = \"true\"" ] }, { "cell_type": "code", "execution_count": null, "id": "b5d8f134", "metadata": {}, "outputs": [], "source": [ "drive.mount('/content/drive')" ] }, { "cell_type": "markdown", "id": "705a32e0", "metadata": {}, "source": [ "## Data set importation" ] }, { "cell_type": "code", "execution_count": null, "id": "d7e95a22", "metadata": {}, "outputs": [], "source": [ "train =pd.read_csv(\"/content/drive/MyDrive/PostBAP_ASSESSMENT/hugging.csv\").dropna(axis = 0)\n", "test = pd.read_csv(\"/content/drive/MyDrive/PostBAP_ASSESSMENT/Testhugging.csv\").fillna(\"\")\n" ] }, { "cell_type": "markdown", "id": "bb162c55", "metadata": {}, "source": [ "## Data cleaning " ] }, { "cell_type": "code", "execution_count": null, "id": "3c676772", "metadata": {}, "outputs": [], "source": [ "train.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "37512519", "metadata": {}, "outputs": [], "source": [ "train.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "7f922334", "metadata": {}, "outputs": [], "source": [ "train.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "8ec92e68", "metadata": {}, "outputs": [], "source": [ "train.dtypes" ] }, { "cell_type": "code", "execution_count": null, "id": "c2179d7a", "metadata": {}, "outputs": [], "source": [ "train.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "eebc84d7", "metadata": {}, "outputs": [], "source": [ "#Removing missing values \n", "train.dropna(inplace = True)" ] }, { "cell_type": "code", "execution_count": null, "id": "f2fe9dc5", "metadata": {}, "outputs": [], "source": [ "train.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "b98c837e", "metadata": {}, "outputs": [], "source": [ "# Remove customer IDs from the data set\n", "train = train.iloc[:,1:]" ] }, { "cell_type": "code", "execution_count": null, "id": "3bb1a46b", "metadata": {}, "outputs": [], "source": [ "train_data.head()" ] }, { "cell_type": "markdown", "id": "47daf33f", "metadata": {}, "source": [ "## EXPLORATORY DATA ANALYSIS" ] }, { "cell_type": "markdown", "id": "57bc229c", "metadata": {}, "source": [ "\n", "Class Distribution Analysis:\n", "\n", "Examine the distribution of the \"label\" column to understand the balance between different classes (e.g., positive, negative, neutral, etc.). Visualize this distribution using bar plots or pie charts.\n", "Agreement Analysis:\n", "\n", "Investigate the \"agreement\" column to understand the level of agreement between annotators or classifiers. Analyze the distribution of agreement scores and determine if there are discrepancies or inconsistencies. Visualize this distribution using histograms or box plots.\n", "Text Length Analysis:\n", "\n", "Explore the length of the \"safe_text\" column (number of characters or words). Calculate summary statistics such as mean, median, minimum, maximum, and standard deviation. Plot histograms or box plots to visualize the distribution of text lengths.\n", "Word Frequency Analysis:\n", "\n", "Identify the most frequent words or terms in the \"safe_text\" column. Generate a word cloud or bar chart to visualize the top N words. This analysis can provide insights into the most commonly used language in the dataset.\n", "Class Distribution Analysis\n", "\n", "Examine the distribution of the \"label\" column to understand the balance between different classes (e.g., positive, negative, neutral)" ] }, { "cell_type": "code", "execution_count": null, "id": "ff2f8361", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "f3877ad2", "metadata": {}, "source": [ "Class Distribution Analysis\n", "\n", "Examine the distribution of the \"label\" column to understand the balance between different classes (e.g., positive, negative, neutral)" ] }, { "cell_type": "code", "execution_count": null, "id": "6f4ca78f", "metadata": {}, "outputs": [], "source": [ "ax = (train_data['label'].value_counts()*100.0 /len(train_data))\\\n", ".plot.pie(autopct='%.1f%%', labels = ['Neutral', 'Positive', 'Negative'],figsize =(5,5), fontsize = 12 ) \n", "ax.yaxis.set_major_formatter(mtick.PercentFormatter())\n", "ax.set_ylabel('',fontsize = 12)\n", "ax.set_title('Distribution of tweet', fontsize = 12)\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "3cbe8f79", "metadata": {}, "source": [ "Agreement Analysis:\n", "\n", "Investigate the \"agreement\" column to understand the level of agreement between annotators or classifiers. Analyze the distribution of agreement scores and determine if there are discrepancies or inconsistencies." ] }, { "cell_type": "code", "execution_count": null, "id": "5c76a978", "metadata": {}, "outputs": [], "source": [ "ax = train_data['agreement'].value_counts().plot(kind = 'bar',rot = 45, width = 0.3)\n", "ax.set_ylabel('Agreement')\n", "ax.set_title('Agreement Distribution')\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "49714fb9", "metadata": {}, "source": [ "Text Length Analysis\n", "\n", "Explore the length of the \"safe_text\" column (number of characters or words)." ] }, { "cell_type": "code", "execution_count": null, "id": "e9020da9", "metadata": {}, "outputs": [], "source": [ "# Calculate the length of each text\n", "train_data['text_length'] = train_data['safe_text'].apply(lambda x: len(x))\n", "\n", "# Plot the text length distribution\n", "plt.hist(train_data['text_length'], bins=20)\n", "plt.xlabel('Text Length')\n", "plt.ylabel('Count')\n", "plt.title('Text Length Distribution')\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "126e79c7", "metadata": {}, "source": [ "Word Frequency Analysis\n", "\n", "Identify the most frequent words or terms in the \"safe_text\" column. This analysis can provide insights into the most commonly used language in the dataset." ] }, { "cell_type": "code", "execution_count": null, "id": "933690da", "metadata": {}, "outputs": [], "source": [ "\n", "# Combine all the texts into a single string\n", "all_text = ' '.join(train_data['safe_text'])\n", "\n", "# Tokenize the text into individual words\n", "tokens = word_tokenize(all_text)\n", "\n", "# Count the frequency of each word\n", "word_freq = Counter(tokens)\n", "\n", "# Get the top 10 most used words\n", "top_10_words = word_freq.most_common(10)\n", "\n", "# Print the top 10 words and their frequencies\n", "for word, freq in top_10_words:\n", " print(f\"Word: {word}\\tFrequency: {freq}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f64e638e", "metadata": {}, "outputs": [], "source": [ "# Combine all the texts into a single string\n", "all_text = ' '.join(train_data['safe_text'])\n", "\n", "# Tokenize the text into individual words\n", "tokens = word_tokenize(all_text)\n", "\n", "# Count the frequency of each word\n", "word_freq = Counter(tokens)\n", "\n", "# Get the top 10 most used words and their frequencies\n", "top_10_words = word_freq.most_common(10)\n", "\n", "# Extract the words and frequencies for plotting\n", "words = [word for word, freq in top_10_words]\n", "frequencies = [freq for word, freq in top_10_words]\n", "\n", "# Create a bar plot\n", "plt.figure(figsize=(10, 6))\n", "plt.bar(words, frequencies)\n", "plt.xlabel('Words')\n", "plt.ylabel('Frequency')\n", "plt.title('Top 10 Most Used Words')\n", "plt.xticks(rotation=45)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "72d98f6b", "metadata": {}, "outputs": [], "source": [ "# modeling " ] }, { "cell_type": "code", "execution_count": null, "id": "c955af2a", "metadata": {}, "outputs": [], "source": [ "train, eval = train_test_split(train, test_size=0.2, random_state=42, stratify= train['label'])" ] }, { "cell_type": "code", "execution_count": null, "id": "df8ba692", "metadata": {}, "outputs": [], "source": [ "print(f\"new dataframe shapes: train is {train.shape}, eval is {eval.shape}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "1b7e04ed", "metadata": {}, "outputs": [], "source": [ "# Save splitted subsets\n", "train.to_csv(\"/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv\", index=False)\n", "eval.to_csv(\"/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "9590e1f2", "metadata": {}, "outputs": [], "source": [ "dataset = load_dataset('csv',\n", " data_files={'train': '/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv',\n", " 'eval': '/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv'}, encoding = \"ISO-8859-1\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c27314c1", "metadata": {}, "outputs": [], "source": [ "def transform_labels(label):\n", "\n", " label = label['label']\n", " num = 0\n", " if label == -1: #'Negative'\n", " num = 0\n", " elif label == 0: #'Neutral'\n", " num = 1\n", " elif label == 1: #'Positive'\n", " num = 2\n", "\n", " return {'labels': num}\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5a39bf13", "metadata": {}, "outputs": [], "source": [ "def tokenize_data(example):\n", " return tokenizer(example['safe_text'], padding='max_length')" ] }, { "cell_type": "code", "execution_count": null, "id": "e6e85205", "metadata": {}, "outputs": [], "source": [ "# Change the tweets to tokens that the models can exploit\n", "dataset = dataset.map(tokenize_data, batched=True)\n", "\n", "# Transform\tlabels and remove the useless columns\n", "remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement']\n", "dataset = dataset.map(transform_labels, remove_columns=remove_columns)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "229f9b59", "metadata": {}, "outputs": [], "source": [ "training_args = TrainingArguments(\"test_trainer\", num_train_epochs= 10, load_best_model_at_end=True,evaluation_strategy= \"steps\")\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-cased\", num_labels=3)\n", "\n", "train_dataset = dataset['train'].shuffle(seed=20) #.select(range(40000)) # to select a part\n", "eval_dataset = dataset['eval'].shuffle(seed=20)\n", "\n", "trainer = Trainer(\n", " model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset\n", ")\n", "\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "id": "6824dd84", "metadata": {}, "outputs": [], "source": [ "#defining the evaluation metrics\n", "metric = load_metric(\"accuracy\")" ] }, { "cell_type": "code", "execution_count": null, "id": "03b3c80f", "metadata": {}, "outputs": [], "source": [ "def compute_metrics(eval_pred):\n", " logits, labels = eval_pred\n", " predictions = np.argmax(logits, axis=-1)\n", " return metric.compute(predictions=predictions, references=labels)\n", "\n", "trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, \n", " eval_dataset=eval_dataset,compute_metrics=compute_metrics)" ] }, { "cell_type": "code", "execution_count": null, "id": "7a1b2a94", "metadata": {}, "outputs": [], "source": [ "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", " eval_dataset=eval_dataset,\n", " tokenizer=tokenizer,\n", " data_collator=data_collator,\n", " compute_metrics=compute_metrics,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "5a03352f", "metadata": {}, "outputs": [], "source": [ "# Launch the final evaluation \n", "trainer.evaluate()" ] }, { "cell_type": "code", "execution_count": null, "id": "e1c0323d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "2f85b000", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }