Spaces:

aim9061
/

sentiment-analysis

Runtime error

App Files Files Community

Alex commited on May 4, 2023

Commit

d311f5c

•

1 Parent(s): 5a9eb5a

added comments

Browse files

Files changed (1) hide show

milestone3.ipynb +87 -101

milestone3.ipynb CHANGED Viewed

@@ -1,18 +1,4 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
   "cells": [
     {
       "cell_type": "code",
@@ -26,8 +12,8 @@
       },
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
           "text": [
             "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
             "Collecting datasets\n",
@@ -110,12 +96,13 @@
         }
       ],
       "source": [
         "!pip install datasets\n",
         "!pip install transformers\n",
         "import pandas as pd\n",
         "from sklearn.model_selection import train_test_split\n",
         "import numpy as np\n",
-        "import transformers\n",
         "import torch\n",
         "import csv\n",
         "from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler\n",
@@ -124,17 +111,7 @@
     },
     {
       "cell_type": "code",
-      "source": [
-        "filename = \"/content/sample_data/train.csv\"\n",
-        "df = pd.read_csv(filename)\n",
-        "df.head()\n",
-        "df.drop(['id'], inplace=True, axis=1)\n",
-        "newdf = pd.DataFrame()\n",
-        "newdf['text'] = df['comment_text']\n",
-        "newdf['labels'] = df.iloc[:, 1:].values.tolist()\n",
-        "\n",
-        "newdf.head()"
-      ],
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -143,12 +120,11 @@
         "id": "XQEDvn-7ksXU",
         "outputId": "960bd74f-2533-4eab-9800-643823e14f2f"
       },
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "error",
           "ename": "FileNotFoundError",
           "evalue": "ignored",
           "traceback": [
             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
             "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
@@ -163,32 +139,22 @@
             "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/content/sample_data/train.csv'"
           ]
         }
       ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "epoch = 1\n",
-        "max_len = 128\n",
-        "batch_size = 5\n",
-        "\n",
-        "train_df, val_df = train_test_split(newdf, test_size=0.2, random_state=42)\n",
-        "\"\"\"\n",
-        "DistilBertTokenizer\n",
-        "torch.utils.data.Dataset\n",
-        "inputs = self.tokenizer.encode_plus\n",
-        "DataLoader\n",
-        "PreTrainedModel\n",
-        "DistilBertForSequenceClassification\n",
-        "DistilBertConfig\n",
-        "model = DistilBertClassifier2(config)\n",
-        "model.to(device)\n",
-        "torch.optim.Adam\n",
-        "tokenizer.encode_plus\n",
-        "tokenizer.save_pretrained(\"model\")\n",
-        "model.save_pretrained(\"model\")\n",
-        "\"\"\""
-      ],
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -197,12 +163,11 @@
         "id": "DO8fKxgnwIPz",
         "outputId": "d0f73814-62a0-4d74-9353-3d4ce90b6d1b"
       },
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "error",
           "ename": "NameError",
           "evalue": "ignored",
           "traceback": [
             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
             "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
@@ -210,17 +175,29 @@
             "\u001b[0;31mNameError\u001b[0m: name 'newdf' is not defined"
           ]
         }
       ]
     },
     {
       "cell_type": "code",
       "source": [
-        "class DS(Dataset):\n",
         "    def __init__(self, dataframe, max_len):\n",
-        "        self.data = dataframe\n",
         "        self.max_len = max_len\n",
-        "        self.text = dataframe.text\n",
-        "        self.targets = self.data.labels\n",
         "        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
         "    \n",
         "    def __len__(self):\n",
@@ -230,7 +207,7 @@
         "        text = str(self.text.iloc[index])\n",
         "        text = \" \".join(text.split())\n",
         "\n",
-        "        inputs = self.tokenizer.encode_plus(\n",
         "            text, None,\n",
         "            add_special_tokens=True,\n",
         "            max_length=self.max_len,\n",
@@ -239,41 +216,30 @@
         "        ids = inputs['input_ids']\n",
         "        mask = inputs['attention_mask']\n",
         "        token_type_ids = inputs[\"token_type_ids\"]\n",
-        "        return {\n",
         "            'ids': torch.tensor(ids, dtype=torch.long),\n",
         "            'attention_mask': torch.tensor(mask, dtype=torch.long),\n",
         "            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),\n",
         "            'labels': torch.tensor(self.targets.iloc[index], dtype=torch.float)\n",
         "        }\n"
-      ],
-      "metadata": {
-        "id": "i80qLafpzWDh"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
-      "source": [
-        "traindata = DS(train_df, max_len)\n",
-        "validdata = DS(val_df, max_len)\n",
-        "train_loader = DataLoader(traindata, batch_size=batch_size, shuffle=True)\n",
-        "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n"
-      ],
       "metadata": {
-        "id": "EMScGH58Poaw",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 201
         },
         "outputId": "de081257-fb6c-4c73-c54d-e88dd1e2603f"
       },
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "error",
           "ename": "NameError",
           "evalue": "ignored",
           "traceback": [
             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
             "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
@@ -281,33 +247,30 @@
             "\u001b[0;31mNameError\u001b[0m: name 'train_df' is not defined"
           ]
         }
       ]
     },
     {
       "cell_type": "code",
       "metadata": {
-        "id": "fb9-Yr9YDZqo",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 235
         },
         "outputId": "0664e5d0-55cb-4b58-e75a-b9acdab82e73"
       },
-      "source": [
-        "device = torch.device('cuda')\n",
-        "\n",
-        "model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6, problem_type=\"multi_label_classification\")\n",
-        "model.to(device)\n",
-        "model.train()\n",
-        "\n",
-        "optimizer = AdamW(model.parameters(), lr=5e-5)\n"
-      ],
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "error",
           "ename": "NameError",
           "evalue": "ignored",
           "traceback": [
             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
             "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
@@ -315,10 +278,24 @@
             "\u001b[0;31mNameError\u001b[0m: name 'torch' is not defined"
           ]
         }
       ]
     },
     {
       "cell_type": "code",
       "source": [
         "for i in range(epoch):\n",
         "  for batch in train_loader:\n",
@@ -333,15 +310,15 @@
         "    loss.backward()\n",
         "    optimizer.step()\n",
         "model.eval()\n"
-      ],
-      "metadata": {
-        "id": "mtMhE5_z8kw8"
-      },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
         "xtrain = [\"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!\"]\n",
         "batch = tokenizer(xtrain, truncation=True, padding='max_length', return_tensors=\"pt\").to(device)\n",
@@ -351,14 +328,23 @@
         "  results = torch.sigmoid(outputs.logits)*100\n",
         "  print(results)\n",
         "\n",
-        "model.save_pretrained(\"pretrained_model\")\n",
         "tokenizer.save_pretrained(\"model_tokenizer\")"
-      ],
-      "metadata": {
-        "id": "8T4UG8K8BvUn"
-      },
-      "execution_count": null,
-      "outputs": []
     }
-  ]
-}

 {
   "cells": [
     {
       "cell_type": "code",
       },
       "outputs": [
         {
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
             "Collecting datasets\n",
         }
       ],
       "source": [
+        "#list of import statements\n",
         "!pip install datasets\n",
         "!pip install transformers\n",
         "import pandas as pd\n",
         "from sklearn.model_selection import train_test_split\n",
         "import numpy as np\n",
+        "import transformers \n",
         "import torch\n",
         "import csv\n",
         "from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler\n",
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
         "id": "XQEDvn-7ksXU",
         "outputId": "960bd74f-2533-4eab-9800-643823e14f2f"
       },
       "outputs": [
         {
           "ename": "FileNotFoundError",
           "evalue": "ignored",
+          "output_type": "error",
           "traceback": [
             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
             "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
             "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/content/sample_data/train.csv'"
           ]
         }
+      ],
+      "source": [
+        "filename = \"/content/sample_data/train.csv\" #takes in the file for training and inputs into a pandas DataFrame\n",
+        "df = pd.read_csv(filename)\n",
+        "df.head()\n",
+        "df.drop(['id'], inplace=True, axis=1)\n",
+        "newdf = pd.DataFrame()\n",
+        "newdf['text'] = df['comment_text']\n",
+        "newdf['labels'] = df.iloc[:, 1:].values.tolist()\n",
+        "\n",
+        "newdf.head()"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
         "id": "DO8fKxgnwIPz",
         "outputId": "d0f73814-62a0-4d74-9353-3d4ce90b6d1b"
       },
       "outputs": [
         {
           "ename": "NameError",
           "evalue": "ignored",
+          "output_type": "error",
           "traceback": [
             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
             "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
             "\u001b[0;31mNameError\u001b[0m: name 'newdf' is not defined"
           ]
         }
+      ],
+      "source": [
+        "epoch = 1\n",
+        "max_len = 128\n",
+        "batch_size = 5\n",
+        "\n",
+        "train_df, val_df = train_test_split(newdf, test_size=0.2, random_state=42) #splits the dataframe into training data and valid data\n"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "i80qLafpzWDh"
+      },
+      "outputs": [],
       "source": [
+        "class DS(Dataset): #this creates the dataset class\n",
         "    def __init__(self, dataframe, max_len):\n",
+        "        self.data = dataframe #takes in the dataframe from earlier\n",
         "        self.max_len = max_len\n",
+        "        self.text = dataframe.text #\n",
+        "        self.targets = self.data.labels \n",
         "        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
         "    \n",
         "    def __len__(self):\n",
         "        text = str(self.text.iloc[index])\n",
         "        text = \" \".join(text.split())\n",
         "\n",
+        "        inputs = self.tokenizer.encode_plus( #this is for the tokens\n",
         "            text, None,\n",
         "            add_special_tokens=True,\n",
         "            max_length=self.max_len,\n",
         "        ids = inputs['input_ids']\n",
         "        mask = inputs['attention_mask']\n",
         "        token_type_ids = inputs[\"token_type_ids\"]\n",
+        "        return { #this is the output for the class (this outputs tensors as it is a more usable form)\n",
         "            'ids': torch.tensor(ids, dtype=torch.long),\n",
         "            'attention_mask': torch.tensor(mask, dtype=torch.long),\n",
         "            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),\n",
         "            'labels': torch.tensor(self.targets.iloc[index], dtype=torch.float)\n",
         "        }\n"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 201
         },
+        "id": "EMScGH58Poaw",
         "outputId": "de081257-fb6c-4c73-c54d-e88dd1e2603f"
       },
       "outputs": [
         {
           "ename": "NameError",
           "evalue": "ignored",
+          "output_type": "error",
           "traceback": [
             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
             "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
             "\u001b[0;31mNameError\u001b[0m: name 'train_df' is not defined"
           ]
         }
+      ],
+      "source": [
+        "traindata = DS(train_df, max_len) #creates training dataset\n",
+        "validdata = DS(val_df, max_len) #creates valid dataset\n",
+        "train_loader = DataLoader(traindata, batch_size=batch_size, shuffle=True) #loads the dataset into dataloader\n",
+        "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 235
         },
+        "id": "fb9-Yr9YDZqo",
         "outputId": "0664e5d0-55cb-4b58-e75a-b9acdab82e73"
       },
       "outputs": [
         {
           "ename": "NameError",
           "evalue": "ignored",
+          "output_type": "error",
           "traceback": [
             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
             "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
             "\u001b[0;31mNameError\u001b[0m: name 'torch' is not defined"
           ]
         }
+      ],
+      "source": [
+        "device = torch.device('cuda')\n",
+        "\n",
+        "model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6, problem_type=\"multi_label_classification\")\n",
+        "model.to(device)\n",
+        "model.train() #trains the data\n",
+        "\n",
+        "optimizer = AdamW(model.parameters(), lr=5e-5)\n"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "mtMhE5_z8kw8"
+      },
+      "outputs": [],
       "source": [
         "for i in range(epoch):\n",
         "  for batch in train_loader:\n",
         "    loss.backward()\n",
         "    optimizer.step()\n",
         "model.eval()\n"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8T4UG8K8BvUn"
+      },
+      "outputs": [],
       "source": [
         "xtrain = [\"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!\"]\n",
         "batch = tokenizer(xtrain, truncation=True, padding='max_length', return_tensors=\"pt\").to(device)\n",
         "  results = torch.sigmoid(outputs.logits)*100\n",
         "  print(results)\n",
         "\n",
+        "model.save_pretrained(\"pretrained_model\") #saves the trained model\n",
         "tokenizer.save_pretrained(\"model_tokenizer\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
     }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}