Alex commited on
Commit
d311f5c
1 Parent(s): 5a9eb5a

added comments

Browse files
Files changed (1) hide show
  1. milestone3.ipynb +87 -101
milestone3.ipynb CHANGED
@@ -1,18 +1,4 @@
1
  {
2
- "nbformat": 4,
3
- "nbformat_minor": 0,
4
- "metadata": {
5
- "colab": {
6
- "provenance": []
7
- },
8
- "kernelspec": {
9
- "name": "python3",
10
- "display_name": "Python 3"
11
- },
12
- "language_info": {
13
- "name": "python"
14
- }
15
- },
16
  "cells": [
17
  {
18
  "cell_type": "code",
@@ -26,8 +12,8 @@
26
  },
27
  "outputs": [
28
  {
29
- "output_type": "stream",
30
  "name": "stdout",
 
31
  "text": [
32
  "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
33
  "Collecting datasets\n",
@@ -110,12 +96,13 @@
110
  }
111
  ],
112
  "source": [
 
113
  "!pip install datasets\n",
114
  "!pip install transformers\n",
115
  "import pandas as pd\n",
116
  "from sklearn.model_selection import train_test_split\n",
117
  "import numpy as np\n",
118
- "import transformers\n",
119
  "import torch\n",
120
  "import csv\n",
121
  "from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler\n",
@@ -124,17 +111,7 @@
124
  },
125
  {
126
  "cell_type": "code",
127
- "source": [
128
- "filename = \"/content/sample_data/train.csv\"\n",
129
- "df = pd.read_csv(filename)\n",
130
- "df.head()\n",
131
- "df.drop(['id'], inplace=True, axis=1)\n",
132
- "newdf = pd.DataFrame()\n",
133
- "newdf['text'] = df['comment_text']\n",
134
- "newdf['labels'] = df.iloc[:, 1:].values.tolist()\n",
135
- "\n",
136
- "newdf.head()"
137
- ],
138
  "metadata": {
139
  "colab": {
140
  "base_uri": "https://localhost:8080/",
@@ -143,12 +120,11 @@
143
  "id": "XQEDvn-7ksXU",
144
  "outputId": "960bd74f-2533-4eab-9800-643823e14f2f"
145
  },
146
- "execution_count": null,
147
  "outputs": [
148
  {
149
- "output_type": "error",
150
  "ename": "FileNotFoundError",
151
  "evalue": "ignored",
 
152
  "traceback": [
153
  "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
154
  "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
@@ -163,32 +139,22 @@
163
  "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/content/sample_data/train.csv'"
164
  ]
165
  }
 
 
 
 
 
 
 
 
 
 
 
166
  ]
167
  },
168
  {
169
  "cell_type": "code",
170
- "source": [
171
- "epoch = 1\n",
172
- "max_len = 128\n",
173
- "batch_size = 5\n",
174
- "\n",
175
- "train_df, val_df = train_test_split(newdf, test_size=0.2, random_state=42)\n",
176
- "\"\"\"\n",
177
- "DistilBertTokenizer\n",
178
- "torch.utils.data.Dataset\n",
179
- "inputs = self.tokenizer.encode_plus\n",
180
- "DataLoader\n",
181
- "PreTrainedModel\n",
182
- "DistilBertForSequenceClassification\n",
183
- "DistilBertConfig\n",
184
- "model = DistilBertClassifier2(config)\n",
185
- "model.to(device)\n",
186
- "torch.optim.Adam\n",
187
- "tokenizer.encode_plus\n",
188
- "tokenizer.save_pretrained(\"model\")\n",
189
- "model.save_pretrained(\"model\")\n",
190
- "\"\"\""
191
- ],
192
  "metadata": {
193
  "colab": {
194
  "base_uri": "https://localhost:8080/",
@@ -197,12 +163,11 @@
197
  "id": "DO8fKxgnwIPz",
198
  "outputId": "d0f73814-62a0-4d74-9353-3d4ce90b6d1b"
199
  },
200
- "execution_count": null,
201
  "outputs": [
202
  {
203
- "output_type": "error",
204
  "ename": "NameError",
205
  "evalue": "ignored",
 
206
  "traceback": [
207
  "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
208
  "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
@@ -210,17 +175,29 @@
210
  "\u001b[0;31mNameError\u001b[0m: name 'newdf' is not defined"
211
  ]
212
  }
 
 
 
 
 
 
 
213
  ]
214
  },
215
  {
216
  "cell_type": "code",
 
 
 
 
 
217
  "source": [
218
- "class DS(Dataset):\n",
219
  " def __init__(self, dataframe, max_len):\n",
220
- " self.data = dataframe\n",
221
  " self.max_len = max_len\n",
222
- " self.text = dataframe.text\n",
223
- " self.targets = self.data.labels\n",
224
  " self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
225
  " \n",
226
  " def __len__(self):\n",
@@ -230,7 +207,7 @@
230
  " text = str(self.text.iloc[index])\n",
231
  " text = \" \".join(text.split())\n",
232
  "\n",
233
- " inputs = self.tokenizer.encode_plus(\n",
234
  " text, None,\n",
235
  " add_special_tokens=True,\n",
236
  " max_length=self.max_len,\n",
@@ -239,41 +216,30 @@
239
  " ids = inputs['input_ids']\n",
240
  " mask = inputs['attention_mask']\n",
241
  " token_type_ids = inputs[\"token_type_ids\"]\n",
242
- " return {\n",
243
  " 'ids': torch.tensor(ids, dtype=torch.long),\n",
244
  " 'attention_mask': torch.tensor(mask, dtype=torch.long),\n",
245
  " 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),\n",
246
  " 'labels': torch.tensor(self.targets.iloc[index], dtype=torch.float)\n",
247
  " }\n"
248
- ],
249
- "metadata": {
250
- "id": "i80qLafpzWDh"
251
- },
252
- "execution_count": null,
253
- "outputs": []
254
  },
255
  {
256
  "cell_type": "code",
257
- "source": [
258
- "traindata = DS(train_df, max_len)\n",
259
- "validdata = DS(val_df, max_len)\n",
260
- "train_loader = DataLoader(traindata, batch_size=batch_size, shuffle=True)\n",
261
- "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n"
262
- ],
263
  "metadata": {
264
- "id": "EMScGH58Poaw",
265
  "colab": {
266
  "base_uri": "https://localhost:8080/",
267
  "height": 201
268
  },
 
269
  "outputId": "de081257-fb6c-4c73-c54d-e88dd1e2603f"
270
  },
271
- "execution_count": null,
272
  "outputs": [
273
  {
274
- "output_type": "error",
275
  "ename": "NameError",
276
  "evalue": "ignored",
 
277
  "traceback": [
278
  "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
279
  "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
@@ -281,33 +247,30 @@
281
  "\u001b[0;31mNameError\u001b[0m: name 'train_df' is not defined"
282
  ]
283
  }
 
 
 
 
 
 
284
  ]
285
  },
286
  {
287
  "cell_type": "code",
 
288
  "metadata": {
289
- "id": "fb9-Yr9YDZqo",
290
  "colab": {
291
  "base_uri": "https://localhost:8080/",
292
  "height": 235
293
  },
 
294
  "outputId": "0664e5d0-55cb-4b58-e75a-b9acdab82e73"
295
  },
296
- "source": [
297
- "device = torch.device('cuda')\n",
298
- "\n",
299
- "model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6, problem_type=\"multi_label_classification\")\n",
300
- "model.to(device)\n",
301
- "model.train()\n",
302
- "\n",
303
- "optimizer = AdamW(model.parameters(), lr=5e-5)\n"
304
- ],
305
- "execution_count": null,
306
  "outputs": [
307
  {
308
- "output_type": "error",
309
  "ename": "NameError",
310
  "evalue": "ignored",
 
311
  "traceback": [
312
  "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
313
  "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
@@ -315,10 +278,24 @@
315
  "\u001b[0;31mNameError\u001b[0m: name 'torch' is not defined"
316
  ]
317
  }
 
 
 
 
 
 
 
 
 
318
  ]
319
  },
320
  {
321
  "cell_type": "code",
 
 
 
 
 
322
  "source": [
323
  "for i in range(epoch):\n",
324
  " for batch in train_loader:\n",
@@ -333,15 +310,15 @@
333
  " loss.backward()\n",
334
  " optimizer.step()\n",
335
  "model.eval()\n"
336
- ],
337
- "metadata": {
338
- "id": "mtMhE5_z8kw8"
339
- },
340
- "execution_count": null,
341
- "outputs": []
342
  },
343
  {
344
  "cell_type": "code",
 
 
 
 
 
345
  "source": [
346
  "xtrain = [\"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!\"]\n",
347
  "batch = tokenizer(xtrain, truncation=True, padding='max_length', return_tensors=\"pt\").to(device)\n",
@@ -351,14 +328,23 @@
351
  " results = torch.sigmoid(outputs.logits)*100\n",
352
  " print(results)\n",
353
  "\n",
354
- "model.save_pretrained(\"pretrained_model\")\n",
355
  "tokenizer.save_pretrained(\"model_tokenizer\")"
356
- ],
357
- "metadata": {
358
- "id": "8T4UG8K8BvUn"
359
- },
360
- "execution_count": null,
361
- "outputs": []
 
 
 
 
 
 
 
362
  }
363
- ]
364
- }
 
 
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
 
12
  },
13
  "outputs": [
14
  {
 
15
  "name": "stdout",
16
+ "output_type": "stream",
17
  "text": [
18
  "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
19
  "Collecting datasets\n",
 
96
  }
97
  ],
98
  "source": [
99
+ "#list of import statements\n",
100
  "!pip install datasets\n",
101
  "!pip install transformers\n",
102
  "import pandas as pd\n",
103
  "from sklearn.model_selection import train_test_split\n",
104
  "import numpy as np\n",
105
+ "import transformers \n",
106
  "import torch\n",
107
  "import csv\n",
108
  "from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler\n",
 
111
  },
112
  {
113
  "cell_type": "code",
114
+ "execution_count": null,
 
 
 
 
 
 
 
 
 
 
115
  "metadata": {
116
  "colab": {
117
  "base_uri": "https://localhost:8080/",
 
120
  "id": "XQEDvn-7ksXU",
121
  "outputId": "960bd74f-2533-4eab-9800-643823e14f2f"
122
  },
 
123
  "outputs": [
124
  {
 
125
  "ename": "FileNotFoundError",
126
  "evalue": "ignored",
127
+ "output_type": "error",
128
  "traceback": [
129
  "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
130
  "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
 
139
  "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/content/sample_data/train.csv'"
140
  ]
141
  }
142
+ ],
143
+ "source": [
144
+ "filename = \"/content/sample_data/train.csv\" #takes in the file for training and inputs into a pandas DataFrame\n",
145
+ "df = pd.read_csv(filename)\n",
146
+ "df.head()\n",
147
+ "df.drop(['id'], inplace=True, axis=1)\n",
148
+ "newdf = pd.DataFrame()\n",
149
+ "newdf['text'] = df['comment_text']\n",
150
+ "newdf['labels'] = df.iloc[:, 1:].values.tolist()\n",
151
+ "\n",
152
+ "newdf.head()"
153
  ]
154
  },
155
  {
156
  "cell_type": "code",
157
+ "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  "metadata": {
159
  "colab": {
160
  "base_uri": "https://localhost:8080/",
 
163
  "id": "DO8fKxgnwIPz",
164
  "outputId": "d0f73814-62a0-4d74-9353-3d4ce90b6d1b"
165
  },
 
166
  "outputs": [
167
  {
 
168
  "ename": "NameError",
169
  "evalue": "ignored",
170
+ "output_type": "error",
171
  "traceback": [
172
  "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
173
  "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
 
175
  "\u001b[0;31mNameError\u001b[0m: name 'newdf' is not defined"
176
  ]
177
  }
178
+ ],
179
+ "source": [
180
+ "epoch = 1\n",
181
+ "max_len = 128\n",
182
+ "batch_size = 5\n",
183
+ "\n",
184
+ "train_df, val_df = train_test_split(newdf, test_size=0.2, random_state=42) #splits the dataframe into training data and valid data\n"
185
  ]
186
  },
187
  {
188
  "cell_type": "code",
189
+ "execution_count": null,
190
+ "metadata": {
191
+ "id": "i80qLafpzWDh"
192
+ },
193
+ "outputs": [],
194
  "source": [
195
+ "class DS(Dataset): #this creates the dataset class\n",
196
  " def __init__(self, dataframe, max_len):\n",
197
+ " self.data = dataframe #takes in the dataframe from earlier\n",
198
  " self.max_len = max_len\n",
199
+ " self.text = dataframe.text #\n",
200
+ " self.targets = self.data.labels \n",
201
  " self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
202
  " \n",
203
  " def __len__(self):\n",
 
207
  " text = str(self.text.iloc[index])\n",
208
  " text = \" \".join(text.split())\n",
209
  "\n",
210
+ " inputs = self.tokenizer.encode_plus( #this is for the tokens\n",
211
  " text, None,\n",
212
  " add_special_tokens=True,\n",
213
  " max_length=self.max_len,\n",
 
216
  " ids = inputs['input_ids']\n",
217
  " mask = inputs['attention_mask']\n",
218
  " token_type_ids = inputs[\"token_type_ids\"]\n",
219
+ " return { #this is the output for the class (this outputs tensors as it is a more usable form)\n",
220
  " 'ids': torch.tensor(ids, dtype=torch.long),\n",
221
  " 'attention_mask': torch.tensor(mask, dtype=torch.long),\n",
222
  " 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),\n",
223
  " 'labels': torch.tensor(self.targets.iloc[index], dtype=torch.float)\n",
224
  " }\n"
225
+ ]
 
 
 
 
 
226
  },
227
  {
228
  "cell_type": "code",
229
+ "execution_count": null,
 
 
 
 
 
230
  "metadata": {
 
231
  "colab": {
232
  "base_uri": "https://localhost:8080/",
233
  "height": 201
234
  },
235
+ "id": "EMScGH58Poaw",
236
  "outputId": "de081257-fb6c-4c73-c54d-e88dd1e2603f"
237
  },
 
238
  "outputs": [
239
  {
 
240
  "ename": "NameError",
241
  "evalue": "ignored",
242
+ "output_type": "error",
243
  "traceback": [
244
  "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
245
  "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
 
247
  "\u001b[0;31mNameError\u001b[0m: name 'train_df' is not defined"
248
  ]
249
  }
250
+ ],
251
+ "source": [
252
+ "traindata = DS(train_df, max_len) #creates training dataset\n",
253
+ "validdata = DS(val_df, max_len) #creates valid dataset\n",
254
+ "train_loader = DataLoader(traindata, batch_size=batch_size, shuffle=True) #loads the dataset into dataloader\n",
255
+ "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n"
256
  ]
257
  },
258
  {
259
  "cell_type": "code",
260
+ "execution_count": null,
261
  "metadata": {
 
262
  "colab": {
263
  "base_uri": "https://localhost:8080/",
264
  "height": 235
265
  },
266
+ "id": "fb9-Yr9YDZqo",
267
  "outputId": "0664e5d0-55cb-4b58-e75a-b9acdab82e73"
268
  },
 
 
 
 
 
 
 
 
 
 
269
  "outputs": [
270
  {
 
271
  "ename": "NameError",
272
  "evalue": "ignored",
273
+ "output_type": "error",
274
  "traceback": [
275
  "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
276
  "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
 
278
  "\u001b[0;31mNameError\u001b[0m: name 'torch' is not defined"
279
  ]
280
  }
281
+ ],
282
+ "source": [
283
+ "device = torch.device('cuda')\n",
284
+ "\n",
285
+ "model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6, problem_type=\"multi_label_classification\")\n",
286
+ "model.to(device)\n",
287
+ "model.train() #trains the data\n",
288
+ "\n",
289
+ "optimizer = AdamW(model.parameters(), lr=5e-5)\n"
290
  ]
291
  },
292
  {
293
  "cell_type": "code",
294
+ "execution_count": null,
295
+ "metadata": {
296
+ "id": "mtMhE5_z8kw8"
297
+ },
298
+ "outputs": [],
299
  "source": [
300
  "for i in range(epoch):\n",
301
  " for batch in train_loader:\n",
 
310
  " loss.backward()\n",
311
  " optimizer.step()\n",
312
  "model.eval()\n"
313
+ ]
 
 
 
 
 
314
  },
315
  {
316
  "cell_type": "code",
317
+ "execution_count": null,
318
+ "metadata": {
319
+ "id": "8T4UG8K8BvUn"
320
+ },
321
+ "outputs": [],
322
  "source": [
323
  "xtrain = [\"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!\"]\n",
324
  "batch = tokenizer(xtrain, truncation=True, padding='max_length', return_tensors=\"pt\").to(device)\n",
 
328
  " results = torch.sigmoid(outputs.logits)*100\n",
329
  " print(results)\n",
330
  "\n",
331
+ "model.save_pretrained(\"pretrained_model\") #saves the trained model\n",
332
  "tokenizer.save_pretrained(\"model_tokenizer\")"
333
+ ]
334
+ }
335
+ ],
336
+ "metadata": {
337
+ "colab": {
338
+ "provenance": []
339
+ },
340
+ "kernelspec": {
341
+ "display_name": "Python 3",
342
+ "name": "python3"
343
+ },
344
+ "language_info": {
345
+ "name": "python"
346
  }
347
+ },
348
+ "nbformat": 4,
349
+ "nbformat_minor": 0
350
+ }