Bryan-Az commited on
Commit
84c3487
1 Parent(s): 01b18c5

evaluted the model

Browse files
Files changed (1) hide show
  1. src/model_evaluation_v2.ipynb +372 -131
src/model_evaluation_v2.ipynb CHANGED
@@ -1,136 +1,377 @@
1
  {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {},
6
- "source": [
7
- "# **Evaluating the Recommendation Model**"
8
- ]
9
- },
10
- {
11
- "cell_type": "code",
12
- "execution_count": 1,
13
- "metadata": {},
14
- "outputs": [
15
  {
16
- "name": "stderr",
17
- "output_type": "stream",
18
- "text": [
19
- "/Users/mocha/miniconda3/envs/mamba/envs/neurobytes_music_recommender/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
20
- " from .autonotebook import tqdm as notebook_tqdm\n"
21
- ]
22
- }
23
- ],
24
- "source": [
25
- "import gradio as gr\n",
26
- "import torch\n",
27
- "import torch.nn as nn\n",
28
- "from joblib import load\n",
29
- "import sklearn"
30
- ]
31
- },
32
- {
33
- "cell_type": "code",
34
- "execution_count": 2,
35
- "metadata": {},
36
- "outputs": [],
37
- "source": [
38
- "# Define the same neural network model\n",
39
- "class ImprovedSongRecommender(nn.Module):\n",
40
- " def __init__(self, input_size, num_titles):\n",
41
- " super(ImprovedSongRecommender, self).__init__()\n",
42
- " self.fc1 = nn.Linear(input_size, 128)\n",
43
- " self.bn1 = nn.BatchNorm1d(128)\n",
44
- " self.fc2 = nn.Linear(128, 256)\n",
45
- " self.bn2 = nn.BatchNorm1d(256)\n",
46
- " self.fc3 = nn.Linear(256, 128)\n",
47
- " self.bn3 = nn.BatchNorm1d(128)\n",
48
- " self.output = nn.Linear(128, num_titles)\n",
49
- " self.dropout = nn.Dropout(0.5)\n",
50
- "\n",
51
- " def forward(self, x):\n",
52
- " x = torch.relu(self.bn1(self.fc1(x)))\n",
53
- " x = self.dropout(x)\n",
54
- " x = torch.relu(self.bn2(self.fc2(x)))\n",
55
- " x = self.dropout(x)\n",
56
- " x = torch.relu(self.bn3(self.fc3(x)))\n",
57
- " x = self.dropout(x)\n",
58
- " x = self.output(x)\n",
59
- " return x\n",
60
- "\n",
61
- "# Load the trained model\n",
62
- "model_path = \"../models/improved_model.pth\"\n",
63
- "num_unique_titles = 4855 "
64
- ]
65
- },
66
- {
67
- "cell_type": "code",
68
- "execution_count": null,
69
- "metadata": {},
70
- "outputs": [],
71
- "source": [
72
- "model = ImprovedSongRecommender(input_size=2, num_titles=num_unique_titles) \n",
73
- "model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))\n",
74
- "model.eval()"
75
- ]
76
- },
77
- {
78
- "cell_type": "code",
79
- "execution_count": 3,
80
- "metadata": {},
81
- "outputs": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  {
83
- "name": "stderr",
84
- "output_type": "stream",
85
- "text": [
86
- "/Users/mocha/miniconda3/envs/mamba/envs/neurobytes_music_recommender/lib/python3.8/site-packages/sklearn/base.py:348: InconsistentVersionWarning: Trying to unpickle estimator LabelEncoder from version 1.2.2 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
87
- "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
88
- " warnings.warn(\n",
89
- "/Users/mocha/miniconda3/envs/mamba/envs/neurobytes_music_recommender/lib/python3.8/site-packages/sklearn/base.py:348: InconsistentVersionWarning: Trying to unpickle estimator MinMaxScaler from version 1.2.2 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
90
- "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
91
- " warnings.warn(\n"
92
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  }
94
- ],
95
- "source": [
96
- "# Load the label encoders and scaler\n",
97
- "label_encoders_path = \"data/new_label_encoders.joblib\"\n",
98
- "scaler_path = \"data/new_scaler.joblib\"\n",
99
- "\n",
100
- "label_encoders = load(label_encoders_path)\n",
101
- "scaler = load(scaler_path)\n",
102
- "\n",
103
- "# Create a mapping from encoded indices to actual song titles\n",
104
- "index_to_song_title = {index: title for index, title in enumerate(label_encoders['title'].classes_)}\n"
105
- ]
106
- },
107
- {
108
- "cell_type": "code",
109
- "execution_count": null,
110
- "metadata": {},
111
- "outputs": [],
112
- "source": []
113
- }
114
- ],
115
- "metadata": {
116
- "kernelspec": {
117
- "display_name": "base",
118
- "language": "python",
119
- "name": "python3"
120
  },
121
- "language_info": {
122
- "codemirror_mode": {
123
- "name": "ipython",
124
- "version": 3
125
- },
126
- "file_extension": ".py",
127
- "mimetype": "text/x-python",
128
- "name": "python",
129
- "nbconvert_exporter": "python",
130
- "pygments_lexer": "ipython3",
131
- "version": "3.8.1"
132
- }
133
- },
134
- "nbformat": 4,
135
- "nbformat_minor": 2
136
- }
 
1
  {
2
+ "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "XeyJCRFOLOvg"
7
+ },
8
+ "source": [
9
+ "# **Evaluating the Recommendation Model**"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 305,
15
+ "metadata": {
16
+ "id": "EWiqFUizLOvh"
17
+ },
18
+ "outputs": [],
19
+ "source": [
20
+ "import gradio as gr\n",
21
+ "import torch\n",
22
+ "import torch.nn as nn\n",
23
+ "from joblib import load\n",
24
+ "import sklearn"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 306,
30
+ "metadata": {
31
+ "id": "egV9aaWzLOvk"
32
+ },
33
+ "outputs": [],
34
+ "source": [
35
+ "user_preferences = pd.read_csv('user_preferences.zip')"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 307,
41
+ "metadata": {
42
+ "id": "-7EqGsy7LOvj"
43
+ },
44
+ "outputs": [],
45
+ "source": [
46
+ "# Define the same neural network model\n",
47
+ "class ImprovedSongRecommender(nn.Module):\n",
48
+ " def __init__(self, input_size, num_titles):\n",
49
+ " super(ImprovedSongRecommender, self).__init__()\n",
50
+ " self.fc1 = nn.Linear(input_size, 128)\n",
51
+ " self.bn1 = nn.BatchNorm1d(128)\n",
52
+ " self.fc2 = nn.Linear(128, 256)\n",
53
+ " self.bn2 = nn.BatchNorm1d(256)\n",
54
+ " self.fc3 = nn.Linear(256, 128)\n",
55
+ " self.bn3 = nn.BatchNorm1d(128)\n",
56
+ " self.output = nn.Linear(128, num_titles)\n",
57
+ " self.dropout = nn.Dropout(0.5)\n",
58
+ "\n",
59
+ " def forward(self, x):\n",
60
+ " x = torch.relu(self.bn1(self.fc1(x)))\n",
61
+ " x = self.dropout(x)\n",
62
+ " x = torch.relu(self.bn2(self.fc2(x)))\n",
63
+ " x = self.dropout(x)\n",
64
+ " x = torch.relu(self.bn3(self.fc3(x)))\n",
65
+ " x = self.dropout(x)\n",
66
+ " x = self.output(x)\n",
67
+ " return x\n",
68
+ "\n",
69
+ "# Load the trained model\n",
70
+ "model_path = \"improved_model.pth\"\n",
71
+ "num_unique_titles = 4855"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 308,
77
+ "metadata": {
78
+ "colab": {
79
+ "base_uri": "https://localhost:8080/"
80
+ },
81
+ "id": "WnWXqoEeLOvk",
82
+ "outputId": "bc9d2c9a-6e8c-40b8-8cff-303d23b38cbd"
83
+ },
84
+ "outputs": [
85
+ {
86
+ "output_type": "execute_result",
87
+ "data": {
88
+ "text/plain": [
89
+ "ImprovedSongRecommender(\n",
90
+ " (fc1): Linear(in_features=2, out_features=128, bias=True)\n",
91
+ " (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
92
+ " (fc2): Linear(in_features=128, out_features=256, bias=True)\n",
93
+ " (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
94
+ " (fc3): Linear(in_features=256, out_features=128, bias=True)\n",
95
+ " (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
96
+ " (output): Linear(in_features=128, out_features=4855, bias=True)\n",
97
+ " (dropout): Dropout(p=0.5, inplace=False)\n",
98
+ ")"
99
+ ]
100
+ },
101
+ "metadata": {},
102
+ "execution_count": 308
103
+ }
104
+ ],
105
+ "source": [
106
+ "model = ImprovedSongRecommender(input_size=2, num_titles=num_unique_titles)\n",
107
+ "model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))\n",
108
+ "model.eval()"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": 309,
114
+ "metadata": {
115
+ "id": "s5acd8QeLOvk"
116
+ },
117
+ "outputs": [],
118
+ "source": [
119
+ "# Load the label encoders and scaler\n",
120
+ "label_encoders_path = \"new_label_encoders.joblib\"\n",
121
+ "scaler_path = \"new_scaler.joblib\"\n",
122
+ "\n",
123
+ "label_encoders = load(label_encoders_path)\n",
124
+ "scaler = load(scaler_path)\n",
125
+ "\n",
126
+ "# Create a mapping from encoded indices to actual song titles\n",
127
+ "index_to_song_title = {index: title for index, title in enumerate(label_encoders['title'].classes_)}\n"
128
+ ]
129
+ },
130
  {
131
+ "cell_type": "code",
132
+ "source": [
133
+ "from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n",
134
+ "import joblib\n",
135
+ "import re\n",
136
+ "\n",
137
+ "# Function to clean tags and artist names\n",
138
+ "def clean_text(text):\n",
139
+ " # Convert to lowercase\n",
140
+ " text = text.lower()\n",
141
+ " # Remove special characters and digits\n",
142
+ " text = re.sub(r'[^a-zA-Z\\s]', '', text)\n",
143
+ " # Remove extra white spaces\n",
144
+ " text = re.sub(r'\\s+', ' ', text).strip()\n",
145
+ " return text\n",
146
+ "\n",
147
+ "columns_to_check = ['tags', 'artist', 'tags', 'song', 'listeners', 'playcount'] # Specify the columns you want to check for NaN values\n",
148
+ "user_preferences = user_preferences.dropna(subset=columns_to_check)\n",
149
+ "\n",
150
+ "\n",
151
+ "# Clean 'tags' and 'artist_name' columns\n",
152
+ "user_preferences['tags'] = user_preferences['tags'].apply(clean_text)\n",
153
+ "user_preferences['artist'] = user_preferences['artist'].apply(clean_text)\n",
154
+ "\n",
155
+ "def label_encode_data(df):\n",
156
+ " df = df.copy(deep=True)\n",
157
+ " label_encoders = {}\n",
158
+ " unknown_label = 'unknown' # Define an unknown label\n",
159
+ "\n",
160
+ " for column in ['tags', 'song', 'artist']:\n",
161
+ " le = LabelEncoder()\n",
162
+ " unique_categories = df[column].unique().tolist()\n",
163
+ " unique_categories.append(unknown_label)\n",
164
+ " le.fit(unique_categories)\n",
165
+ " df[column] = le.transform(df[column].astype(str))\n",
166
+ " label_encoders[column] = le\n",
167
+ "\n",
168
+ " return df, label_encoders\n",
169
+ "\n",
170
+ "# Normalize numerical features\n",
171
+ "scaler = MinMaxScaler()\n",
172
+ "user_preferences[['listeners', 'playcount']] = scaler.fit_transform(user_preferences[['listeners', 'playcount']])\n",
173
+ "\n",
174
+ "# Label encode categorical features\n",
175
+ "df_scaled, label_encoders = label_encode_data(user_preferences.loc[:, ['tags', 'artist', 'listeners', 'playcount', 'song']])"
176
+ ],
177
+ "metadata": {
178
+ "colab": {
179
+ "base_uri": "https://localhost:8080/"
180
+ },
181
+ "id": "qeuVdOrZMX2H",
182
+ "outputId": "3e38f50d-a6fe-4ec4-eafe-c119ef4228fe"
183
+ },
184
+ "execution_count": 310,
185
+ "outputs": [
186
+ {
187
+ "output_type": "stream",
188
+ "name": "stderr",
189
+ "text": [
190
+ "<ipython-input-310-b2dbd9207146>:20: SettingWithCopyWarning: \n",
191
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
192
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
193
+ "\n",
194
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
195
+ " user_preferences['tags'] = user_preferences['tags'].apply(clean_text)\n",
196
+ "<ipython-input-310-b2dbd9207146>:21: SettingWithCopyWarning: \n",
197
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
198
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
199
+ "\n",
200
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
201
+ " user_preferences['artist'] = user_preferences['artist'].apply(clean_text)\n",
202
+ "<ipython-input-310-b2dbd9207146>:40: SettingWithCopyWarning: \n",
203
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
204
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
205
+ "\n",
206
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
207
+ " user_preferences[['listeners', 'playcount']] = scaler.fit_transform(user_preferences[['listeners', 'playcount']])\n"
208
+ ]
209
+ }
210
+ ]
211
+ },
212
+ {
213
+ "cell_type": "code",
214
+ "source": [
215
+ "from sklearn.model_selection import train_test_split"
216
+ ],
217
+ "metadata": {
218
+ "id": "f8Z0xtfCOWkC"
219
+ },
220
+ "execution_count": 311,
221
+ "outputs": []
222
+ },
223
+ {
224
+ "cell_type": "code",
225
+ "source": [
226
+ "# Split data into features and target\n",
227
+ "X = df_scaled[['tags', 'artist']]\n",
228
+ "y = df_scaled['song']\n",
229
+ "\n",
230
+ "# Split the dataset into training and testing sets\n",
231
+ "X_valid, X_test, y_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
232
+ "print(\"Data split into validation and testing sets.\")"
233
+ ],
234
+ "metadata": {
235
+ "colab": {
236
+ "base_uri": "https://localhost:8080/"
237
+ },
238
+ "id": "tuyHessoL9AS",
239
+ "outputId": "9af89ed4-5ce3-423a-a60e-e6c012b35421"
240
+ },
241
+ "execution_count": 312,
242
+ "outputs": [
243
+ {
244
+ "output_type": "stream",
245
+ "name": "stdout",
246
+ "text": [
247
+ "Data split into validation and testing sets.\n"
248
+ ]
249
+ }
250
+ ]
251
+ },
252
+ {
253
+ "cell_type": "code",
254
+ "source": [
255
+ "import torch\n",
256
+ "import torch.nn as nn\n",
257
+ "import torch.optim as optim\n",
258
+ "from torch.utils.data import DataLoader\n",
259
+ "import numpy as np\n",
260
+ "from sklearn.metrics import accuracy_score"
261
+ ],
262
+ "metadata": {
263
+ "id": "YO3SpUROPRIL"
264
+ },
265
+ "execution_count": 313,
266
+ "outputs": []
267
+ },
268
+ {
269
+ "cell_type": "code",
270
+ "source": [
271
+ "valid_loader = DataLoader(list(zip(X_valid.values.astype(float), y_valid)), batch_size=1, shuffle=True)\n",
272
+ "test_loader = DataLoader(list(zip(X_test.values.astype(float), y_test)), batch_size=1, shuffle=False)\n"
273
+ ],
274
+ "metadata": {
275
+ "id": "ddLMncl-Paj5"
276
+ },
277
+ "execution_count": 314,
278
+ "outputs": []
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "source": [
283
+ "valid_accuracy = 0\n",
284
+ "test_accuracy = 0\n",
285
+ "for features, labels in valid_loader:\n",
286
+ " preds = model(features.float().detach())\n",
287
+ "\n",
288
+ " # Get the predicted class (the one with the highest score)\n",
289
+ " _, predicted_class = torch.max(preds, 1)\n",
290
+ "\n",
291
+ " # Convert to numpy arrays\n",
292
+ " predicted_class_np = predicted_class.numpy()\n",
293
+ " labels_np = labels.numpy()\n",
294
+ "\n",
295
+ " # Calculate accuracy\n",
296
+ " accuracy = accuracy_score(labels_np, predicted_class_np)\n",
297
+ " valid_accuracy += accuracy\n",
298
+ "\n",
299
+ "for features, labels in test_loader:\n",
300
+ " preds = model(features.float())\n",
301
+ " # Get the predicted class (the one with the highest score)\n",
302
+ " _, predicted_class = torch.max(preds, 1)\n",
303
+ "\n",
304
+ " # Convert to numpy arrays\n",
305
+ " predicted_class_np = predicted_class.numpy()\n",
306
+ " labels_np = labels.numpy()\n",
307
+ "\n",
308
+ " # Calculate accuracy\n",
309
+ " accuracy = accuracy_score(labels_np, predicted_class_np)\n",
310
+ " test_accuracy += accuracy"
311
+ ],
312
+ "metadata": {
313
+ "id": "CIH4yNETOR6r"
314
+ },
315
+ "execution_count": 315,
316
+ "outputs": []
317
+ },
318
+ {
319
+ "cell_type": "code",
320
+ "source": [
321
+ "print('The loss of the model on the unseen validation dataset is: ', valid_accuracy)\n",
322
+ "print('The loss of the model on the unseen test dataset is: ', test_accuracy)"
323
+ ],
324
+ "metadata": {
325
+ "colab": {
326
+ "base_uri": "https://localhost:8080/"
327
+ },
328
+ "id": "Tf5kf1dMOpdw",
329
+ "outputId": "5377af95-5412-4593-e4b7-c74ec03425a0"
330
+ },
331
+ "execution_count": 316,
332
+ "outputs": [
333
+ {
334
+ "output_type": "stream",
335
+ "name": "stdout",
336
+ "text": [
337
+ "The loss of the model on the unseen validation dataset is: 2.0\n",
338
+ "The loss of the model on the unseen test dataset is: 0.0\n"
339
+ ]
340
+ }
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "source": [],
346
+ "metadata": {
347
+ "id": "TYbj1lHYQZtg"
348
+ },
349
+ "execution_count": 316,
350
+ "outputs": []
351
+ }
352
+ ],
353
+ "metadata": {
354
+ "kernelspec": {
355
+ "display_name": "base",
356
+ "language": "python",
357
+ "name": "python3"
358
+ },
359
+ "language_info": {
360
+ "codemirror_mode": {
361
+ "name": "ipython",
362
+ "version": 3
363
+ },
364
+ "file_extension": ".py",
365
+ "mimetype": "text/x-python",
366
+ "name": "python",
367
+ "nbconvert_exporter": "python",
368
+ "pygments_lexer": "ipython3",
369
+ "version": "3.8.1"
370
+ },
371
+ "colab": {
372
+ "provenance": []
373
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  },
375
+ "nbformat": 4,
376
+ "nbformat_minor": 0
377
+ }