Kamand commited on
Commit
d170d9a
1 Parent(s): 6631f1c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +482 -0
app.py ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[2]:
5
+
6
+
7
+ get_ipython().run_line_magic('matplotlib', 'inline')
8
+ import pandas as pd
9
+ import numpy as np
10
+ import matplotlib.pyplot as plt
11
+ import gradio as gr
12
+ from scipy import stats
13
+ from ast import literal_eval
14
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
15
+ from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
16
+ from nltk.stem.snowball import SnowballStemmer
17
+ from nltk.stem.wordnet import WordNetLemmatizer
18
+ from nltk.corpus import wordnet
19
+ from surprise import Reader, Dataset, SVD
20
+ import warnings; warnings.simplefilter('ignore')
21
+ import surprise
22
+
23
+
24
+ # In[3]:
25
+
26
+
27
+ path = 'C:/HW/Spring 2022/Deep learning/Project/all csvs'
28
+
29
+
30
+ # In[4]:
31
+
32
+
33
+ md = pd.read_csv(path+'/movies_metadata.csv')
34
+ md.head(2)
35
+
36
+
37
+ # <b> Simple rec system <b>
38
+ #
39
+
40
+ # In[5]:
41
+
42
+
43
+ md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
44
+ # fillna replaces NaN values with '[]'
45
+ # Get just the genres
46
+
47
+
48
+ # Weighted Rating (WR) = (v/(v+m)*R)+(m/(v+m).C)
49
+ #
50
+ # where,
51
+ #
52
+ # [1] v is the number of votes for the movie <br>
53
+ # [2] m is the minimum votes required to be listed in the chart <br>
54
+ # [3] R is the average rating of the movie <br>
55
+ # [4] C is the mean vote across the whole report <br>
56
+
57
+ # In[6]:
58
+
59
+
60
+ vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype(int)
61
+ vote_average = md[md['vote_average'].notnull()]['vote_average'].astype(int)
62
+
63
+ C = np.mean(vote_average)
64
+ m = vote_counts.quantile(0.95)
65
+
66
+ print('The average rating for these movies is: ',C)
67
+ print('The minimum votes required to be listed in the chart: ',m)
68
+
69
+
70
+ # In[7]:
71
+
72
+
73
+ # Keeping the year from the date
74
+ md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
75
+
76
+
77
+ # In[8]:
78
+
79
+
80
+ md['popularity']
81
+
82
+
83
+ # In[9]:
84
+
85
+
86
+ qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
87
+
88
+
89
+ # In[10]:
90
+
91
+
92
+ qualified['vote_count'] = qualified['vote_count'].astype(int)
93
+ qualified['vote_average'] = qualified['vote_average'].astype(int)
94
+ qualified.shape
95
+
96
+
97
+ # In[11]:
98
+
99
+
100
+ def weighted_rating(x):
101
+ v = x['vote_count']
102
+ R = x['vote_average']
103
+ return (v/(v+m) * R) + (m/(m+v) * C)
104
+
105
+
106
+ # In[12]:
107
+
108
+
109
+ qualified['wr'] = qualified.apply(weighted_rating, axis=1)
110
+ qualified = qualified.sort_values('wr',ascending = False).head(250)
111
+
112
+
113
+ # In[13]:
114
+
115
+
116
+ s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
117
+ s.name = 'genre'
118
+ gen_md = md.drop('genres', axis=1).join(s)
119
+
120
+
121
+ # In[14]:
122
+
123
+
124
+ def build_chart(genre, percentile=0.85):
125
+
126
+ df = gen_md[gen_md['genre'] == genre] # Getting gen_md for specific genres
127
+ vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
128
+ vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
129
+ C = vote_averages.mean()
130
+ m = vote_counts.quantile(percentile)
131
+
132
+
133
+ qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
134
+ qualified['vote_count'] = qualified['vote_count'].astype('int')
135
+ qualified['vote_average'] = qualified['vote_average'].astype('int')
136
+
137
+ qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
138
+ qualified = qualified.sort_values('wr', ascending=False).head(250)
139
+
140
+ return qualified
141
+
142
+
143
+ # In[15]:
144
+
145
+
146
+ build_chart('Romance')
147
+
148
+
149
+ # <b> Content Based Recommender/ Filtering <b>
150
+ #
151
+ # In this section we personalize the movie recommendations, Content Based Recommenders based on:
152
+ #
153
+ # Movie Overviews and Taglines <br>
154
+ # Movie Cast, Crew, Keywords and Genre
155
+ #
156
+
157
+ # In[16]:
158
+
159
+
160
+ links = pd.read_csv(path+'/links_small.csv')
161
+ links = links[links['tmdbId'].notnull()]['tmdbId'].astype(int)
162
+
163
+
164
+ # In[17]:
165
+
166
+
167
+ md = md.drop([19730, 29503, 35587])
168
+
169
+
170
+ # In[18]:
171
+
172
+
173
+ md['id'] = md['id'].astype('int')
174
+
175
+
176
+ # In[19]:
177
+
178
+
179
+ # Getting the movies that their IDs exist in "links"
180
+ smd = md[md['id'].isin(links)]
181
+ smd.shape
182
+
183
+
184
+ # In[20]:
185
+
186
+
187
+ smd['tagline'] = smd['tagline'].fillna('')
188
+ smd['description'] = smd['overview'] + smd['tagline']
189
+ smd['description'] = smd['description'].fillna('')
190
+
191
+
192
+ # <b><font size="3"> This is where things gets exciting!!!!!!!!!<font> <b>
193
+ #
194
+ # [1] Convert a collection of raw documents to a matrix of TF-IDF features -- TF-IDF: term frequency–inverse document frequency <br>
195
+ # <b>how many times a word appears in a document, and the inverse document frequency of the word across a set of documents?<b> <br>
196
+ #
197
+ # [2] ngram_range: All values of n such that min_n <= n <= max_n will be used. For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, So we're using both unigrams and bigrams <br>
198
+ #
199
+ # [3] A 1-gram (or unigram) is a one-word sequence. ... A 2-gram (or bigram) is a two-word sequence of words, like “I love”, “love reading”, or “Analytics Vidhya”. And a 3-gram (or trigram) is a three-word sequence of words like “I love reading”
200
+
201
+ # In[21]:
202
+
203
+
204
+ tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
205
+ tfidf_matrix = tf.fit_transform(smd['description'])
206
+
207
+
208
+ # In[22]:
209
+
210
+
211
+ cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
212
+
213
+
214
+ # In[23]:
215
+
216
+
217
+ smd = smd.reset_index()
218
+
219
+
220
+ # In[24]:
221
+
222
+
223
+ titles = smd['title']
224
+ indices = pd.Series(smd.index, index=smd['title'])
225
+
226
+
227
+ # In[25]:
228
+
229
+
230
+ tfidf_matrix.shape
231
+
232
+
233
+ # In[34]:
234
+
235
+
236
+ def get_recommendations(title):
237
+ if indices[title].shape ==():
238
+ idx = indices[title]
239
+ else:
240
+ idx = indices[title][0]
241
+ sim = cosine_sim
242
+ sim_scores = list(enumerate(sim[idx]))
243
+ sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
244
+ sim_scores = sim_scores[1:5]
245
+ title_idx= [l[0] for l in sim_scores]
246
+ title_rec = [titles[i] for i in title_idx]
247
+ return title_rec
248
+
249
+
250
+ # In[28]:
251
+
252
+
253
+ def greet(name):
254
+ return "Hello " + name + "!!"
255
+
256
+
257
+ # In[27]:
258
+
259
+
260
+ get_recommendations('The Dark Knight',cosine_sim)
261
+
262
+
263
+ # <b> <font size="3"> Adding the metadata to the rec system <font> <b>
264
+
265
+ # In[42]:
266
+
267
+
268
+ credits = pd.read_csv(path+'/credits.csv')
269
+ keywords = pd.read_csv(path+'/keywords.csv')
270
+
271
+
272
+ # In[43]:
273
+
274
+
275
+ keywords['id'] = keywords['id'].astype('int')
276
+ credits['id'] = credits['id'].astype('int')
277
+ md['id'] = md['id'].astype('int')
278
+
279
+
280
+ # In[44]:
281
+
282
+
283
+ md = md.merge(credits, on = 'id')
284
+ md = md.merge(keywords, on = 'id')
285
+ smd = md[md['id'].isin(links)]
286
+
287
+
288
+ # In[45]:
289
+
290
+
291
+ smd.shape
292
+
293
+
294
+ # In[46]:
295
+
296
+
297
+ smd['cast'] = smd['cast'].apply(literal_eval)
298
+ smd['crew'] = smd['crew'].apply(literal_eval)
299
+ smd['keywords'] = smd['keywords'].apply(literal_eval)
300
+ smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
301
+ smd['crew_size'] = smd['crew'].apply(lambda x: len(x))
302
+
303
+
304
+ # In[47]:
305
+
306
+
307
+ def get_director(x):
308
+ for i in x:
309
+ if i['job'] == 'Director':
310
+ return i['name']
311
+ return np.nan
312
+
313
+
314
+ # In[48]:
315
+
316
+
317
+ smd['director'] = smd['crew'].apply(get_director)
318
+ smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
319
+ smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x)>=3 else x)
320
+
321
+
322
+ # In[49]:
323
+
324
+
325
+ smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
326
+
327
+
328
+ # In[50]:
329
+
330
+
331
+ smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])
332
+
333
+
334
+ # In[51]:
335
+
336
+
337
+ smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
338
+ smd['director'] = smd['director'].apply(lambda x: [x,x, x])
339
+ # we mentioned director 3 times to give it more weight
340
+
341
+
342
+ # In[52]:
343
+
344
+
345
+ s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
346
+ s.name = 'keyword'
347
+ s=s.value_counts()
348
+ s = s[s>1]
349
+
350
+
351
+ # In[53]:
352
+
353
+
354
+ stemmer = SnowballStemmer('english')
355
+
356
+
357
+ # In[54]:
358
+
359
+
360
+ stemmer.stem('')
361
+
362
+
363
+ # In[55]:
364
+
365
+
366
+ smd['keywords'] = smd['keywords'].apply(lambda x: [i for i in x if i in s])
367
+ smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
368
+ smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])
369
+
370
+
371
+ # In[56]:
372
+
373
+
374
+ smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
375
+ smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))
376
+
377
+
378
+ # In[57]:
379
+
380
+
381
+ count = CountVectorizer(analyzer = 'word', ngram_range = (1,2), min_df = 0, stop_words = 'english')
382
+ count_matrix = count.fit_transform(smd['soup'])
383
+
384
+
385
+ # In[58]:
386
+
387
+
388
+ cosine_sim2 = linear_kernel(count_matrix, count_matrix)
389
+
390
+
391
+ # In[59]:
392
+
393
+
394
+ smd = smd.reset_index()
395
+ titles = smd['title']
396
+ indices = pd.Series(smd.index, index=smd['title'])
397
+
398
+
399
+ # In[45]:
400
+
401
+
402
+ cosine_sim2.shape
403
+
404
+
405
+ # In[60]:
406
+
407
+
408
+ def get_recommendations(title,sim):
409
+ if indices[title].shape ==():
410
+ idx = indices[title]
411
+ else:
412
+ idx = indices[title][0]
413
+ sim_scores = list(enumerate(sim[idx]))
414
+ sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
415
+ sim_scores = sim_scores[1:31]
416
+ title_idx= [l[0] for l in sim_scores]
417
+ title_rec = [titles[i] for i in title_idx]
418
+ return title_rec
419
+
420
+
421
+ # In[62]:
422
+
423
+
424
+ get_recommendations('The Avengers',cosine_sim2)
425
+
426
+
427
+ # <font size="3"> This recommendation system works a lot better than the first, but it doesn't take popularity into account. <font>
428
+
429
+ # In[75]:
430
+
431
+
432
+ def improved_recommendations(title):
433
+ idx = indices[title]
434
+ sim_scores = list(enumerate(cosine_sim2[idx]))
435
+ sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
436
+ sim_scores = sim_scores[1:26]
437
+ movie_indices = [i[0] for i in sim_scores]
438
+
439
+ movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
440
+ vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
441
+ vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
442
+ C = vote_averages.mean()
443
+ m = vote_counts.quantile(0.60)
444
+ qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
445
+ qualified['vote_count'] = qualified['vote_count'].astype('int')
446
+ qualified['vote_average'] = qualified['vote_average'].astype('int')
447
+ qualified['wr'] = qualified.apply(weighted_rating, axis=1)
448
+ qualified = qualified.sort_values('wr', ascending=False).head(10)
449
+ return list(qualified['title'])
450
+
451
+
452
+ # In[76]:
453
+
454
+
455
+ list(improved_recommendations('Mean Girls'))
456
+
457
+
458
+ # In[81]:
459
+
460
+
461
+ iface = gr.Interface(fn=improved_recommendations, title= "Enter movie title for recommendations",inputs="text", outputs=["text",'text','text','text',"text",'text','text','text'], examples = ['The Dark Knight', 'Mean Girls', 'Avatar','The Godfather', 'Top Gun', 'Toy Story'])
462
+ iface.launch(share=True)
463
+
464
+
465
+ # In[83]:
466
+
467
+
468
+ get_ipython().system('git clone https://huggingface.co/spaces/Kamand/Movie_Recommendation')
469
+
470
+
471
+ # In[ ]:
472
+
473
+
474
+ get_ipython().system('git add app.py')
475
+ get_ipython().system('git commit -m "Add application file"')
476
+ get_ipython().system('git push')
477
+
478
+
479
+
480
+
481
+
482
+