pea-movie2moviev2

Runtime error

App Files Files Community

Kamand commited on Apr 23, 2022

Commit

d170d9a

•

1 Parent(s): 6631f1c

Create app.py

Browse files

Files changed (1) hide show

app.py +482 -0

app.py ADDED Viewed

	@@ -0,0 +1,482 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[2]:
+get_ipython().run_line_magic('matplotlib', 'inline')
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import gradio as gr
+from scipy import stats
+from ast import literal_eval
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
+from nltk.stem.snowball import SnowballStemmer
+from nltk.stem.wordnet import WordNetLemmatizer
+from nltk.corpus import wordnet
+from surprise import Reader, Dataset, SVD
+import warnings; warnings.simplefilter('ignore')
+import surprise
+# In[3]:
+path = 'C:/HW/Spring 2022/Deep learning/Project/all csvs'
+# In[4]:
+md = pd.read_csv(path+'/movies_metadata.csv')
+md.head(2)
+# <b> Simple rec system <b>
+#
+# In[5]:
+md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
+# fillna replaces NaN values with '[]'
+# Get just the genres
+# Weighted Rating (WR) =  (v/(v+m)*R)+(m/(v+m).C)
+#
+# where,
+#
+# [1] v is the number of votes for the movie <br>
+# [2] m is the minimum votes required to be listed in the chart <br>
+# [3] R is the average rating of the movie <br>
+# [4] C is the mean vote across the whole report <br>
+# In[6]:
+vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype(int)
+vote_average = md[md['vote_average'].notnull()]['vote_average'].astype(int)
+C = np.mean(vote_average)
+m = vote_counts.quantile(0.95)
+print('The average rating for these movies is: ',C)
+print('The minimum votes required to be listed in the chart: ',m)
+# In[7]:
+# Keeping the year from the date
+md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
+# In[8]:
+md['popularity']
+# In[9]:
+qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
+# In[10]:
+qualified['vote_count'] = qualified['vote_count'].astype(int)
+qualified['vote_average'] = qualified['vote_average'].astype(int)
+qualified.shape
+# In[11]:
+def weighted_rating(x):
+    v = x['vote_count']
+    R = x['vote_average']
+    return (v/(v+m) * R) + (m/(m+v) * C)
+# In[12]:
+qualified['wr'] = qualified.apply(weighted_rating, axis=1)
+qualified = qualified.sort_values('wr',ascending = False).head(250)
+# In[13]:
+s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
+s.name = 'genre'
+gen_md = md.drop('genres', axis=1).join(s)
+# In[14]:
+def build_chart(genre, percentile=0.85):
+    df = gen_md[gen_md['genre'] == genre] # Getting gen_md for specific genres
+    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
+    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
+    C = vote_averages.mean()
+    m = vote_counts.quantile(percentile)
+    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
+    qualified['vote_count'] = qualified['vote_count'].astype('int')
+    qualified['vote_average'] = qualified['vote_average'].astype('int')
+    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
+    qualified = qualified.sort_values('wr', ascending=False).head(250)
+    return qualified
+# In[15]:
+build_chart('Romance')
+# <b> Content Based Recommender/ Filtering <b>
+#
+# In this section we personalize the movie recommendations, Content Based Recommenders based on:
+#
+# Movie Overviews and Taglines <br>
+# Movie Cast, Crew, Keywords and Genre
+#
+# In[16]:
+links = pd.read_csv(path+'/links_small.csv')
+links = links[links['tmdbId'].notnull()]['tmdbId'].astype(int)
+# In[17]:
+md = md.drop([19730, 29503, 35587])
+# In[18]:
+md['id'] = md['id'].astype('int')
+# In[19]:
+# Getting the movies that their IDs exist in "links"
+smd = md[md['id'].isin(links)]
+smd.shape
+# In[20]:
+smd['tagline'] = smd['tagline'].fillna('')
+smd['description'] = smd['overview'] + smd['tagline']
+smd['description'] = smd['description'].fillna('')
+# <b><font size="3"> This is where things gets exciting!!!!!!!!!<font> <b>
+#
+# [1] Convert a collection of raw documents to a matrix of TF-IDF features --  TF-IDF: term frequency–inverse document frequency <br>
+# <b>how many times a word appears in a document, and the inverse document frequency of the word across a set of documents?<b> <br>
+#
+# [2] ngram_range: All values of n such that min_n <= n <= max_n will be used. For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams,  So we're using both unigrams and bigrams <br>
+#
+# [3] A 1-gram (or unigram) is a one-word sequence. ... A 2-gram (or bigram) is a two-word sequence of words, like “I love”, “love reading”, or “Analytics Vidhya”. And a 3-gram (or trigram) is a three-word sequence of words like “I love reading”
+# In[21]:
+tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
+tfidf_matrix = tf.fit_transform(smd['description'])
+# In[22]:
+cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
+# In[23]:
+smd = smd.reset_index()
+# In[24]:
+titles = smd['title']
+indices = pd.Series(smd.index, index=smd['title'])
+# In[25]:
+tfidf_matrix.shape
+# In[34]:
+def get_recommendations(title):
+    if indices[title].shape ==():
+        idx = indices[title]
+    else:
+        idx = indices[title][0]
+    sim = cosine_sim
+    sim_scores = list(enumerate(sim[idx]))
+    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
+    sim_scores = sim_scores[1:5]
+    title_idx= [l[0] for l in sim_scores]
+    title_rec = [titles[i] for i in title_idx]
+    return title_rec
+# In[28]:
+def greet(name):
+    return "Hello " + name + "!!"
+# In[27]:
+get_recommendations('The Dark Knight',cosine_sim)
+# <b> <font size="3"> Adding the metadata to the rec system <font> <b>
+# In[42]:
+credits = pd.read_csv(path+'/credits.csv')
+keywords = pd.read_csv(path+'/keywords.csv')
+# In[43]:
+keywords['id'] = keywords['id'].astype('int')
+credits['id'] = credits['id'].astype('int')
+md['id'] = md['id'].astype('int')
+# In[44]:
+md = md.merge(credits, on = 'id')
+md = md.merge(keywords, on = 'id')
+smd = md[md['id'].isin(links)]
+# In[45]:
+smd.shape
+# In[46]:
+smd['cast'] = smd['cast'].apply(literal_eval)
+smd['crew'] = smd['crew'].apply(literal_eval)
+smd['keywords'] = smd['keywords'].apply(literal_eval)
+smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
+smd['crew_size'] = smd['crew'].apply(lambda x: len(x))
+# In[47]:
+def get_director(x):
+    for i in x:
+        if i['job'] == 'Director':
+            return i['name']
+    return np.nan
+# In[48]:
+smd['director'] = smd['crew'].apply(get_director)
+smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
+smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x)>=3 else x)
+# In[49]:
+smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
+# In[50]:
+smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])
+# In[51]:
+smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
+smd['director'] = smd['director'].apply(lambda x: [x,x, x])
+# we mentioned director 3 times to give it more weight
+# In[52]:
+s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
+s.name = 'keyword'
+s=s.value_counts()
+s = s[s>1]
+# In[53]:
+stemmer = SnowballStemmer('english')
+# In[54]:
+stemmer.stem('')
+# In[55]:
+smd['keywords'] = smd['keywords'].apply(lambda x: [i for i in x if i in s])
+smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
+smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])
+# In[56]:
+smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
+smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))
+# In[57]:
+count = CountVectorizer(analyzer = 'word', ngram_range = (1,2), min_df = 0, stop_words = 'english')
+count_matrix = count.fit_transform(smd['soup'])
+# In[58]:
+cosine_sim2 = linear_kernel(count_matrix, count_matrix)
+# In[59]:
+smd = smd.reset_index()
+titles = smd['title']
+indices = pd.Series(smd.index, index=smd['title'])
+# In[45]:
+cosine_sim2.shape
+# In[60]:
+def get_recommendations(title,sim):
+    if indices[title].shape ==():
+        idx = indices[title]
+    else:
+        idx = indices[title][0]
+    sim_scores = list(enumerate(sim[idx]))
+    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
+    sim_scores = sim_scores[1:31]
+    title_idx= [l[0] for l in sim_scores]
+    title_rec = [titles[i] for i in title_idx]
+    return title_rec
+# In[62]:
+get_recommendations('The Avengers',cosine_sim2)
+# <font size="3"> This recommendation system works a lot better than the first, but it doesn't take popularity into account. <font>
+# In[75]:
+def improved_recommendations(title):
+    idx = indices[title]
+    sim_scores = list(enumerate(cosine_sim2[idx]))
+    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
+    sim_scores = sim_scores[1:26]
+    movie_indices = [i[0] for i in sim_scores]
+    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
+    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
+    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
+    C = vote_averages.mean()
+    m = vote_counts.quantile(0.60)
+    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
+    qualified['vote_count'] = qualified['vote_count'].astype('int')
+    qualified['vote_average'] = qualified['vote_average'].astype('int')
+    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
+    qualified = qualified.sort_values('wr', ascending=False).head(10)
+    return list(qualified['title'])
+# In[76]:
+list(improved_recommendations('Mean Girls'))
+# In[81]:
+iface = gr.Interface(fn=improved_recommendations, title= "Enter movie title for recommendations",inputs="text", outputs=["text",'text','text','text',"text",'text','text','text'], examples = ['The Dark Knight', 'Mean Girls', 'Avatar','The Godfather', 'Top Gun', 'Toy Story'])
+iface.launch(share=True)
+# In[83]:
+get_ipython().system('git clone https://huggingface.co/spaces/Kamand/Movie_Recommendation')
+# In[ ]:
+get_ipython().system('git add app.py')
+get_ipython().system('git commit -m "Add application file"')
+get_ipython().system('git push')