#!/usr/bin/env python # coding: utf-8 # In[2]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import gradio as gr #from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet #from surprise import Reader, Dataset, SVD import warnings; warnings.simplefilter('ignore') #import surprise # In[3]: path = '.' # In[4]: md = pd.read_csv(path+'/movies_metadata.csv') md.head(2) # Simple rec system # # In[5]: md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []) # fillna replaces NaN values with '[]' # Get just the genres # Weighted Rating (WR) = (v/(v+m)*R)+(m/(v+m).C) # # where, # # [1] v is the number of votes for the movie
# [2] m is the minimum votes required to be listed in the chart
# [3] R is the average rating of the movie
# [4] C is the mean vote across the whole report
# In[6]: vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype(int) vote_average = md[md['vote_average'].notnull()]['vote_average'].astype(int) C = np.mean(vote_average) m = vote_counts.quantile(0.95) print('The average rating for these movies is: ',C) print('The minimum votes required to be listed in the chart: ',m) # In[7]: # Keeping the year from the date md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan) # In[8]: md['popularity'] # In[9]: qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']] # In[10]: qualified['vote_count'] = qualified['vote_count'].astype(int) qualified['vote_average'] = qualified['vote_average'].astype(int) qualified.shape # In[11]: def weighted_rating(x): v = x['vote_count'] R = x['vote_average'] return (v/(v+m) * R) + (m/(m+v) * C) # In[12]: qualified['wr'] = qualified.apply(weighted_rating, axis=1) qualified = qualified.sort_values('wr',ascending = False).head(250) # In[13]: s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True) s.name = 'genre' gen_md = md.drop('genres', axis=1).join(s) # In[14]: def build_chart(genre, percentile=0.85): df = gen_md[gen_md['genre'] == genre] # Getting gen_md for specific genres vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int') vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int') C = vote_averages.mean() m = vote_counts.quantile(percentile) qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']] qualified['vote_count'] = qualified['vote_count'].astype('int') qualified['vote_average'] = qualified['vote_average'].astype('int') qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1) qualified = qualified.sort_values('wr', ascending=False).head(250) return qualified # In[15]: build_chart('Romance') # Content Based Recommender/ Filtering # # In this section we personalize the movie recommendations, Content Based Recommenders based on: # # Movie Overviews and Taglines
# Movie Cast, Crew, Keywords and Genre # # In[16]: links = pd.read_csv(path+'/links_small.csv') links = links[links['tmdbId'].notnull()]['tmdbId'].astype(int) # In[17]: md = md.drop([19730, 29503, 35587]) # In[18]: md['id'] = md['id'].astype('int') # In[19]: # Getting the movies that their IDs exist in "links" smd = md[md['id'].isin(links)] smd.shape # In[20]: smd['tagline'] = smd['tagline'].fillna('') smd['description'] = smd['overview'] + smd['tagline'] smd['description'] = smd['description'].fillna('') # This is where things gets exciting!!!!!!!!! # # [1] Convert a collection of raw documents to a matrix of TF-IDF features -- TF-IDF: term frequency–inverse document frequency
# how many times a word appears in a document, and the inverse document frequency of the word across a set of documents?
# # [2] ngram_range: All values of n such that min_n <= n <= max_n will be used. For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, So we're using both unigrams and bigrams
# # [3] A 1-gram (or unigram) is a one-word sequence. ... A 2-gram (or bigram) is a two-word sequence of words, like “I love”, “love reading”, or “Analytics Vidhya”. And a 3-gram (or trigram) is a three-word sequence of words like “I love reading” # In[21]: tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english') tfidf_matrix = tf.fit_transform(smd['description']) # In[22]: cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) # In[23]: smd = smd.reset_index() # In[24]: titles = smd['title'] indices = pd.Series(smd.index, index=smd['title']) # In[25]: tfidf_matrix.shape # In[34]: def get_recommendations(title): if indices[title].shape ==(): idx = indices[title] else: idx = indices[title][0] sim = cosine_sim sim_scores = list(enumerate(sim[idx])) sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) sim_scores = sim_scores[1:5] title_idx= [l[0] for l in sim_scores] title_rec = [titles[i] for i in title_idx] return title_rec # In[28]: def greet(name): return "Hello " + name + "!!" # In[27]: #get_recommendations('The Dark Knight',cosine_sim) # Adding the metadata to the rec system # In[42]: credits = pd.read_csv(path+'/credits.csv') keywords = pd.read_csv(path+'/keywords.csv') # In[43]: keywords['id'] = keywords['id'].astype('int') credits['id'] = credits['id'].astype('int') md['id'] = md['id'].astype('int') # In[44]: md = md.merge(credits, on = 'id') md = md.merge(keywords, on = 'id') smd = md[md['id'].isin(links)] # In[45]: smd.shape # In[46]: smd['cast'] = smd['cast'].apply(literal_eval) smd['crew'] = smd['crew'].apply(literal_eval) smd['keywords'] = smd['keywords'].apply(literal_eval) smd['cast_size'] = smd['cast'].apply(lambda x: len(x)) smd['crew_size'] = smd['crew'].apply(lambda x: len(x)) # In[47]: def get_director(x): for i in x: if i['job'] == 'Director': return i['name'] return np.nan # In[48]: smd['director'] = smd['crew'].apply(get_director) smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else []) smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x)>=3 else x) # In[49]: smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else []) # In[50]: smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x]) # In[51]: smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", ""))) smd['director'] = smd['director'].apply(lambda x: [x,x, x]) # we mentioned director 3 times to give it more weight # In[52]: s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True) s.name = 'keyword' s=s.value_counts() s = s[s>1] # In[53]: stemmer = SnowballStemmer('english') # In[54]: stemmer.stem('') # In[55]: smd['keywords'] = smd['keywords'].apply(lambda x: [i for i in x if i in s]) smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x]) smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x]) # In[56]: smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres'] smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x)) # In[57]: count = CountVectorizer(analyzer = 'word', ngram_range = (1,2), min_df = 0, stop_words = 'english') count_matrix = count.fit_transform(smd['soup']) # In[58]: cosine_sim2 = linear_kernel(count_matrix, count_matrix) # In[59]: smd = smd.reset_index() titles = smd['title'] indices = pd.Series(smd.index, index=smd['title']) # In[45]: #cosine_sim2.shape # In[60]: def get_recommendations(title,sim): if indices[title].shape ==(): idx = indices[title] else: idx = indices[title][0] sim_scores = list(enumerate(sim[idx])) sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) sim_scores = sim_scores[1:31] title_idx= [l[0] for l in sim_scores] title_rec = [titles[i] for i in title_idx] return title_rec # In[62]: #get_recommendations('The Avengers',cosine_sim2) # This recommendation system works a lot better than the first, but it doesn't take popularity into account. # In[75]: def improved_recommendations(title): idx = indices[title] sim_scores = list(enumerate(cosine_sim2[idx])) sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) sim_scores = sim_scores[1:26] movie_indices = [i[0] for i in sim_scores] movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']] vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int') vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int') C = vote_averages.mean() m = vote_counts.quantile(0.60) qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())] qualified['vote_count'] = qualified['vote_count'].astype('int') qualified['vote_average'] = qualified['vote_average'].astype('int') qualified['wr'] = qualified.apply(weighted_rating, axis=1) qualified = qualified.sort_values('wr', ascending=False).head(10) return list(qualified['title']) # In[76]: #list(improved_recommendations('Mean Girls')) # In[81]: iface = gr.Interface(fn=improved_recommendations, title= "Enter movie title for recommendations",inputs="text", outputs=["text",'text','text','text',"text",'text','text','text'], examples = ['The Dark Knight', 'Mean Girls', 'Avatar','The Godfather', 'Top Gun', 'Toy Story']) iface.launch() # In[83]: