Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[2]: | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import gradio as gr | |
#from scipy import stats | |
from ast import literal_eval | |
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity | |
from nltk.stem.snowball import SnowballStemmer | |
from nltk.stem.wordnet import WordNetLemmatizer | |
from nltk.corpus import wordnet | |
#from surprise import Reader, Dataset, SVD | |
import warnings; warnings.simplefilter('ignore') | |
#import surprise | |
# In[3]: | |
path = '.' | |
# In[4]: | |
md = pd.read_csv(path+'/movies_metadata.csv') | |
md.head(2) | |
# <b> Simple rec system <b> | |
# | |
# In[5]: | |
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []) | |
# fillna replaces NaN values with '[]' | |
# Get just the genres | |
# Weighted Rating (WR) = (v/(v+m)*R)+(m/(v+m).C) | |
# | |
# where, | |
# | |
# [1] v is the number of votes for the movie <br> | |
# [2] m is the minimum votes required to be listed in the chart <br> | |
# [3] R is the average rating of the movie <br> | |
# [4] C is the mean vote across the whole report <br> | |
# In[6]: | |
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype(int) | |
vote_average = md[md['vote_average'].notnull()]['vote_average'].astype(int) | |
C = np.mean(vote_average) | |
m = vote_counts.quantile(0.95) | |
print('The average rating for these movies is: ',C) | |
print('The minimum votes required to be listed in the chart: ',m) | |
# In[7]: | |
# Keeping the year from the date | |
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan) | |
# In[8]: | |
md['popularity'] | |
# In[9]: | |
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']] | |
# In[10]: | |
qualified['vote_count'] = qualified['vote_count'].astype(int) | |
qualified['vote_average'] = qualified['vote_average'].astype(int) | |
# In[11]: | |
def weighted_rating(x): | |
v = x['vote_count'] | |
R = x['vote_average'] | |
return (v/(v+m) * R) + (m/(m+v) * C) | |
# In[12]: | |
qualified['wr'] = qualified.apply(weighted_rating, axis=1) | |
qualified = qualified.sort_values('wr',ascending = False).head(250) | |
# In[13]: | |
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True) | |
s.name = 'genre' | |
gen_md = md.drop('genres', axis=1).join(s) | |
# In[14]: | |
def build_chart(genre, percentile=0.85): | |
df = gen_md[gen_md['genre'] == genre] # Getting gen_md for specific genres | |
vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int') | |
vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int') | |
C = vote_averages.mean() | |
m = vote_counts.quantile(percentile) | |
qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']] | |
qualified['vote_count'] = qualified['vote_count'].astype('int') | |
qualified['vote_average'] = qualified['vote_average'].astype('int') | |
qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1) | |
qualified = qualified.sort_values('wr', ascending=False).head(250) | |
return qualified | |
# In[15]: | |
# <b> Content Based Recommender/ Filtering <b> | |
# | |
# In this section we personalize the movie recommendations, Content Based Recommenders based on: | |
# | |
# Movie Overviews and Taglines <br> | |
# Movie Cast, Crew, Keywords and Genre | |
# | |
# In[16]: | |
links = pd.read_csv(path+'/links_small.csv') | |
links = links[links['tmdbId'].notnull()]['tmdbId'].astype(int) | |
# In[17]: | |
md = md.drop([19730, 29503, 35587]) | |
# In[18]: | |
md['id'] = md['id'].astype('int') | |
# In[19]: | |
# Getting the movies that their IDs exist in "links" | |
smd = md[md['id'].isin(links)] | |
smd.shape | |
# In[20]: | |
smd['tagline'] = smd['tagline'].fillna('') | |
smd['description'] = smd['overview'] + smd['tagline'] | |
smd['description'] = smd['description'].fillna('') | |
# <b><font size="3"> This is where things gets exciting!!!!!!!!!<font> <b> | |
# | |
# [1] Convert a collection of raw documents to a matrix of TF-IDF features -- TF-IDF: term frequency–inverse document frequency <br> | |
# <b>how many times a word appears in a document, and the inverse document frequency of the word across a set of documents?<b> <br> | |
# | |
# [2] ngram_range: All values of n such that min_n <= n <= max_n will be used. For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, So we're using both unigrams and bigrams <br> | |
# | |
# [3] A 1-gram (or unigram) is a one-word sequence. ... A 2-gram (or bigram) is a two-word sequence of words, like “I love”, “love reading”, or “Analytics Vidhya”. And a 3-gram (or trigram) is a three-word sequence of words like “I love reading” | |
# In[21]: | |
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english') | |
tfidf_matrix = tf.fit_transform(smd['description']) | |
# In[22]: | |
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) | |
# In[23]: | |
smd = smd.reset_index() | |
smd['title'] = smd['title'].apply(lambda x: str.lower(x)) | |
# In[24]: | |
titles = smd['title'] | |
indices = pd.Series(smd.index, index=smd['title']) | |
# In[25]: | |
# In[28]: | |
# In[27]: | |
# <b> <font size="3"> Adding the metadata to the rec system <font> <b> | |
# In[42]: | |
credits = pd.read_csv(path+'/credits.csv') | |
keywords = pd.read_csv(path+'/keywords.csv') | |
# In[43]: | |
keywords['id'] = keywords['id'].astype('int') | |
credits['id'] = credits['id'].astype('int') | |
md['id'] = md['id'].astype('int') | |
# In[44]: | |
md = md.merge(credits, on = 'id') | |
md = md.merge(keywords, on = 'id') | |
smd = md[md['id'].isin(links)] | |
# In[45]: | |
#smd.shape | |
# In[46]: | |
smd['cast'] = smd['cast'].apply(literal_eval) | |
smd['crew'] = smd['crew'].apply(literal_eval) | |
smd['keywords'] = smd['keywords'].apply(literal_eval) | |
smd['cast_size'] = smd['cast'].apply(lambda x: len(x)) | |
smd['crew_size'] = smd['crew'].apply(lambda x: len(x)) | |
# In[47]: | |
def get_director(x): | |
for i in x: | |
if i['job'] == 'Director': | |
return i['name'] | |
return np.nan | |
# In[48]: | |
smd['director'] = smd['crew'].apply(get_director) | |
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else []) | |
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x)>=3 else x) | |
# In[49]: | |
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else []) | |
# In[50]: | |
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x]) | |
# In[51]: | |
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", ""))) | |
smd['director'] = smd['director'].apply(lambda x: [x,x, x]) | |
# we mentioned director 3 times to give it more weight | |
# In[52]: | |
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True) | |
s.name = 'keyword' | |
s=s.value_counts() | |
s = s[s>1] | |
# In[53]: | |
stemmer = SnowballStemmer('english') | |
# In[54]: | |
stemmer.stem('') | |
# In[55]: | |
smd['keywords'] = smd['keywords'].apply(lambda x: [i for i in x if i in s]) | |
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x]) | |
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x]) | |
# In[56]: | |
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres'] | |
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x)) | |
# In[57]: | |
count = CountVectorizer(analyzer = 'word', ngram_range = (1,2), min_df = 0, stop_words = 'english') | |
count_matrix = count.fit_transform(smd['soup']) | |
# In[58]: | |
cosine_sim2 = linear_kernel(count_matrix, count_matrix) | |
# In[59]: | |
smd = smd.reset_index() | |
smd['title'] = smd['title'].apply(lambda x: str.lower(x)) | |
titles = smd['title'] | |
indices = pd.Series(smd.index, index=smd['title']) | |
# In[45]: | |
#cosine_sim2.shape | |
# In[60]: | |
#get_recommendations('The Avengers',cosine_sim2) | |
# <font size="3"> This recommendation system works a lot better than the first, but it doesn't take popularity into account. <font> | |
# In[75]: | |
def improved_recommendations(title): | |
title = str.lower(title) | |
idx = indices[title] | |
sim_scores = list(enumerate(cosine_sim2[idx])) | |
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) | |
sim_scores = sim_scores[1:26] | |
movie_indices = [i[0] for i in sim_scores] | |
movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']] | |
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int') | |
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int') | |
C = vote_averages.mean() | |
m = vote_counts.quantile(0.60) | |
qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())] | |
qualified['vote_count'] = qualified['vote_count'].astype('int') | |
qualified['vote_average'] = qualified['vote_average'].astype('int') | |
qualified['wr'] = qualified.apply(weighted_rating, axis=1) | |
qualified = qualified.sort_values('wr', ascending=False).head(10) | |
return list(qualified['title'].apply(lambda x: x.title())) | |
# In[76]: | |
#list(improved_recommendations('Mean Girls')) | |
# In[81]: | |
iface = gr.Interface(fn=improved_recommendations, title= "Enter movie title for recommendations",inputs="text", outputs=["text",'text','text','text',"text",'text','text','text'], examples = ['The Dark Knight', 'Mean Girls', 'Avatar','The Godfather', 'Top Gun', 'Toy Story']) | |
iface.launch() | |
# In[83]: | |