pea-movie2moviev2

Runtime error

App Files Files Community

pea-movie2moviev2 / app.py

Zenovak

Attempt to fix build error which resulted form improper scikit-learn requirements.txt usage

56ed207 12 months ago

raw

history blame contribute delete

No virus

9.76 kB

	#!/usr/bin/env python
	# coding: utf-8

	# In[2]:


	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import gradio as gr
	#from scipy import stats
	from ast import literal_eval
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
	from nltk.stem.snowball import SnowballStemmer
	from nltk.stem.wordnet import WordNetLemmatizer
	from nltk.corpus import wordnet
	#from surprise import Reader, Dataset, SVD
	import warnings; warnings.simplefilter('ignore')
	#import surprise


	# In[3]:


	path = '.'


	# In[4]:


	md = pd.read_csv(path+'/movies_metadata.csv')
	md.head(2)


	# <b> Simple rec system <b>
	#

	# In[5]:


	md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
	# fillna replaces NaN values with '[]'
	# Get just the genres


	# Weighted Rating (WR) = (v/(v+m)*R)+(m/(v+m).C)
	#
	# where,
	#
	# [1] v is the number of votes for the movie <br>
	# [2] m is the minimum votes required to be listed in the chart <br>
	# [3] R is the average rating of the movie <br>
	# [4] C is the mean vote across the whole report <br>

	# In[6]:


	vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype(int)
	vote_average = md[md['vote_average'].notnull()]['vote_average'].astype(int)

	C = np.mean(vote_average)
	m = vote_counts.quantile(0.95)

	print('The average rating for these movies is: ',C)
	print('The minimum votes required to be listed in the chart: ',m)


	# In[7]:


	# Keeping the year from the date
	md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)


	# In[8]:


	md['popularity']


	# In[9]:


	qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]


	# In[10]:


	qualified['vote_count'] = qualified['vote_count'].astype(int)
	qualified['vote_average'] = qualified['vote_average'].astype(int)


	# In[11]:


	def weighted_rating(x):
	v = x['vote_count']
	R = x['vote_average']
	return (v/(v+m) * R) + (m/(m+v) * C)


	# In[12]:


	qualified['wr'] = qualified.apply(weighted_rating, axis=1)
	qualified = qualified.sort_values('wr',ascending = False).head(250)


	# In[13]:


	s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
	s.name = 'genre'
	gen_md = md.drop('genres', axis=1).join(s)


	# In[14]:


	def build_chart(genre, percentile=0.85):

	df = gen_md[gen_md['genre'] == genre] # Getting gen_md for specific genres
	vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
	vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
	C = vote_averages.mean()
	m = vote_counts.quantile(percentile)


	qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
	qualified['vote_count'] = qualified['vote_count'].astype('int')
	qualified['vote_average'] = qualified['vote_average'].astype('int')

	qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
	qualified = qualified.sort_values('wr', ascending=False).head(250)

	return qualified


	# In[15]:




	# <b> Content Based Recommender/ Filtering <b>
	#
	# In this section we personalize the movie recommendations, Content Based Recommenders based on:
	#
	# Movie Overviews and Taglines <br>
	# Movie Cast, Crew, Keywords and Genre
	#

	# In[16]:


	links = pd.read_csv(path+'/links_small.csv')
	links = links[links['tmdbId'].notnull()]['tmdbId'].astype(int)


	# In[17]:


	md = md.drop([19730, 29503, 35587])


	# In[18]:


	md['id'] = md['id'].astype('int')


	# In[19]:


	# Getting the movies that their IDs exist in "links"
	smd = md[md['id'].isin(links)]
	smd.shape


	# In[20]:


	smd['tagline'] = smd['tagline'].fillna('')
	smd['description'] = smd['overview'] + smd['tagline']
	smd['description'] = smd['description'].fillna('')


	# <b><font size="3"> This is where things gets exciting!!!!!!!!!<font> <b>
	#
	# [1] Convert a collection of raw documents to a matrix of TF-IDF features -- TF-IDF: term frequency–inverse document frequency <br>
	# <b>how many times a word appears in a document, and the inverse document frequency of the word across a set of documents?<b> <br>
	#
	# [2] ngram_range: All values of n such that min_n <= n <= max_n will be used. For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, So we're using both unigrams and bigrams <br>
	#
	# [3] A 1-gram (or unigram) is a one-word sequence. ... A 2-gram (or bigram) is a two-word sequence of words, like “I love”, “love reading”, or “Analytics Vidhya”. And a 3-gram (or trigram) is a three-word sequence of words like “I love reading”

	# In[21]:


	tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
	tfidf_matrix = tf.fit_transform(smd['description'])


	# In[22]:


	cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


	# In[23]:


	smd = smd.reset_index()
	smd['title'] = smd['title'].apply(lambda x: str.lower(x))


	# In[24]:


	titles = smd['title']
	indices = pd.Series(smd.index, index=smd['title'])


	# In[25]:







	# In[28]:





	# In[27]:




	# <b> <font size="3"> Adding the metadata to the rec system <font> <b>

	# In[42]:


	credits = pd.read_csv(path+'/credits.csv')
	keywords = pd.read_csv(path+'/keywords.csv')


	# In[43]:


	keywords['id'] = keywords['id'].astype('int')
	credits['id'] = credits['id'].astype('int')
	md['id'] = md['id'].astype('int')


	# In[44]:


	md = md.merge(credits, on = 'id')
	md = md.merge(keywords, on = 'id')
	smd = md[md['id'].isin(links)]


	# In[45]:


	#smd.shape


	# In[46]:


	smd['cast'] = smd['cast'].apply(literal_eval)
	smd['crew'] = smd['crew'].apply(literal_eval)
	smd['keywords'] = smd['keywords'].apply(literal_eval)
	smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
	smd['crew_size'] = smd['crew'].apply(lambda x: len(x))


	# In[47]:


	def get_director(x):
	for i in x:
	if i['job'] == 'Director':
	return i['name']
	return np.nan


	# In[48]:


	smd['director'] = smd['crew'].apply(get_director)
	smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
	smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x)>=3 else x)


	# In[49]:


	smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])


	# In[50]:


	smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])


	# In[51]:


	smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
	smd['director'] = smd['director'].apply(lambda x: [x,x, x])
	# we mentioned director 3 times to give it more weight


	# In[52]:


	s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
	s.name = 'keyword'
	s=s.value_counts()
	s = s[s>1]


	# In[53]:


	stemmer = SnowballStemmer('english')


	# In[54]:


	stemmer.stem('')


	# In[55]:


	smd['keywords'] = smd['keywords'].apply(lambda x: [i for i in x if i in s])
	smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
	smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])


	# In[56]:


	smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
	smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))


	# In[57]:


	count = CountVectorizer(analyzer = 'word', ngram_range = (1,2), min_df = 0, stop_words = 'english')
	count_matrix = count.fit_transform(smd['soup'])


	# In[58]:


	cosine_sim2 = linear_kernel(count_matrix, count_matrix)


	# In[59]:


	smd = smd.reset_index()
	smd['title'] = smd['title'].apply(lambda x: str.lower(x))
	titles = smd['title']
	indices = pd.Series(smd.index, index=smd['title'])


	# In[45]:


	#cosine_sim2.shape


	# In[60]:





	#get_recommendations('The Avengers',cosine_sim2)


	# <font size="3"> This recommendation system works a lot better than the first, but it doesn't take popularity into account. <font>

	# In[75]:


	def improved_recommendations(title):
	title = str.lower(title)
	idx = indices[title]
	sim_scores = list(enumerate(cosine_sim2[idx]))
	sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
	sim_scores = sim_scores[1:26]
	movie_indices = [i[0] for i in sim_scores]

	movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
	vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
	vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
	C = vote_averages.mean()
	m = vote_counts.quantile(0.60)
	qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
	qualified['vote_count'] = qualified['vote_count'].astype('int')
	qualified['vote_average'] = qualified['vote_average'].astype('int')
	qualified['wr'] = qualified.apply(weighted_rating, axis=1)
	qualified = qualified.sort_values('wr', ascending=False).head(10)
	return list(qualified['title'].apply(lambda x: x.title()))


	# In[76]:


	#list(improved_recommendations('Mean Girls'))


	# In[81]:


	iface = gr.Interface(fn=improved_recommendations, title= "Enter movie title for recommendations",inputs="text", outputs=["text",'text','text','text',"text",'text','text','text'], examples = ['The Dark Knight', 'Mean Girls', 'Avatar','The Godfather', 'Top Gun', 'Toy Story'])
	iface.launch()


	# In[83]: