|
import numpy as np |
|
import pandas as pd |
|
import os |
|
import random |
|
from collections import defaultdict |
|
import surprise |
|
from surprise.reader import Reader |
|
from surprise import Dataset |
|
from surprise.model_selection import GridSearchCV |
|
from surprise.model_selection import cross_validate |
|
from surprise import SVD |
|
from surprise import NMF |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import linear_kernel |
|
|
|
|
|
|
|
W_belongs_to_collection = 0.16 |
|
W_genres = 0.10 |
|
W_original_language = 0.01 |
|
W_title = 0.11 |
|
W_overview = 0.08 |
|
W_production_countries = 0.01 |
|
W_production_companies = 0.02 |
|
W_tagline = 0.10 |
|
W_keywords = 0.10 |
|
W_Director = 0.03 |
|
W_Writer = 0.02 |
|
W_Cast = 0.02 |
|
W_Top_Cast = 0.03 |
|
W_budget_categorized = 0.01 |
|
W_length = 0.02 |
|
W_average_vote_categorized = 0.08 |
|
W_count_vote_categorized = 0.07 |
|
W_era = 0.03 |
|
|
|
tfidf = TfidfVectorizer(stop_words='english') |
|
|
|
movies_filename = pd.read_csv('movies_metadata.csv', low_memory = False) |
|
ratings_filename = pd.read_csv('ratings_small.csv', low_memory = False) |
|
df_popular_popularity = pd.read_csv('df_popular_popularity.csv', low_memory = False) |
|
df_popular_WR_Q = pd.read_csv('df_popular_WR_Q.csv', low_memory = False) |
|
df_cbf_Q = pd.read_csv('df_cbf_Q.csv', low_memory = False) |
|
|
|
df_cbf_Q['belongs_to_collection'] = df_cbf_Q['belongs_to_collection'].fillna("") |
|
df_cbf_Q['overview'] = df_cbf_Q['overview'].fillna("") |
|
df_cbf_Q['spoken_languages'] = df_cbf_Q['spoken_languages'].fillna("") |
|
df_cbf_Q['tagline'] = df_cbf_Q['tagline'].fillna("") |
|
df_cbf_Q['Director'] = df_cbf_Q['Director'].fillna("") |
|
df_cbf_Q['Writer'] = df_cbf_Q['Writer'].fillna("") |
|
|
|
df_cbf1 = df_cbf_Q |
|
df_cbf2 = df_cbf_Q |
|
|
|
ratings = ratings_filename |
|
movie_md = movies_filename |
|
|
|
|
|
|
|
movie_md = movie_md[movie_md['vote_count']>100] |
|
|
|
|
|
ratings = ratings.groupby("userId").filter(lambda x: x['userId'].count() >= 10) |
|
|
|
|
|
movie_ids = [int(x) for x in movie_md['id'].values] |
|
|
|
|
|
ratings = ratings[ratings['movieId'].isin(movie_ids)] |
|
|
|
|
|
|
|
|
|
|
|
|
|
ratings.reset_index(inplace=True, drop=True) |
|
|
|
|
|
|
|
df_cbf_tfidf_belongs_to_collection = tfidf.fit_transform(df_cbf1['belongs_to_collection']) |
|
cosine_sim_belongs_to_collection = linear_kernel(df_cbf_tfidf_belongs_to_collection, df_cbf_tfidf_belongs_to_collection) |
|
df_cbf_tfidf_genres = tfidf.fit_transform(df_cbf1['genres']) |
|
cosine_sim_genres = linear_kernel(df_cbf_tfidf_genres, df_cbf_tfidf_genres) |
|
df_cbf_tfidf_original_language = tfidf.fit_transform(df_cbf1['original_language']) |
|
cosine_sim_original_language = linear_kernel(df_cbf_tfidf_original_language, df_cbf_tfidf_original_language) |
|
df_cbf_tfidf_title = tfidf.fit_transform(df_cbf1['title']) |
|
cosine_sim_title = linear_kernel(df_cbf_tfidf_title, df_cbf_tfidf_title) |
|
df_cbf_tfidf_overview = tfidf.fit_transform(df_cbf1['overview']) |
|
cosine_sim_overview = linear_kernel(df_cbf_tfidf_overview, df_cbf_tfidf_overview) |
|
df_cbf_tfidf_pruduction_countries = tfidf.fit_transform(df_cbf1['production_countries']) |
|
cosine_sim_pruduction_countries = linear_kernel(df_cbf_tfidf_pruduction_countries, df_cbf_tfidf_pruduction_countries) |
|
df_cbf_tfidf_pruduction_companies = tfidf.fit_transform(df_cbf1['production_companies']) |
|
cosine_sim_pruduction_companies = linear_kernel(df_cbf_tfidf_pruduction_companies, df_cbf_tfidf_pruduction_companies) |
|
df_cbf_tfidf_tagline = tfidf.fit_transform(df_cbf1['tagline']) |
|
cosine_sim_tagline = linear_kernel(df_cbf_tfidf_tagline, df_cbf_tfidf_tagline) |
|
df_cbf_tfidf_keywords = tfidf.fit_transform(df_cbf1['keywords']) |
|
cosine_sim_keywords = linear_kernel(df_cbf_tfidf_keywords, df_cbf_tfidf_keywords) |
|
df_cbf_tfidf_Director = tfidf.fit_transform(df_cbf1['Director']) |
|
cosine_sim_Director = linear_kernel(df_cbf_tfidf_Director, df_cbf_tfidf_Director) |
|
df_cbf_tfidf_Writer = tfidf.fit_transform(df_cbf1['Writer']) |
|
cosine_sim_Writer = linear_kernel(df_cbf_tfidf_Writer, df_cbf_tfidf_Writer) |
|
df_cbf_tfidf_Cast = tfidf.fit_transform(df_cbf1['Cast']) |
|
cosine_sim_Cast = linear_kernel(df_cbf_tfidf_Cast, df_cbf_tfidf_Cast) |
|
df_cbf_tfidf_Top_Cast = tfidf.fit_transform(df_cbf1['Top Cast']) |
|
cosine_sim_Top_Cast = linear_kernel(df_cbf_tfidf_Top_Cast, df_cbf_tfidf_Top_Cast) |
|
df_cbf_tfidf_budget_categorized = tfidf.fit_transform(df_cbf1['budget_categorized']) |
|
cosine_sim_budget_categorized = linear_kernel(df_cbf_tfidf_budget_categorized, df_cbf_tfidf_budget_categorized) |
|
df_cbf_tfidf_Length = tfidf.fit_transform(df_cbf1['Length']) |
|
cosine_sim_Length = linear_kernel(df_cbf_tfidf_Length, df_cbf_tfidf_Length) |
|
df_cbf_tfidf_average_vote_categorized = tfidf.fit_transform(df_cbf1['average_vote_categorized']) |
|
cosine_sim_average_vote_categorized = linear_kernel(df_cbf_tfidf_average_vote_categorized, df_cbf_tfidf_average_vote_categorized) |
|
df_cbf_tfidf_count_vote_categorized = tfidf.fit_transform(df_cbf1['count_vote_categorized']) |
|
cosine_sim_count_vote_categorized = linear_kernel(df_cbf_tfidf_count_vote_categorized, df_cbf_tfidf_count_vote_categorized) |
|
df_cbf_tfidf_era = tfidf.fit_transform(df_cbf1['era']) |
|
cosine_sim_era = linear_kernel(df_cbf_tfidf_era, df_cbf_tfidf_era) |
|
|
|
|
|
|
|
cosin_sim_final = np.multiply(cosine_sim_belongs_to_collection, W_belongs_to_collection) + np.multiply(cosine_sim_genres, W_genres) + np.multiply(cosine_sim_original_language, W_original_language) + np.multiply(cosine_sim_title, W_title) + np.multiply(cosine_sim_overview, W_overview) + np.multiply(cosine_sim_pruduction_countries, W_production_countries) + np.multiply(cosine_sim_pruduction_companies, W_production_companies) + np.multiply(cosine_sim_tagline, W_tagline) + np.multiply(cosine_sim_keywords, W_keywords) + np.multiply(cosine_sim_Director, W_Director) + np.multiply(cosine_sim_Writer, W_Writer) + np.multiply(cosine_sim_Cast, W_Cast) + np.multiply(cosine_sim_Top_Cast, W_Top_Cast) + np.multiply(cosine_sim_budget_categorized, W_budget_categorized) + np.multiply(cosine_sim_Length, W_length) + np.multiply(cosine_sim_average_vote_categorized, W_average_vote_categorized) + np.multiply(cosine_sim_count_vote_categorized, W_count_vote_categorized) + np.multiply(cosine_sim_era, W_era) |
|
|
|
|
|
df_cbf2_indices = pd.Series(df_cbf2.index, index=df_cbf2['title']) |
|
|
|
|
|
|
|
def final_recommender_hot_picks_now(Watched_list): |
|
recommended_list = [] |
|
for i in range(10): |
|
recommended_list.append(df_popular_popularity.loc[i, 'title']) |
|
return recommended_list |
|
|
|
|
|
|
|
def final_recommender_hot_picks_of_all_time(Watched_list): |
|
recommended_list = [] |
|
for i in range(10): |
|
recommended_list.append(df_popular_WR_Q.loc[i, 'title']) |
|
return recommended_list |
|
|
|
|
|
|
|
def final_recommender_for_you(Watched_list): |
|
recommended_list = [] |
|
if len(Watched_list) < 3: |
|
for i in range(10): |
|
recommended_list.append(df_popular_WR_Q.loc[i, 'title']) |
|
else: |
|
Watched_movies_list = Watched_list[-3:] |
|
recently_watched = Watched_movies_list[-3:] |
|
for i in range(len(recently_watched)): |
|
y = df_cbf2_indices[recently_watched[i]] |
|
z = list(enumerate(cosin_sim_final[y])) |
|
z = sorted(z, key=lambda x: x[1], reverse=True) |
|
z = z[1:16] |
|
k = [i[0] for i in z] |
|
for j in k: |
|
recommended_list.append(df_cbf2.loc[j, 'title']) |
|
for i in range(len(Watched_movies_list)): |
|
recommended_list.append(Watched_movies_list[i]) |
|
recommended_list = list(set(recommended_list)) |
|
for i in Watched_list: |
|
recommended_list.remove(i) |
|
random.shuffle(recommended_list) |
|
recommended_list = recommended_list[:15] |
|
return recommended_list |
|
|
|
|
|
|
|
|
|
def recommender_svd(watch_list): |
|
df1 = ratings |
|
for i in range(len(watch_list)): |
|
df1 = df1.append({'userId' : int(ratings.loc[26123,'userId'])+1, 'movieId' : int(movie_md.loc[movie_md['title'] == watch_list[i], 'id']), 'rating' : 5, 'timestamp' : 0}, |
|
ignore_index = True) |
|
|
|
|
|
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5), skip_lines=1) |
|
|
|
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader=reader) |
|
|
|
trainset = data.build_full_trainset() |
|
|
|
svd = SVD() |
|
|
|
svd.fit(trainset) |
|
|
|
recommendations = [] |
|
user_movie_interactions_matrix = df1.pivot(index='userId', columns='movieId', values='rating') |
|
|
|
non_interacted_movies = user_movie_interactions_matrix.loc[int(ratings.loc[26123,'userId'])+1][user_movie_interactions_matrix.loc[int(ratings.loc[26123,'userId'])+1].isnull()].index.tolist() |
|
|
|
for item_id in non_interacted_movies: |
|
est = svd.predict(int(ratings.loc[26123,'userId'])+1, item_id).est |
|
movie_name = movie_md[movie_md['id']==str(item_id)]['title'].values[0] |
|
recommendations.append((movie_name, est)) |
|
|
|
recommendations.sort(key=lambda x: x[1], reverse=True) |
|
recommendations = [x[0] for x in recommendations] |
|
return recommendations[:15] |
|
|
|
|