|
import pandas as pd |
|
from arango import ArangoClient |
|
from tqdm import tqdm |
|
import numpy as np |
|
import itertools |
|
import requests |
|
import sys |
|
import oasis |
|
from arango import ArangoClient |
|
|
|
import torch |
|
import torch.nn.functional as F |
|
from torch.nn import Linear |
|
from arango import ArangoClient |
|
import torch_geometric.transforms as T |
|
from torch_geometric.nn import SAGEConv, to_hetero |
|
from torch_geometric.transforms import RandomLinkSplit, ToUndirected |
|
from sentence_transformers import SentenceTransformer |
|
from torch_geometric.data import HeteroData |
|
import yaml |
|
|
|
|
|
|
|
|
|
|
|
def node_mappings(path, index_col): |
|
df = pd.read_csv(path, index_col=index_col) |
|
mapping = {index: i for i, index in enumerate(df.index.unique())} |
|
|
|
return mapping |
|
|
|
|
|
def convert_int(x): |
|
try: |
|
return int(x) |
|
except: |
|
return np.nan |
|
|
|
|
|
def remove_movies(m_id): |
|
''' |
|
# Remove ids which dont have meta data information |
|
''' |
|
no_metadata = [] |
|
for idx in range(len(m_id)): |
|
tmdb_id = id_map.loc[id_map['movieId'] == m_id[idx]] |
|
|
|
if tmdb_id.size == 0: |
|
no_metadata.append(m_id[idx]) |
|
|
|
return no_metadata |
|
|
|
def populate_user_collection(total_users): |
|
batch = [] |
|
BATCH_SIZE = 50 |
|
batch_idx = 1 |
|
index = 0 |
|
user_ids = list(user_mapping.keys()) |
|
user_collection = movie_rec_db["Users"] |
|
for idx in tqdm(range(total_users)): |
|
insert_doc = {} |
|
|
|
insert_doc["_id"] = "Users/" + str(user_mapping[user_ids[idx]]) |
|
insert_doc["original_id"] = str(user_ids[idx]) |
|
|
|
batch.append(insert_doc) |
|
index +=1 |
|
last_record = (idx == (total_users - 1)) |
|
if index % BATCH_SIZE == 0: |
|
|
|
batch_idx += 1 |
|
user_collection.import_bulk(batch) |
|
batch = [] |
|
if last_record and len(batch) > 0: |
|
print("Inserting batch the last batch!") |
|
user_collection.import_bulk(batch) |
|
|
|
def create_ratings_graph(user_id, movie_id, ratings): |
|
batch = [] |
|
BATCH_SIZE = 100 |
|
batch_idx = 1 |
|
index = 0 |
|
edge_collection = movie_rec_db["Ratings"] |
|
for idx in tqdm(range(ratings.shape[0])): |
|
|
|
|
|
if movie_id[idx] in no_metadata: |
|
print('Removing edges with no metadata', movie_id[idx]) |
|
|
|
else: |
|
insert_doc = {} |
|
insert_doc = {"_id": "Ratings" + "/" + 'user-' + str(user_mapping[user_id[idx]]) + "-r-" + "movie-" + str(movie_mappings[movie_id[idx]]), |
|
"_from": ("Users" + "/" + str(user_mapping[user_id[idx]])), |
|
"_to": ("Movie" + "/" + str(movie_mappings[movie_id[idx]])), |
|
"_rating": float(ratings[idx])} |
|
|
|
batch.append(insert_doc) |
|
index += 1 |
|
last_record = (idx == (ratings.shape[0] - 1)) |
|
|
|
if index % BATCH_SIZE == 0: |
|
|
|
batch_idx += 1 |
|
edge_collection.import_bulk(batch) |
|
batch = [] |
|
if last_record and len(batch) > 0: |
|
print("Inserting batch the last batch!") |
|
edge_collection.import_bulk(batch) |
|
|
|
|
|
|
|
|
|
def make_graph(): |
|
metadata_path = './sampled_movie_dataset/movies_metadata.csv' |
|
df = pd.read_csv(metadata_path) |
|
df = df.drop([19730, 29503, 35587]) |
|
df['id'] = df['id'].astype('int') |
|
|
|
links_small = pd.read_csv('./sampled_movie_dataset/links_small.csv') |
|
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int') |
|
|
|
sampled_md = df[df['id'].isin(links_small)] |
|
sampled_md['tagline'] = sampled_md['tagline'].fillna('') |
|
sampled_md['description'] = sampled_md['overview'] + sampled_md['tagline'] |
|
sampled_md['description'] = sampled_md['description'].fillna('') |
|
sampled_md = sampled_md.reset_index() |
|
|
|
indices = pd.Series(sampled_md.index, index=sampled_md['title']) |
|
ind_gen = pd.Series(sampled_md.index, index=sampled_md['genres']) |
|
|
|
ratings_path = './sampled_movie_dataset/ratings_small.csv' |
|
ratings_df = pd.read_csv(ratings_path) |
|
m_id = ratings_df['movieId'].tolist() |
|
m_id = list(dict.fromkeys(m_id)) |
|
|
|
user_mapping = node_mappings(ratings_path, index_col='userId') |
|
movie_mapping = node_mappings(ratings_path, index_col='movieId') |
|
|
|
id_map = pd.read_csv('./sampled_movie_dataset/links_small.csv')[['movieId', 'tmdbId']] |
|
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int) |
|
id_map.columns = ['movieId', 'id'] |
|
id_map = id_map.merge(sampled_md[['title', 'id']], on='id').set_index('title') |
|
indices_map = id_map.set_index('id') |
|
|
|
no_metadata = remove_movies(m_id) |
|
|
|
|
|
for element in no_metadata: |
|
if element in m_id: |
|
print("ids with no metadata information:",element) |
|
m_id.remove(element) |
|
|
|
|
|
movie_mappings = {} |
|
for idx, m in enumerate(m_id): |
|
movie_mappings[m] = idx |
|
|
|
|
|
|
|
return movie_mappings |
|
|
|
|
|
def login_ArangoDB(): |
|
|
|
login = oasis.getTempCredentials(tutorialName="MovieRecommendations", credentialProvider="https://tutorials.arangodb.cloud:8529/_db/_system/tutorialDB/tutorialDB") |
|
|
|
print("https://"+login["hostname"]+":"+str(login["port"])) |
|
print("Username: " + login["username"]) |
|
print("Password: " + login["password"]) |
|
print("Database: " + login["dbName"]) |
|
return login |
|
|
|
def load_data_to_ArangoDB(login): |
|
|
|
movie_rec_db = oasis.connect_python_arango(login) |
|
if not movie_rec_db.has_collection("Movie"): |
|
movie_rec_db.create_collection("Movie", replication_factor=3) |
|
|
|
batch = [] |
|
BATCH_SIZE = 128 |
|
batch_idx = 1 |
|
index = 0 |
|
movie_collection = movie_rec_db["Movie"] |
|
|
|
|
|
for idx in tqdm(range(len(m_id))): |
|
insert_doc = {} |
|
tmdb_id = id_map.loc[id_map['movieId'] == m_id[idx]] |
|
|
|
if tmdb_id.size == 0: |
|
print('No Meta data information at:', m_id[idx]) |
|
|
|
|
|
else: |
|
tmdb_id = int(tmdb_id.iloc[:,1][0]) |
|
emb_id = "Movie/" + str(movie_mappings[m_id[idx]]) |
|
insert_doc["_id"] = emb_id |
|
m_meta = sampled_md.loc[sampled_md['id'] == tmdb_id] |
|
|
|
m_title = m_meta.iloc[0]['title'] |
|
m_poster = m_meta.iloc[0]['poster_path'] |
|
m_description = m_meta.iloc[0]['description'] |
|
m_language = m_meta.iloc[0]['original_language'] |
|
m_genre = m_meta.iloc[0]['genres'] |
|
m_genre = yaml.load(m_genre, Loader=yaml.BaseLoader) |
|
genres = [g['name'] for g in m_genre] |
|
|
|
insert_doc["movieId"] = m_id[idx] |
|
insert_doc["mapped_movieId"] = movie_mappings[m_id[idx]] |
|
insert_doc["tmdbId"] = tmdb_id |
|
insert_doc['movie_title'] = m_title |
|
|
|
insert_doc['description'] = m_description |
|
insert_doc['genres'] = genres |
|
insert_doc['language'] = m_language |
|
|
|
if str(m_poster) == "nan": |
|
insert_doc['poster_path'] = "No poster path available" |
|
else: |
|
insert_doc['poster_path'] = m_poster |
|
|
|
batch.append(insert_doc) |
|
index +=1 |
|
last_record = (idx == (len(m_id) - 1)) |
|
if index % BATCH_SIZE == 0: |
|
|
|
batch_idx += 1 |
|
movie_collection.import_bulk(batch) |
|
batch = [] |
|
if last_record and len(batch) > 0: |
|
print("Inserting batch the last batch!") |
|
movie_collection.import_bulk(batch) |
|
|
|
if not movie_rec_db.has_collection("Users"): |
|
movie_rec_db.create_collection("Users", replication_factor=3) |
|
|
|
total_users = np.unique(ratings_df[['userId']].values.flatten()).shape[0] |
|
print("Total number of Users:", total_users) |
|
populate_user_collection(total_users) |
|
|
|
|
|
if not movie_rec_db.has_collection("Ratings"): |
|
movie_rec_db.create_collection("Ratings", edge=True, replication_factor=3) |
|
|
|
if not movie_rec_db.has_graph("movie_rating_graph"): |
|
movie_rec_db.create_graph('movie_rating_graph', smart=True) |
|
|
|
|
|
movie_rating_graph = movie_rec_db.graph("movie_rating_graph") |
|
|
|
|
|
if not movie_rating_graph.has_vertex_collection("Users"): |
|
movie_rating_graph.vertex_collection("Users") |
|
|
|
|
|
if not movie_rating_graph.has_vertex_collection("Movie"): |
|
movie_rating_graph.vertex_collection("Movie") |
|
|
|
if not movie_rating_graph.has_edge_definition("Ratings"): |
|
Ratings = movie_rating_graph.create_edge_definition( |
|
edge_collection='Ratings', |
|
from_vertex_collections=['Users'], |
|
to_vertex_collections=['Movie'] |
|
) |
|
|
|
user_id, movie_id, ratings = ratings_df[['userId']].values.flatten(), ratings_df[['movieId']].values.flatten() , ratings_df[['rating']].values.flatten() |
|
create_ratings_graph(user_id, movie_id, ratings) |
|
|
|
|
|
return movie_rec_db |
|
|
|
|
|
|
|
|
|
|
|
|
|
|