|
import pandas as pd |
|
from arango import ArangoClient |
|
from tqdm import tqdm |
|
import numpy as np |
|
import itertools |
|
import requests |
|
import sys |
|
import oasis |
|
from arango import ArangoClient |
|
|
|
import torch |
|
import torch.nn.functional as F |
|
from torch.nn import Linear |
|
from arango import ArangoClient |
|
import torch_geometric.transforms as T |
|
from torch_geometric.nn import SAGEConv, to_hetero |
|
from torch_geometric.transforms import RandomLinkSplit, ToUndirected |
|
from sentence_transformers import SentenceTransformer |
|
from torch_geometric.data import HeteroData |
|
import yaml |
|
|
|
|
|
|
|
|
|
|
|
def node_mappings(path, index_col): |
|
df = pd.read_csv(path, index_col=index_col) |
|
mapping = {index: i for i, index in enumerate(df.index.unique())} |
|
|
|
return mapping |
|
|
|
|
|
def convert_int(x): |
|
try: |
|
return int(x) |
|
except: |
|
return np.nan |
|
|
|
|
|
def remove_movies(m_id): |
|
''' |
|
# Remove ids which dont have meta data information |
|
''' |
|
no_metadata = [] |
|
for idx in range(len(m_id)): |
|
tmdb_id = id_map.loc[id_map['movieId'] == m_id[idx]] |
|
|
|
if tmdb_id.size == 0: |
|
no_metadata.append(m_id[idx]) |
|
|
|
return no_metadata |
|
|
|
|
|
|
|
|
|
def make_graph(): |
|
metadata_path = './sampled_movie_dataset/movies_metadata.csv' |
|
df = pd.read_csv(metadata_path) |
|
df = df.drop([19730, 29503, 35587]) |
|
df['id'] = df['id'].astype('int') |
|
|
|
links_small = pd.read_csv('./sampled_movie_dataset/links_small.csv') |
|
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int') |
|
|
|
sampled_md = df[df['id'].isin(links_small)] |
|
sampled_md['tagline'] = sampled_md['tagline'].fillna('') |
|
sampled_md['description'] = sampled_md['overview'] + sampled_md['tagline'] |
|
sampled_md['description'] = sampled_md['description'].fillna('') |
|
sampled_md = sampled_md.reset_index() |
|
|
|
indices = pd.Series(sampled_md.index, index=sampled_md['title']) |
|
ind_gen = pd.Series(sampled_md.index, index=sampled_md['genres']) |
|
|
|
ratings_path = './sampled_movie_dataset/ratings_small.csv' |
|
ratings_df = pd.read_csv(ratings_path) |
|
m_id = ratings_df['movieId'].tolist() |
|
m_id = list(dict.fromkeys(m_id)) |
|
|
|
user_mapping = node_mappings(ratings_path, index_col='userId') |
|
movie_mapping = node_mappings(ratings_path, index_col='movieId') |
|
|
|
id_map = pd.read_csv('./sampled_movie_dataset/links_small.csv')[['movieId', 'tmdbId']] |
|
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int) |
|
id_map.columns = ['movieId', 'id'] |
|
id_map = id_map.merge(sampled_md[['title', 'id']], on='id').set_index('title') |
|
indices_map = id_map.set_index('id') |
|
|
|
no_metadata = remove_movies(m_id) |
|
|
|
|
|
for element in no_metadata: |
|
if element in m_id: |
|
print("ids with no metadata information:",element) |
|
m_id.remove(element) |
|
|
|
|
|
movie_mappings = {} |
|
for idx, m in enumerate(m_id): |
|
movie_mappings[m] = idx |
|
|
|
return movie_mappings |
|
|
|
|
|
|
|
def load_data_to_ArangoDB(): |
|
|
|
|
|
|
|
|
|
|
|
|