Polo123 commited on
Commit
97de7f8
·
verified ·
1 Parent(s): 9a53806

Create logic.py

Browse files
Files changed (1) hide show
  1. logic.py +110 -0
logic.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from arango import ArangoClient
3
+ from tqdm import tqdm
4
+ import numpy as np
5
+ import itertools
6
+ import requests
7
+ import sys
8
+ import oasis
9
+ from arango import ArangoClient
10
+
11
+ import torch
12
+ import torch.nn.functional as F
13
+ from torch.nn import Linear
14
+ from arango import ArangoClient
15
+ import torch_geometric.transforms as T
16
+ from torch_geometric.nn import SAGEConv, to_hetero
17
+ from torch_geometric.transforms import RandomLinkSplit, ToUndirected
18
+ from sentence_transformers import SentenceTransformer
19
+ from torch_geometric.data import HeteroData
20
+ import yaml
21
+
22
+ #-------------------------------------------------------------------------------------------
23
+ # Functions
24
+
25
+ # performs user and movie mappings
26
+ def node_mappings(path, index_col):
27
+ df = pd.read_csv(path, index_col=index_col)
28
+ mapping = {index: i for i, index in enumerate(df.index.unique())}
29
+
30
+ return mapping
31
+
32
+
33
+ def convert_int(x):
34
+ try:
35
+ return int(x)
36
+ except:
37
+ return np.nan
38
+
39
+
40
+ def remove_movies(m_id):
41
+ '''
42
+ # Remove ids which dont have meta data information
43
+ '''
44
+ no_metadata = []
45
+ for idx in range(len(m_id)):
46
+ tmdb_id = id_map.loc[id_map['movieId'] == m_id[idx]]
47
+
48
+ if tmdb_id.size == 0:
49
+ no_metadata.append(m_id[idx])
50
+ #print('No Meta data information at:', m_id[idx])
51
+ return no_metadata
52
+
53
+
54
+ #-------------------------------------------------------------------------------------------
55
+
56
+ def make_graph():
57
+ metadata_path = './sampled_movie_dataset/movies_metadata.csv'
58
+ df = pd.read_csv(metadata_path)
59
+ df = df.drop([19730, 29503, 35587])
60
+ df['id'] = df['id'].astype('int')
61
+
62
+ links_small = pd.read_csv('./sampled_movie_dataset/links_small.csv')
63
+ links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int') # selecting tmdbId coloumn from links_small file
64
+
65
+ sampled_md = df[df['id'].isin(links_small)]
66
+ sampled_md['tagline'] = sampled_md['tagline'].fillna('')
67
+ sampled_md['description'] = sampled_md['overview'] + sampled_md['tagline']
68
+ sampled_md['description'] = sampled_md['description'].fillna('')
69
+ sampled_md = sampled_md.reset_index()
70
+
71
+ indices = pd.Series(sampled_md.index, index=sampled_md['title'])
72
+ ind_gen = pd.Series(sampled_md.index, index=sampled_md['genres'])
73
+
74
+ ratings_path = './sampled_movie_dataset/ratings_small.csv'
75
+ ratings_df = pd.read_csv(ratings_path)
76
+ m_id = ratings_df['movieId'].tolist()
77
+ m_id = list(dict.fromkeys(m_id))
78
+
79
+ user_mapping = node_mappings(ratings_path, index_col='userId')
80
+ movie_mapping = node_mappings(ratings_path, index_col='movieId')
81
+
82
+ id_map = pd.read_csv('./sampled_movie_dataset/links_small.csv')[['movieId', 'tmdbId']]
83
+ id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
84
+ id_map.columns = ['movieId', 'id']
85
+ id_map = id_map.merge(sampled_md[['title', 'id']], on='id').set_index('title') # tmbdid is same (of links_small) as of id in sampled_md
86
+ indices_map = id_map.set_index('id')
87
+
88
+ no_metadata = remove_movies(m_id)
89
+
90
+ ## remove ids which dont have meta data information
91
+ for element in no_metadata:
92
+ if element in m_id:
93
+ print("ids with no metadata information:",element)
94
+ m_id.remove(element)
95
+
96
+ # create new movie_mapping dict with only m_ids having metadata information
97
+ movie_mappings = {}
98
+ for idx, m in enumerate(m_id):
99
+ movie_mappings[m] = idx
100
+
101
+ return movie_mappings
102
+
103
+
104
+
105
+ def load_data_to_ArangoDB():
106
+
107
+
108
+
109
+
110
+