Update logic.py
Browse files
logic.py
CHANGED
@@ -50,6 +50,63 @@ def remove_movies(m_id):
|
|
50 |
#print('No Meta data information at:', m_id[idx])
|
51 |
return no_metadata
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
#-------------------------------------------------------------------------------------------
|
55 |
|
@@ -98,11 +155,120 @@ def make_graph():
|
|
98 |
for idx, m in enumerate(m_id):
|
99 |
movie_mappings[m] = idx
|
100 |
|
|
|
|
|
101 |
return movie_mappings
|
102 |
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
-
def load_data_to_ArangoDB():
|
106 |
|
107 |
|
108 |
|
|
|
50 |
#print('No Meta data information at:', m_id[idx])
|
51 |
return no_metadata
|
52 |
|
53 |
+
def populate_user_collection(total_users):
|
54 |
+
batch = []
|
55 |
+
BATCH_SIZE = 50
|
56 |
+
batch_idx = 1
|
57 |
+
index = 0
|
58 |
+
user_ids = list(user_mapping.keys())
|
59 |
+
user_collection = movie_rec_db["Users"]
|
60 |
+
for idx in tqdm(range(total_users)):
|
61 |
+
insert_doc = {}
|
62 |
+
|
63 |
+
insert_doc["_id"] = "Users/" + str(user_mapping[user_ids[idx]])
|
64 |
+
insert_doc["original_id"] = str(user_ids[idx])
|
65 |
+
|
66 |
+
batch.append(insert_doc)
|
67 |
+
index +=1
|
68 |
+
last_record = (idx == (total_users - 1))
|
69 |
+
if index % BATCH_SIZE == 0:
|
70 |
+
#print("Inserting batch %d" % (batch_idx))
|
71 |
+
batch_idx += 1
|
72 |
+
user_collection.import_bulk(batch)
|
73 |
+
batch = []
|
74 |
+
if last_record and len(batch) > 0:
|
75 |
+
print("Inserting batch the last batch!")
|
76 |
+
user_collection.import_bulk(batch)
|
77 |
+
|
78 |
+
def create_ratings_graph(user_id, movie_id, ratings):
|
79 |
+
batch = []
|
80 |
+
BATCH_SIZE = 100
|
81 |
+
batch_idx = 1
|
82 |
+
index = 0
|
83 |
+
edge_collection = movie_rec_db["Ratings"]
|
84 |
+
for idx in tqdm(range(ratings.shape[0])):
|
85 |
+
|
86 |
+
# removing edges (movies) with no metatdata
|
87 |
+
if movie_id[idx] in no_metadata:
|
88 |
+
print('Removing edges with no metadata', movie_id[idx])
|
89 |
+
|
90 |
+
else:
|
91 |
+
insert_doc = {}
|
92 |
+
insert_doc = {"_id": "Ratings" + "/" + 'user-' + str(user_mapping[user_id[idx]]) + "-r-" + "movie-" + str(movie_mappings[movie_id[idx]]),
|
93 |
+
"_from": ("Users" + "/" + str(user_mapping[user_id[idx]])),
|
94 |
+
"_to": ("Movie" + "/" + str(movie_mappings[movie_id[idx]])),
|
95 |
+
"_rating": float(ratings[idx])}
|
96 |
+
|
97 |
+
batch.append(insert_doc)
|
98 |
+
index += 1
|
99 |
+
last_record = (idx == (ratings.shape[0] - 1))
|
100 |
+
|
101 |
+
if index % BATCH_SIZE == 0:
|
102 |
+
#print("Inserting batch %d" % (batch_idx))
|
103 |
+
batch_idx += 1
|
104 |
+
edge_collection.import_bulk(batch)
|
105 |
+
batch = []
|
106 |
+
if last_record and len(batch) > 0:
|
107 |
+
print("Inserting batch the last batch!")
|
108 |
+
edge_collection.import_bulk(batch)
|
109 |
+
|
110 |
|
111 |
#-------------------------------------------------------------------------------------------
|
112 |
|
|
|
155 |
for idx, m in enumerate(m_id):
|
156 |
movie_mappings[m] = idx
|
157 |
|
158 |
+
|
159 |
+
|
160 |
return movie_mappings
|
161 |
|
162 |
|
163 |
+
def login_ArangoDB():
|
164 |
+
# get temporary credentials for ArangoDB on cloud
|
165 |
+
login = oasis.getTempCredentials(tutorialName="MovieRecommendations", credentialProvider="https://tutorials.arangodb.cloud:8529/_db/_system/tutorialDB/tutorialDB")
|
166 |
+
# url to access the ArangoDB Web UI
|
167 |
+
print("https://"+login["hostname"]+":"+str(login["port"]))
|
168 |
+
print("Username: " + login["username"])
|
169 |
+
print("Password: " + login["password"])
|
170 |
+
print("Database: " + login["dbName"])
|
171 |
+
return login
|
172 |
+
|
173 |
+
def load_data_to_ArangoDB(login):
|
174 |
+
|
175 |
+
movie_rec_db = oasis.connect_python_arango(login)
|
176 |
+
if not movie_rec_db.has_collection("Movie"):
|
177 |
+
movie_rec_db.create_collection("Movie", replication_factor=3)
|
178 |
+
|
179 |
+
batch = []
|
180 |
+
BATCH_SIZE = 128
|
181 |
+
batch_idx = 1
|
182 |
+
index = 0
|
183 |
+
movie_collection = movie_rec_db["Movie"]
|
184 |
+
|
185 |
+
# loading movies metadata information into ArangoDB's Movie collection
|
186 |
+
for idx in tqdm(range(len(m_id))):
|
187 |
+
insert_doc = {}
|
188 |
+
tmdb_id = id_map.loc[id_map['movieId'] == m_id[idx]]
|
189 |
+
|
190 |
+
if tmdb_id.size == 0:
|
191 |
+
print('No Meta data information at:', m_id[idx])
|
192 |
+
|
193 |
+
|
194 |
+
else:
|
195 |
+
tmdb_id = int(tmdb_id.iloc[:,1][0])
|
196 |
+
emb_id = "Movie/" + str(movie_mappings[m_id[idx]])
|
197 |
+
insert_doc["_id"] = emb_id
|
198 |
+
m_meta = sampled_md.loc[sampled_md['id'] == tmdb_id]
|
199 |
+
# adding movie metadata information
|
200 |
+
m_title = m_meta.iloc[0]['title']
|
201 |
+
m_poster = m_meta.iloc[0]['poster_path']
|
202 |
+
m_description = m_meta.iloc[0]['description']
|
203 |
+
m_language = m_meta.iloc[0]['original_language']
|
204 |
+
m_genre = m_meta.iloc[0]['genres']
|
205 |
+
m_genre = yaml.load(m_genre, Loader=yaml.BaseLoader)
|
206 |
+
genres = [g['name'] for g in m_genre]
|
207 |
+
|
208 |
+
insert_doc["movieId"] = m_id[idx]
|
209 |
+
insert_doc["mapped_movieId"] = movie_mappings[m_id[idx]]
|
210 |
+
insert_doc["tmdbId"] = tmdb_id
|
211 |
+
insert_doc['movie_title'] = m_title
|
212 |
+
|
213 |
+
insert_doc['description'] = m_description
|
214 |
+
insert_doc['genres'] = genres
|
215 |
+
insert_doc['language'] = m_language
|
216 |
+
|
217 |
+
if str(m_poster) == "nan":
|
218 |
+
insert_doc['poster_path'] = "No poster path available"
|
219 |
+
else:
|
220 |
+
insert_doc['poster_path'] = m_poster
|
221 |
+
|
222 |
+
batch.append(insert_doc)
|
223 |
+
index +=1
|
224 |
+
last_record = (idx == (len(m_id) - 1))
|
225 |
+
if index % BATCH_SIZE == 0:
|
226 |
+
#print("Inserting batch %d" % (batch_idx))
|
227 |
+
batch_idx += 1
|
228 |
+
movie_collection.import_bulk(batch)
|
229 |
+
batch = []
|
230 |
+
if last_record and len(batch) > 0:
|
231 |
+
print("Inserting batch the last batch!")
|
232 |
+
movie_collection.import_bulk(batch)
|
233 |
+
|
234 |
+
if not movie_rec_db.has_collection("Users"):
|
235 |
+
movie_rec_db.create_collection("Users", replication_factor=3)
|
236 |
+
|
237 |
+
total_users = np.unique(ratings_df[['userId']].values.flatten()).shape[0]
|
238 |
+
print("Total number of Users:", total_users)
|
239 |
+
populate_user_collection(total_users)
|
240 |
+
|
241 |
+
# This returns an API wrapper for "Ratings" collection.
|
242 |
+
if not movie_rec_db.has_collection("Ratings"):
|
243 |
+
movie_rec_db.create_collection("Ratings", edge=True, replication_factor=3)
|
244 |
+
|
245 |
+
if not movie_rec_db.has_graph("movie_rating_graph"):
|
246 |
+
movie_rec_db.create_graph('movie_rating_graph', smart=True)
|
247 |
+
|
248 |
+
# This returns and API wrapper for the above created graphs
|
249 |
+
movie_rating_graph = movie_rec_db.graph("movie_rating_graph")
|
250 |
+
|
251 |
+
# Create a new vertex collection named "Users" if it does not exist.
|
252 |
+
if not movie_rating_graph.has_vertex_collection("Users"):
|
253 |
+
movie_rating_graph.vertex_collection("Users")
|
254 |
+
|
255 |
+
# Create a new vertex collection named "Movie" if it does not exist.
|
256 |
+
if not movie_rating_graph.has_vertex_collection("Movie"):
|
257 |
+
movie_rating_graph.vertex_collection("Movie")
|
258 |
+
|
259 |
+
if not movie_rating_graph.has_edge_definition("Ratings"):
|
260 |
+
Ratings = movie_rating_graph.create_edge_definition(
|
261 |
+
edge_collection='Ratings',
|
262 |
+
from_vertex_collections=['Users'],
|
263 |
+
to_vertex_collections=['Movie']
|
264 |
+
)
|
265 |
+
|
266 |
+
user_id, movie_id, ratings = ratings_df[['userId']].values.flatten(), ratings_df[['movieId']].values.flatten() , ratings_df[['rating']].values.flatten()
|
267 |
+
create_ratings_graph(user_id, movie_id, ratings)
|
268 |
+
|
269 |
+
|
270 |
+
return movie_rec_db
|
271 |
|
|
|
272 |
|
273 |
|
274 |
|