Spaces:

Polo123
/

PyG-ArangoDB-Movie-Recommendation

Sleeping

App Files Files Community

Polo123 commited on Mar 15, 2024

Commit

64e5385

verified ·

1 Parent(s): 97de7f8

Update logic.py

Browse files

Files changed (1) hide show

logic.py +167 -1

logic.py CHANGED Viewed

@@ -50,6 +50,63 @@ def remove_movies(m_id):
             #print('No Meta data information at:', m_id[idx])
     return no_metadata
 #-------------------------------------------------------------------------------------------
@@ -98,11 +155,120 @@ def make_graph():
     for idx, m in enumerate(m_id):
         movie_mappings[m] = idx
     return movie_mappings
-def load_data_to_ArangoDB():

             #print('No Meta data information at:', m_id[idx])
     return no_metadata
+def populate_user_collection(total_users):
+    batch = []
+    BATCH_SIZE = 50
+    batch_idx = 1
+    index = 0
+    user_ids = list(user_mapping.keys())
+    user_collection = movie_rec_db["Users"]
+    for idx in tqdm(range(total_users)):
+        insert_doc = {}
+        insert_doc["_id"] = "Users/" + str(user_mapping[user_ids[idx]])
+        insert_doc["original_id"] = str(user_ids[idx])
+        batch.append(insert_doc)
+        index +=1
+        last_record = (idx == (total_users - 1))
+        if index % BATCH_SIZE == 0:
+            #print("Inserting batch %d" % (batch_idx))
+            batch_idx += 1
+            user_collection.import_bulk(batch)
+            batch = []
+        if last_record and len(batch) > 0:
+            print("Inserting batch the last batch!")
+            user_collection.import_bulk(batch)
+def create_ratings_graph(user_id, movie_id, ratings):
+    batch = []
+    BATCH_SIZE = 100
+    batch_idx = 1
+    index = 0
+    edge_collection = movie_rec_db["Ratings"]
+    for idx in tqdm(range(ratings.shape[0])):
+        # removing edges (movies) with no metatdata
+        if movie_id[idx] in no_metadata:
+            print('Removing edges with no metadata', movie_id[idx])
+        else:
+            insert_doc = {}
+            insert_doc = {"_id":    "Ratings" + "/" + 'user-' + str(user_mapping[user_id[idx]]) + "-r-" + "movie-" + str(movie_mappings[movie_id[idx]]),
+                          "_from":  ("Users" + "/" + str(user_mapping[user_id[idx]])),
+                          "_to":    ("Movie" + "/" + str(movie_mappings[movie_id[idx]])),
+                          "_rating": float(ratings[idx])}
+            batch.append(insert_doc)
+            index += 1
+            last_record = (idx == (ratings.shape[0] - 1))
+            if index % BATCH_SIZE == 0:
+                #print("Inserting batch %d" % (batch_idx))
+                batch_idx += 1
+                edge_collection.import_bulk(batch)
+                batch = []
+            if last_record and len(batch) > 0:
+                print("Inserting batch the last batch!")
+                edge_collection.import_bulk(batch)
 #-------------------------------------------------------------------------------------------
     for idx, m in enumerate(m_id):
         movie_mappings[m] = idx
     return movie_mappings
+def login_ArangoDB():
+    # get temporary credentials for ArangoDB on cloud
+    login = oasis.getTempCredentials(tutorialName="MovieRecommendations", credentialProvider="https://tutorials.arangodb.cloud:8529/_db/_system/tutorialDB/tutorialDB")
+    # url to access the ArangoDB Web UI
+    print("https://"+login["hostname"]+":"+str(login["port"]))
+    print("Username: " + login["username"])
+    print("Password: " + login["password"])
+    print("Database: " + login["dbName"])
+    return login
+def load_data_to_ArangoDB(login):
+    movie_rec_db = oasis.connect_python_arango(login)
+    if not movie_rec_db.has_collection("Movie"):
+        movie_rec_db.create_collection("Movie", replication_factor=3)
+    batch = []
+    BATCH_SIZE = 128
+    batch_idx = 1
+    index = 0
+    movie_collection = movie_rec_db["Movie"]
+    # loading movies metadata information into ArangoDB's Movie collection
+    for idx in tqdm(range(len(m_id))):
+        insert_doc = {}
+        tmdb_id = id_map.loc[id_map['movieId'] == m_id[idx]]
+        if tmdb_id.size == 0:
+            print('No Meta data information at:', m_id[idx])
+        else:
+            tmdb_id = int(tmdb_id.iloc[:,1][0])
+            emb_id = "Movie/" + str(movie_mappings[m_id[idx]])
+            insert_doc["_id"] = emb_id
+            m_meta = sampled_md.loc[sampled_md['id'] == tmdb_id]
+            # adding movie metadata information
+            m_title = m_meta.iloc[0]['title']
+            m_poster = m_meta.iloc[0]['poster_path']
+            m_description = m_meta.iloc[0]['description']
+            m_language = m_meta.iloc[0]['original_language']
+            m_genre = m_meta.iloc[0]['genres']
+            m_genre = yaml.load(m_genre, Loader=yaml.BaseLoader)
+            genres = [g['name'] for g in m_genre]
+            insert_doc["movieId"] = m_id[idx]
+            insert_doc["mapped_movieId"] = movie_mappings[m_id[idx]]
+            insert_doc["tmdbId"] = tmdb_id
+            insert_doc['movie_title'] = m_title
+            insert_doc['description'] = m_description
+            insert_doc['genres'] = genres
+            insert_doc['language'] = m_language
+            if str(m_poster) == "nan":
+                insert_doc['poster_path'] = "No poster path available"
+            else:
+                insert_doc['poster_path'] = m_poster
+            batch.append(insert_doc)
+            index +=1
+            last_record = (idx == (len(m_id) - 1))
+            if index % BATCH_SIZE == 0:
+                #print("Inserting batch %d" % (batch_idx))
+                batch_idx += 1
+                movie_collection.import_bulk(batch)
+                batch = []
+            if last_record and len(batch) > 0:
+                print("Inserting batch the last batch!")
+                movie_collection.import_bulk(batch)
+    if not movie_rec_db.has_collection("Users"):
+        movie_rec_db.create_collection("Users", replication_factor=3)
+    total_users = np.unique(ratings_df[['userId']].values.flatten()).shape[0]
+    print("Total number of Users:", total_users)
+    populate_user_collection(total_users)
+    # This returns an API wrapper for "Ratings" collection.
+    if not movie_rec_db.has_collection("Ratings"):
+        movie_rec_db.create_collection("Ratings", edge=True, replication_factor=3)
+    if not movie_rec_db.has_graph("movie_rating_graph"):
+        movie_rec_db.create_graph('movie_rating_graph', smart=True)
+    # This returns and API wrapper for the above created graphs
+    movie_rating_graph = movie_rec_db.graph("movie_rating_graph")
+    # Create a new vertex collection named "Users" if it does not exist.
+    if not movie_rating_graph.has_vertex_collection("Users"):
+        movie_rating_graph.vertex_collection("Users")
+    # Create a new vertex collection named "Movie" if it does not exist.
+    if not movie_rating_graph.has_vertex_collection("Movie"):
+        movie_rating_graph.vertex_collection("Movie")
+    if not movie_rating_graph.has_edge_definition("Ratings"):
+        Ratings = movie_rating_graph.create_edge_definition(
+            edge_collection='Ratings',
+            from_vertex_collections=['Users'],
+            to_vertex_collections=['Movie']
+        )
+    user_id, movie_id, ratings = ratings_df[['userId']].values.flatten(), ratings_df[['movieId']].values.flatten() , ratings_df[['rating']].values.flatten()
+    create_ratings_graph(user_id, movie_id, ratings)
+    return movie_rec_db