Spaces:
Sleeping
Sleeping
Commit
·
9697a93
1
Parent(s):
d98a3d8
addinh helper file
Browse files- Helpers.py +106 -0
- requirements.txt +1 -0
Helpers.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
+
from sklearn.metrics import mean_squared_error
|
5 |
+
from xgboost import XGBRegressor
|
6 |
+
|
7 |
+
|
8 |
+
def train_model(data,user_id, test=None, eval = False):
|
9 |
+
|
10 |
+
|
11 |
+
# select only user data
|
12 |
+
train_user = data[data['userId']==user_id]
|
13 |
+
|
14 |
+
|
15 |
+
X_train = train_user.drop(columns=['userId','rating', 'Train', 'title'])
|
16 |
+
y_train = train_user['rating']
|
17 |
+
|
18 |
+
model = XGBRegressor()
|
19 |
+
model.fit(X_train,y_train)
|
20 |
+
|
21 |
+
if eval:
|
22 |
+
test_user = test[test['userId']== user_id]
|
23 |
+
X_test = test_user.drop(columns=['userId','rating', 'Train', 'title'])
|
24 |
+
y_test = test_user['rating']
|
25 |
+
y_pred = model.predict(X_test)
|
26 |
+
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
|
27 |
+
print(f'RMSE: {rmse:.4f}')
|
28 |
+
# Model evaluation
|
29 |
+
# print("Predected rating:", y_pred)
|
30 |
+
# print("Actual rating:",y_test)
|
31 |
+
# print(X_test)
|
32 |
+
|
33 |
+
return model
|
34 |
+
|
35 |
+
def get_user_recommendation_XGBoost(all_moves,model, user_id, n=10):
|
36 |
+
# get all movies that the user has not seen
|
37 |
+
user_seen_movies = all_moves[all_moves['userId'] == user_id]['title']
|
38 |
+
user_unseen_movies = all_moves[~all_moves['title'].isin(user_seen_movies)]
|
39 |
+
|
40 |
+
# drop duplicates
|
41 |
+
user_unseen_movies = user_unseen_movies.drop_duplicates(subset=['title'])
|
42 |
+
|
43 |
+
# make predictions
|
44 |
+
user_unseen_movies['Pred_rating'] = model.predict(user_unseen_movies.drop(columns=['userId', 'rating', 'Train', 'title']))
|
45 |
+
|
46 |
+
# only return movies with more than 100 ratings
|
47 |
+
|
48 |
+
# get top 10 recommendations
|
49 |
+
recommendations = user_unseen_movies.sort_values(by='Pred_rating', ascending=False).head(n)['title']
|
50 |
+
return recommendations ,user_seen_movies
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
def get_user_recommendation(DataBase, Matrix,user_id,l=10):
|
55 |
+
user = Matrix[user_id]
|
56 |
+
user = user.sort_values(ascending=False)
|
57 |
+
# now we have a series of user similarities
|
58 |
+
# we only want to recommend movies that the user has not seen
|
59 |
+
# so we need to filter out movies that the user has seen
|
60 |
+
user_seen_movies = DataBase[DataBase['userId'] == user_id]['title']
|
61 |
+
|
62 |
+
|
63 |
+
# Now we loop through user and get top 10 recommendations
|
64 |
+
recommendations = []
|
65 |
+
print(len(user.index))
|
66 |
+
for U in user.index[1:10]:
|
67 |
+
# get all rated movies by user U
|
68 |
+
movies = DataBase[DataBase['userId'] == U]['title']
|
69 |
+
movies = movies[~movies.isin(user_seen_movies)]
|
70 |
+
|
71 |
+
# get all movies that U has rated 4 or higher
|
72 |
+
movies = movies[DataBase['rating'] >= 4]
|
73 |
+
# sort by rating
|
74 |
+
movies = movies.sort_values(ascending=False)
|
75 |
+
for movie in movies[:4]:
|
76 |
+
if movie not in recommendations:
|
77 |
+
recommendations.append(movie)
|
78 |
+
|
79 |
+
if len(recommendations) >= l:
|
80 |
+
break
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
return recommendations
|
85 |
+
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
if __name__ == '__main__':
|
91 |
+
import pickle
|
92 |
+
|
93 |
+
def load_similarity_matrix(path):
|
94 |
+
with open(path, 'rb') as f:
|
95 |
+
similarity_df = pickle.load(f)
|
96 |
+
return similarity_df
|
97 |
+
|
98 |
+
# Load the data
|
99 |
+
DataBaseCSV = r"D:\Study\ITI\Recommender Systems\Final\Movies-Recommender-System\Data\XGBoost_database.csv"
|
100 |
+
DataBase = pd.read_csv(DataBaseCSV)
|
101 |
+
# Load the similarity matrix
|
102 |
+
MatrixCSV = r"D:\Study\ITI\Recommender Systems\Final\Movies-Recommender-System\Models\user_based_matrix.pkl"
|
103 |
+
Matrix = load_similarity_matrix(MatrixCSV)
|
104 |
+
recommendations = get_user_recommendation(DataBase, Matrix,1)
|
105 |
+
print(recommendations)
|
106 |
+
|
requirements.txt
CHANGED
@@ -3,3 +3,4 @@ requests==2.31.0
|
|
3 |
pandas==2.2.2
|
4 |
pickleshare==0.7.5
|
5 |
gdown==5.1.0
|
|
|
|
3 |
pandas==2.2.2
|
4 |
pickleshare==0.7.5
|
5 |
gdown==5.1.0
|
6 |
+
xgboost==2.0.3
|