Commit
·
7ec9545
1
Parent(s):
259bb98
model_trainers
Browse files
.gitignore
CHANGED
@@ -2,5 +2,4 @@ ars/
|
|
2 |
.env
|
3 |
Artifacts/
|
4 |
logs/
|
5 |
-
__pycache__/
|
6 |
-
model_trainer/
|
|
|
2 |
.env
|
3 |
Artifacts/
|
4 |
logs/
|
5 |
+
__pycache__/
|
|
anime_recommender/model_trainer/collaborative_modelling.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import pandas as pd
|
3 |
+
from anime_recommender.loggers.logging import logging
|
4 |
+
from anime_recommender.exception.exception import AnimeRecommendorException
|
5 |
+
|
6 |
+
from surprise import Reader, Dataset, SVD
|
7 |
+
from surprise.model_selection import cross_validate
|
8 |
+
from scipy.sparse import csr_matrix
|
9 |
+
from sklearn.neighbors import NearestNeighbors
|
10 |
+
from collections import Counter
|
11 |
+
|
12 |
+
class CollaborativeAnimeRecommender:
|
13 |
+
def __init__(self, df):
|
14 |
+
self.df = df
|
15 |
+
self.svd = None
|
16 |
+
self.knn_item_based = None
|
17 |
+
self.knn_user_based = None
|
18 |
+
self.prepare_data()
|
19 |
+
|
20 |
+
|
21 |
+
def prepare_data(self):
|
22 |
+
self.df = self.df.drop_duplicates()
|
23 |
+
reader = Reader(rating_scale=(1, 10))
|
24 |
+
self.data = Dataset.load_from_df(self.df[['user_id', 'anime_id', 'rating']], reader)
|
25 |
+
self.anime_pivot = self.df.pivot_table(index='name', columns='user_id', values='rating').fillna(0)
|
26 |
+
self.user_pivot = self.df.pivot_table(index='user_id', columns='name', values='rating').fillna(0)
|
27 |
+
|
28 |
+
def train_svd(self):
|
29 |
+
self.svd = SVD()
|
30 |
+
cross_validate(self.svd, self.data, cv=5)
|
31 |
+
trainset = self.data.build_full_trainset()
|
32 |
+
self.svd.fit(trainset)
|
33 |
+
|
34 |
+
def train_knn_item_based(self):
|
35 |
+
item_user_matrix = csr_matrix(self.anime_pivot.values)
|
36 |
+
self.knn_item_based = NearestNeighbors(metric='cosine', algorithm='brute')
|
37 |
+
self.knn_item_based.fit(item_user_matrix)
|
38 |
+
|
39 |
+
def train_knn_user_based(self):
|
40 |
+
"""Train the KNN model for user-based recommendations."""
|
41 |
+
user_item_matrix = csr_matrix(self.user_pivot.values)
|
42 |
+
self.knn_user_based = NearestNeighbors(metric='cosine', algorithm='brute')
|
43 |
+
self.knn_user_based.fit(user_item_matrix)
|
44 |
+
|
45 |
+
def print_unique_user_ids(self):
|
46 |
+
"""Print unique user IDs from the dataset."""
|
47 |
+
unique_user_ids = self.df['user_id'].unique()
|
48 |
+
logging.info(f"Unique User IDs: {unique_user_ids}")
|
49 |
+
return unique_user_ids
|
50 |
+
|
51 |
+
def get_svd_recommendations(self, user_id, n=10, svd_model=None):
|
52 |
+
# Use the provided SVD model or the trained self.svd model
|
53 |
+
svd_model = svd_model or self.svd
|
54 |
+
if svd_model is None:
|
55 |
+
raise ValueError("SVD model is not provided or trained.")
|
56 |
+
|
57 |
+
# Ensure user exists in the dataset
|
58 |
+
if user_id not in self.df['user_id'].unique():
|
59 |
+
return f"User ID '{user_id}' not found in the dataset."
|
60 |
+
|
61 |
+
# Get unique anime IDs
|
62 |
+
anime_ids = self.df['anime_id'].unique()
|
63 |
+
|
64 |
+
# Predict ratings for all anime for the given user
|
65 |
+
predictions = [(anime_id, svd_model.predict(user_id, anime_id).est) for anime_id in anime_ids]
|
66 |
+
predictions.sort(key=lambda x: x[1], reverse=True)
|
67 |
+
|
68 |
+
# Extract top N anime IDs
|
69 |
+
recommended_anime_ids = [pred[0] for pred in predictions[:n]]
|
70 |
+
|
71 |
+
# Get details of recommended anime
|
72 |
+
recommended_anime = self.df[self.df['anime_id'].isin(recommended_anime_ids)].drop_duplicates(subset='anime_id')
|
73 |
+
logging.info(f"Shape of recommended_anime: {recommended_anime.shape}")
|
74 |
+
# Limit to N recommendations
|
75 |
+
recommended_anime = recommended_anime.head(n)
|
76 |
+
|
77 |
+
return pd.DataFrame({
|
78 |
+
'Anime Name': recommended_anime['name'].values,
|
79 |
+
'Genres': recommended_anime['genres'].values,
|
80 |
+
'Image URL': recommended_anime['image url'].values,
|
81 |
+
'Rating': recommended_anime['average_rating'].values
|
82 |
+
})
|
83 |
+
|
84 |
+
def get_item_based_recommendations(self, anime_name, n_recommendations=10, knn_item_model=None):
|
85 |
+
# Use the provided model or fall back to self.knn_item_based
|
86 |
+
knn_item_based = knn_item_model or self.knn_item_based
|
87 |
+
if knn_item_based is None:
|
88 |
+
raise ValueError("Item-based KNN model is not provided or trained.")
|
89 |
+
|
90 |
+
# Ensure the anime name exists in the pivot table
|
91 |
+
if anime_name not in self.anime_pivot.index:
|
92 |
+
return f"Anime title '{anime_name}' not found in the dataset."
|
93 |
+
|
94 |
+
# Get the index of the anime in the pivot table
|
95 |
+
query_index = self.anime_pivot.index.get_loc(anime_name)
|
96 |
+
|
97 |
+
# Use the KNN model to find similar animes (n_neighbors + 1 to exclude the query itself)
|
98 |
+
distances, indices = knn_item_based.kneighbors(
|
99 |
+
self.anime_pivot.iloc[query_index, :].values.reshape(1, -1),
|
100 |
+
n_neighbors=n_recommendations + 1 # +1 because the query anime itself is included
|
101 |
+
)
|
102 |
+
recommendations = []
|
103 |
+
for i in range(1, len(distances.flatten())): # Start from 1 to exclude the query anime
|
104 |
+
anime_title = self.anime_pivot.index[indices.flatten()[i]]
|
105 |
+
distance = distances.flatten()[i]
|
106 |
+
recommendations.append((anime_title, distance))
|
107 |
+
|
108 |
+
# Fetch the recommended anime names (top n_recommendations)
|
109 |
+
recommended_anime_titles = [rec[0] for rec in recommendations]
|
110 |
+
logging.info(f"Top {n_recommendations} recommendations: {recommended_anime_titles}")
|
111 |
+
filtered_df = self.df[self.df['name'].isin(recommended_anime_titles)].drop_duplicates(subset='name')
|
112 |
+
logging.info(f"Shape of filtered df: {filtered_df.shape}")
|
113 |
+
# Limit the results to `n_recommendations`
|
114 |
+
filtered_df = filtered_df.head(n_recommendations)
|
115 |
+
|
116 |
+
return pd.DataFrame({
|
117 |
+
'Anime Name': filtered_df['name'].values,
|
118 |
+
'Image URL': filtered_df['image url'].values,
|
119 |
+
'Genres': filtered_df['genres'].values,
|
120 |
+
'Rating': filtered_df['average_rating'].values
|
121 |
+
})
|
122 |
+
|
123 |
+
def get_user_based_recommendations(self, user_id, n_recommendations=10, knn_user_model=None):
|
124 |
+
"""
|
125 |
+
Recommend anime for a given user based on similar users' preferences using the provided or trained KNN model.
|
126 |
+
|
127 |
+
Args:
|
128 |
+
user_id (int): The ID of the user.
|
129 |
+
n_recommendations (int): Number of recommendations to return.
|
130 |
+
knn_user_model (NearestNeighbors, optional): Pre-trained KNN model. Defaults to None.
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
pd.DataFrame: A DataFrame containing recommended anime titles and related information.
|
134 |
+
"""
|
135 |
+
# Use the provided model or fall back to self.knn_user_based
|
136 |
+
knn_user_based = knn_user_model or self.knn_user_based
|
137 |
+
if knn_user_based is None:
|
138 |
+
raise ValueError("User-based KNN model is not provided or trained.")
|
139 |
+
|
140 |
+
# Ensure the user exists in the pivot table
|
141 |
+
user_id = float(user_id) # Convert to match pivot table index type
|
142 |
+
if user_id not in self.user_pivot.index:
|
143 |
+
return f"User ID '{user_id}' not found in the dataset."
|
144 |
+
|
145 |
+
# Find the user's index in the pivot table
|
146 |
+
user_idx = self.user_pivot.index.get_loc(user_id)
|
147 |
+
|
148 |
+
# Use the KNN model to find the nearest neighbors
|
149 |
+
distances, indices = knn_user_based.kneighbors(
|
150 |
+
self.user_pivot.iloc[user_idx, :].values.reshape(1, -1),
|
151 |
+
n_neighbors=n_recommendations + 1 # Include the user itself
|
152 |
+
)
|
153 |
+
|
154 |
+
# Get the list of anime the user has already rated
|
155 |
+
user_rated_anime = set(self.user_pivot.columns[self.user_pivot.iloc[user_idx, :] > 0])
|
156 |
+
|
157 |
+
# Collect all anime rated by the nearest neighbors
|
158 |
+
all_neighbor_ratings = []
|
159 |
+
for i in range(1, len(distances.flatten())): # Start from 1 to exclude the user itself
|
160 |
+
neighbor_idx = indices.flatten()[i]
|
161 |
+
neighbor_rated_anime = self.user_pivot.iloc[neighbor_idx, :]
|
162 |
+
neighbor_ratings = neighbor_rated_anime[neighbor_rated_anime > 0]
|
163 |
+
all_neighbor_ratings.extend(neighbor_ratings.index)
|
164 |
+
|
165 |
+
# Count how frequently each anime is rated by neighbors
|
166 |
+
anime_counter = Counter(all_neighbor_ratings)
|
167 |
+
|
168 |
+
# Recommend anime not already rated by the user
|
169 |
+
recommendations = [(anime, count) for anime, count in anime_counter.items() if anime not in user_rated_anime]
|
170 |
+
recommendations.sort(key=lambda x: x[1], reverse=True) # Sort by frequency
|
171 |
+
|
172 |
+
# Extract recommended anime names and their details
|
173 |
+
recommended_anime_titles = [rec[0] for rec in recommendations[:n_recommendations]]
|
174 |
+
filtered_df = self.df[self.df['name'].isin(recommended_anime_titles)].drop_duplicates(subset='name')
|
175 |
+
logging.info(f"Shape of filtered df: {filtered_df.shape}")
|
176 |
+
filtered_df = filtered_df.head(n_recommendations)
|
177 |
+
|
178 |
+
return pd.DataFrame({
|
179 |
+
'Anime Name': filtered_df['name'].values,
|
180 |
+
'Image URL': filtered_df['image url'].values,
|
181 |
+
'Genres': filtered_df['genres'].values,
|
182 |
+
'Rating': filtered_df['average_rating'].values
|
183 |
+
})
|
anime_recommender/model_trainer/content_based_modelling.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
+
import joblib
|
6 |
+
|
7 |
+
|
8 |
+
class ContentBasedRecommender:
|
9 |
+
"""
|
10 |
+
A content-based recommender system using TF-IDF Vectorizer and Cosine Similarity.
|
11 |
+
"""
|
12 |
+
def __init__(self, df):
|
13 |
+
try:
|
14 |
+
# Drop missing values from the DataFrame
|
15 |
+
self.df = df.dropna()
|
16 |
+
|
17 |
+
# Create a Series mapping anime names to their indices
|
18 |
+
self.indices = pd.Series(self.df.index, index=self.df['name']).drop_duplicates()
|
19 |
+
|
20 |
+
# Initialize and fit the TF-IDF Vectorizer on the 'genres' column
|
21 |
+
self.tfv = TfidfVectorizer(
|
22 |
+
min_df=3,
|
23 |
+
strip_accents='unicode',
|
24 |
+
analyzer='word',
|
25 |
+
token_pattern=r'\w{1,}',
|
26 |
+
ngram_range=(1, 3),
|
27 |
+
stop_words='english'
|
28 |
+
)
|
29 |
+
self.tfv_matrix = self.tfv.fit_transform(self.df['genres'])
|
30 |
+
|
31 |
+
self.cosine_sim = cosine_similarity(self.tfv_matrix, self.tfv_matrix)
|
32 |
+
|
33 |
+
except Exception as e:
|
34 |
+
raise e
|
35 |
+
def save_model(self, model_path):
|
36 |
+
"""Save the trained model (TF-IDF and Cosine Similarity Matrix) to a file."""
|
37 |
+
try:
|
38 |
+
os.makedirs(os.path.dirname(model_path), exist_ok=True)
|
39 |
+
with open(model_path, 'wb') as f:
|
40 |
+
joblib.dump((self.tfv, self.cosine_sim), f)
|
41 |
+
|
42 |
+
except Exception as e:
|
43 |
+
raise e
|
44 |
+
def get_rec_cosine(self, title, model_path, n_recommendations=5):
|
45 |
+
"""Get recommendations based on cosine similarity for a given anime title."""
|
46 |
+
try:
|
47 |
+
# Load the model (TF-IDF and cosine similarity matrix)
|
48 |
+
with open(model_path, 'rb') as f:
|
49 |
+
self.tfv, self.cosine_sim = joblib.load(f)
|
50 |
+
|
51 |
+
# Check if the DataFrame is loaded
|
52 |
+
if self.df is None:
|
53 |
+
raise ValueError("The DataFrame is not loaded, cannot make recommendations.")
|
54 |
+
|
55 |
+
if title not in self.indices.index:
|
56 |
+
return f"Anime title '{title}' not found in the dataset."
|
57 |
+
|
58 |
+
idx = self.indices[title]
|
59 |
+
cosinesim_scores = list(enumerate(self.cosine_sim[idx]))
|
60 |
+
cosinesim_scores = sorted(cosinesim_scores, key=lambda x: x[1], reverse=True)[1:n_recommendations + 1]
|
61 |
+
anime_indices = [i[0] for i in cosinesim_scores]
|
62 |
+
|
63 |
+
return pd.DataFrame({
|
64 |
+
'Anime name': self.df['name'].iloc[anime_indices].values,
|
65 |
+
'Image URL': self.df['image url'].iloc[anime_indices].values,
|
66 |
+
'Genres': self.df['genres'].iloc[anime_indices].values,
|
67 |
+
'Rating': self.df['average_rating'].iloc[anime_indices].values
|
68 |
+
})
|
69 |
+
except Exception as e:
|
70 |
+
raise e
|
71 |
+
|
anime_recommender/model_trainer/top_anime_filtering.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
from anime_recommender.exception.exception import AnimeRecommendorException
|
5 |
+
|
6 |
+
class PopularityBasedFiltering:
|
7 |
+
def __init__(self, df):
|
8 |
+
try:
|
9 |
+
self.df = df
|
10 |
+
self.df['average_rating'] = pd.to_numeric(self.df['average_rating'], errors='coerce')
|
11 |
+
self.df['average_rating'].fillna(self.df['average_rating'].median())
|
12 |
+
except Exception as e:
|
13 |
+
raise AnimeRecommendorException(e, sys)
|
14 |
+
|
15 |
+
def popular_animes(self, n=10):
|
16 |
+
sorted_df = self.df.sort_values(by=['popularity'], ascending=True)
|
17 |
+
top_n_anime = sorted_df.head(n)
|
18 |
+
return pd.DataFrame({
|
19 |
+
'Anime name': top_n_anime['name'].values,
|
20 |
+
'Image URL': top_n_anime['image url'].values,
|
21 |
+
'Genres': top_n_anime['genres'].values,
|
22 |
+
'Rating': top_n_anime['average_rating'].values
|
23 |
+
})
|
24 |
+
|
25 |
+
def top_ranked_animes(self, n=10):
|
26 |
+
self.df['rank'] = self.df['rank'].replace('UNKNOWN', np.nan).astype(float)
|
27 |
+
df_filtered = self.df[self.df['rank'] > 1]
|
28 |
+
sorted_df = df_filtered.sort_values(by=['rank'], ascending=True)
|
29 |
+
top_n_anime = sorted_df.head(n)
|
30 |
+
return pd.DataFrame({
|
31 |
+
'Anime name': top_n_anime['name'].values,
|
32 |
+
'Image URL': top_n_anime['image url'].values,
|
33 |
+
'Genres': top_n_anime['genres'].values,
|
34 |
+
'Rating': top_n_anime['average_rating'].values
|
35 |
+
})
|
36 |
+
|
37 |
+
def overall_top_rated_animes(self, n=10):
|
38 |
+
sorted_df = self.df.sort_values(by=['average_rating'], ascending=False)
|
39 |
+
top_n_anime = sorted_df.head(n)
|
40 |
+
return pd.DataFrame({
|
41 |
+
'Anime name': top_n_anime['name'].values,
|
42 |
+
'Image URL': top_n_anime['image url'].values,
|
43 |
+
'Genres': top_n_anime['genres'].values,
|
44 |
+
'Rating': top_n_anime['average_rating'].values
|
45 |
+
})
|
46 |
+
|
47 |
+
def favorite_animes(self, n=10):
|
48 |
+
sorted_df = self.df.sort_values(by=['favorites'], ascending=False)
|
49 |
+
top_n_anime = sorted_df.head(n)
|
50 |
+
return pd.DataFrame({
|
51 |
+
'Anime name': top_n_anime['name'].values,
|
52 |
+
'Image URL': top_n_anime['image url'].values,
|
53 |
+
'Genres': top_n_anime['genres'].values,
|
54 |
+
'Rating': top_n_anime['average_rating'].values
|
55 |
+
})
|
56 |
+
|
57 |
+
def top_animes_members(self, n=10):
|
58 |
+
sorted_df = self.df.sort_values(by=['members'], ascending=False)
|
59 |
+
top_n_anime = sorted_df.head(n)
|
60 |
+
return pd.DataFrame({
|
61 |
+
'Anime name': top_n_anime['name'].values,
|
62 |
+
'Image URL': top_n_anime['image url'].values,
|
63 |
+
'Genres': top_n_anime['genres'].values,
|
64 |
+
'Rating': top_n_anime['average_rating'].values
|
65 |
+
})
|
66 |
+
|
67 |
+
def popular_anime_among_members(self, n=10):
|
68 |
+
sorted_df = self.df.sort_values(by=['members', 'average_rating'], ascending=[False, False]).drop_duplicates(subset='name')['name']
|
69 |
+
popular_animes = sorted_df.head(n)
|
70 |
+
return pd.DataFrame({
|
71 |
+
'Anime name': popular_animes['name'].values,
|
72 |
+
'Image URL': popular_animes['image url'].values,
|
73 |
+
'Genres': popular_animes['genres'].values,
|
74 |
+
'Rating': popular_animes['average_rating'].values
|
75 |
+
})
|
76 |
+
|
77 |
+
def top_avg_rated(self, n=10):
|
78 |
+
self.df['average_rating'] = pd.to_numeric(self.df['average_rating'], errors='coerce')
|
79 |
+
|
80 |
+
# Replace NaN values with the median
|
81 |
+
median_rating = self.df['average_rating'].median()
|
82 |
+
self.df['average_rating'].fillna(median_rating)
|
83 |
+
# Select top N animes by average rating
|
84 |
+
top_animes = (
|
85 |
+
self.df.drop_duplicates(subset='name')
|
86 |
+
.nlargest(n, 'average_rating')[['name', 'average_rating', 'image url', 'genres']]
|
87 |
+
)
|
88 |
+
return pd.DataFrame({
|
89 |
+
'Anime name': top_animes['name'].values,
|
90 |
+
'Image URL': top_animes['image url'].values,
|
91 |
+
'Genres': top_animes['genres'].values,
|
92 |
+
'Rating': top_animes['average_rating'].values
|
93 |
+
})
|