krishnaveni76 commited on
Commit
7ec9545
·
1 Parent(s): 259bb98

model_trainers

Browse files
.gitignore CHANGED
@@ -2,5 +2,4 @@ ars/
2
  .env
3
  Artifacts/
4
  logs/
5
- __pycache__/
6
- model_trainer/
 
2
  .env
3
  Artifacts/
4
  logs/
5
+ __pycache__/
 
anime_recommender/model_trainer/collaborative_modelling.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import pandas as pd
3
+ from anime_recommender.loggers.logging import logging
4
+ from anime_recommender.exception.exception import AnimeRecommendorException
5
+
6
+ from surprise import Reader, Dataset, SVD
7
+ from surprise.model_selection import cross_validate
8
+ from scipy.sparse import csr_matrix
9
+ from sklearn.neighbors import NearestNeighbors
10
+ from collections import Counter
11
+
12
+ class CollaborativeAnimeRecommender:
13
+ def __init__(self, df):
14
+ self.df = df
15
+ self.svd = None
16
+ self.knn_item_based = None
17
+ self.knn_user_based = None
18
+ self.prepare_data()
19
+
20
+
21
+ def prepare_data(self):
22
+ self.df = self.df.drop_duplicates()
23
+ reader = Reader(rating_scale=(1, 10))
24
+ self.data = Dataset.load_from_df(self.df[['user_id', 'anime_id', 'rating']], reader)
25
+ self.anime_pivot = self.df.pivot_table(index='name', columns='user_id', values='rating').fillna(0)
26
+ self.user_pivot = self.df.pivot_table(index='user_id', columns='name', values='rating').fillna(0)
27
+
28
+ def train_svd(self):
29
+ self.svd = SVD()
30
+ cross_validate(self.svd, self.data, cv=5)
31
+ trainset = self.data.build_full_trainset()
32
+ self.svd.fit(trainset)
33
+
34
+ def train_knn_item_based(self):
35
+ item_user_matrix = csr_matrix(self.anime_pivot.values)
36
+ self.knn_item_based = NearestNeighbors(metric='cosine', algorithm='brute')
37
+ self.knn_item_based.fit(item_user_matrix)
38
+
39
+ def train_knn_user_based(self):
40
+ """Train the KNN model for user-based recommendations."""
41
+ user_item_matrix = csr_matrix(self.user_pivot.values)
42
+ self.knn_user_based = NearestNeighbors(metric='cosine', algorithm='brute')
43
+ self.knn_user_based.fit(user_item_matrix)
44
+
45
+ def print_unique_user_ids(self):
46
+ """Print unique user IDs from the dataset."""
47
+ unique_user_ids = self.df['user_id'].unique()
48
+ logging.info(f"Unique User IDs: {unique_user_ids}")
49
+ return unique_user_ids
50
+
51
+ def get_svd_recommendations(self, user_id, n=10, svd_model=None):
52
+ # Use the provided SVD model or the trained self.svd model
53
+ svd_model = svd_model or self.svd
54
+ if svd_model is None:
55
+ raise ValueError("SVD model is not provided or trained.")
56
+
57
+ # Ensure user exists in the dataset
58
+ if user_id not in self.df['user_id'].unique():
59
+ return f"User ID '{user_id}' not found in the dataset."
60
+
61
+ # Get unique anime IDs
62
+ anime_ids = self.df['anime_id'].unique()
63
+
64
+ # Predict ratings for all anime for the given user
65
+ predictions = [(anime_id, svd_model.predict(user_id, anime_id).est) for anime_id in anime_ids]
66
+ predictions.sort(key=lambda x: x[1], reverse=True)
67
+
68
+ # Extract top N anime IDs
69
+ recommended_anime_ids = [pred[0] for pred in predictions[:n]]
70
+
71
+ # Get details of recommended anime
72
+ recommended_anime = self.df[self.df['anime_id'].isin(recommended_anime_ids)].drop_duplicates(subset='anime_id')
73
+ logging.info(f"Shape of recommended_anime: {recommended_anime.shape}")
74
+ # Limit to N recommendations
75
+ recommended_anime = recommended_anime.head(n)
76
+
77
+ return pd.DataFrame({
78
+ 'Anime Name': recommended_anime['name'].values,
79
+ 'Genres': recommended_anime['genres'].values,
80
+ 'Image URL': recommended_anime['image url'].values,
81
+ 'Rating': recommended_anime['average_rating'].values
82
+ })
83
+
84
+ def get_item_based_recommendations(self, anime_name, n_recommendations=10, knn_item_model=None):
85
+ # Use the provided model or fall back to self.knn_item_based
86
+ knn_item_based = knn_item_model or self.knn_item_based
87
+ if knn_item_based is None:
88
+ raise ValueError("Item-based KNN model is not provided or trained.")
89
+
90
+ # Ensure the anime name exists in the pivot table
91
+ if anime_name not in self.anime_pivot.index:
92
+ return f"Anime title '{anime_name}' not found in the dataset."
93
+
94
+ # Get the index of the anime in the pivot table
95
+ query_index = self.anime_pivot.index.get_loc(anime_name)
96
+
97
+ # Use the KNN model to find similar animes (n_neighbors + 1 to exclude the query itself)
98
+ distances, indices = knn_item_based.kneighbors(
99
+ self.anime_pivot.iloc[query_index, :].values.reshape(1, -1),
100
+ n_neighbors=n_recommendations + 1 # +1 because the query anime itself is included
101
+ )
102
+ recommendations = []
103
+ for i in range(1, len(distances.flatten())): # Start from 1 to exclude the query anime
104
+ anime_title = self.anime_pivot.index[indices.flatten()[i]]
105
+ distance = distances.flatten()[i]
106
+ recommendations.append((anime_title, distance))
107
+
108
+ # Fetch the recommended anime names (top n_recommendations)
109
+ recommended_anime_titles = [rec[0] for rec in recommendations]
110
+ logging.info(f"Top {n_recommendations} recommendations: {recommended_anime_titles}")
111
+ filtered_df = self.df[self.df['name'].isin(recommended_anime_titles)].drop_duplicates(subset='name')
112
+ logging.info(f"Shape of filtered df: {filtered_df.shape}")
113
+ # Limit the results to `n_recommendations`
114
+ filtered_df = filtered_df.head(n_recommendations)
115
+
116
+ return pd.DataFrame({
117
+ 'Anime Name': filtered_df['name'].values,
118
+ 'Image URL': filtered_df['image url'].values,
119
+ 'Genres': filtered_df['genres'].values,
120
+ 'Rating': filtered_df['average_rating'].values
121
+ })
122
+
123
+ def get_user_based_recommendations(self, user_id, n_recommendations=10, knn_user_model=None):
124
+ """
125
+ Recommend anime for a given user based on similar users' preferences using the provided or trained KNN model.
126
+
127
+ Args:
128
+ user_id (int): The ID of the user.
129
+ n_recommendations (int): Number of recommendations to return.
130
+ knn_user_model (NearestNeighbors, optional): Pre-trained KNN model. Defaults to None.
131
+
132
+ Returns:
133
+ pd.DataFrame: A DataFrame containing recommended anime titles and related information.
134
+ """
135
+ # Use the provided model or fall back to self.knn_user_based
136
+ knn_user_based = knn_user_model or self.knn_user_based
137
+ if knn_user_based is None:
138
+ raise ValueError("User-based KNN model is not provided or trained.")
139
+
140
+ # Ensure the user exists in the pivot table
141
+ user_id = float(user_id) # Convert to match pivot table index type
142
+ if user_id not in self.user_pivot.index:
143
+ return f"User ID '{user_id}' not found in the dataset."
144
+
145
+ # Find the user's index in the pivot table
146
+ user_idx = self.user_pivot.index.get_loc(user_id)
147
+
148
+ # Use the KNN model to find the nearest neighbors
149
+ distances, indices = knn_user_based.kneighbors(
150
+ self.user_pivot.iloc[user_idx, :].values.reshape(1, -1),
151
+ n_neighbors=n_recommendations + 1 # Include the user itself
152
+ )
153
+
154
+ # Get the list of anime the user has already rated
155
+ user_rated_anime = set(self.user_pivot.columns[self.user_pivot.iloc[user_idx, :] > 0])
156
+
157
+ # Collect all anime rated by the nearest neighbors
158
+ all_neighbor_ratings = []
159
+ for i in range(1, len(distances.flatten())): # Start from 1 to exclude the user itself
160
+ neighbor_idx = indices.flatten()[i]
161
+ neighbor_rated_anime = self.user_pivot.iloc[neighbor_idx, :]
162
+ neighbor_ratings = neighbor_rated_anime[neighbor_rated_anime > 0]
163
+ all_neighbor_ratings.extend(neighbor_ratings.index)
164
+
165
+ # Count how frequently each anime is rated by neighbors
166
+ anime_counter = Counter(all_neighbor_ratings)
167
+
168
+ # Recommend anime not already rated by the user
169
+ recommendations = [(anime, count) for anime, count in anime_counter.items() if anime not in user_rated_anime]
170
+ recommendations.sort(key=lambda x: x[1], reverse=True) # Sort by frequency
171
+
172
+ # Extract recommended anime names and their details
173
+ recommended_anime_titles = [rec[0] for rec in recommendations[:n_recommendations]]
174
+ filtered_df = self.df[self.df['name'].isin(recommended_anime_titles)].drop_duplicates(subset='name')
175
+ logging.info(f"Shape of filtered df: {filtered_df.shape}")
176
+ filtered_df = filtered_df.head(n_recommendations)
177
+
178
+ return pd.DataFrame({
179
+ 'Anime Name': filtered_df['name'].values,
180
+ 'Image URL': filtered_df['image url'].values,
181
+ 'Genres': filtered_df['genres'].values,
182
+ 'Rating': filtered_df['average_rating'].values
183
+ })
anime_recommender/model_trainer/content_based_modelling.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import joblib
6
+
7
+
8
+ class ContentBasedRecommender:
9
+ """
10
+ A content-based recommender system using TF-IDF Vectorizer and Cosine Similarity.
11
+ """
12
+ def __init__(self, df):
13
+ try:
14
+ # Drop missing values from the DataFrame
15
+ self.df = df.dropna()
16
+
17
+ # Create a Series mapping anime names to their indices
18
+ self.indices = pd.Series(self.df.index, index=self.df['name']).drop_duplicates()
19
+
20
+ # Initialize and fit the TF-IDF Vectorizer on the 'genres' column
21
+ self.tfv = TfidfVectorizer(
22
+ min_df=3,
23
+ strip_accents='unicode',
24
+ analyzer='word',
25
+ token_pattern=r'\w{1,}',
26
+ ngram_range=(1, 3),
27
+ stop_words='english'
28
+ )
29
+ self.tfv_matrix = self.tfv.fit_transform(self.df['genres'])
30
+
31
+ self.cosine_sim = cosine_similarity(self.tfv_matrix, self.tfv_matrix)
32
+
33
+ except Exception as e:
34
+ raise e
35
+ def save_model(self, model_path):
36
+ """Save the trained model (TF-IDF and Cosine Similarity Matrix) to a file."""
37
+ try:
38
+ os.makedirs(os.path.dirname(model_path), exist_ok=True)
39
+ with open(model_path, 'wb') as f:
40
+ joblib.dump((self.tfv, self.cosine_sim), f)
41
+
42
+ except Exception as e:
43
+ raise e
44
+ def get_rec_cosine(self, title, model_path, n_recommendations=5):
45
+ """Get recommendations based on cosine similarity for a given anime title."""
46
+ try:
47
+ # Load the model (TF-IDF and cosine similarity matrix)
48
+ with open(model_path, 'rb') as f:
49
+ self.tfv, self.cosine_sim = joblib.load(f)
50
+
51
+ # Check if the DataFrame is loaded
52
+ if self.df is None:
53
+ raise ValueError("The DataFrame is not loaded, cannot make recommendations.")
54
+
55
+ if title not in self.indices.index:
56
+ return f"Anime title '{title}' not found in the dataset."
57
+
58
+ idx = self.indices[title]
59
+ cosinesim_scores = list(enumerate(self.cosine_sim[idx]))
60
+ cosinesim_scores = sorted(cosinesim_scores, key=lambda x: x[1], reverse=True)[1:n_recommendations + 1]
61
+ anime_indices = [i[0] for i in cosinesim_scores]
62
+
63
+ return pd.DataFrame({
64
+ 'Anime name': self.df['name'].iloc[anime_indices].values,
65
+ 'Image URL': self.df['image url'].iloc[anime_indices].values,
66
+ 'Genres': self.df['genres'].iloc[anime_indices].values,
67
+ 'Rating': self.df['average_rating'].iloc[anime_indices].values
68
+ })
69
+ except Exception as e:
70
+ raise e
71
+
anime_recommender/model_trainer/top_anime_filtering.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import numpy as np
3
+ import pandas as pd
4
+ from anime_recommender.exception.exception import AnimeRecommendorException
5
+
6
+ class PopularityBasedFiltering:
7
+ def __init__(self, df):
8
+ try:
9
+ self.df = df
10
+ self.df['average_rating'] = pd.to_numeric(self.df['average_rating'], errors='coerce')
11
+ self.df['average_rating'].fillna(self.df['average_rating'].median())
12
+ except Exception as e:
13
+ raise AnimeRecommendorException(e, sys)
14
+
15
+ def popular_animes(self, n=10):
16
+ sorted_df = self.df.sort_values(by=['popularity'], ascending=True)
17
+ top_n_anime = sorted_df.head(n)
18
+ return pd.DataFrame({
19
+ 'Anime name': top_n_anime['name'].values,
20
+ 'Image URL': top_n_anime['image url'].values,
21
+ 'Genres': top_n_anime['genres'].values,
22
+ 'Rating': top_n_anime['average_rating'].values
23
+ })
24
+
25
+ def top_ranked_animes(self, n=10):
26
+ self.df['rank'] = self.df['rank'].replace('UNKNOWN', np.nan).astype(float)
27
+ df_filtered = self.df[self.df['rank'] > 1]
28
+ sorted_df = df_filtered.sort_values(by=['rank'], ascending=True)
29
+ top_n_anime = sorted_df.head(n)
30
+ return pd.DataFrame({
31
+ 'Anime name': top_n_anime['name'].values,
32
+ 'Image URL': top_n_anime['image url'].values,
33
+ 'Genres': top_n_anime['genres'].values,
34
+ 'Rating': top_n_anime['average_rating'].values
35
+ })
36
+
37
+ def overall_top_rated_animes(self, n=10):
38
+ sorted_df = self.df.sort_values(by=['average_rating'], ascending=False)
39
+ top_n_anime = sorted_df.head(n)
40
+ return pd.DataFrame({
41
+ 'Anime name': top_n_anime['name'].values,
42
+ 'Image URL': top_n_anime['image url'].values,
43
+ 'Genres': top_n_anime['genres'].values,
44
+ 'Rating': top_n_anime['average_rating'].values
45
+ })
46
+
47
+ def favorite_animes(self, n=10):
48
+ sorted_df = self.df.sort_values(by=['favorites'], ascending=False)
49
+ top_n_anime = sorted_df.head(n)
50
+ return pd.DataFrame({
51
+ 'Anime name': top_n_anime['name'].values,
52
+ 'Image URL': top_n_anime['image url'].values,
53
+ 'Genres': top_n_anime['genres'].values,
54
+ 'Rating': top_n_anime['average_rating'].values
55
+ })
56
+
57
+ def top_animes_members(self, n=10):
58
+ sorted_df = self.df.sort_values(by=['members'], ascending=False)
59
+ top_n_anime = sorted_df.head(n)
60
+ return pd.DataFrame({
61
+ 'Anime name': top_n_anime['name'].values,
62
+ 'Image URL': top_n_anime['image url'].values,
63
+ 'Genres': top_n_anime['genres'].values,
64
+ 'Rating': top_n_anime['average_rating'].values
65
+ })
66
+
67
+ def popular_anime_among_members(self, n=10):
68
+ sorted_df = self.df.sort_values(by=['members', 'average_rating'], ascending=[False, False]).drop_duplicates(subset='name')['name']
69
+ popular_animes = sorted_df.head(n)
70
+ return pd.DataFrame({
71
+ 'Anime name': popular_animes['name'].values,
72
+ 'Image URL': popular_animes['image url'].values,
73
+ 'Genres': popular_animes['genres'].values,
74
+ 'Rating': popular_animes['average_rating'].values
75
+ })
76
+
77
+ def top_avg_rated(self, n=10):
78
+ self.df['average_rating'] = pd.to_numeric(self.df['average_rating'], errors='coerce')
79
+
80
+ # Replace NaN values with the median
81
+ median_rating = self.df['average_rating'].median()
82
+ self.df['average_rating'].fillna(median_rating)
83
+ # Select top N animes by average rating
84
+ top_animes = (
85
+ self.df.drop_duplicates(subset='name')
86
+ .nlargest(n, 'average_rating')[['name', 'average_rating', 'image url', 'genres']]
87
+ )
88
+ return pd.DataFrame({
89
+ 'Anime name': top_animes['name'].values,
90
+ 'Image URL': top_animes['image url'].values,
91
+ 'Genres': top_animes['genres'].values,
92
+ 'Rating': top_animes['average_rating'].values
93
+ })