import os import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import joblib from anime_recommender.loggers.logging import logging from anime_recommender.exception.exception import AnimeRecommendorException class ContentBasedRecommender: """ A content-based recommender system using TF-IDF Vectorizer and Cosine Similarity. """ def __init__(self, df): try: self.df = df.dropna() # Create a Series mapping anime names to their indices self.indices = pd.Series(self.df.index, index=self.df['name']).drop_duplicates() # Initialize and fit the TF-IDF Vectorizer on the 'genres' column self.tfv = TfidfVectorizer( min_df=3, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words='english' ) self.tfv_matrix = self.tfv.fit_transform(self.df['genres']) self.cosine_sim = cosine_similarity(self.tfv_matrix, self.tfv_matrix) except Exception as e: raise AnimeRecommendorException(e) def save_model(self, model_path): """Save the trained model (TF-IDF and Cosine Similarity Matrix) to a file.""" try: logging.info(f"Saving model to {model_path}") os.makedirs(os.path.dirname(model_path), exist_ok=True) with open(model_path, 'wb') as f: joblib.dump((self.tfv, self.cosine_sim), f) logging.info("Content recommender Model saved successfully") except Exception as e: raise AnimeRecommendorException(e) def get_rec_cosine(self, title, model_path, n_recommendations=5): """Get recommendations based on cosine similarity for a given anime title.""" try: logging.info(f"Loading model from {model_path}") # Load the model (TF-IDF and cosine similarity matrix) with open(model_path, 'rb') as f: self.tfv, self.cosine_sim = joblib.load(f) logging.info("Model loaded successfully") # Check if the DataFrame is loaded if self.df is None: logging.error("The DataFrame is not loaded, cannot make recommendations.") raise ValueError("The DataFrame is not loaded, cannot make recommendations.") if title not in self.indices.index: logging.warning(f"Anime title '{title}' not found in dataset") return f"Anime title '{title}' not found in the dataset." idx = self.indices[title] cosinesim_scores = list(enumerate(self.cosine_sim[idx])) cosinesim_scores = sorted(cosinesim_scores, key=lambda x: x[1], reverse=True)[1:n_recommendations + 1] anime_indices = [i[0] for i in cosinesim_scores] logging.info("Recommendations generated successfully") return pd.DataFrame({ 'Anime name': self.df['name'].iloc[anime_indices].values, 'Image URL': self.df['image url'].iloc[anime_indices].values, 'Genres': self.df['genres'].iloc[anime_indices].values, 'Rating': self.df['average_rating'].iloc[anime_indices].values }) except Exception as e: raise AnimeRecommendorException(e)