File size: 5,312 Bytes
b4f6ffc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2773c3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import sys 
import numpy as np
import pandas as pd 
from anime_recommender.loggers.logging import logging
from anime_recommender.exception.exception import AnimeRecommendorException
from anime_recommender.utils.main_utils.utils import export_data_to_dataframe
from anime_recommender.constant import *
from anime_recommender.entity.config_entity import DataTransformationConfig
from anime_recommender.entity.artifact_entity import DataIngestionArtifact,DataTransformationArtifact

class DataTransformation:
    """

    Class for handling data transformation for energy generation models.

    """
    def __init__(self,data_ingestion_artifact:DataIngestionArtifact,data_transformation_config:DataTransformationConfig):
        """

        Initializes the DataTransformation class with the given data ingestion and configuration artifacts. 

        Args:

            data_ingestion_artifact (DataIngestionArtifact): The artifact containing ingested data paths.

            data_transformation_config (DataTransformationConfig): Configuration object for data transformation.

        """
        try:
            self.data_ingestion_artifact = data_ingestion_artifact
            self.data_transformation_config = data_transformation_config
        except Exception as e:
            raise AnimeRecommendorException(e,sys)
    
    @staticmethod
    def read_data(file_path)->pd.DataFrame:
        """

        Reads data from a CSV file. 

        Args:

            file_path (str): Path to the CSV file. 

        Returns:

            pd.DataFrame: The DataFrame containing the data from the CSV file. 

        """
        try:
            return pd.read_csv(file_path)
        except Exception as e:
            raise AnimeRecommendorException(e,sys)
    
    @staticmethod
    def merge_data(anime_df: pd.DataFrame, rating_df: pd.DataFrame) -> pd.DataFrame:
        """

        Merges the anime and rating DataFrames on 'anime_id'. 

        Args:

            anime_df (pd.DataFrame): DataFrame containing anime information.

            rating_df (pd.DataFrame): DataFrame containing user rating information. 

        Returns:

            pd.DataFrame: Merged DataFrame on 'anime_id'.

        """
        try:
            merged_df = pd.merge(rating_df, anime_df, on="anime_id", how="inner")
            logging.info(f"Shape of the Merged dataframe:{merged_df.shape}")
            logging.info(f"Column names: {merged_df.columns}") 
            return merged_df
        except Exception as e:
            raise AnimeRecommendorException(e, sys)

    @staticmethod
    def clean_filter_data(merged_df: pd.DataFrame) -> pd.DataFrame:
        """

        Cleans the merged DataFrame by replacing 'UNKNOWN' with NaN, filling NaN values with median and also filters the data.



        Args:

            merged_df (pd.DataFrame): Merged DataFrame to clean and filter.



        Returns:

            pd.DataFrame: Cleaned and Filtered DataFrame with NaN values handled.

        """
        try:
            merged_df['average_rating'].replace('UNKNOWN', np.nan)
            merged_df['average_rating'] = pd.to_numeric(merged_df['average_rating'], errors='coerce')
            merged_df['average_rating'].fillna(merged_df['average_rating'].median())
            merged_df = merged_df[merged_df['average_rating'] > 6]
            cols_to_drop = [  'username', 'overview', 'type', 'episodes', 'producers',
                'licensors', 'studios', 'source',   'rank', 'popularity',
                'favorites', 'scored by', 'members' ]
            cleaned_df = merged_df.copy()
            cleaned_df.drop(columns=cols_to_drop, inplace=True)
            logging.info(f"Shape of the Merged dataframe:{cleaned_df.shape}")
            logging.info(f"Column names: {cleaned_df.columns}")
            logging.info(f"Preview of the merged DataFrame:\n{cleaned_df.head()}")
            return cleaned_df
        except Exception as e:
            raise AnimeRecommendorException(e, sys)
        
    def initiate_data_transformation(self)->DataTransformationArtifact:
        """

        Initiates the data transformation process by reading, transforming, and saving the data.



        Returns:

            DataTransformationArtifact: The artifact containing paths to the transformed data. 

        """
        logging.info("Entering initiate_data_transformation method of DataTransformation class.")
        try:  
            anime_df = DataTransformation.read_data(self.data_ingestion_artifact.feature_store_anime_file_path)
            rating_df = DataTransformation.read_data(self.data_ingestion_artifact.feature_store_userrating_file_path) 
            merged_df = DataTransformation.merge_data(anime_df, rating_df)
            transformed_df = DataTransformation.clean_filter_data(merged_df)

            export_data_to_dataframe(transformed_df, self.data_transformation_config.merged_file_path)
            data_transformation_artifact = DataTransformationArtifact( 
                merged_file_path=self.data_transformation_config.merged_file_path
                            )
            
            return data_transformation_artifact
        except Exception as e:
            raise AnimeRecommendorException(e,sys)