File size: 3,688 Bytes
92b63f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
import logging
import os
from sklearn.preprocessing import LabelEncoder

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class DataPreprocessor:
    
    def __init__(self, data: pd.DataFrame):
        """
        Initializes the DataPreprocessor with data.
        
        Parameters:
       
        data : pd.DataFrame
            The customer churn data to preprocess.
        """
        self.data = data
        self.label_encoders = {}
        logging.info("DataPreprocessor initialized with data of shape: %s", data.shape)

    def drop_customer_id(self):
        """Drop the CustomerID column if it exists in the DataFrame."""
        if 'CustomerID' in self.data.columns:
            self.data.drop(columns=['CustomerID'], inplace=True)
            logging.info("Dropped CustomerID column.")
        else:
            logging.warning("CustomerID column not found.")

    def drop_null_values(self):
        """Drop rows with any null values in the DataFrame."""
        null_count = self.data.isnull().sum().sum()
        if null_count > 0:
            self.data.dropna(inplace=True)
            logging.info("Dropped %d rows with null values.", null_count)
        else:
            logging.info("No null values to drop.")

    def encode_categorical_columns(self):
        """
        Encode categorical features: Subscription Type and Contract Length.
        Uses LabelEncoder for each specified column.
        """
        for column in ['Subscription Type', 'Contract Length']:
            if column in self.data.columns:
                le = LabelEncoder()
                self.data[column] = le.fit_transform(self.data[column].astype(str))
                self.label_encoders[column] = le
                logging.info("Encoded %s with labels: %s", column, le.classes_)
            else:
                logging.warning("%s column not found for encoding.", column)

    def map_gender(self):
        """Map Gender to binary values: Male - 1, Female - 0."""
        if 'Gender' in self.data.columns:
            self.data['Gender'] = self.data['Gender'].map({'Male': 1, 'Female': 0})
            logging.info("Mapped Gender: Male - 1, Female - 0.")
        else:
            logging.warning("Gender column not found for mapping.")

    def save_processed_data(self, output_directory='processed_data', filename='processed_data.csv'):
        """
        Save the processed data to a CSV file.
        
        Parameters:
        -----------
        output_directory : str, optional
            The directory to save the processed data (default is 'processed_data').
        filename : str, optional
            The name of the output CSV file (default is 'processed_data.csv').
        """
        os.makedirs(output_directory, exist_ok=True)
        processed_csv_path = os.path.join(output_directory, filename)
        self.data.to_csv(processed_csv_path, index=False)
        logging.info("Processed data saved to %s", processed_csv_path)

    def process_data(self) -> pd.DataFrame:
        """Execute the full preprocessing pipeline and return the processed DataFrame."""
        self.drop_customer_id()
        self.drop_null_values()
        self.encode_categorical_columns()
        self.map_gender()
        self.save_processed_data()
        logging.info("Data preprocessing completed.")
        return self.data

# Usage Example
if __name__ == '__main__':
    # df = pd.read_csv("extracted/customer_churn_dataset-training-master.csv")
    # preprocessor = DataPreprocessor(df)
    # cleaned_df = preprocessor.process_data()
    # print(cleaned_df.head())
    pass