|
import pandas as pd |
|
import logging |
|
import os |
|
from sklearn.preprocessing import LabelEncoder |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
class DataPreprocessor: |
|
|
|
def __init__(self, data: pd.DataFrame): |
|
""" |
|
Initializes the DataPreprocessor with data. |
|
|
|
Parameters: |
|
|
|
data : pd.DataFrame |
|
The customer churn data to preprocess. |
|
""" |
|
self.data = data |
|
self.label_encoders = {} |
|
logging.info("DataPreprocessor initialized with data of shape: %s", data.shape) |
|
|
|
def drop_customer_id(self): |
|
"""Drop the CustomerID column if it exists in the DataFrame.""" |
|
if 'CustomerID' in self.data.columns: |
|
self.data.drop(columns=['CustomerID'], inplace=True) |
|
logging.info("Dropped CustomerID column.") |
|
else: |
|
logging.warning("CustomerID column not found.") |
|
|
|
def drop_null_values(self): |
|
"""Drop rows with any null values in the DataFrame.""" |
|
null_count = self.data.isnull().sum().sum() |
|
if null_count > 0: |
|
self.data.dropna(inplace=True) |
|
logging.info("Dropped %d rows with null values.", null_count) |
|
else: |
|
logging.info("No null values to drop.") |
|
|
|
def encode_categorical_columns(self): |
|
""" |
|
Encode categorical features: Subscription Type and Contract Length. |
|
Uses LabelEncoder for each specified column. |
|
""" |
|
for column in ['Subscription Type', 'Contract Length']: |
|
if column in self.data.columns: |
|
le = LabelEncoder() |
|
self.data[column] = le.fit_transform(self.data[column].astype(str)) |
|
self.label_encoders[column] = le |
|
logging.info("Encoded %s with labels: %s", column, le.classes_) |
|
else: |
|
logging.warning("%s column not found for encoding.", column) |
|
|
|
def map_gender(self): |
|
"""Map Gender to binary values: Male - 1, Female - 0.""" |
|
if 'Gender' in self.data.columns: |
|
self.data['Gender'] = self.data['Gender'].map({'Male': 1, 'Female': 0}) |
|
logging.info("Mapped Gender: Male - 1, Female - 0.") |
|
else: |
|
logging.warning("Gender column not found for mapping.") |
|
|
|
def save_processed_data(self, output_directory='processed_data', filename='processed_data.csv'): |
|
""" |
|
Save the processed data to a CSV file. |
|
|
|
Parameters: |
|
----------- |
|
output_directory : str, optional |
|
The directory to save the processed data (default is 'processed_data'). |
|
filename : str, optional |
|
The name of the output CSV file (default is 'processed_data.csv'). |
|
""" |
|
os.makedirs(output_directory, exist_ok=True) |
|
processed_csv_path = os.path.join(output_directory, filename) |
|
self.data.to_csv(processed_csv_path, index=False) |
|
logging.info("Processed data saved to %s", processed_csv_path) |
|
|
|
def process_data(self) -> pd.DataFrame: |
|
"""Execute the full preprocessing pipeline and return the processed DataFrame.""" |
|
self.drop_customer_id() |
|
self.drop_null_values() |
|
self.encode_categorical_columns() |
|
self.map_gender() |
|
self.save_processed_data() |
|
logging.info("Data preprocessing completed.") |
|
return self.data |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
|
|
|
|
pass |
|
|