import pandas as pd import numpy as np import logging # Set up logging configuration logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') class OutlierDetector: def __init__(self, data: pd.DataFrame): """ Initializes the OutlierDetector with data. Parameters: data : pd.DataFrame The data for outlier detection. """ self.data = data logging.info("OutlierDetector initialized with data of shape: %s", data.shape) def z_score_outlier_detection(self, threshold: float = 3.0) -> pd.DataFrame: """Detect outliers using Z-Score method.""" logging.info("Calculating Z-Scores for outlier detection.") z_scores = np.abs((self.data - self.data.mean()) / self.data.std()) outliers = (z_scores > threshold) logging.info("Detected %d outliers using Z-Score method.", outliers.sum().sum()) return self.data[~outliers.any(axis=1)] # Return DataFrame without outliers def iqr_outlier_detection(self) -> pd.DataFrame: """Detect outliers using IQR method.""" logging.info("Calculating IQR for outlier detection.") Q1 = self.data.quantile(0.25) Q3 = self.data.quantile(0.75) IQR = Q3 - Q1 outlier_condition = (self.data < (Q1 - 1.5 * IQR)) | (self.data > (Q3 + 1.5 * IQR)) logging.info("Detected %d outliers using IQR method.", outlier_condition.sum().sum()) return self.data[~outlier_condition.any(axis=1)] # Return DataFrame without outliers def run_outlier_detection(self) -> pd.DataFrame: """Run all outlier detection methods and return cleaned data.""" logging.info("Starting outlier detection steps.") # Select only numerical columns for outlier detection numerical_data = self.data.select_dtypes(include=[np.number]) # Include all numerical columns logging.info("Selected numerical columns for outlier detection: %s", numerical_data.columns.tolist()) # Z-Score Method cleaned_data_z = self.z_score_outlier_detection() # IQR Method cleaned_data_iqr = self.iqr_outlier_detection() logging.info("Outlier detection completed.") # Return a dictionary of cleaned data return cleaned_data_iqr # Usage Example if __name__ == '__main__': # Sample data # try: # df = pd.read_csv("extracted/customer_churn_dataset-training-master.csv") # logging.info("Loaded dataset with shape: %s", df.shape) # # Initialize the outlier detector # detector = OutlierDetector(df) # # Run the outlier detection # cleaned_data = detector.run_outlier_detection() # # Display the cleaned DataFrames # logging.info("Cleaned Data (Z-Score):") # print(cleaned_data["z_score_cleaned"].head()) # logging.info("Cleaned Data (IQR):") # print(cleaned_data["iqr_cleaned"].head()) # except Exception as e: # logging.error("An error occurred: %s", e) pass