File size: 3,131 Bytes
92b63f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pandas as pd
import numpy as np
import logging

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class OutlierDetector:

    def __init__(self, data: pd.DataFrame):
        """
        Initializes the OutlierDetector with data.
        
        Parameters:
        
        data : pd.DataFrame
            The data for outlier detection.
        """
        self.data = data
        logging.info("OutlierDetector initialized with data of shape: %s", data.shape)

    def z_score_outlier_detection(self, threshold: float = 3.0) -> pd.DataFrame:
        """Detect outliers using Z-Score method."""
        logging.info("Calculating Z-Scores for outlier detection.")
        z_scores = np.abs((self.data - self.data.mean()) / self.data.std())
        outliers = (z_scores > threshold)
        logging.info("Detected %d outliers using Z-Score method.", outliers.sum().sum())
        return self.data[~outliers.any(axis=1)]  # Return DataFrame without outliers

    def iqr_outlier_detection(self) -> pd.DataFrame:
        """Detect outliers using IQR method."""
        logging.info("Calculating IQR for outlier detection.")
        Q1 = self.data.quantile(0.25)
        Q3 = self.data.quantile(0.75)
        IQR = Q3 - Q1
        outlier_condition = (self.data < (Q1 - 1.5 * IQR)) | (self.data > (Q3 + 1.5 * IQR))
        logging.info("Detected %d outliers using IQR method.", outlier_condition.sum().sum())
        return self.data[~outlier_condition.any(axis=1)]  # Return DataFrame without outliers

    def run_outlier_detection(self) -> pd.DataFrame:
        """Run all outlier detection methods and return cleaned data."""
        logging.info("Starting outlier detection steps.")
        
        # Select only numerical columns for outlier detection
        numerical_data = self.data.select_dtypes(include=[np.number])  # Include all numerical columns
        logging.info("Selected numerical columns for outlier detection: %s", numerical_data.columns.tolist())
        
        # Z-Score Method
        cleaned_data_z = self.z_score_outlier_detection()
        
        # IQR Method
        cleaned_data_iqr = self.iqr_outlier_detection()

        logging.info("Outlier detection completed.")
        
        # Return a dictionary of cleaned data
        return  cleaned_data_iqr

# Usage Example
if __name__ == '__main__':
    # Sample data
    # try:
    #     df = pd.read_csv("extracted/customer_churn_dataset-training-master.csv")
    #     logging.info("Loaded dataset with shape: %s", df.shape)

    #     # Initialize the outlier detector
    #     detector = OutlierDetector(df)

    #     # Run the outlier detection
    #     cleaned_data = detector.run_outlier_detection()

    #     # Display the cleaned DataFrames
    #     logging.info("Cleaned Data (Z-Score):")
    #     print(cleaned_data["z_score_cleaned"].head())
        
    #     logging.info("Cleaned Data (IQR):")
    #     print(cleaned_data["iqr_cleaned"].head())
        
    # except Exception as e:
    #     logging.error("An error occurred: %s", e)
    pass