streamlitwebapp / src /outlier_detection.py
Sarathkumar1304ai's picture
Upload 91 files
2953afe verified
import pandas as pd
import numpy as np
import logging
# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class OutlierDetector:
def __init__(self, data: pd.DataFrame):
"""
Initializes the OutlierDetector with data.
Parameters:
data : pd.DataFrame
The data for outlier detection.
"""
self.data = data
logging.info("OutlierDetector initialized with data of shape: %s", data.shape)
def z_score_outlier_detection(self, threshold: float = 3.0) -> pd.DataFrame:
"""Detect outliers using Z-Score method."""
logging.info("Calculating Z-Scores for outlier detection.")
z_scores = np.abs((self.data - self.data.mean()) / self.data.std())
outliers = (z_scores > threshold)
logging.info("Detected %d outliers using Z-Score method.", outliers.sum().sum())
return self.data[~outliers.any(axis=1)] # Return DataFrame without outliers
def iqr_outlier_detection(self) -> pd.DataFrame:
"""Detect outliers using IQR method."""
logging.info("Calculating IQR for outlier detection.")
Q1 = self.data.quantile(0.25)
Q3 = self.data.quantile(0.75)
IQR = Q3 - Q1
outlier_condition = (self.data < (Q1 - 1.5 * IQR)) | (self.data > (Q3 + 1.5 * IQR))
logging.info("Detected %d outliers using IQR method.", outlier_condition.sum().sum())
return self.data[~outlier_condition.any(axis=1)] # Return DataFrame without outliers
def run_outlier_detection(self) -> pd.DataFrame:
"""Run all outlier detection methods and return cleaned data."""
logging.info("Starting outlier detection steps.")
# Select only numerical columns for outlier detection
numerical_data = self.data.select_dtypes(include=[np.number]) # Include all numerical columns
logging.info("Selected numerical columns for outlier detection: %s", numerical_data.columns.tolist())
# Z-Score Method
cleaned_data_z = self.z_score_outlier_detection()
# IQR Method
cleaned_data_iqr = self.iqr_outlier_detection()
logging.info("Outlier detection completed.")
# Return a dictionary of cleaned data
return cleaned_data_iqr
# Usage Example
if __name__ == '__main__':
# Sample data
# try:
# df = pd.read_csv("extracted/customer_churn_dataset-training-master.csv")
# logging.info("Loaded dataset with shape: %s", df.shape)
# # Initialize the outlier detector
# detector = OutlierDetector(df)
# # Run the outlier detection
# cleaned_data = detector.run_outlier_detection()
# # Display the cleaned DataFrames
# logging.info("Cleaned Data (Z-Score):")
# print(cleaned_data["z_score_cleaned"].head())
# logging.info("Cleaned Data (IQR):")
# print(cleaned_data["iqr_cleaned"].head())
# except Exception as e:
# logging.error("An error occurred: %s", e)
pass