|
import pandas as pd
|
|
import numpy as np
|
|
|
|
def preprocess_data(data):
|
|
if 'CustID' in data.columns:
|
|
data = data.drop(columns=['CustID'])
|
|
if 'Channel' in data.columns:
|
|
data = data.drop(columns=['Channel'])
|
|
if 'Region' in data.columns:
|
|
data = data.drop(columns=['Region'])
|
|
|
|
|
|
data = remove_outliers(data)
|
|
return data
|
|
|
|
def remove_outliers(df, threshold=3):
|
|
df_numeric = df.select_dtypes(include=[float, int])
|
|
z_scores = np.abs((df_numeric - df_numeric.mean()) / df_numeric.std())
|
|
df_clean = df[(z_scores < threshold).all(axis=1)]
|
|
return df_clean
|
|
|