import pandas as pd import numpy as np def preprocess_data(data): if 'CustID' in data.columns: data = data.drop(columns=['CustID']) if 'Channel' in data.columns: data = data.drop(columns=['Channel']) if 'Region' in data.columns: data = data.drop(columns=['Region']) data = remove_outliers(data) return data def remove_outliers(df, threshold=3): df_numeric = df.select_dtypes(include=[float, int]) z_scores = np.abs((df_numeric - df_numeric.mean()) / df_numeric.std()) df_clean = df[(z_scores < threshold).all(axis=1)] return df_clean