Spaces:

bacancydataprophets
/

Customer-Segmentation

Sleeping

App Files Files Community

simran0608 commited on Jun 26, 2024

Commit

0a4a484

verified ·

1 Parent(s): 1107b8f

Update data_preparation.py

Browse files

Files changed (1) hide show

data_preparation.py +83 -20

data_preparation.py CHANGED Viewed

@@ -1,20 +1,83 @@
-import pandas as pd
-import numpy as np
-def preprocess_data(data):
-    if 'CustID' in data.columns:
-        data = data.drop(columns=['CustID'])
-    if 'Channel' in data.columns:
-        data = data.drop(columns=['Channel'])
-    if 'Region' in data.columns:
-        data = data.drop(columns=['Region'])
-    data = remove_outliers(data)
-    return data
-def remove_outliers(df, threshold=3):
-    df_numeric = df.select_dtypes(include=[float, int])
-    z_scores = np.abs((df_numeric - df_numeric.mean()) / df_numeric.std())
-    df_clean = df[(z_scores < threshold).all(axis=1)]
-    return df_clean

+import pandas as pd
+import numpy as np
+def data_imp():
+    feature_descriptions = {
+    "CustID": "Unique identifier for each customer.",
+    "FirstPolYear": "Year when the customer first bought an insurance policy.",
+    "BirthYear": "Birth year of the customer, used to calculate age.",
+    "EducDeg": "Highest educational degree obtained by the customer.",
+    "MonthSal": "Monthly salary of the customer. (Numerical, float64)",
+    "GeoLivArea": "Geographical area where the customer lives.",
+    "Children": "Number of children the customer has.",
+    "CustMonVal": "Total monetary value of the customer to the company.",
+    "ClaimsRate": "Rate at which the customer files insurance claims.",
+    "PremMotor": "Premium amount for motor insurance.",
+    "PremHousehold": "Premium amount for household insurance.",
+    "PremHealth": "Premium amount for health insurance.",
+    "PremLife": "Premium amount for life insurance.",
+    "PremWork": "Premium amount for work insurance."
+    }
+    insurance_defaults = {
+                "FirstPolYear": 1999,
+                "BirthYear": 1980,
+                "MonthSal": 1000,
+                "GeoLivArea": 0,  # Options: 0, 1, 2, 3
+                "Children": 0,  # Options: 0, 1, 2
+                "CustMonVal": 100,
+                "ClaimsRate": 2.33,
+                "PremMotor": 200,
+                "PremHousehold": 200,
+                "PremHealth": 200,
+                "PremLife": 200,
+                "PremWork": 200
+            }
+            # Define default values for banking dataset features
+    banking_defaults = {
+                "BALANCE": 2000,
+                "BALANCE_FREQUENCY": 0.5,
+                "PURCHASES": 500,
+                "ONEOFF_PURCHASES": 0,
+                "INSTALLMENTS_PURCHASES": 0,
+                "CASH_ADVANCE": 200,
+                "PURCHASES_FREQUENCY": 0.1,
+                "ONEOFF_PURCHASES_FREQUENCY": 0.1,
+                "PURCHASES_INSTALLMENTS_FREQUENCY": 0.5,
+                "CASH_ADVANCE_FREQUENCY": 5,
+                "CASH_ADVANCE_TRX": 5,
+                "PURCHASES_TRX": 5,
+                "CREDIT_LIMIT": 10000,
+                "PAYMENTS": 500,
+                "MINIMUM_PAYMENTS": 130,
+                "PRC_FULL_PAYMENT": 0.22,
+                "TENURE": 10
+            }
+            # Define default values for retail dataset features
+    retail_defaults = {
+                "Fresh": 6000,
+                "Milk": 9000,
+                "Grocery": 9000,
+                "Frozen": 4000,
+                "Detergents_Paper": 4000,
+                "Delicassen": 2000
+            }
+    return feature_descriptions,insurance_defaults,banking_defaults,retail_defaults
+def preprocess_data(data):
+    if 'CustID' in data.columns:
+        data = data.drop(columns=['CustID'])
+    if 'Channel' in data.columns:
+        data = data.drop(columns=['Channel'])
+    if 'Region' in data.columns:
+        data = data.drop(columns=['Region'])
+    data = remove_outliers(data)
+    return data
+def remove_outliers(df, threshold=3):
+    df_numeric = df.select_dtypes(include=[float, int])
+    z_scores = np.abs((df_numeric - df_numeric.mean()) / df_numeric.std())
+    df_clean = df[(z_scores < threshold).all(axis=1)]
+    return df_clean