Spaces:

huntrezz
/

LACityEmployeePayPredictor

Sleeping

App Files Files Community

huntrezz commited on Sep 18, 2024

Commit

060d492

verified ·

1 Parent(s): 402148f

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -44

app.py CHANGED Viewed

@@ -1,61 +1,45 @@
 import pandas as pd
 import numpy as np
 from sklearn.ensemble import VotingRegressor
 from sklearn.base import BaseEstimator, RegressorMixin
 import gradio as gr
 import joblib
-class FastAIWrapper(BaseEstimator, RegressorMixin):
-    def __init__(self, learn):
-        self.learn = learn
-    def fit(self, X, y):
-        return self
-    def predict(self, X):
-        dl = self.learn.dls.test_dl(X)
-        preds, _ = self.learn.get_preds(dl=dl)
-        return preds.numpy().flatten()
-# Load your data and trained model
 df = pd.read_csv('City_Employee_Payroll__Current__20240915.csv', low_memory=False)
 ensemble = joblib.load('ensemble_model.joblib')
 def predict_total_pay(gender, job_title, ethnicity):
-    # Function to predict total pay based on input parameters
-    # Parameters:
-    #   gender: str - The gender of the employee
-    #   job_title: str - The job title of the employee
-    #   ethnicity: str - The ethnicity of the employee
-    # Create a sample input DataFrame with the given parameters
-    # This will be used as input for the prediction model
     sample = pd.DataFrame({
         'GENDER': [gender],
         'JOB_TITLE': [job_title],
         'ETHNICITY': [ethnicity],
     })
-    # Filter the main DataFrame (df) to find exact matches for the input combination
-    # This creates a subset of data that matches all three input parameters
-    group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)]
     if len(group) > 0:
-        # If exact matches are found, use their statistics to populate the sample
-        # For categorical variables, use the mode (most frequent value)
         sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]]
         sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]]
         sample['MOU'] = [group['MOU'].mode().iloc[0]]
         sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]]
-        # For numerical variables, use the mean
         sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()]
         sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()]
         sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()]
     else:
-        # If no exact match is found, try to find a broader match based on job_title
         job_group = df[df['JOB_TITLE'] == job_title]
         if len(job_group) > 0:
-            # If job title matches are found, use their statistics
             sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]]
             sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]]
             sample['MOU'] = [job_group['MOU'].mode().iloc[0]]
@@ -64,7 +48,6 @@ def predict_total_pay(gender, job_title, ethnicity):
             sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()]
             sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()]
         else:
-            # If no job title match is found, use overall statistics from the entire dataset
             sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]]
             sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]]
             sample['MOU'] = [df['MOU'].mode().iloc[0]]
@@ -72,26 +55,16 @@ def predict_total_pay(gender, job_title, ethnicity):
             sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()]
             sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()]
             sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()]
-    # Set PAY_YEAR to the most recent year in the dataset
-    sample['PAY_YEAR'] = [df['PAY_YEAR'].max()]
-    # Calculate derived features
-    # PAY_RATIO: Ratio of regular pay to other types of pay
     sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
-    # TOTAL_NON_REGULAR_PAY: Sum of overtime pay and all other pay
     sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']
-    # Ensure all categorical columns are of type 'object' to prevent type issues with the model
     categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO']
     for col in categorical_columns:
         sample[col] = sample[col].astype('object')
-    # Use the ensemble model to make a prediction
-    # The model takes the sample DataFrame as input and returns a predicted total pay
-    prediction = ensemble.predict(sample)[0]
-    # Return the predicted total pay
     return prediction
 def gradio_predict(gender, ethnicity, job_title):

 import pandas as pd
 import numpy as np
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.ensemble import VotingRegressor
+from sklearn.linear_model import LinearRegression
+from sklearn.tree import DecisionTreeRegressor
 from sklearn.base import BaseEstimator, RegressorMixin
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
 import gradio as gr
 import joblib
+# Load data
 df = pd.read_csv('City_Employee_Payroll__Current__20240915.csv', low_memory=False)
+df = df.replace([np.inf, -np.inf], np.nan)
+# Define categorical and continuous variables
+cat_names = ['EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'GENDER', 'ETHNICITY', 'JOB_TITLE', 'DEPARTMENT_NO']
+cont_names = ['PAY_YEAR', 'REGULAR_PAY', 'OVERTIME_PAY', 'ALL_OTHER_PAY', 'PAY_RATIO', 'TOTAL_NON_REGULAR_PAY']
+# Load the trained model
 ensemble = joblib.load('ensemble_model.joblib')
 def predict_total_pay(gender, job_title, ethnicity):
     sample = pd.DataFrame({
         'GENDER': [gender],
         'JOB_TITLE': [job_title],
         'ETHNICITY': [ethnicity],
     })
+    group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)]
     if len(group) > 0:
         sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]]
         sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]]
         sample['MOU'] = [group['MOU'].mode().iloc[0]]
         sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]]
         sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()]
         sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()]
         sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()]
     else:
         job_group = df[df['JOB_TITLE'] == job_title]
         if len(job_group) > 0:
             sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]]
             sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]]
             sample['MOU'] = [job_group['MOU'].mode().iloc[0]]
             sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()]
             sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()]
         else:
             sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]]
             sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]]
             sample['MOU'] = [df['MOU'].mode().iloc[0]]
             sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()]
             sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()]
             sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()]
+    sample['PAY_YEAR'] = [df['PAY_YEAR'].max()]
     sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
     sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']
     categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO']
     for col in categorical_columns:
         sample[col] = sample[col].astype('object')
+    prediction = ensemble.predict(sample)[0]
     return prediction
 def gradio_predict(gender, ethnicity, job_title):