Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,61 +1,45 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
|
|
|
| 3 |
from sklearn.ensemble import VotingRegressor
|
|
|
|
|
|
|
| 4 |
from sklearn.base import BaseEstimator, RegressorMixin
|
|
|
|
|
|
|
| 5 |
import gradio as gr
|
| 6 |
import joblib
|
| 7 |
|
| 8 |
-
|
| 9 |
-
def __init__(self, learn):
|
| 10 |
-
self.learn = learn
|
| 11 |
-
|
| 12 |
-
def fit(self, X, y):
|
| 13 |
-
return self
|
| 14 |
-
|
| 15 |
-
def predict(self, X):
|
| 16 |
-
dl = self.learn.dls.test_dl(X)
|
| 17 |
-
preds, _ = self.learn.get_preds(dl=dl)
|
| 18 |
-
return preds.numpy().flatten()
|
| 19 |
-
|
| 20 |
-
# Load your data and trained model
|
| 21 |
df = pd.read_csv('City_Employee_Payroll__Current__20240915.csv', low_memory=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
ensemble = joblib.load('ensemble_model.joblib')
|
| 23 |
|
| 24 |
def predict_total_pay(gender, job_title, ethnicity):
|
| 25 |
-
# Function to predict total pay based on input parameters
|
| 26 |
-
# Parameters:
|
| 27 |
-
# gender: str - The gender of the employee
|
| 28 |
-
# job_title: str - The job title of the employee
|
| 29 |
-
# ethnicity: str - The ethnicity of the employee
|
| 30 |
-
|
| 31 |
-
# Create a sample input DataFrame with the given parameters
|
| 32 |
-
# This will be used as input for the prediction model
|
| 33 |
sample = pd.DataFrame({
|
| 34 |
'GENDER': [gender],
|
| 35 |
'JOB_TITLE': [job_title],
|
| 36 |
'ETHNICITY': [ethnicity],
|
| 37 |
})
|
| 38 |
-
|
| 39 |
-
# Filter the main DataFrame (df) to find exact matches for the input combination
|
| 40 |
-
# This creates a subset of data that matches all three input parameters
|
| 41 |
-
group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)]
|
| 42 |
|
|
|
|
| 43 |
if len(group) > 0:
|
| 44 |
-
# If exact matches are found, use their statistics to populate the sample
|
| 45 |
-
# For categorical variables, use the mode (most frequent value)
|
| 46 |
sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]]
|
| 47 |
sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]]
|
| 48 |
sample['MOU'] = [group['MOU'].mode().iloc[0]]
|
| 49 |
sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]]
|
| 50 |
-
# For numerical variables, use the mean
|
| 51 |
sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()]
|
| 52 |
sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()]
|
| 53 |
sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()]
|
| 54 |
else:
|
| 55 |
-
# If no exact match is found, try to find a broader match based on job_title
|
| 56 |
job_group = df[df['JOB_TITLE'] == job_title]
|
| 57 |
if len(job_group) > 0:
|
| 58 |
-
# If job title matches are found, use their statistics
|
| 59 |
sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]]
|
| 60 |
sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]]
|
| 61 |
sample['MOU'] = [job_group['MOU'].mode().iloc[0]]
|
|
@@ -64,7 +48,6 @@ def predict_total_pay(gender, job_title, ethnicity):
|
|
| 64 |
sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()]
|
| 65 |
sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()]
|
| 66 |
else:
|
| 67 |
-
# If no job title match is found, use overall statistics from the entire dataset
|
| 68 |
sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]]
|
| 69 |
sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]]
|
| 70 |
sample['MOU'] = [df['MOU'].mode().iloc[0]]
|
|
@@ -72,26 +55,16 @@ def predict_total_pay(gender, job_title, ethnicity):
|
|
| 72 |
sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()]
|
| 73 |
sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()]
|
| 74 |
sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()]
|
| 75 |
-
|
| 76 |
-
# Set PAY_YEAR to the most recent year in the dataset
|
| 77 |
-
sample['PAY_YEAR'] = [df['PAY_YEAR'].max()]
|
| 78 |
|
| 79 |
-
|
| 80 |
-
# PAY_RATIO: Ratio of regular pay to other types of pay
|
| 81 |
sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
|
| 82 |
-
# TOTAL_NON_REGULAR_PAY: Sum of overtime pay and all other pay
|
| 83 |
sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']
|
| 84 |
-
|
| 85 |
-
# Ensure all categorical columns are of type 'object' to prevent type issues with the model
|
| 86 |
categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO']
|
| 87 |
for col in categorical_columns:
|
| 88 |
sample[col] = sample[col].astype('object')
|
| 89 |
-
|
| 90 |
-
# Use the ensemble model to make a prediction
|
| 91 |
-
# The model takes the sample DataFrame as input and returns a predicted total pay
|
| 92 |
-
prediction = ensemble.predict(sample)[0]
|
| 93 |
|
| 94 |
-
|
| 95 |
return prediction
|
| 96 |
|
| 97 |
def gradio_predict(gender, ethnicity, job_title):
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
| 4 |
from sklearn.ensemble import VotingRegressor
|
| 5 |
+
from sklearn.linear_model import LinearRegression
|
| 6 |
+
from sklearn.tree import DecisionTreeRegressor
|
| 7 |
from sklearn.base import BaseEstimator, RegressorMixin
|
| 8 |
+
from sklearn.compose import ColumnTransformer
|
| 9 |
+
from sklearn.pipeline import Pipeline
|
| 10 |
import gradio as gr
|
| 11 |
import joblib
|
| 12 |
|
| 13 |
+
# Load data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
df = pd.read_csv('City_Employee_Payroll__Current__20240915.csv', low_memory=False)
|
| 15 |
+
df = df.replace([np.inf, -np.inf], np.nan)
|
| 16 |
+
|
| 17 |
+
# Define categorical and continuous variables
|
| 18 |
+
cat_names = ['EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'GENDER', 'ETHNICITY', 'JOB_TITLE', 'DEPARTMENT_NO']
|
| 19 |
+
cont_names = ['PAY_YEAR', 'REGULAR_PAY', 'OVERTIME_PAY', 'ALL_OTHER_PAY', 'PAY_RATIO', 'TOTAL_NON_REGULAR_PAY']
|
| 20 |
+
|
| 21 |
+
# Load the trained model
|
| 22 |
ensemble = joblib.load('ensemble_model.joblib')
|
| 23 |
|
| 24 |
def predict_total_pay(gender, job_title, ethnicity):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
sample = pd.DataFrame({
|
| 26 |
'GENDER': [gender],
|
| 27 |
'JOB_TITLE': [job_title],
|
| 28 |
'ETHNICITY': [ethnicity],
|
| 29 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)]
|
| 32 |
if len(group) > 0:
|
|
|
|
|
|
|
| 33 |
sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]]
|
| 34 |
sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]]
|
| 35 |
sample['MOU'] = [group['MOU'].mode().iloc[0]]
|
| 36 |
sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]]
|
|
|
|
| 37 |
sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()]
|
| 38 |
sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()]
|
| 39 |
sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()]
|
| 40 |
else:
|
|
|
|
| 41 |
job_group = df[df['JOB_TITLE'] == job_title]
|
| 42 |
if len(job_group) > 0:
|
|
|
|
| 43 |
sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]]
|
| 44 |
sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]]
|
| 45 |
sample['MOU'] = [job_group['MOU'].mode().iloc[0]]
|
|
|
|
| 48 |
sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()]
|
| 49 |
sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()]
|
| 50 |
else:
|
|
|
|
| 51 |
sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]]
|
| 52 |
sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]]
|
| 53 |
sample['MOU'] = [df['MOU'].mode().iloc[0]]
|
|
|
|
| 55 |
sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()]
|
| 56 |
sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()]
|
| 57 |
sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()]
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
+
sample['PAY_YEAR'] = [df['PAY_YEAR'].max()]
|
|
|
|
| 60 |
sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
|
|
|
|
| 61 |
sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']
|
| 62 |
+
|
|
|
|
| 63 |
categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO']
|
| 64 |
for col in categorical_columns:
|
| 65 |
sample[col] = sample[col].astype('object')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
prediction = ensemble.predict(sample)[0]
|
| 68 |
return prediction
|
| 69 |
|
| 70 |
def gradio_predict(gender, ethnicity, job_title):
|