huntrezz's picture
Update app.py
ad5a6b0 verified
raw
history blame
6.33 kB
import pandas as pd
import numpy as np
from sklearn.ensemble import VotingRegressor
from sklearn.base import BaseEstimator, RegressorMixin
import gradio as gr
import joblib
class FastAIWrapper(BaseEstimator, RegressorMixin):
def __init__(self, learn):
self.learn = learn
def fit(self, X, y):
return self
def predict(self, X):
dl = self.learn.dls.test_dl(X)
preds, _ = self.learn.get_preds(dl=dl)
return preds.numpy().flatten()
# Load your data and trained model
df = pd.read_csv('City_Employee_Payroll__Current__20240915.csv', low_memory=False)
ensemble = joblib.load('ensemble_model.joblib')
def predict_total_pay(gender, job_title, ethnicity):
# Create a sample input DataFrame
sample = pd.DataFrame({
'GENDER': [gender],
'JOB_TITLE': [job_title],
'ETHNICITY': [ethnicity],
})
# Fill in other required features (you may need to adjust this based on your model's requirements)
sample['EMPLOYMENT_TYPE'] = df['EMPLOYMENT_TYPE'].mode().iloc[0]
sample['JOB_STATUS'] = df['JOB_STATUS'].mode().iloc[0]
sample['MOU'] = df['MOU'].mode().iloc[0]
sample['DEPARTMENT_NO'] = df['DEPARTMENT_NO'].mode().iloc[0]
sample['PAY_YEAR'] = df['PAY_YEAR'].max()
sample['REGULAR_PAY'] = df['REGULAR_PAY'].mean()
sample['OVERTIME_PAY'] = df['OVERTIME_PAY'].mean()
sample['ALL_OTHER_PAY'] = df['ALL_OTHER_PAY'].mean()
# Calculate derived features
sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']
# Make prediction
prediction = ensemble.predict(sample)[0]
return prediction
def predict_total_pay(gender, job_title, ethnicity):
# Function to predict total pay based on input parameters
# Parameters:
# gender: str - The gender of the employee
# job_title: str - The job title of the employee
# ethnicity: str - The ethnicity of the employee
# Create a sample input DataFrame with the given parameters
# This will be used as input for the prediction model
sample = pd.DataFrame({
'GENDER': [gender],
'JOB_TITLE': [job_title],
'ETHNICITY': [ethnicity],
})
# Filter the main DataFrame (df) to find exact matches for the input combination
# This creates a subset of data that matches all three input parameters
group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)]
if len(group) > 0:
# If exact matches are found, use their statistics to populate the sample
# For categorical variables, use the mode (most frequent value)
sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]]
sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]]
sample['MOU'] = [group['MOU'].mode().iloc[0]]
sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]]
# For numerical variables, use the mean
sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()]
sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()]
sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()]
else:
# If no exact match is found, try to find a broader match based on job_title
job_group = df[df['JOB_TITLE'] == job_title]
if len(job_group) > 0:
# If job title matches are found, use their statistics
sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]]
sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]]
sample['MOU'] = [job_group['MOU'].mode().iloc[0]]
sample['DEPARTMENT_NO'] = [job_group['DEPARTMENT_NO'].mode().iloc[0]]
sample['REGULAR_PAY'] = [job_group['REGULAR_PAY'].mean()]
sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()]
sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()]
else:
# If no job title match is found, use overall statistics from the entire dataset
sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]]
sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]]
sample['MOU'] = [df['MOU'].mode().iloc[0]]
sample['DEPARTMENT_NO'] = [df['DEPARTMENT_NO'].mode().iloc[0]]
sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()]
sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()]
sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()]
# Set PAY_YEAR to the most recent year in the dataset
sample['PAY_YEAR'] = [df['PAY_YEAR'].max()]
# Calculate derived features
# PAY_RATIO: Ratio of regular pay to other types of pay
sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
# TOTAL_NON_REGULAR_PAY: Sum of overtime pay and all other pay
sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']
# Ensure all categorical columns are of type 'object' to prevent type issues with the model
categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO']
for col in categorical_columns:
sample[col] = sample[col].astype('object')
# Use the ensemble model to make a prediction
# The model takes the sample DataFrame as input and returns a predicted total pay
prediction = ensemble.predict(sample)[0]
# Return the predicted total pay
return prediction
# Prepare dropdown options
genders = df['GENDER'].dropna().unique().tolist()
ethnicities = df['ETHNICITY'].dropna().unique().tolist()
job_titles = sorted(df['JOB_TITLE'].dropna().unique().tolist())
# Create Gradio interface
iface = gr.Interface(
fn=predict_total_pay,
inputs=[
gr.Dropdown(choices=genders, label="Gender"),
gr.Dropdown(choices=ethnicities, label="Ethnicity"),
gr.Dropdown(choices=job_titles, label="Job Title")
],
outputs=gr.Textbox(label="Predicted Total Pay"),
title="LA City Employee Pay Predictor",
description="Predict the total pay for LA City employees based on gender, ethnicity, and job title."
)
iface.launch()