File size: 6,326 Bytes
ea189f9
 
a14015e
5bbeebd
ea189f9
 
 
5bbeebd
 
 
 
 
 
 
 
 
 
 
 
a14015e
ea189f9
a14015e
ea189f9
 
a14015e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea189f9
ad5a6b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea189f9
 
 
 
 
 
 
 
ad5a6b0
ea189f9
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import pandas as pd
import numpy as np
from sklearn.ensemble import VotingRegressor
from sklearn.base import BaseEstimator, RegressorMixin
import gradio as gr
import joblib

class FastAIWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, learn):
        self.learn = learn
    
    def fit(self, X, y):
        return self
    
    def predict(self, X):
        dl = self.learn.dls.test_dl(X)
        preds, _ = self.learn.get_preds(dl=dl)
        return preds.numpy().flatten()

# Load your data and trained model
df = pd.read_csv('City_Employee_Payroll__Current__20240915.csv', low_memory=False)
ensemble = joblib.load('ensemble_model.joblib')

def predict_total_pay(gender, job_title, ethnicity):
    # Create a sample input DataFrame
    sample = pd.DataFrame({
        'GENDER': [gender],
        'JOB_TITLE': [job_title],
        'ETHNICITY': [ethnicity],
    })
    
    # Fill in other required features (you may need to adjust this based on your model's requirements)
    sample['EMPLOYMENT_TYPE'] = df['EMPLOYMENT_TYPE'].mode().iloc[0]
    sample['JOB_STATUS'] = df['JOB_STATUS'].mode().iloc[0]
    sample['MOU'] = df['MOU'].mode().iloc[0]
    sample['DEPARTMENT_NO'] = df['DEPARTMENT_NO'].mode().iloc[0]
    sample['PAY_YEAR'] = df['PAY_YEAR'].max()
    sample['REGULAR_PAY'] = df['REGULAR_PAY'].mean()
    sample['OVERTIME_PAY'] = df['OVERTIME_PAY'].mean()
    sample['ALL_OTHER_PAY'] = df['ALL_OTHER_PAY'].mean()
    
    # Calculate derived features
    sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
    sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']
    
    # Make prediction
    prediction = ensemble.predict(sample)[0]
    return prediction

def predict_total_pay(gender, job_title, ethnicity):
    # Function to predict total pay based on input parameters
    # Parameters:
    #   gender: str - The gender of the employee
    #   job_title: str - The job title of the employee
    #   ethnicity: str - The ethnicity of the employee
    
    # Create a sample input DataFrame with the given parameters
    # This will be used as input for the prediction model
    sample = pd.DataFrame({
        'GENDER': [gender],
        'JOB_TITLE': [job_title],
        'ETHNICITY': [ethnicity],
    })

    # Filter the main DataFrame (df) to find exact matches for the input combination
    # This creates a subset of data that matches all three input parameters
    group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)]
    
    if len(group) > 0:
        # If exact matches are found, use their statistics to populate the sample
        # For categorical variables, use the mode (most frequent value)
        sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]]
        sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]]
        sample['MOU'] = [group['MOU'].mode().iloc[0]]
        sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]]
        # For numerical variables, use the mean
        sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()]
        sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()]
        sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()]
    else:
        # If no exact match is found, try to find a broader match based on job_title
        job_group = df[df['JOB_TITLE'] == job_title]
        if len(job_group) > 0:
            # If job title matches are found, use their statistics
            sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]]
            sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]]
            sample['MOU'] = [job_group['MOU'].mode().iloc[0]]
            sample['DEPARTMENT_NO'] = [job_group['DEPARTMENT_NO'].mode().iloc[0]]
            sample['REGULAR_PAY'] = [job_group['REGULAR_PAY'].mean()]
            sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()]
            sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()]
        else:
            # If no job title match is found, use overall statistics from the entire dataset
            sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]]
            sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]]
            sample['MOU'] = [df['MOU'].mode().iloc[0]]
            sample['DEPARTMENT_NO'] = [df['DEPARTMENT_NO'].mode().iloc[0]]
            sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()]
            sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()]
            sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()]

    # Set PAY_YEAR to the most recent year in the dataset
    sample['PAY_YEAR'] = [df['PAY_YEAR'].max()]
    
    # Calculate derived features
    # PAY_RATIO: Ratio of regular pay to other types of pay
    sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
    # TOTAL_NON_REGULAR_PAY: Sum of overtime pay and all other pay
    sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']

    # Ensure all categorical columns are of type 'object' to prevent type issues with the model
    categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO']
    for col in categorical_columns:
        sample[col] = sample[col].astype('object')

    # Use the ensemble model to make a prediction
    # The model takes the sample DataFrame as input and returns a predicted total pay
    prediction = ensemble.predict(sample)[0]
    
    # Return the predicted total pay
    return prediction

# Prepare dropdown options
genders = df['GENDER'].dropna().unique().tolist()
ethnicities = df['ETHNICITY'].dropna().unique().tolist()
job_titles = sorted(df['JOB_TITLE'].dropna().unique().tolist())

# Create Gradio interface
iface = gr.Interface(
    fn=predict_total_pay,
    inputs=[
        gr.Dropdown(choices=genders, label="Gender"),
        gr.Dropdown(choices=ethnicities, label="Ethnicity"),
        gr.Dropdown(choices=job_titles, label="Job Title")
    ],
    outputs=gr.Textbox(label="Predicted Total Pay"),
    title="LA City Employee Pay Predictor",
    description="Predict the total pay for LA City employees based on gender, ethnicity, and job title."
)

iface.launch()