Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -47,9 +47,78 @@ def predict_total_pay(gender, job_title, ethnicity):
|
|
47 |
prediction = ensemble.predict(sample)[0]
|
48 |
return prediction
|
49 |
|
50 |
-
def
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
# Prepare dropdown options
|
55 |
genders = df['GENDER'].dropna().unique().tolist()
|
@@ -58,7 +127,7 @@ job_titles = sorted(df['JOB_TITLE'].dropna().unique().tolist())
|
|
58 |
|
59 |
# Create Gradio interface
|
60 |
iface = gr.Interface(
|
61 |
-
fn=
|
62 |
inputs=[
|
63 |
gr.Dropdown(choices=genders, label="Gender"),
|
64 |
gr.Dropdown(choices=ethnicities, label="Ethnicity"),
|
|
|
47 |
prediction = ensemble.predict(sample)[0]
|
48 |
return prediction
|
49 |
|
50 |
+
def predict_total_pay(gender, job_title, ethnicity):
|
51 |
+
# Function to predict total pay based on input parameters
|
52 |
+
# Parameters:
|
53 |
+
# gender: str - The gender of the employee
|
54 |
+
# job_title: str - The job title of the employee
|
55 |
+
# ethnicity: str - The ethnicity of the employee
|
56 |
+
|
57 |
+
# Create a sample input DataFrame with the given parameters
|
58 |
+
# This will be used as input for the prediction model
|
59 |
+
sample = pd.DataFrame({
|
60 |
+
'GENDER': [gender],
|
61 |
+
'JOB_TITLE': [job_title],
|
62 |
+
'ETHNICITY': [ethnicity],
|
63 |
+
})
|
64 |
+
|
65 |
+
# Filter the main DataFrame (df) to find exact matches for the input combination
|
66 |
+
# This creates a subset of data that matches all three input parameters
|
67 |
+
group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)]
|
68 |
+
|
69 |
+
if len(group) > 0:
|
70 |
+
# If exact matches are found, use their statistics to populate the sample
|
71 |
+
# For categorical variables, use the mode (most frequent value)
|
72 |
+
sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]]
|
73 |
+
sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]]
|
74 |
+
sample['MOU'] = [group['MOU'].mode().iloc[0]]
|
75 |
+
sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]]
|
76 |
+
# For numerical variables, use the mean
|
77 |
+
sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()]
|
78 |
+
sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()]
|
79 |
+
sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()]
|
80 |
+
else:
|
81 |
+
# If no exact match is found, try to find a broader match based on job_title
|
82 |
+
job_group = df[df['JOB_TITLE'] == job_title]
|
83 |
+
if len(job_group) > 0:
|
84 |
+
# If job title matches are found, use their statistics
|
85 |
+
sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]]
|
86 |
+
sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]]
|
87 |
+
sample['MOU'] = [job_group['MOU'].mode().iloc[0]]
|
88 |
+
sample['DEPARTMENT_NO'] = [job_group['DEPARTMENT_NO'].mode().iloc[0]]
|
89 |
+
sample['REGULAR_PAY'] = [job_group['REGULAR_PAY'].mean()]
|
90 |
+
sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()]
|
91 |
+
sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()]
|
92 |
+
else:
|
93 |
+
# If no job title match is found, use overall statistics from the entire dataset
|
94 |
+
sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]]
|
95 |
+
sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]]
|
96 |
+
sample['MOU'] = [df['MOU'].mode().iloc[0]]
|
97 |
+
sample['DEPARTMENT_NO'] = [df['DEPARTMENT_NO'].mode().iloc[0]]
|
98 |
+
sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()]
|
99 |
+
sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()]
|
100 |
+
sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()]
|
101 |
+
|
102 |
+
# Set PAY_YEAR to the most recent year in the dataset
|
103 |
+
sample['PAY_YEAR'] = [df['PAY_YEAR'].max()]
|
104 |
+
|
105 |
+
# Calculate derived features
|
106 |
+
# PAY_RATIO: Ratio of regular pay to other types of pay
|
107 |
+
sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
|
108 |
+
# TOTAL_NON_REGULAR_PAY: Sum of overtime pay and all other pay
|
109 |
+
sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']
|
110 |
+
|
111 |
+
# Ensure all categorical columns are of type 'object' to prevent type issues with the model
|
112 |
+
categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO']
|
113 |
+
for col in categorical_columns:
|
114 |
+
sample[col] = sample[col].astype('object')
|
115 |
+
|
116 |
+
# Use the ensemble model to make a prediction
|
117 |
+
# The model takes the sample DataFrame as input and returns a predicted total pay
|
118 |
+
prediction = ensemble.predict(sample)[0]
|
119 |
+
|
120 |
+
# Return the predicted total pay
|
121 |
+
return prediction
|
122 |
|
123 |
# Prepare dropdown options
|
124 |
genders = df['GENDER'].dropna().unique().tolist()
|
|
|
127 |
|
128 |
# Create Gradio interface
|
129 |
iface = gr.Interface(
|
130 |
+
fn=predict_total_pay,
|
131 |
inputs=[
|
132 |
gr.Dropdown(choices=genders, label="Gender"),
|
133 |
gr.Dropdown(choices=ethnicities, label="Ethnicity"),
|