huntrezz commited on
Commit
060d492
·
verified ·
1 Parent(s): 402148f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -44
app.py CHANGED
@@ -1,61 +1,45 @@
1
  import pandas as pd
2
  import numpy as np
 
3
  from sklearn.ensemble import VotingRegressor
 
 
4
  from sklearn.base import BaseEstimator, RegressorMixin
 
 
5
  import gradio as gr
6
  import joblib
7
 
8
- class FastAIWrapper(BaseEstimator, RegressorMixin):
9
- def __init__(self, learn):
10
- self.learn = learn
11
-
12
- def fit(self, X, y):
13
- return self
14
-
15
- def predict(self, X):
16
- dl = self.learn.dls.test_dl(X)
17
- preds, _ = self.learn.get_preds(dl=dl)
18
- return preds.numpy().flatten()
19
-
20
- # Load your data and trained model
21
  df = pd.read_csv('City_Employee_Payroll__Current__20240915.csv', low_memory=False)
 
 
 
 
 
 
 
22
  ensemble = joblib.load('ensemble_model.joblib')
23
 
24
  def predict_total_pay(gender, job_title, ethnicity):
25
- # Function to predict total pay based on input parameters
26
- # Parameters:
27
- # gender: str - The gender of the employee
28
- # job_title: str - The job title of the employee
29
- # ethnicity: str - The ethnicity of the employee
30
-
31
- # Create a sample input DataFrame with the given parameters
32
- # This will be used as input for the prediction model
33
  sample = pd.DataFrame({
34
  'GENDER': [gender],
35
  'JOB_TITLE': [job_title],
36
  'ETHNICITY': [ethnicity],
37
  })
38
-
39
- # Filter the main DataFrame (df) to find exact matches for the input combination
40
- # This creates a subset of data that matches all three input parameters
41
- group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)]
42
 
 
43
  if len(group) > 0:
44
- # If exact matches are found, use their statistics to populate the sample
45
- # For categorical variables, use the mode (most frequent value)
46
  sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]]
47
  sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]]
48
  sample['MOU'] = [group['MOU'].mode().iloc[0]]
49
  sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]]
50
- # For numerical variables, use the mean
51
  sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()]
52
  sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()]
53
  sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()]
54
  else:
55
- # If no exact match is found, try to find a broader match based on job_title
56
  job_group = df[df['JOB_TITLE'] == job_title]
57
  if len(job_group) > 0:
58
- # If job title matches are found, use their statistics
59
  sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]]
60
  sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]]
61
  sample['MOU'] = [job_group['MOU'].mode().iloc[0]]
@@ -64,7 +48,6 @@ def predict_total_pay(gender, job_title, ethnicity):
64
  sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()]
65
  sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()]
66
  else:
67
- # If no job title match is found, use overall statistics from the entire dataset
68
  sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]]
69
  sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]]
70
  sample['MOU'] = [df['MOU'].mode().iloc[0]]
@@ -72,26 +55,16 @@ def predict_total_pay(gender, job_title, ethnicity):
72
  sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()]
73
  sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()]
74
  sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()]
75
-
76
- # Set PAY_YEAR to the most recent year in the dataset
77
- sample['PAY_YEAR'] = [df['PAY_YEAR'].max()]
78
 
79
- # Calculate derived features
80
- # PAY_RATIO: Ratio of regular pay to other types of pay
81
  sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
82
- # TOTAL_NON_REGULAR_PAY: Sum of overtime pay and all other pay
83
  sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']
84
-
85
- # Ensure all categorical columns are of type 'object' to prevent type issues with the model
86
  categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO']
87
  for col in categorical_columns:
88
  sample[col] = sample[col].astype('object')
89
-
90
- # Use the ensemble model to make a prediction
91
- # The model takes the sample DataFrame as input and returns a predicted total pay
92
- prediction = ensemble.predict(sample)[0]
93
 
94
- # Return the predicted total pay
95
  return prediction
96
 
97
  def gradio_predict(gender, ethnicity, job_title):
 
1
  import pandas as pd
2
  import numpy as np
3
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
4
  from sklearn.ensemble import VotingRegressor
5
+ from sklearn.linear_model import LinearRegression
6
+ from sklearn.tree import DecisionTreeRegressor
7
  from sklearn.base import BaseEstimator, RegressorMixin
8
+ from sklearn.compose import ColumnTransformer
9
+ from sklearn.pipeline import Pipeline
10
  import gradio as gr
11
  import joblib
12
 
13
+ # Load data
 
 
 
 
 
 
 
 
 
 
 
 
14
  df = pd.read_csv('City_Employee_Payroll__Current__20240915.csv', low_memory=False)
15
+ df = df.replace([np.inf, -np.inf], np.nan)
16
+
17
+ # Define categorical and continuous variables
18
+ cat_names = ['EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'GENDER', 'ETHNICITY', 'JOB_TITLE', 'DEPARTMENT_NO']
19
+ cont_names = ['PAY_YEAR', 'REGULAR_PAY', 'OVERTIME_PAY', 'ALL_OTHER_PAY', 'PAY_RATIO', 'TOTAL_NON_REGULAR_PAY']
20
+
21
+ # Load the trained model
22
  ensemble = joblib.load('ensemble_model.joblib')
23
 
24
  def predict_total_pay(gender, job_title, ethnicity):
 
 
 
 
 
 
 
 
25
  sample = pd.DataFrame({
26
  'GENDER': [gender],
27
  'JOB_TITLE': [job_title],
28
  'ETHNICITY': [ethnicity],
29
  })
 
 
 
 
30
 
31
+ group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)]
32
  if len(group) > 0:
 
 
33
  sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]]
34
  sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]]
35
  sample['MOU'] = [group['MOU'].mode().iloc[0]]
36
  sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]]
 
37
  sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()]
38
  sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()]
39
  sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()]
40
  else:
 
41
  job_group = df[df['JOB_TITLE'] == job_title]
42
  if len(job_group) > 0:
 
43
  sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]]
44
  sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]]
45
  sample['MOU'] = [job_group['MOU'].mode().iloc[0]]
 
48
  sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()]
49
  sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()]
50
  else:
 
51
  sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]]
52
  sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]]
53
  sample['MOU'] = [df['MOU'].mode().iloc[0]]
 
55
  sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()]
56
  sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()]
57
  sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()]
 
 
 
58
 
59
+ sample['PAY_YEAR'] = [df['PAY_YEAR'].max()]
 
60
  sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
 
61
  sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']
62
+
 
63
  categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO']
64
  for col in categorical_columns:
65
  sample[col] = sample[col].astype('object')
 
 
 
 
66
 
67
+ prediction = ensemble.predict(sample)[0]
68
  return prediction
69
 
70
  def gradio_predict(gender, ethnicity, job_title):