Spaces:

huntrezz
/

LACityEmployeePayPredictor

Sleeping

App Files Files Community

huntrezz commited on Sep 18, 2024

Commit

ad5a6b0

verified ·

1 Parent(s): 85e5e78

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -4

app.py CHANGED Viewed

@@ -47,9 +47,78 @@ def predict_total_pay(gender, job_title, ethnicity):
     prediction = ensemble.predict(sample)[0]
     return prediction
-def gradio_predict(gender, ethnicity, job_title):
-    predicted_pay = predict_total_pay(gender, job_title, ethnicity)
-    return f"${predicted_pay:.2f}"
 # Prepare dropdown options
 genders = df['GENDER'].dropna().unique().tolist()
@@ -58,7 +127,7 @@ job_titles = sorted(df['JOB_TITLE'].dropna().unique().tolist())
 # Create Gradio interface
 iface = gr.Interface(
-    fn=gradio_predict,
     inputs=[
         gr.Dropdown(choices=genders, label="Gender"),
         gr.Dropdown(choices=ethnicities, label="Ethnicity"),

     prediction = ensemble.predict(sample)[0]
     return prediction
+def predict_total_pay(gender, job_title, ethnicity):
+    # Function to predict total pay based on input parameters
+    # Parameters:
+    #   gender: str - The gender of the employee
+    #   job_title: str - The job title of the employee
+    #   ethnicity: str - The ethnicity of the employee
+    # Create a sample input DataFrame with the given parameters
+    # This will be used as input for the prediction model
+    sample = pd.DataFrame({
+        'GENDER': [gender],
+        'JOB_TITLE': [job_title],
+        'ETHNICITY': [ethnicity],
+    })
+    # Filter the main DataFrame (df) to find exact matches for the input combination
+    # This creates a subset of data that matches all three input parameters
+    group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)]
+    if len(group) > 0:
+        # If exact matches are found, use their statistics to populate the sample
+        # For categorical variables, use the mode (most frequent value)
+        sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]]
+        sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]]
+        sample['MOU'] = [group['MOU'].mode().iloc[0]]
+        sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]]
+        # For numerical variables, use the mean
+        sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()]
+        sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()]
+        sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()]
+    else:
+        # If no exact match is found, try to find a broader match based on job_title
+        job_group = df[df['JOB_TITLE'] == job_title]
+        if len(job_group) > 0:
+            # If job title matches are found, use their statistics
+            sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]]
+            sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]]
+            sample['MOU'] = [job_group['MOU'].mode().iloc[0]]
+            sample['DEPARTMENT_NO'] = [job_group['DEPARTMENT_NO'].mode().iloc[0]]
+            sample['REGULAR_PAY'] = [job_group['REGULAR_PAY'].mean()]
+            sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()]
+            sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()]
+        else:
+            # If no job title match is found, use overall statistics from the entire dataset
+            sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]]
+            sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]]
+            sample['MOU'] = [df['MOU'].mode().iloc[0]]
+            sample['DEPARTMENT_NO'] = [df['DEPARTMENT_NO'].mode().iloc[0]]
+            sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()]
+            sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()]
+            sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()]
+    # Set PAY_YEAR to the most recent year in the dataset
+    sample['PAY_YEAR'] = [df['PAY_YEAR'].max()]
+    # Calculate derived features
+    # PAY_RATIO: Ratio of regular pay to other types of pay
+    sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
+    # TOTAL_NON_REGULAR_PAY: Sum of overtime pay and all other pay
+    sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']
+    # Ensure all categorical columns are of type 'object' to prevent type issues with the model
+    categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO']
+    for col in categorical_columns:
+        sample[col] = sample[col].astype('object')
+    # Use the ensemble model to make a prediction
+    # The model takes the sample DataFrame as input and returns a predicted total pay
+    prediction = ensemble.predict(sample)[0]
+    # Return the predicted total pay
+    return prediction
 # Prepare dropdown options
 genders = df['GENDER'].dropna().unique().tolist()
 # Create Gradio interface
 iface = gr.Interface(
+    fn=predict_total_pay,
     inputs=[
         gr.Dropdown(choices=genders, label="Gender"),
         gr.Dropdown(choices=ethnicities, label="Ethnicity"),