Spaces:

huntrezz
/

LACityEmployeePayPredictor

Sleeping

App Files Files Community

LACityEmployeePayPredictor / app.py

huntrezz

Update app.py

ad5a6b0 verified 10 months ago

raw

history blame

6.33 kB

	import pandas as pd
	import numpy as np
	from sklearn.ensemble import VotingRegressor
	from sklearn.base import BaseEstimator, RegressorMixin
	import gradio as gr
	import joblib

	class FastAIWrapper(BaseEstimator, RegressorMixin):
	def __init__(self, learn):
	self.learn = learn

	def fit(self, X, y):
	return self

	def predict(self, X):
	dl = self.learn.dls.test_dl(X)
	preds, _ = self.learn.get_preds(dl=dl)
	return preds.numpy().flatten()

	# Load your data and trained model
	df = pd.read_csv('City_Employee_Payroll__Current__20240915.csv', low_memory=False)
	ensemble = joblib.load('ensemble_model.joblib')

	def predict_total_pay(gender, job_title, ethnicity):
	# Create a sample input DataFrame
	sample = pd.DataFrame({
	'GENDER': [gender],
	'JOB_TITLE': [job_title],
	'ETHNICITY': [ethnicity],
	})

	# Fill in other required features (you may need to adjust this based on your model's requirements)
	sample['EMPLOYMENT_TYPE'] = df['EMPLOYMENT_TYPE'].mode().iloc[0]
	sample['JOB_STATUS'] = df['JOB_STATUS'].mode().iloc[0]
	sample['MOU'] = df['MOU'].mode().iloc[0]
	sample['DEPARTMENT_NO'] = df['DEPARTMENT_NO'].mode().iloc[0]
	sample['PAY_YEAR'] = df['PAY_YEAR'].max()
	sample['REGULAR_PAY'] = df['REGULAR_PAY'].mean()
	sample['OVERTIME_PAY'] = df['OVERTIME_PAY'].mean()
	sample['ALL_OTHER_PAY'] = df['ALL_OTHER_PAY'].mean()

	# Calculate derived features
	sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
	sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']

	# Make prediction
	prediction = ensemble.predict(sample)[0]
	return prediction

	def predict_total_pay(gender, job_title, ethnicity):
	# Function to predict total pay based on input parameters
	# Parameters:
	# gender: str - The gender of the employee
	# job_title: str - The job title of the employee
	# ethnicity: str - The ethnicity of the employee

	# Create a sample input DataFrame with the given parameters
	# This will be used as input for the prediction model
	sample = pd.DataFrame({
	'GENDER': [gender],
	'JOB_TITLE': [job_title],
	'ETHNICITY': [ethnicity],
	})

	# Filter the main DataFrame (df) to find exact matches for the input combination
	# This creates a subset of data that matches all three input parameters
	group = df[(df['GENDER'] == gender) & (df['JOB_TITLE'] == job_title) & (df['ETHNICITY'] == ethnicity)]

	if len(group) > 0:
	# If exact matches are found, use their statistics to populate the sample
	# For categorical variables, use the mode (most frequent value)
	sample['EMPLOYMENT_TYPE'] = [group['EMPLOYMENT_TYPE'].mode().iloc[0]]
	sample['JOB_STATUS'] = [group['JOB_STATUS'].mode().iloc[0]]
	sample['MOU'] = [group['MOU'].mode().iloc[0]]
	sample['DEPARTMENT_NO'] = [group['DEPARTMENT_NO'].mode().iloc[0]]
	# For numerical variables, use the mean
	sample['REGULAR_PAY'] = [group['REGULAR_PAY'].mean()]
	sample['OVERTIME_PAY'] = [group['OVERTIME_PAY'].mean()]
	sample['ALL_OTHER_PAY'] = [group['ALL_OTHER_PAY'].mean()]
	else:
	# If no exact match is found, try to find a broader match based on job_title
	job_group = df[df['JOB_TITLE'] == job_title]
	if len(job_group) > 0:
	# If job title matches are found, use their statistics
	sample['EMPLOYMENT_TYPE'] = [job_group['EMPLOYMENT_TYPE'].mode().iloc[0]]
	sample['JOB_STATUS'] = [job_group['JOB_STATUS'].mode().iloc[0]]
	sample['MOU'] = [job_group['MOU'].mode().iloc[0]]
	sample['DEPARTMENT_NO'] = [job_group['DEPARTMENT_NO'].mode().iloc[0]]
	sample['REGULAR_PAY'] = [job_group['REGULAR_PAY'].mean()]
	sample['OVERTIME_PAY'] = [job_group['OVERTIME_PAY'].mean()]
	sample['ALL_OTHER_PAY'] = [job_group['ALL_OTHER_PAY'].mean()]
	else:
	# If no job title match is found, use overall statistics from the entire dataset
	sample['EMPLOYMENT_TYPE'] = [df['EMPLOYMENT_TYPE'].mode().iloc[0]]
	sample['JOB_STATUS'] = [df['JOB_STATUS'].mode().iloc[0]]
	sample['MOU'] = [df['MOU'].mode().iloc[0]]
	sample['DEPARTMENT_NO'] = [df['DEPARTMENT_NO'].mode().iloc[0]]
	sample['REGULAR_PAY'] = [df['REGULAR_PAY'].mean()]
	sample['OVERTIME_PAY'] = [df['OVERTIME_PAY'].mean()]
	sample['ALL_OTHER_PAY'] = [df['ALL_OTHER_PAY'].mean()]

	# Set PAY_YEAR to the most recent year in the dataset
	sample['PAY_YEAR'] = [df['PAY_YEAR'].max()]

	# Calculate derived features
	# PAY_RATIO: Ratio of regular pay to other types of pay
	sample['PAY_RATIO'] = sample['REGULAR_PAY'] / (sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY'] + 1)
	# TOTAL_NON_REGULAR_PAY: Sum of overtime pay and all other pay
	sample['TOTAL_NON_REGULAR_PAY'] = sample['OVERTIME_PAY'] + sample['ALL_OTHER_PAY']

	# Ensure all categorical columns are of type 'object' to prevent type issues with the model
	categorical_columns = ['GENDER', 'JOB_TITLE', 'ETHNICITY', 'EMPLOYMENT_TYPE', 'JOB_STATUS', 'MOU', 'DEPARTMENT_NO']
	for col in categorical_columns:
	sample[col] = sample[col].astype('object')

	# Use the ensemble model to make a prediction
	# The model takes the sample DataFrame as input and returns a predicted total pay
	prediction = ensemble.predict(sample)[0]

	# Return the predicted total pay
	return prediction

	# Prepare dropdown options
	genders = df['GENDER'].dropna().unique().tolist()
	ethnicities = df['ETHNICITY'].dropna().unique().tolist()
	job_titles = sorted(df['JOB_TITLE'].dropna().unique().tolist())

	# Create Gradio interface
	iface = gr.Interface(
	fn=predict_total_pay,
	inputs=[
	gr.Dropdown(choices=genders, label="Gender"),
	gr.Dropdown(choices=ethnicities, label="Ethnicity"),
	gr.Dropdown(choices=job_titles, label="Job Title")
	],
	outputs=gr.Textbox(label="Predicted Total Pay"),
	title="LA City Employee Pay Predictor",
	description="Predict the total pay for LA City employees based on gender, ethnicity, and job title."
	)

	iface.launch()