Spaces:
Runtime error
Runtime error
File size: 1,938 Bytes
2798386 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# -*- coding: utf-8 -*-
"""Copy of Lab06.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1eKsEZ2OurE_fRyVw_cxMPq5cuYpUkSJM
We will train an XGBoost model on the Adult's Income dataset and deploy it on Hugging Face spaces.
"""
!wget http://www.donlapark.cmustat.com/Income.csv
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier
EDU_DICT = {'Preschool': 1,
'1st-4th': 2,
'5th-6th': 3,
'7th-8th': 4,
'9th': 5,
'10th': 6,
'11th': 7,
'12th': 8,
'HS-grad': 9,
'Some-college': 10,
'Assoc-voc': 11,
'Assoc-acdm': 12,
'Bachelors': 13,
'Masters': 14,
'Prof-school': 15,
'Doctorate': 16
}
X_train = pd.read_csv('Income.csv')
X_train
y_train = X_train.pop("income")
y_train = (y_train == ">50K").astype(int)
X_train['education'].replace(EDU_DICT, inplace=True)
# Names of numerical features
num_col = X_train.select_dtypes(include=['int64', 'float64']).columns
# Names of categorical features
cat_col = X_train.select_dtypes(include=['object', 'bool']).columns
print(num_col)
print(cat_col)
# print num_col and cat_col
preprocessor = ColumnTransformer([("scaler", StandardScaler(), num_col),
("onehot", OneHotEncoder(sparse=False), cat_col)])
model = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', XGBClassifier())])
model.fit(X_train, y_train)
"""### Saving the model"""
import joblib
joblib.dump(model, 'model.joblib')
unique_values = {col:X_train[col].unique() for col in cat_col}
joblib.dump(unique_values, 'unique_values.joblib') |