Spaces:
Runtime error
Runtime error
Upload app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Copy of Lab06.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colaboratory.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1eKsEZ2OurE_fRyVw_cxMPq5cuYpUkSJM
|
| 8 |
+
|
| 9 |
+
We will train an XGBoost model on the Adult's Income dataset and deploy it on Hugging Face spaces.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
!wget http://www.donlapark.cmustat.com/Income.csv
|
| 13 |
+
|
| 14 |
+
import pandas as pd
|
| 15 |
+
from sklearn.compose import ColumnTransformer
|
| 16 |
+
from sklearn.pipeline import Pipeline
|
| 17 |
+
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
| 18 |
+
|
| 19 |
+
from xgboost import XGBClassifier
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
EDU_DICT = {'Preschool': 1,
|
| 23 |
+
'1st-4th': 2,
|
| 24 |
+
'5th-6th': 3,
|
| 25 |
+
'7th-8th': 4,
|
| 26 |
+
'9th': 5,
|
| 27 |
+
'10th': 6,
|
| 28 |
+
'11th': 7,
|
| 29 |
+
'12th': 8,
|
| 30 |
+
'HS-grad': 9,
|
| 31 |
+
'Some-college': 10,
|
| 32 |
+
'Assoc-voc': 11,
|
| 33 |
+
'Assoc-acdm': 12,
|
| 34 |
+
'Bachelors': 13,
|
| 35 |
+
'Masters': 14,
|
| 36 |
+
'Prof-school': 15,
|
| 37 |
+
'Doctorate': 16
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
X_train = pd.read_csv('Income.csv')
|
| 42 |
+
|
| 43 |
+
X_train
|
| 44 |
+
|
| 45 |
+
y_train = X_train.pop("income")
|
| 46 |
+
y_train = (y_train == ">50K").astype(int)
|
| 47 |
+
X_train['education'].replace(EDU_DICT, inplace=True)
|
| 48 |
+
|
| 49 |
+
# Names of numerical features
|
| 50 |
+
num_col = X_train.select_dtypes(include=['int64', 'float64']).columns
|
| 51 |
+
# Names of categorical features
|
| 52 |
+
cat_col = X_train.select_dtypes(include=['object', 'bool']).columns
|
| 53 |
+
|
| 54 |
+
print(num_col)
|
| 55 |
+
print(cat_col)
|
| 56 |
+
|
| 57 |
+
# print num_col and cat_col
|
| 58 |
+
|
| 59 |
+
preprocessor = ColumnTransformer([("scaler", StandardScaler(), num_col),
|
| 60 |
+
("onehot", OneHotEncoder(sparse=False), cat_col)])
|
| 61 |
+
|
| 62 |
+
model = Pipeline(steps=[('preprocessor', preprocessor),
|
| 63 |
+
('classifier', XGBClassifier())])
|
| 64 |
+
|
| 65 |
+
model.fit(X_train, y_train)
|
| 66 |
+
|
| 67 |
+
"""### Saving the model"""
|
| 68 |
+
|
| 69 |
+
import joblib
|
| 70 |
+
|
| 71 |
+
joblib.dump(model, 'model.joblib')
|
| 72 |
+
|
| 73 |
+
unique_values = {col:X_train[col].unique() for col in cat_col}
|
| 74 |
+
|
| 75 |
+
joblib.dump(unique_values, 'unique_values.joblib')
|