Spaces:
Runtime error
Runtime error
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Copy of Lab06.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1eKsEZ2OurE_fRyVw_cxMPq5cuYpUkSJM
|
8 |
+
|
9 |
+
We will train an XGBoost model on the Adult's Income dataset and deploy it on Hugging Face spaces.
|
10 |
+
"""
|
11 |
+
|
12 |
+
!wget http://www.donlapark.cmustat.com/Income.csv
|
13 |
+
|
14 |
+
import pandas as pd
|
15 |
+
from sklearn.compose import ColumnTransformer
|
16 |
+
from sklearn.pipeline import Pipeline
|
17 |
+
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
18 |
+
|
19 |
+
from xgboost import XGBClassifier
|
20 |
+
|
21 |
+
|
22 |
+
EDU_DICT = {'Preschool': 1,
|
23 |
+
'1st-4th': 2,
|
24 |
+
'5th-6th': 3,
|
25 |
+
'7th-8th': 4,
|
26 |
+
'9th': 5,
|
27 |
+
'10th': 6,
|
28 |
+
'11th': 7,
|
29 |
+
'12th': 8,
|
30 |
+
'HS-grad': 9,
|
31 |
+
'Some-college': 10,
|
32 |
+
'Assoc-voc': 11,
|
33 |
+
'Assoc-acdm': 12,
|
34 |
+
'Bachelors': 13,
|
35 |
+
'Masters': 14,
|
36 |
+
'Prof-school': 15,
|
37 |
+
'Doctorate': 16
|
38 |
+
}
|
39 |
+
|
40 |
+
|
41 |
+
X_train = pd.read_csv('Income.csv')
|
42 |
+
|
43 |
+
X_train
|
44 |
+
|
45 |
+
y_train = X_train.pop("income")
|
46 |
+
y_train = (y_train == ">50K").astype(int)
|
47 |
+
X_train['education'].replace(EDU_DICT, inplace=True)
|
48 |
+
|
49 |
+
# Names of numerical features
|
50 |
+
num_col = X_train.select_dtypes(include=['int64', 'float64']).columns
|
51 |
+
# Names of categorical features
|
52 |
+
cat_col = X_train.select_dtypes(include=['object', 'bool']).columns
|
53 |
+
|
54 |
+
print(num_col)
|
55 |
+
print(cat_col)
|
56 |
+
|
57 |
+
# print num_col and cat_col
|
58 |
+
|
59 |
+
preprocessor = ColumnTransformer([("scaler", StandardScaler(), num_col),
|
60 |
+
("onehot", OneHotEncoder(sparse=False), cat_col)])
|
61 |
+
|
62 |
+
model = Pipeline(steps=[('preprocessor', preprocessor),
|
63 |
+
('classifier', XGBClassifier())])
|
64 |
+
|
65 |
+
model.fit(X_train, y_train)
|
66 |
+
|
67 |
+
"""### Saving the model"""
|
68 |
+
|
69 |
+
import joblib
|
70 |
+
|
71 |
+
joblib.dump(model, 'model.joblib')
|
72 |
+
|
73 |
+
unique_values = {col:X_train[col].unique() for col in cat_col}
|
74 |
+
|
75 |
+
joblib.dump(unique_values, 'unique_values.joblib')
|