# -*- coding: utf-8 -*- """Copy of Lab06.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1eKsEZ2OurE_fRyVw_cxMPq5cuYpUkSJM We will train an XGBoost model on the Adult's Income dataset and deploy it on Hugging Face spaces. """ !wget http://www.donlapark.cmustat.com/Income.csv import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from xgboost import XGBClassifier EDU_DICT = {'Preschool': 1, '1st-4th': 2, '5th-6th': 3, '7th-8th': 4, '9th': 5, '10th': 6, '11th': 7, '12th': 8, 'HS-grad': 9, 'Some-college': 10, 'Assoc-voc': 11, 'Assoc-acdm': 12, 'Bachelors': 13, 'Masters': 14, 'Prof-school': 15, 'Doctorate': 16 } X_train = pd.read_csv('Income.csv') X_train y_train = X_train.pop("income") y_train = (y_train == ">50K").astype(int) X_train['education'].replace(EDU_DICT, inplace=True) # Names of numerical features num_col = X_train.select_dtypes(include=['int64', 'float64']).columns # Names of categorical features cat_col = X_train.select_dtypes(include=['object', 'bool']).columns print(num_col) print(cat_col) # print num_col and cat_col preprocessor = ColumnTransformer([("scaler", StandardScaler(), num_col), ("onehot", OneHotEncoder(sparse=False), cat_col)]) model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', XGBClassifier())]) model.fit(X_train, y_train) """### Saving the model""" import joblib joblib.dump(model, 'model.joblib') unique_values = {col:X_train[col].unique() for col in cat_col} joblib.dump(unique_values, 'unique_values.joblib')