{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### TU257 - AutoML Demo"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Run this notebook and examine what is produced for each cell.\n",
"\n",
"#### Add your own comments to enrich the information given to make it more meaningful to you.\n",
"\n",
"#### In this notebook we will have a look at two different AutoML libraries. See the notes/website for links to posts illustrating other AutoML libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Age | \n",
" WorkClass | \n",
" Fnlwgt | \n",
" Education | \n",
" Edu_Num | \n",
" MaritalStatus | \n",
" Occupation | \n",
" Relationship | \n",
" Race | \n",
" Sex | \n",
" CapitalGain | \n",
" CapitalLoss | \n",
" HrPerWk | \n",
" Native | \n",
" Target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 39 | \n",
" State-gov | \n",
" 77516 | \n",
" Bachelors | \n",
" 13 | \n",
" Never-married | \n",
" Adm-clerical | \n",
" Not-in-family | \n",
" White | \n",
" Male | \n",
" 2174 | \n",
" 0 | \n",
" 40 | \n",
" United-States | \n",
" <=50K | \n",
"
\n",
" \n",
" 1 | \n",
" 50 | \n",
" Self-emp-not-inc | \n",
" 83311 | \n",
" Bachelors | \n",
" 13 | \n",
" Married-civ-spouse | \n",
" Exec-managerial | \n",
" Husband | \n",
" White | \n",
" Male | \n",
" 0 | \n",
" 0 | \n",
" 13 | \n",
" United-States | \n",
" <=50K | \n",
"
\n",
" \n",
" 2 | \n",
" 38 | \n",
" Private | \n",
" 215646 | \n",
" HS-grad | \n",
" 9 | \n",
" Divorced | \n",
" Handlers-cleaners | \n",
" Not-in-family | \n",
" White | \n",
" Male | \n",
" 0 | \n",
" 0 | \n",
" 40 | \n",
" United-States | \n",
" <=50K | \n",
"
\n",
" \n",
" 3 | \n",
" 53 | \n",
" Private | \n",
" 234721 | \n",
" 11th | \n",
" 7 | \n",
" Married-civ-spouse | \n",
" Handlers-cleaners | \n",
" Husband | \n",
" Black | \n",
" Male | \n",
" 0 | \n",
" 0 | \n",
" 40 | \n",
" United-States | \n",
" <=50K | \n",
"
\n",
" \n",
" 4 | \n",
" 28 | \n",
" Private | \n",
" 338409 | \n",
" Bachelors | \n",
" 13 | \n",
" Married-civ-spouse | \n",
" Prof-specialty | \n",
" Wife | \n",
" Black | \n",
" Female | \n",
" 0 | \n",
" 0 | \n",
" 40 | \n",
" Cuba | \n",
" <=50K | \n",
"
\n",
" \n",
" 5 | \n",
" 37 | \n",
" Private | \n",
" 284582 | \n",
" Masters | \n",
" 14 | \n",
" Married-civ-spouse | \n",
" Exec-managerial | \n",
" Wife | \n",
" White | \n",
" Female | \n",
" 0 | \n",
" 0 | \n",
" 40 | \n",
" United-States | \n",
" <=50K | \n",
"
\n",
" \n",
" 6 | \n",
" 49 | \n",
" Private | \n",
" 160187 | \n",
" 9th | \n",
" 5 | \n",
" Married-spouse-absent | \n",
" Other-service | \n",
" Not-in-family | \n",
" Black | \n",
" Female | \n",
" 0 | \n",
" 0 | \n",
" 16 | \n",
" Jamaica | \n",
" <=50K | \n",
"
\n",
" \n",
" 7 | \n",
" 52 | \n",
" Self-emp-not-inc | \n",
" 209642 | \n",
" HS-grad | \n",
" 9 | \n",
" Married-civ-spouse | \n",
" Exec-managerial | \n",
" Husband | \n",
" White | \n",
" Male | \n",
" 0 | \n",
" 0 | \n",
" 45 | \n",
" United-States | \n",
" >50K | \n",
"
\n",
" \n",
" 8 | \n",
" 31 | \n",
" Private | \n",
" 45781 | \n",
" Masters | \n",
" 14 | \n",
" Never-married | \n",
" Prof-specialty | \n",
" Not-in-family | \n",
" White | \n",
" Female | \n",
" 14084 | \n",
" 0 | \n",
" 50 | \n",
" United-States | \n",
" >50K | \n",
"
\n",
" \n",
" 9 | \n",
" 42 | \n",
" Private | \n",
" 159449 | \n",
" Bachelors | \n",
" 13 | \n",
" Married-civ-spouse | \n",
" Exec-managerial | \n",
" Husband | \n",
" White | \n",
" Male | \n",
" 5178 | \n",
" 0 | \n",
" 40 | \n",
" United-States | \n",
" >50K | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age WorkClass Fnlwgt Education Edu_Num \\\n",
"0 39 State-gov 77516 Bachelors 13 \n",
"1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
"2 38 Private 215646 HS-grad 9 \n",
"3 53 Private 234721 11th 7 \n",
"4 28 Private 338409 Bachelors 13 \n",
"5 37 Private 284582 Masters 14 \n",
"6 49 Private 160187 9th 5 \n",
"7 52 Self-emp-not-inc 209642 HS-grad 9 \n",
"8 31 Private 45781 Masters 14 \n",
"9 42 Private 159449 Bachelors 13 \n",
"\n",
" MaritalStatus Occupation Relationship Race \\\n",
"0 Never-married Adm-clerical Not-in-family White \n",
"1 Married-civ-spouse Exec-managerial Husband White \n",
"2 Divorced Handlers-cleaners Not-in-family White \n",
"3 Married-civ-spouse Handlers-cleaners Husband Black \n",
"4 Married-civ-spouse Prof-specialty Wife Black \n",
"5 Married-civ-spouse Exec-managerial Wife White \n",
"6 Married-spouse-absent Other-service Not-in-family Black \n",
"7 Married-civ-spouse Exec-managerial Husband White \n",
"8 Never-married Prof-specialty Not-in-family White \n",
"9 Married-civ-spouse Exec-managerial Husband White \n",
"\n",
" Sex CapitalGain CapitalLoss HrPerWk Native Target \n",
"0 Male 2174 0 40 United-States <=50K \n",
"1 Male 0 0 13 United-States <=50K \n",
"2 Male 0 0 40 United-States <=50K \n",
"3 Male 0 0 40 United-States <=50K \n",
"4 Female 0 0 40 Cuba <=50K \n",
"5 Female 0 0 40 United-States <=50K \n",
"6 Female 0 0 16 Jamaica <=50K \n",
"7 Male 0 0 45 United-States >50K \n",
"8 Female 14084 0 50 United-States >50K \n",
"9 Male 5178 0 40 United-States >50K "
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"#Load in the dataset\n",
"colnames=['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native', 'Target'] \n",
"df = pd.read_csv('/Users/brendan.tierney/Dropbox/4-Datasets/adult.csv', names=colnames, header=None)\n",
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().values.any()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows : 32561\n",
"Columns : 15\n",
"\n",
"Features : \n",
" ['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native', 'Target']\n",
"\n",
"Missing values : 0\n",
"\n",
"Unique values : \n",
" Age 73\n",
"WorkClass 9\n",
"Fnlwgt 21648\n",
"Education 16\n",
"Edu_Num 16\n",
"MaritalStatus 7\n",
"Occupation 15\n",
"Relationship 6\n",
"Race 5\n",
"Sex 2\n",
"CapitalGain 119\n",
"CapitalLoss 92\n",
"HrPerWk 94\n",
"Native 42\n",
"Target 2\n",
"dtype: int64\n"
]
}
],
"source": [
"print (\"Rows : \" ,df.shape[0])\n",
"print (\"Columns : \" ,df.shape[1])\n",
"print (\"\\nFeatures : \\n\" ,df.columns.tolist())\n",
"print (\"\\nMissing values : \", df.isnull().sum().values.sum())\n",
"print (\"\\nUnique values : \\n\",df.nunique())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 32561 entries, 0 to 32560\n",
"Data columns (total 15 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Age 32561 non-null int64 \n",
" 1 WorkClass 32561 non-null object\n",
" 2 Fnlwgt 32561 non-null int64 \n",
" 3 Education 32561 non-null object\n",
" 4 Edu_Num 32561 non-null int64 \n",
" 5 MaritalStatus 32561 non-null object\n",
" 6 Occupation 32561 non-null object\n",
" 7 Relationship 32561 non-null object\n",
" 8 Race 32561 non-null object\n",
" 9 Sex 32561 non-null object\n",
" 10 CapitalGain 32561 non-null int64 \n",
" 11 CapitalLoss 32561 non-null int64 \n",
" 12 HrPerWk 32561 non-null int64 \n",
" 13 Native 32561 non-null object\n",
" 14 Target 32561 non-null object\n",
"dtypes: int64(6), object(9)\n",
"memory usage: 3.7+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Age | \n",
" Fnlwgt | \n",
" Edu_Num | \n",
" CapitalGain | \n",
" CapitalLoss | \n",
" HrPerWk | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 32561.000000 | \n",
" 3.256100e+04 | \n",
" 32561.000000 | \n",
" 32561.000000 | \n",
" 32561.000000 | \n",
" 32561.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 38.581647 | \n",
" 1.897784e+05 | \n",
" 10.080679 | \n",
" 1077.648844 | \n",
" 87.303830 | \n",
" 40.437456 | \n",
"
\n",
" \n",
" std | \n",
" 13.640433 | \n",
" 1.055500e+05 | \n",
" 2.572720 | \n",
" 7385.292085 | \n",
" 402.960219 | \n",
" 12.347429 | \n",
"
\n",
" \n",
" min | \n",
" 17.000000 | \n",
" 1.228500e+04 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 28.000000 | \n",
" 1.178270e+05 | \n",
" 9.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 40.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 37.000000 | \n",
" 1.783560e+05 | \n",
" 10.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 40.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 48.000000 | \n",
" 2.370510e+05 | \n",
" 12.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 45.000000 | \n",
"
\n",
" \n",
" max | \n",
" 90.000000 | \n",
" 1.484705e+06 | \n",
" 16.000000 | \n",
" 99999.000000 | \n",
" 4356.000000 | \n",
" 99.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age Fnlwgt Edu_Num CapitalGain CapitalLoss \\\n",
"count 32561.000000 3.256100e+04 32561.000000 32561.000000 32561.000000 \n",
"mean 38.581647 1.897784e+05 10.080679 1077.648844 87.303830 \n",
"std 13.640433 1.055500e+05 2.572720 7385.292085 402.960219 \n",
"min 17.000000 1.228500e+04 1.000000 0.000000 0.000000 \n",
"25% 28.000000 1.178270e+05 9.000000 0.000000 0.000000 \n",
"50% 37.000000 1.783560e+05 10.000000 0.000000 0.000000 \n",
"75% 48.000000 2.370510e+05 12.000000 0.000000 0.000000 \n",
"max 90.000000 1.484705e+06 16.000000 99999.000000 4356.000000 \n",
"\n",
" HrPerWk \n",
"count 32561.000000 \n",
"mean 40.437456 \n",
"std 12.347429 \n",
"min 1.000000 \n",
"25% 40.000000 \n",
"50% 40.000000 \n",
"75% 45.000000 \n",
"max 99.000000 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Numerical feature of summary/description \n",
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Age 0\n",
"WorkClass 0\n",
"Fnlwgt 0\n",
"Education 0\n",
"Edu_Num 0\n",
"MaritalStatus 0\n",
"Occupation 0\n",
"Relationship 0\n",
"Race 0\n",
"Sex 0\n",
"CapitalGain 0\n",
"CapitalLoss 0\n",
"HrPerWk 0\n",
"Native 0\n",
"Target 0\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# checking \"?\" values, how many are there in the whole dataset\n",
"df_missing = (df=='?').sum()\n",
"df_missing"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" <=50K 0.75919\n",
" >50K 0.24081\n",
"Name: Target, dtype: float64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Target'].value_counts()/len(df) #calculate percentages"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" WorkClass | \n",
" Education | \n",
" MaritalStatus | \n",
" Occupation | \n",
" Relationship | \n",
" Race | \n",
" Sex | \n",
" Native | \n",
" Target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" State-gov | \n",
" Bachelors | \n",
" Never-married | \n",
" Adm-clerical | \n",
" Not-in-family | \n",
" White | \n",
" Male | \n",
" United-States | \n",
" <=50K | \n",
"
\n",
" \n",
" 1 | \n",
" Self-emp-not-inc | \n",
" Bachelors | \n",
" Married-civ-spouse | \n",
" Exec-managerial | \n",
" Husband | \n",
" White | \n",
" Male | \n",
" United-States | \n",
" <=50K | \n",
"
\n",
" \n",
" 2 | \n",
" Private | \n",
" HS-grad | \n",
" Divorced | \n",
" Handlers-cleaners | \n",
" Not-in-family | \n",
" White | \n",
" Male | \n",
" United-States | \n",
" <=50K | \n",
"
\n",
" \n",
" 3 | \n",
" Private | \n",
" 11th | \n",
" Married-civ-spouse | \n",
" Handlers-cleaners | \n",
" Husband | \n",
" Black | \n",
" Male | \n",
" United-States | \n",
" <=50K | \n",
"
\n",
" \n",
" 4 | \n",
" Private | \n",
" Bachelors | \n",
" Married-civ-spouse | \n",
" Prof-specialty | \n",
" Wife | \n",
" Black | \n",
" Female | \n",
" Cuba | \n",
" <=50K | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" WorkClass Education MaritalStatus Occupation \\\n",
"0 State-gov Bachelors Never-married Adm-clerical \n",
"1 Self-emp-not-inc Bachelors Married-civ-spouse Exec-managerial \n",
"2 Private HS-grad Divorced Handlers-cleaners \n",
"3 Private 11th Married-civ-spouse Handlers-cleaners \n",
"4 Private Bachelors Married-civ-spouse Prof-specialty \n",
"\n",
" Relationship Race Sex Native Target \n",
"0 Not-in-family White Male United-States <=50K \n",
"1 Husband White Male United-States <=50K \n",
"2 Not-in-family White Male United-States <=50K \n",
"3 Husband Black Male United-States <=50K \n",
"4 Wife Black Female Cuba <=50K "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import preprocessing\n",
"\n",
"# encode categorical variables using label Encoder\n",
"\n",
"# select all categorical variables\n",
"df_categorical = df.select_dtypes(include=['object'])\n",
"df_categorical.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" WorkClass | \n",
" Education | \n",
" MaritalStatus | \n",
" Occupation | \n",
" Relationship | \n",
" Race | \n",
" Sex | \n",
" Native | \n",
" Target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 7 | \n",
" 9 | \n",
" 4 | \n",
" 1 | \n",
" 1 | \n",
" 4 | \n",
" 1 | \n",
" 39 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 6 | \n",
" 9 | \n",
" 2 | \n",
" 4 | \n",
" 0 | \n",
" 4 | \n",
" 1 | \n",
" 39 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 4 | \n",
" 11 | \n",
" 0 | \n",
" 6 | \n",
" 1 | \n",
" 4 | \n",
" 1 | \n",
" 39 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 1 | \n",
" 2 | \n",
" 6 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 39 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" 9 | \n",
" 2 | \n",
" 10 | \n",
" 5 | \n",
" 2 | \n",
" 0 | \n",
" 5 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" WorkClass Education MaritalStatus Occupation Relationship Race Sex \\\n",
"0 7 9 4 1 1 4 1 \n",
"1 6 9 2 4 0 4 1 \n",
"2 4 11 0 6 1 4 1 \n",
"3 4 1 2 6 0 2 1 \n",
"4 4 9 2 10 5 2 0 \n",
"\n",
" Native Target \n",
"0 39 0 \n",
"1 39 0 \n",
"2 39 0 \n",
"3 39 0 \n",
"4 5 0 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# apply label encoder to df_categorical\n",
"le = preprocessing.LabelEncoder()\n",
"df_categorical = df_categorical.apply(le.fit_transform)\n",
"df_categorical.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Age | \n",
" Fnlwgt | \n",
" Edu_Num | \n",
" CapitalGain | \n",
" CapitalLoss | \n",
" HrPerWk | \n",
" WorkClass | \n",
" Education | \n",
" MaritalStatus | \n",
" Occupation | \n",
" Relationship | \n",
" Race | \n",
" Sex | \n",
" Native | \n",
" Target | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 39 | \n",
" 77516 | \n",
" 13 | \n",
" 2174 | \n",
" 0 | \n",
" 40 | \n",
" 7 | \n",
" 9 | \n",
" 4 | \n",
" 1 | \n",
" 1 | \n",
" 4 | \n",
" 1 | \n",
" 39 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 50 | \n",
" 83311 | \n",
" 13 | \n",
" 0 | \n",
" 0 | \n",
" 13 | \n",
" 6 | \n",
" 9 | \n",
" 2 | \n",
" 4 | \n",
" 0 | \n",
" 4 | \n",
" 1 | \n",
" 39 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 38 | \n",
" 215646 | \n",
" 9 | \n",
" 0 | \n",
" 0 | \n",
" 40 | \n",
" 4 | \n",
" 11 | \n",
" 0 | \n",
" 6 | \n",
" 1 | \n",
" 4 | \n",
" 1 | \n",
" 39 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 53 | \n",
" 234721 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 40 | \n",
" 4 | \n",
" 1 | \n",
" 2 | \n",
" 6 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 39 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 28 | \n",
" 338409 | \n",
" 13 | \n",
" 0 | \n",
" 0 | \n",
" 40 | \n",
" 4 | \n",
" 9 | \n",
" 2 | \n",
" 10 | \n",
" 5 | \n",
" 2 | \n",
" 0 | \n",
" 5 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age Fnlwgt Edu_Num CapitalGain CapitalLoss HrPerWk WorkClass \\\n",
"0 39 77516 13 2174 0 40 7 \n",
"1 50 83311 13 0 0 13 6 \n",
"2 38 215646 9 0 0 40 4 \n",
"3 53 234721 7 0 0 40 4 \n",
"4 28 338409 13 0 0 40 4 \n",
"\n",
" Education MaritalStatus Occupation Relationship Race Sex Native \\\n",
"0 9 4 1 1 4 1 39 \n",
"1 9 2 4 0 4 1 39 \n",
"2 11 0 6 1 4 1 39 \n",
"3 1 2 6 0 2 1 39 \n",
"4 9 2 10 5 2 0 5 \n",
"\n",
" Target \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Next, Concatenate df_categorical dataframe with original df (dataframe)\n",
"\n",
"# first, Drop earlier duplicate columns which had categorical values\n",
"df = df.drop(df_categorical.columns,axis=1)\n",
"df = pd.concat([df,df_categorical],axis=1)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Age | \n",
" Fnlwgt | \n",
" Edu_Num | \n",
" CapitalGain | \n",
" CapitalLoss | \n",
" HrPerWk | \n",
" WorkClass | \n",
" Education | \n",
" MaritalStatus | \n",
" Occupation | \n",
" Relationship | \n",
" Race | \n",
" Sex | \n",
" Native | \n",
" Target | \n",
"
\n",
" \n",
" \n",
" \n",
" Age | \n",
" 1.000000 | \n",
" -0.076646 | \n",
" 0.036527 | \n",
" 0.077674 | \n",
" 0.057775 | \n",
" 0.068756 | \n",
" 0.003787 | \n",
" -0.010508 | \n",
" -0.266288 | \n",
" -0.020947 | \n",
" -0.263698 | \n",
" 0.028718 | \n",
" 0.088832 | \n",
" -0.001151 | \n",
" 0.234037 | \n",
"
\n",
" \n",
" Fnlwgt | \n",
" -0.076646 | \n",
" 1.000000 | \n",
" -0.043195 | \n",
" 0.000432 | \n",
" -0.010252 | \n",
" -0.018768 | \n",
" -0.016656 | \n",
" -0.028145 | \n",
" 0.028153 | \n",
" 0.001597 | \n",
" 0.008931 | \n",
" -0.021291 | \n",
" 0.026858 | \n",
" -0.051966 | \n",
" -0.009463 | \n",
"
\n",
" \n",
" Edu_Num | \n",
" 0.036527 | \n",
" -0.043195 | \n",
" 1.000000 | \n",
" 0.122630 | \n",
" 0.079923 | \n",
" 0.148123 | \n",
" 0.052085 | \n",
" 0.359153 | \n",
" -0.069304 | \n",
" 0.109697 | \n",
" -0.094153 | \n",
" 0.031838 | \n",
" 0.012280 | \n",
" 0.050840 | \n",
" 0.335154 | \n",
"
\n",
" \n",
" CapitalGain | \n",
" 0.077674 | \n",
" 0.000432 | \n",
" 0.122630 | \n",
" 1.000000 | \n",
" -0.031615 | \n",
" 0.078409 | \n",
" 0.033835 | \n",
" 0.030046 | \n",
" -0.043393 | \n",
" 0.025505 | \n",
" -0.057919 | \n",
" 0.011145 | \n",
" 0.048480 | \n",
" -0.001982 | \n",
" 0.223329 | \n",
"
\n",
" \n",
" CapitalLoss | \n",
" 0.057775 | \n",
" -0.010252 | \n",
" 0.079923 | \n",
" -0.031615 | \n",
" 1.000000 | \n",
" 0.054256 | \n",
" 0.012216 | \n",
" 0.016746 | \n",
" -0.034187 | \n",
" 0.017987 | \n",
" -0.061062 | \n",
" 0.018899 | \n",
" 0.045567 | \n",
" 0.000419 | \n",
" 0.150526 | \n",
"
\n",
" \n",
" HrPerWk | \n",
" 0.068756 | \n",
" -0.018768 | \n",
" 0.148123 | \n",
" 0.078409 | \n",
" 0.054256 | \n",
" 1.000000 | \n",
" 0.138962 | \n",
" 0.055510 | \n",
" -0.190519 | \n",
" 0.080383 | \n",
" -0.248974 | \n",
" 0.041910 | \n",
" 0.229309 | \n",
" -0.002671 | \n",
" 0.229689 | \n",
"
\n",
" \n",
" WorkClass | \n",
" 0.003787 | \n",
" -0.016656 | \n",
" 0.052085 | \n",
" 0.033835 | \n",
" 0.012216 | \n",
" 0.138962 | \n",
" 1.000000 | \n",
" 0.023513 | \n",
" -0.064731 | \n",
" 0.254892 | \n",
" -0.090461 | \n",
" 0.049742 | \n",
" 0.095981 | \n",
" -0.007690 | \n",
" 0.051604 | \n",
"
\n",
" \n",
" Education | \n",
" -0.010508 | \n",
" -0.028145 | \n",
" 0.359153 | \n",
" 0.030046 | \n",
" 0.016746 | \n",
" 0.055510 | \n",
" 0.023513 | \n",
" 1.000000 | \n",
" -0.038407 | \n",
" -0.021260 | \n",
" -0.010876 | \n",
" 0.014131 | \n",
" -0.027356 | \n",
" 0.064288 | \n",
" 0.079317 | \n",
"
\n",
" \n",
" MaritalStatus | \n",
" -0.266288 | \n",
" 0.028153 | \n",
" -0.069304 | \n",
" -0.043393 | \n",
" -0.034187 | \n",
" -0.190519 | \n",
" -0.064731 | \n",
" -0.038407 | \n",
" 1.000000 | \n",
" -0.009654 | \n",
" 0.185451 | \n",
" -0.068013 | \n",
" -0.129314 | \n",
" -0.023819 | \n",
" -0.199307 | \n",
"
\n",
" \n",
" Occupation | \n",
" -0.020947 | \n",
" 0.001597 | \n",
" 0.109697 | \n",
" 0.025505 | \n",
" 0.017987 | \n",
" 0.080383 | \n",
" 0.254892 | \n",
" -0.021260 | \n",
" -0.009654 | \n",
" 1.000000 | \n",
" -0.075607 | \n",
" 0.006763 | \n",
" 0.080296 | \n",
" -0.012543 | \n",
" 0.075468 | \n",
"
\n",
" \n",
" Relationship | \n",
" -0.263698 | \n",
" 0.008931 | \n",
" -0.094153 | \n",
" -0.057919 | \n",
" -0.061062 | \n",
" -0.248974 | \n",
" -0.090461 | \n",
" -0.010876 | \n",
" 0.185451 | \n",
" -0.075607 | \n",
" 1.000000 | \n",
" -0.116055 | \n",
" -0.582454 | \n",
" -0.005507 | \n",
" -0.250918 | \n",
"
\n",
" \n",
" Race | \n",
" 0.028718 | \n",
" -0.021291 | \n",
" 0.031838 | \n",
" 0.011145 | \n",
" 0.018899 | \n",
" 0.041910 | \n",
" 0.049742 | \n",
" 0.014131 | \n",
" -0.068013 | \n",
" 0.006763 | \n",
" -0.116055 | \n",
" 1.000000 | \n",
" 0.087204 | \n",
" 0.137852 | \n",
" 0.071846 | \n",
"
\n",
" \n",
" Sex | \n",
" 0.088832 | \n",
" 0.026858 | \n",
" 0.012280 | \n",
" 0.048480 | \n",
" 0.045567 | \n",
" 0.229309 | \n",
" 0.095981 | \n",
" -0.027356 | \n",
" -0.129314 | \n",
" 0.080296 | \n",
" -0.582454 | \n",
" 0.087204 | \n",
" 1.000000 | \n",
" -0.008119 | \n",
" 0.215980 | \n",
"
\n",
" \n",
" Native | \n",
" -0.001151 | \n",
" -0.051966 | \n",
" 0.050840 | \n",
" -0.001982 | \n",
" 0.000419 | \n",
" -0.002671 | \n",
" -0.007690 | \n",
" 0.064288 | \n",
" -0.023819 | \n",
" -0.012543 | \n",
" -0.005507 | \n",
" 0.137852 | \n",
" -0.008119 | \n",
" 1.000000 | \n",
" 0.015840 | \n",
"
\n",
" \n",
" Target | \n",
" 0.234037 | \n",
" -0.009463 | \n",
" 0.335154 | \n",
" 0.223329 | \n",
" 0.150526 | \n",
" 0.229689 | \n",
" 0.051604 | \n",
" 0.079317 | \n",
" -0.199307 | \n",
" 0.075468 | \n",
" -0.250918 | \n",
" 0.071846 | \n",
" 0.215980 | \n",
" 0.015840 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age Fnlwgt Edu_Num CapitalGain CapitalLoss \\\n",
"Age 1.000000 -0.076646 0.036527 0.077674 0.057775 \n",
"Fnlwgt -0.076646 1.000000 -0.043195 0.000432 -0.010252 \n",
"Edu_Num 0.036527 -0.043195 1.000000 0.122630 0.079923 \n",
"CapitalGain 0.077674 0.000432 0.122630 1.000000 -0.031615 \n",
"CapitalLoss 0.057775 -0.010252 0.079923 -0.031615 1.000000 \n",
"HrPerWk 0.068756 -0.018768 0.148123 0.078409 0.054256 \n",
"WorkClass 0.003787 -0.016656 0.052085 0.033835 0.012216 \n",
"Education -0.010508 -0.028145 0.359153 0.030046 0.016746 \n",
"MaritalStatus -0.266288 0.028153 -0.069304 -0.043393 -0.034187 \n",
"Occupation -0.020947 0.001597 0.109697 0.025505 0.017987 \n",
"Relationship -0.263698 0.008931 -0.094153 -0.057919 -0.061062 \n",
"Race 0.028718 -0.021291 0.031838 0.011145 0.018899 \n",
"Sex 0.088832 0.026858 0.012280 0.048480 0.045567 \n",
"Native -0.001151 -0.051966 0.050840 -0.001982 0.000419 \n",
"Target 0.234037 -0.009463 0.335154 0.223329 0.150526 \n",
"\n",
" HrPerWk WorkClass Education MaritalStatus Occupation \\\n",
"Age 0.068756 0.003787 -0.010508 -0.266288 -0.020947 \n",
"Fnlwgt -0.018768 -0.016656 -0.028145 0.028153 0.001597 \n",
"Edu_Num 0.148123 0.052085 0.359153 -0.069304 0.109697 \n",
"CapitalGain 0.078409 0.033835 0.030046 -0.043393 0.025505 \n",
"CapitalLoss 0.054256 0.012216 0.016746 -0.034187 0.017987 \n",
"HrPerWk 1.000000 0.138962 0.055510 -0.190519 0.080383 \n",
"WorkClass 0.138962 1.000000 0.023513 -0.064731 0.254892 \n",
"Education 0.055510 0.023513 1.000000 -0.038407 -0.021260 \n",
"MaritalStatus -0.190519 -0.064731 -0.038407 1.000000 -0.009654 \n",
"Occupation 0.080383 0.254892 -0.021260 -0.009654 1.000000 \n",
"Relationship -0.248974 -0.090461 -0.010876 0.185451 -0.075607 \n",
"Race 0.041910 0.049742 0.014131 -0.068013 0.006763 \n",
"Sex 0.229309 0.095981 -0.027356 -0.129314 0.080296 \n",
"Native -0.002671 -0.007690 0.064288 -0.023819 -0.012543 \n",
"Target 0.229689 0.051604 0.079317 -0.199307 0.075468 \n",
"\n",
" Relationship Race Sex Native Target \n",
"Age -0.263698 0.028718 0.088832 -0.001151 0.234037 \n",
"Fnlwgt 0.008931 -0.021291 0.026858 -0.051966 -0.009463 \n",
"Edu_Num -0.094153 0.031838 0.012280 0.050840 0.335154 \n",
"CapitalGain -0.057919 0.011145 0.048480 -0.001982 0.223329 \n",
"CapitalLoss -0.061062 0.018899 0.045567 0.000419 0.150526 \n",
"HrPerWk -0.248974 0.041910 0.229309 -0.002671 0.229689 \n",
"WorkClass -0.090461 0.049742 0.095981 -0.007690 0.051604 \n",
"Education -0.010876 0.014131 -0.027356 0.064288 0.079317 \n",
"MaritalStatus 0.185451 -0.068013 -0.129314 -0.023819 -0.199307 \n",
"Occupation -0.075607 0.006763 0.080296 -0.012543 0.075468 \n",
"Relationship 1.000000 -0.116055 -0.582454 -0.005507 -0.250918 \n",
"Race -0.116055 1.000000 0.087204 0.137852 0.071846 \n",
"Sex -0.582454 0.087204 1.000000 -0.008119 0.215980 \n",
"Native -0.005507 0.137852 -0.008119 1.000000 0.015840 \n",
"Target -0.250918 0.071846 0.215980 0.015840 1.000000 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"corr_matrix=df.corr()\n",
"corr_matrix"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import seaborn as sn\n",
"import matplotlib.pyplot as plt\n",
"\n",
"fig = plt.subplots(figsize=(17,14))\n",
"sn.heatmap(corr_matrix, annot=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 24720\n",
"1 7841\n",
"Name: Target, dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Target'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Putting independent variables/features to X\n",
"X = df.drop('Target',axis=1)\n",
"\n",
"# Putting response/dependent variable/feature to y\n",
"y = df['Target']"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# Splitting the data into train and test\n",
"X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=99)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22792\n",
"22792\n",
"9769\n",
"9769\n",
"0.6999785018887626\n",
"0.30002149811123735\n"
]
}
],
"source": [
"y_train.dtypes\n",
"print(len(X_train))\n",
"print(len(y_train))\n",
"print(len(X_test))\n",
"print(len(y_test))\n",
"print(len(X_train)/(len(X_train)+len(X_test)))\n",
"print(len(X_test)/(len(X_train)+len(X_test)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Now use AutoML\n",
"#### example using tpot library"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tpot/builtins/__init__.py:36: UserWarning: Warning: optional dependency `torch` is not available. - skipping import of NN models.\n",
" warnings.warn(\"Warning: optional dependency `torch` is not available. - skipping import of NN models.\")\n"
]
}
],
"source": [
"import tpot"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Optimization Progress: 0%| | 0/30 [00:00, ?pipeline/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generation 1 - Current best internal CV score: 0.8726748059978281\n",
"\n",
"Generation 2 - Current best internal CV score: 0.8726748059978281\n",
"\n",
"Generation 3 - Current best internal CV score: 0.8726748059978281\n",
"\n",
"Generation 4 - Current best internal CV score: 0.8726748059978281\n",
"\n",
"Generation 5 - Current best internal CV score: 0.8726748059978281\n",
"\n",
"Best pipeline: XGBClassifier(input_matrix, learning_rate=0.1, max_depth=6, min_child_weight=4, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)\n"
]
},
{
"data": {
"text/plain": [
"TPOTClassifier(generations=5, population_size=5, verbosity=2)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from tpot import TPOTClassifier\n",
"from tpot import TPOTRegressor\n",
"\n",
"tpot = TPOTClassifier(generations=5, population_size=5, verbosity=2)\n",
"\n",
"tpot.fit(X_train, y_train)\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"32 operators have been imported by TPOT.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Optimization Progress: 0%| | 0/30 [00:00, ?pipeline/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
"\n",
"Generation 1 - Current Pareto front scores:\n",
"\n",
"-1\t0.846393379147429\tExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=True, ExtraTreesClassifier__criterion=entropy, ExtraTreesClassifier__max_features=0.7500000000000001, ExtraTreesClassifier__min_samples_leaf=20, ExtraTreesClassifier__min_samples_split=18, ExtraTreesClassifier__n_estimators=100)\n",
"\n",
"Generation 2 - Current Pareto front scores:\n",
"\n",
"-1\t0.8489381528958578\tExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=True, ExtraTreesClassifier__criterion=entropy, ExtraTreesClassifier__max_features=0.7500000000000001, ExtraTreesClassifier__min_samples_leaf=15, ExtraTreesClassifier__min_samples_split=18, ExtraTreesClassifier__n_estimators=100)\n",
"_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required..\n",
"\n",
"Generation 3 - Current Pareto front scores:\n",
"\n",
"-1\t0.8675415817248977\tGradientBoostingClassifier(input_matrix, GradientBoostingClassifier__learning_rate=0.1, GradientBoostingClassifier__max_depth=10, GradientBoostingClassifier__max_features=0.5, GradientBoostingClassifier__min_samples_leaf=8, GradientBoostingClassifier__min_samples_split=8, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=1.0)\n",
"_pre_test decorator: _random_mutation_operator: num_test=0 feature_names mismatch: ['Age', 'Fnlwgt', 'Edu_Num', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'WorkClass', 'Education', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'Native'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13']\n",
"expected Edu_Num, Race, Sex, Native, WorkClass, Education, Fnlwgt, HrPerWk, CapitalLoss, Relationship, Occupation, Age, CapitalGain, MaritalStatus in input data\n",
"training data did not have the following fields: f9, f8, f11, f4, f5, f3, f6, f13, f2, f0, f1, f7, f10, f12.\n",
"Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
"\n",
"Generation 4 - Current Pareto front scores:\n",
"\n",
"-1\t0.8682435189121499\tGradientBoostingClassifier(input_matrix, GradientBoostingClassifier__learning_rate=0.1, GradientBoostingClassifier__max_depth=10, GradientBoostingClassifier__max_features=0.5, GradientBoostingClassifier__min_samples_leaf=17, GradientBoostingClassifier__min_samples_split=8, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=1.0)\n",
"_pre_test decorator: _random_mutation_operator: num_test=0 feature_names mismatch: ['Age', 'Fnlwgt', 'Edu_Num', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'WorkClass', 'Education', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'Native'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13']\n",
"expected Edu_Num, Race, Sex, Native, WorkClass, Education, Fnlwgt, HrPerWk, CapitalLoss, Relationship, Occupation, Age, CapitalGain, MaritalStatus in input data\n",
"training data did not have the following fields: f9, f8, f11, f4, f5, f3, f6, f13, f2, f0, f1, f7, f10, f12.\n",
"_pre_test decorator: _random_mutation_operator: num_test=0 '(slice(None, None, None), 0)' is an invalid key.\n",
"Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
"\n",
"Generation 5 - Current Pareto front scores:\n",
"\n",
"-1\t0.8682435189121499\tGradientBoostingClassifier(input_matrix, GradientBoostingClassifier__learning_rate=0.1, GradientBoostingClassifier__max_depth=10, GradientBoostingClassifier__max_features=0.5, GradientBoostingClassifier__min_samples_leaf=17, GradientBoostingClassifier__min_samples_split=8, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=1.0)\n"
]
},
{
"data": {
"text/plain": [
"TPOTClassifier(generations=5, population_size=5, verbosity=3)"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Change Verboity = 3\n",
"from tpot import TPOTClassifier\n",
"from tpot import TPOTRegressor\n",
"\n",
"tpot = TPOTClassifier(generations=5, population_size=5, verbosity=3)\n",
"\n",
"tpot.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"32 operators have been imported by TPOT.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Optimization Progress: 0%| | 0/30 [00:00, ?pipeline/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generation 1 - Current Pareto front scores:\n",
"\n",
"-1\t0.8431029144382738\tDecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=4, DecisionTreeClassifier__min_samples_leaf=10, DecisionTreeClassifier__min_samples_split=4)\n",
"_pre_test decorator: _random_mutation_operator: num_test=0 cosine was provided as affinity. Ward can only work with euclidean distances..\n",
"Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
"\n",
"Generation 2 - Current Pareto front scores:\n",
"\n",
"-1\t0.8431029144382738\tDecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=4, DecisionTreeClassifier__min_samples_leaf=10, DecisionTreeClassifier__min_samples_split=4)\n",
"Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
"\n",
"Generation 3 - Current Pareto front scores:\n",
"\n",
"-1\t0.8541157276721251\tDecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=7, DecisionTreeClassifier__min_samples_leaf=10, DecisionTreeClassifier__min_samples_split=4)\n",
"Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
"\n",
"Generation 4 - Current Pareto front scores:\n",
"\n",
"-1\t0.8541157276721251\tDecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=7, DecisionTreeClassifier__min_samples_leaf=10, DecisionTreeClassifier__min_samples_split=4)\n",
"\n",
"Generation 5 - Current Pareto front scores:\n",
"\n",
"-1\t0.8541157276721251\tDecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=7, DecisionTreeClassifier__min_samples_leaf=10, DecisionTreeClassifier__min_samples_split=4)\n"
]
},
{
"data": {
"text/plain": [
"TPOTClassifier(generations=5, n_jobs=4, population_size=5, verbosity=3)"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Change Verboity = 4\n",
"from tpot import TPOTClassifier\n",
"from tpot import TPOTRegressor\n",
"\n",
"tpot = TPOTClassifier(generations=5, population_size=5, verbosity=3, n_jobs=4) #, max_time_mins=3)\n",
"\n",
"tpot.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"32 operators have been imported by TPOT.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Optimization Progress: 0%| | 0/30 [00:00, ?pipeline/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generation 1 - Current Pareto front scores:\n",
"\n",
"-1\t0.8579326621149012\tRandomForestClassifier(input_matrix, RandomForestClassifier__bootstrap=True, RandomForestClassifier__criterion=entropy, RandomForestClassifier__max_features=0.9500000000000001, RandomForestClassifier__min_samples_leaf=4, RandomForestClassifier__min_samples_split=4, RandomForestClassifier__n_estimators=100)\n",
"\n",
"-2\t0.8691648409459862\tXGBClassifier(MaxAbsScaler(input_matrix), XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)\n",
"\n",
"Generation 2 - Current Pareto front scores:\n",
"\n",
"-1\t0.8690771312808586\tXGBClassifier(input_matrix, XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)\n",
"\n",
"-2\t0.8691648409459862\tXGBClassifier(MaxAbsScaler(input_matrix), XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)\n",
"Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
"\n",
"Generation 3 - Current Pareto front scores:\n",
"\n",
"-1\t0.8690771312808586\tXGBClassifier(input_matrix, XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)\n",
"\n",
"-2\t0.8691648409459862\tXGBClassifier(MaxAbsScaler(input_matrix), XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)\n",
"\n",
"Generation 4 - Current Pareto front scores:\n",
"\n",
"-1\t0.8690771312808586\tXGBClassifier(input_matrix, XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)\n",
"\n",
"-2\t0.8700862977252755\tXGBClassifier(MaxAbsScaler(input_matrix), XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=1.0, XGBClassifier__verbosity=0)\n",
"\n",
"Generation 5 - Current Pareto front scores:\n",
"\n",
"-1\t0.8700862977252755\tXGBClassifier(input_matrix, XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=1.0, XGBClassifier__verbosity=0)\n"
]
},
{
"data": {
"text/plain": [
"TPOTClassifier(generations=5, n_jobs=4, population_size=5, scoring='accuracy',\n",
" verbosity=3)"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Change Verboity = 4 & scoring = accuracy (which is default value)\n",
"from tpot import TPOTClassifier\n",
"from tpot import TPOTRegressor\n",
"\n",
"tpot = TPOTClassifier(generations=5, population_size=5, verbosity=3, n_jobs=4, scoring='accuracy')\n",
"\n",
"tpot.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Add the parameter random_state and give it a values\n",
"#Copy the code and add this parameter.\n",
"#Q: Are the results different to what is shown above?\n",
"#Q: Rerun this code and see if the outputs change? If they remain the same, why?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8670283550005118"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tpot.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"#export the model\n",
"tpot.export('/Users/brendan.tierney/Dropbox/4-Datasets/tpot_Adult_pipeline.py')"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(steps=[('xgbclassifier',\n",
" XGBClassifier(base_score=0.5, booster='gbtree',\n",
" colsample_bylevel=1, colsample_bynode=1,\n",
" colsample_bytree=1, gamma=0, gpu_id=-1,\n",
" importance_type='gain',\n",
" interaction_constraints='', learning_rate=0.1,\n",
" max_delta_step=0, max_depth=9,\n",
" min_child_weight=11, missing=nan,\n",
" monotone_constraints='()', n_estimators=100,\n",
" n_jobs=1, num_parallel_tree=1, random_state=0,\n",
" reg_alpha=0, reg_lambda=1, scale_pos_weight=1,\n",
" subsample=1.0, tree_method='exact',\n",
" validate_parameters=1, verbosity=0))])"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tpot.fitted_pipeline_"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}