{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### TU257 - AutoML Demo"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Run this notebook and examine what is produced for each cell.\n",
    "\n",
    "#### Add your own comments to enrich the information given to make it more meaningful to you.\n",
    "\n",
    "#### In this notebook we will have a look at two different AutoML libraries.  See the notes/website for links to posts illustrating other AutoML libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>WorkClass</th>\n",
       "      <th>Fnlwgt</th>\n",
       "      <th>Education</th>\n",
       "      <th>Edu_Num</th>\n",
       "      <th>MaritalStatus</th>\n",
       "      <th>Occupation</th>\n",
       "      <th>Relationship</th>\n",
       "      <th>Race</th>\n",
       "      <th>Sex</th>\n",
       "      <th>CapitalGain</th>\n",
       "      <th>CapitalLoss</th>\n",
       "      <th>HrPerWk</th>\n",
       "      <th>Native</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>39</td>\n",
       "      <td>State-gov</td>\n",
       "      <td>77516</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>2174</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>50</td>\n",
       "      <td>Self-emp-not-inc</td>\n",
       "      <td>83311</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38</td>\n",
       "      <td>Private</td>\n",
       "      <td>215646</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Handlers-cleaners</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>53</td>\n",
       "      <td>Private</td>\n",
       "      <td>234721</td>\n",
       "      <td>11th</td>\n",
       "      <td>7</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Handlers-cleaners</td>\n",
       "      <td>Husband</td>\n",
       "      <td>Black</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>28</td>\n",
       "      <td>Private</td>\n",
       "      <td>338409</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Wife</td>\n",
       "      <td>Black</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>Cuba</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>37</td>\n",
       "      <td>Private</td>\n",
       "      <td>284582</td>\n",
       "      <td>Masters</td>\n",
       "      <td>14</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Wife</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>49</td>\n",
       "      <td>Private</td>\n",
       "      <td>160187</td>\n",
       "      <td>9th</td>\n",
       "      <td>5</td>\n",
       "      <td>Married-spouse-absent</td>\n",
       "      <td>Other-service</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>Black</td>\n",
       "      <td>Female</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>Jamaica</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>52</td>\n",
       "      <td>Self-emp-not-inc</td>\n",
       "      <td>209642</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>9</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>45</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>31</td>\n",
       "      <td>Private</td>\n",
       "      <td>45781</td>\n",
       "      <td>Masters</td>\n",
       "      <td>14</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Female</td>\n",
       "      <td>14084</td>\n",
       "      <td>0</td>\n",
       "      <td>50</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>42</td>\n",
       "      <td>Private</td>\n",
       "      <td>159449</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>13</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>5178</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&gt;50K</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Age          WorkClass  Fnlwgt   Education  Edu_Num  \\\n",
       "0   39          State-gov   77516   Bachelors       13   \n",
       "1   50   Self-emp-not-inc   83311   Bachelors       13   \n",
       "2   38            Private  215646     HS-grad        9   \n",
       "3   53            Private  234721        11th        7   \n",
       "4   28            Private  338409   Bachelors       13   \n",
       "5   37            Private  284582     Masters       14   \n",
       "6   49            Private  160187         9th        5   \n",
       "7   52   Self-emp-not-inc  209642     HS-grad        9   \n",
       "8   31            Private   45781     Masters       14   \n",
       "9   42            Private  159449   Bachelors       13   \n",
       "\n",
       "            MaritalStatus          Occupation    Relationship    Race  \\\n",
       "0           Never-married        Adm-clerical   Not-in-family   White   \n",
       "1      Married-civ-spouse     Exec-managerial         Husband   White   \n",
       "2                Divorced   Handlers-cleaners   Not-in-family   White   \n",
       "3      Married-civ-spouse   Handlers-cleaners         Husband   Black   \n",
       "4      Married-civ-spouse      Prof-specialty            Wife   Black   \n",
       "5      Married-civ-spouse     Exec-managerial            Wife   White   \n",
       "6   Married-spouse-absent       Other-service   Not-in-family   Black   \n",
       "7      Married-civ-spouse     Exec-managerial         Husband   White   \n",
       "8           Never-married      Prof-specialty   Not-in-family   White   \n",
       "9      Married-civ-spouse     Exec-managerial         Husband   White   \n",
       "\n",
       "       Sex  CapitalGain  CapitalLoss  HrPerWk          Native  Target  \n",
       "0     Male         2174            0       40   United-States   <=50K  \n",
       "1     Male            0            0       13   United-States   <=50K  \n",
       "2     Male            0            0       40   United-States   <=50K  \n",
       "3     Male            0            0       40   United-States   <=50K  \n",
       "4   Female            0            0       40            Cuba   <=50K  \n",
       "5   Female            0            0       40   United-States   <=50K  \n",
       "6   Female            0            0       16         Jamaica   <=50K  \n",
       "7     Male            0            0       45   United-States    >50K  \n",
       "8   Female        14084            0       50   United-States    >50K  \n",
       "9     Male         5178            0       40   United-States    >50K  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "#Load in the dataset\n",
    "colnames=['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native', 'Target'] \n",
    "df = pd.read_csv('/Users/brendan.tierney/Dropbox/4-Datasets/adult.csv', names=colnames, header=None)\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull().values.any()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rows     :  32561\n",
      "Columns  :  15\n",
      "\n",
      "Features : \n",
      " ['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native', 'Target']\n",
      "\n",
      "Missing values :   0\n",
      "\n",
      "Unique values :  \n",
      " Age                 73\n",
      "WorkClass            9\n",
      "Fnlwgt           21648\n",
      "Education           16\n",
      "Edu_Num             16\n",
      "MaritalStatus        7\n",
      "Occupation          15\n",
      "Relationship         6\n",
      "Race                 5\n",
      "Sex                  2\n",
      "CapitalGain        119\n",
      "CapitalLoss         92\n",
      "HrPerWk             94\n",
      "Native              42\n",
      "Target               2\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print (\"Rows     : \" ,df.shape[0])\n",
    "print (\"Columns  : \" ,df.shape[1])\n",
    "print (\"\\nFeatures : \\n\" ,df.columns.tolist())\n",
    "print (\"\\nMissing values :  \", df.isnull().sum().values.sum())\n",
    "print (\"\\nUnique values :  \\n\",df.nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 32561 entries, 0 to 32560\n",
      "Data columns (total 15 columns):\n",
      " #   Column         Non-Null Count  Dtype \n",
      "---  ------         --------------  ----- \n",
      " 0   Age            32561 non-null  int64 \n",
      " 1   WorkClass      32561 non-null  object\n",
      " 2   Fnlwgt         32561 non-null  int64 \n",
      " 3   Education      32561 non-null  object\n",
      " 4   Edu_Num        32561 non-null  int64 \n",
      " 5   MaritalStatus  32561 non-null  object\n",
      " 6   Occupation     32561 non-null  object\n",
      " 7   Relationship   32561 non-null  object\n",
      " 8   Race           32561 non-null  object\n",
      " 9   Sex            32561 non-null  object\n",
      " 10  CapitalGain    32561 non-null  int64 \n",
      " 11  CapitalLoss    32561 non-null  int64 \n",
      " 12  HrPerWk        32561 non-null  int64 \n",
      " 13  Native         32561 non-null  object\n",
      " 14  Target         32561 non-null  object\n",
      "dtypes: int64(6), object(9)\n",
      "memory usage: 3.7+ MB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Fnlwgt</th>\n",
       "      <th>Edu_Num</th>\n",
       "      <th>CapitalGain</th>\n",
       "      <th>CapitalLoss</th>\n",
       "      <th>HrPerWk</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>32561.000000</td>\n",
       "      <td>3.256100e+04</td>\n",
       "      <td>32561.000000</td>\n",
       "      <td>32561.000000</td>\n",
       "      <td>32561.000000</td>\n",
       "      <td>32561.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>38.581647</td>\n",
       "      <td>1.897784e+05</td>\n",
       "      <td>10.080679</td>\n",
       "      <td>1077.648844</td>\n",
       "      <td>87.303830</td>\n",
       "      <td>40.437456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>13.640433</td>\n",
       "      <td>1.055500e+05</td>\n",
       "      <td>2.572720</td>\n",
       "      <td>7385.292085</td>\n",
       "      <td>402.960219</td>\n",
       "      <td>12.347429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>17.000000</td>\n",
       "      <td>1.228500e+04</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>28.000000</td>\n",
       "      <td>1.178270e+05</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>40.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>37.000000</td>\n",
       "      <td>1.783560e+05</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>40.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>48.000000</td>\n",
       "      <td>2.370510e+05</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>45.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>90.000000</td>\n",
       "      <td>1.484705e+06</td>\n",
       "      <td>16.000000</td>\n",
       "      <td>99999.000000</td>\n",
       "      <td>4356.000000</td>\n",
       "      <td>99.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                Age        Fnlwgt       Edu_Num   CapitalGain   CapitalLoss  \\\n",
       "count  32561.000000  3.256100e+04  32561.000000  32561.000000  32561.000000   \n",
       "mean      38.581647  1.897784e+05     10.080679   1077.648844     87.303830   \n",
       "std       13.640433  1.055500e+05      2.572720   7385.292085    402.960219   \n",
       "min       17.000000  1.228500e+04      1.000000      0.000000      0.000000   \n",
       "25%       28.000000  1.178270e+05      9.000000      0.000000      0.000000   \n",
       "50%       37.000000  1.783560e+05     10.000000      0.000000      0.000000   \n",
       "75%       48.000000  2.370510e+05     12.000000      0.000000      0.000000   \n",
       "max       90.000000  1.484705e+06     16.000000  99999.000000   4356.000000   \n",
       "\n",
       "            HrPerWk  \n",
       "count  32561.000000  \n",
       "mean      40.437456  \n",
       "std       12.347429  \n",
       "min        1.000000  \n",
       "25%       40.000000  \n",
       "50%       40.000000  \n",
       "75%       45.000000  \n",
       "max       99.000000  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Numerical feature of summary/description \n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Age              0\n",
       "WorkClass        0\n",
       "Fnlwgt           0\n",
       "Education        0\n",
       "Edu_Num          0\n",
       "MaritalStatus    0\n",
       "Occupation       0\n",
       "Relationship     0\n",
       "Race             0\n",
       "Sex              0\n",
       "CapitalGain      0\n",
       "CapitalLoss      0\n",
       "HrPerWk          0\n",
       "Native           0\n",
       "Target           0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# checking \"?\" values, how many are there in the whole dataset\n",
    "df_missing = (df=='?').sum()\n",
    "df_missing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       " <=50K    0.75919\n",
       " >50K     0.24081\n",
       "Name: Target, dtype: float64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Target'].value_counts()/len(df)   #calculate percentages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>WorkClass</th>\n",
       "      <th>Education</th>\n",
       "      <th>MaritalStatus</th>\n",
       "      <th>Occupation</th>\n",
       "      <th>Relationship</th>\n",
       "      <th>Race</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Native</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>State-gov</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>Never-married</td>\n",
       "      <td>Adm-clerical</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Self-emp-not-inc</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Exec-managerial</td>\n",
       "      <td>Husband</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Private</td>\n",
       "      <td>HS-grad</td>\n",
       "      <td>Divorced</td>\n",
       "      <td>Handlers-cleaners</td>\n",
       "      <td>Not-in-family</td>\n",
       "      <td>White</td>\n",
       "      <td>Male</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Private</td>\n",
       "      <td>11th</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Handlers-cleaners</td>\n",
       "      <td>Husband</td>\n",
       "      <td>Black</td>\n",
       "      <td>Male</td>\n",
       "      <td>United-States</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Private</td>\n",
       "      <td>Bachelors</td>\n",
       "      <td>Married-civ-spouse</td>\n",
       "      <td>Prof-specialty</td>\n",
       "      <td>Wife</td>\n",
       "      <td>Black</td>\n",
       "      <td>Female</td>\n",
       "      <td>Cuba</td>\n",
       "      <td>&lt;=50K</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           WorkClass   Education        MaritalStatus          Occupation  \\\n",
       "0          State-gov   Bachelors        Never-married        Adm-clerical   \n",
       "1   Self-emp-not-inc   Bachelors   Married-civ-spouse     Exec-managerial   \n",
       "2            Private     HS-grad             Divorced   Handlers-cleaners   \n",
       "3            Private        11th   Married-civ-spouse   Handlers-cleaners   \n",
       "4            Private   Bachelors   Married-civ-spouse      Prof-specialty   \n",
       "\n",
       "     Relationship    Race      Sex          Native  Target  \n",
       "0   Not-in-family   White     Male   United-States   <=50K  \n",
       "1         Husband   White     Male   United-States   <=50K  \n",
       "2   Not-in-family   White     Male   United-States   <=50K  \n",
       "3         Husband   Black     Male   United-States   <=50K  \n",
       "4            Wife   Black   Female            Cuba   <=50K  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import preprocessing\n",
    "\n",
    "# encode categorical variables using label Encoder\n",
    "\n",
    "# select all categorical variables\n",
    "df_categorical = df.select_dtypes(include=['object'])\n",
    "df_categorical.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>WorkClass</th>\n",
       "      <th>Education</th>\n",
       "      <th>MaritalStatus</th>\n",
       "      <th>Occupation</th>\n",
       "      <th>Relationship</th>\n",
       "      <th>Race</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Native</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7</td>\n",
       "      <td>9</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>6</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "      <td>10</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   WorkClass  Education  MaritalStatus  Occupation  Relationship  Race  Sex  \\\n",
       "0          7          9              4           1             1     4    1   \n",
       "1          6          9              2           4             0     4    1   \n",
       "2          4         11              0           6             1     4    1   \n",
       "3          4          1              2           6             0     2    1   \n",
       "4          4          9              2          10             5     2    0   \n",
       "\n",
       "   Native  Target  \n",
       "0      39       0  \n",
       "1      39       0  \n",
       "2      39       0  \n",
       "3      39       0  \n",
       "4       5       0  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# apply label encoder to df_categorical\n",
    "le = preprocessing.LabelEncoder()\n",
    "df_categorical = df_categorical.apply(le.fit_transform)\n",
    "df_categorical.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Fnlwgt</th>\n",
       "      <th>Edu_Num</th>\n",
       "      <th>CapitalGain</th>\n",
       "      <th>CapitalLoss</th>\n",
       "      <th>HrPerWk</th>\n",
       "      <th>WorkClass</th>\n",
       "      <th>Education</th>\n",
       "      <th>MaritalStatus</th>\n",
       "      <th>Occupation</th>\n",
       "      <th>Relationship</th>\n",
       "      <th>Race</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Native</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>39</td>\n",
       "      <td>77516</td>\n",
       "      <td>13</td>\n",
       "      <td>2174</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>7</td>\n",
       "      <td>9</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>50</td>\n",
       "      <td>83311</td>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>6</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38</td>\n",
       "      <td>215646</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>4</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>53</td>\n",
       "      <td>234721</td>\n",
       "      <td>7</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>39</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>28</td>\n",
       "      <td>338409</td>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "      <td>10</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Age  Fnlwgt  Edu_Num  CapitalGain  CapitalLoss  HrPerWk  WorkClass  \\\n",
       "0   39   77516       13         2174            0       40          7   \n",
       "1   50   83311       13            0            0       13          6   \n",
       "2   38  215646        9            0            0       40          4   \n",
       "3   53  234721        7            0            0       40          4   \n",
       "4   28  338409       13            0            0       40          4   \n",
       "\n",
       "   Education  MaritalStatus  Occupation  Relationship  Race  Sex  Native  \\\n",
       "0          9              4           1             1     4    1      39   \n",
       "1          9              2           4             0     4    1      39   \n",
       "2         11              0           6             1     4    1      39   \n",
       "3          1              2           6             0     2    1      39   \n",
       "4          9              2          10             5     2    0       5   \n",
       "\n",
       "   Target  \n",
       "0       0  \n",
       "1       0  \n",
       "2       0  \n",
       "3       0  \n",
       "4       0  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Next, Concatenate df_categorical dataframe with original df (dataframe)\n",
    "\n",
    "# first, Drop earlier duplicate columns which had categorical values\n",
    "df = df.drop(df_categorical.columns,axis=1)\n",
    "df = pd.concat([df,df_categorical],axis=1)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Fnlwgt</th>\n",
       "      <th>Edu_Num</th>\n",
       "      <th>CapitalGain</th>\n",
       "      <th>CapitalLoss</th>\n",
       "      <th>HrPerWk</th>\n",
       "      <th>WorkClass</th>\n",
       "      <th>Education</th>\n",
       "      <th>MaritalStatus</th>\n",
       "      <th>Occupation</th>\n",
       "      <th>Relationship</th>\n",
       "      <th>Race</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Native</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Age</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.076646</td>\n",
       "      <td>0.036527</td>\n",
       "      <td>0.077674</td>\n",
       "      <td>0.057775</td>\n",
       "      <td>0.068756</td>\n",
       "      <td>0.003787</td>\n",
       "      <td>-0.010508</td>\n",
       "      <td>-0.266288</td>\n",
       "      <td>-0.020947</td>\n",
       "      <td>-0.263698</td>\n",
       "      <td>0.028718</td>\n",
       "      <td>0.088832</td>\n",
       "      <td>-0.001151</td>\n",
       "      <td>0.234037</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Fnlwgt</th>\n",
       "      <td>-0.076646</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.043195</td>\n",
       "      <td>0.000432</td>\n",
       "      <td>-0.010252</td>\n",
       "      <td>-0.018768</td>\n",
       "      <td>-0.016656</td>\n",
       "      <td>-0.028145</td>\n",
       "      <td>0.028153</td>\n",
       "      <td>0.001597</td>\n",
       "      <td>0.008931</td>\n",
       "      <td>-0.021291</td>\n",
       "      <td>0.026858</td>\n",
       "      <td>-0.051966</td>\n",
       "      <td>-0.009463</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Edu_Num</th>\n",
       "      <td>0.036527</td>\n",
       "      <td>-0.043195</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.122630</td>\n",
       "      <td>0.079923</td>\n",
       "      <td>0.148123</td>\n",
       "      <td>0.052085</td>\n",
       "      <td>0.359153</td>\n",
       "      <td>-0.069304</td>\n",
       "      <td>0.109697</td>\n",
       "      <td>-0.094153</td>\n",
       "      <td>0.031838</td>\n",
       "      <td>0.012280</td>\n",
       "      <td>0.050840</td>\n",
       "      <td>0.335154</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CapitalGain</th>\n",
       "      <td>0.077674</td>\n",
       "      <td>0.000432</td>\n",
       "      <td>0.122630</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.031615</td>\n",
       "      <td>0.078409</td>\n",
       "      <td>0.033835</td>\n",
       "      <td>0.030046</td>\n",
       "      <td>-0.043393</td>\n",
       "      <td>0.025505</td>\n",
       "      <td>-0.057919</td>\n",
       "      <td>0.011145</td>\n",
       "      <td>0.048480</td>\n",
       "      <td>-0.001982</td>\n",
       "      <td>0.223329</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CapitalLoss</th>\n",
       "      <td>0.057775</td>\n",
       "      <td>-0.010252</td>\n",
       "      <td>0.079923</td>\n",
       "      <td>-0.031615</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.054256</td>\n",
       "      <td>0.012216</td>\n",
       "      <td>0.016746</td>\n",
       "      <td>-0.034187</td>\n",
       "      <td>0.017987</td>\n",
       "      <td>-0.061062</td>\n",
       "      <td>0.018899</td>\n",
       "      <td>0.045567</td>\n",
       "      <td>0.000419</td>\n",
       "      <td>0.150526</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HrPerWk</th>\n",
       "      <td>0.068756</td>\n",
       "      <td>-0.018768</td>\n",
       "      <td>0.148123</td>\n",
       "      <td>0.078409</td>\n",
       "      <td>0.054256</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.138962</td>\n",
       "      <td>0.055510</td>\n",
       "      <td>-0.190519</td>\n",
       "      <td>0.080383</td>\n",
       "      <td>-0.248974</td>\n",
       "      <td>0.041910</td>\n",
       "      <td>0.229309</td>\n",
       "      <td>-0.002671</td>\n",
       "      <td>0.229689</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>WorkClass</th>\n",
       "      <td>0.003787</td>\n",
       "      <td>-0.016656</td>\n",
       "      <td>0.052085</td>\n",
       "      <td>0.033835</td>\n",
       "      <td>0.012216</td>\n",
       "      <td>0.138962</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.023513</td>\n",
       "      <td>-0.064731</td>\n",
       "      <td>0.254892</td>\n",
       "      <td>-0.090461</td>\n",
       "      <td>0.049742</td>\n",
       "      <td>0.095981</td>\n",
       "      <td>-0.007690</td>\n",
       "      <td>0.051604</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Education</th>\n",
       "      <td>-0.010508</td>\n",
       "      <td>-0.028145</td>\n",
       "      <td>0.359153</td>\n",
       "      <td>0.030046</td>\n",
       "      <td>0.016746</td>\n",
       "      <td>0.055510</td>\n",
       "      <td>0.023513</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.038407</td>\n",
       "      <td>-0.021260</td>\n",
       "      <td>-0.010876</td>\n",
       "      <td>0.014131</td>\n",
       "      <td>-0.027356</td>\n",
       "      <td>0.064288</td>\n",
       "      <td>0.079317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MaritalStatus</th>\n",
       "      <td>-0.266288</td>\n",
       "      <td>0.028153</td>\n",
       "      <td>-0.069304</td>\n",
       "      <td>-0.043393</td>\n",
       "      <td>-0.034187</td>\n",
       "      <td>-0.190519</td>\n",
       "      <td>-0.064731</td>\n",
       "      <td>-0.038407</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.009654</td>\n",
       "      <td>0.185451</td>\n",
       "      <td>-0.068013</td>\n",
       "      <td>-0.129314</td>\n",
       "      <td>-0.023819</td>\n",
       "      <td>-0.199307</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Occupation</th>\n",
       "      <td>-0.020947</td>\n",
       "      <td>0.001597</td>\n",
       "      <td>0.109697</td>\n",
       "      <td>0.025505</td>\n",
       "      <td>0.017987</td>\n",
       "      <td>0.080383</td>\n",
       "      <td>0.254892</td>\n",
       "      <td>-0.021260</td>\n",
       "      <td>-0.009654</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.075607</td>\n",
       "      <td>0.006763</td>\n",
       "      <td>0.080296</td>\n",
       "      <td>-0.012543</td>\n",
       "      <td>0.075468</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Relationship</th>\n",
       "      <td>-0.263698</td>\n",
       "      <td>0.008931</td>\n",
       "      <td>-0.094153</td>\n",
       "      <td>-0.057919</td>\n",
       "      <td>-0.061062</td>\n",
       "      <td>-0.248974</td>\n",
       "      <td>-0.090461</td>\n",
       "      <td>-0.010876</td>\n",
       "      <td>0.185451</td>\n",
       "      <td>-0.075607</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.116055</td>\n",
       "      <td>-0.582454</td>\n",
       "      <td>-0.005507</td>\n",
       "      <td>-0.250918</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Race</th>\n",
       "      <td>0.028718</td>\n",
       "      <td>-0.021291</td>\n",
       "      <td>0.031838</td>\n",
       "      <td>0.011145</td>\n",
       "      <td>0.018899</td>\n",
       "      <td>0.041910</td>\n",
       "      <td>0.049742</td>\n",
       "      <td>0.014131</td>\n",
       "      <td>-0.068013</td>\n",
       "      <td>0.006763</td>\n",
       "      <td>-0.116055</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.087204</td>\n",
       "      <td>0.137852</td>\n",
       "      <td>0.071846</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Sex</th>\n",
       "      <td>0.088832</td>\n",
       "      <td>0.026858</td>\n",
       "      <td>0.012280</td>\n",
       "      <td>0.048480</td>\n",
       "      <td>0.045567</td>\n",
       "      <td>0.229309</td>\n",
       "      <td>0.095981</td>\n",
       "      <td>-0.027356</td>\n",
       "      <td>-0.129314</td>\n",
       "      <td>0.080296</td>\n",
       "      <td>-0.582454</td>\n",
       "      <td>0.087204</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.008119</td>\n",
       "      <td>0.215980</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Native</th>\n",
       "      <td>-0.001151</td>\n",
       "      <td>-0.051966</td>\n",
       "      <td>0.050840</td>\n",
       "      <td>-0.001982</td>\n",
       "      <td>0.000419</td>\n",
       "      <td>-0.002671</td>\n",
       "      <td>-0.007690</td>\n",
       "      <td>0.064288</td>\n",
       "      <td>-0.023819</td>\n",
       "      <td>-0.012543</td>\n",
       "      <td>-0.005507</td>\n",
       "      <td>0.137852</td>\n",
       "      <td>-0.008119</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.015840</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Target</th>\n",
       "      <td>0.234037</td>\n",
       "      <td>-0.009463</td>\n",
       "      <td>0.335154</td>\n",
       "      <td>0.223329</td>\n",
       "      <td>0.150526</td>\n",
       "      <td>0.229689</td>\n",
       "      <td>0.051604</td>\n",
       "      <td>0.079317</td>\n",
       "      <td>-0.199307</td>\n",
       "      <td>0.075468</td>\n",
       "      <td>-0.250918</td>\n",
       "      <td>0.071846</td>\n",
       "      <td>0.215980</td>\n",
       "      <td>0.015840</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Age    Fnlwgt   Edu_Num  CapitalGain  CapitalLoss  \\\n",
       "Age            1.000000 -0.076646  0.036527     0.077674     0.057775   \n",
       "Fnlwgt        -0.076646  1.000000 -0.043195     0.000432    -0.010252   \n",
       "Edu_Num        0.036527 -0.043195  1.000000     0.122630     0.079923   \n",
       "CapitalGain    0.077674  0.000432  0.122630     1.000000    -0.031615   \n",
       "CapitalLoss    0.057775 -0.010252  0.079923    -0.031615     1.000000   \n",
       "HrPerWk        0.068756 -0.018768  0.148123     0.078409     0.054256   \n",
       "WorkClass      0.003787 -0.016656  0.052085     0.033835     0.012216   \n",
       "Education     -0.010508 -0.028145  0.359153     0.030046     0.016746   \n",
       "MaritalStatus -0.266288  0.028153 -0.069304    -0.043393    -0.034187   \n",
       "Occupation    -0.020947  0.001597  0.109697     0.025505     0.017987   \n",
       "Relationship  -0.263698  0.008931 -0.094153    -0.057919    -0.061062   \n",
       "Race           0.028718 -0.021291  0.031838     0.011145     0.018899   \n",
       "Sex            0.088832  0.026858  0.012280     0.048480     0.045567   \n",
       "Native        -0.001151 -0.051966  0.050840    -0.001982     0.000419   \n",
       "Target         0.234037 -0.009463  0.335154     0.223329     0.150526   \n",
       "\n",
       "                HrPerWk  WorkClass  Education  MaritalStatus  Occupation  \\\n",
       "Age            0.068756   0.003787  -0.010508      -0.266288   -0.020947   \n",
       "Fnlwgt        -0.018768  -0.016656  -0.028145       0.028153    0.001597   \n",
       "Edu_Num        0.148123   0.052085   0.359153      -0.069304    0.109697   \n",
       "CapitalGain    0.078409   0.033835   0.030046      -0.043393    0.025505   \n",
       "CapitalLoss    0.054256   0.012216   0.016746      -0.034187    0.017987   \n",
       "HrPerWk        1.000000   0.138962   0.055510      -0.190519    0.080383   \n",
       "WorkClass      0.138962   1.000000   0.023513      -0.064731    0.254892   \n",
       "Education      0.055510   0.023513   1.000000      -0.038407   -0.021260   \n",
       "MaritalStatus -0.190519  -0.064731  -0.038407       1.000000   -0.009654   \n",
       "Occupation     0.080383   0.254892  -0.021260      -0.009654    1.000000   \n",
       "Relationship  -0.248974  -0.090461  -0.010876       0.185451   -0.075607   \n",
       "Race           0.041910   0.049742   0.014131      -0.068013    0.006763   \n",
       "Sex            0.229309   0.095981  -0.027356      -0.129314    0.080296   \n",
       "Native        -0.002671  -0.007690   0.064288      -0.023819   -0.012543   \n",
       "Target         0.229689   0.051604   0.079317      -0.199307    0.075468   \n",
       "\n",
       "               Relationship      Race       Sex    Native    Target  \n",
       "Age               -0.263698  0.028718  0.088832 -0.001151  0.234037  \n",
       "Fnlwgt             0.008931 -0.021291  0.026858 -0.051966 -0.009463  \n",
       "Edu_Num           -0.094153  0.031838  0.012280  0.050840  0.335154  \n",
       "CapitalGain       -0.057919  0.011145  0.048480 -0.001982  0.223329  \n",
       "CapitalLoss       -0.061062  0.018899  0.045567  0.000419  0.150526  \n",
       "HrPerWk           -0.248974  0.041910  0.229309 -0.002671  0.229689  \n",
       "WorkClass         -0.090461  0.049742  0.095981 -0.007690  0.051604  \n",
       "Education         -0.010876  0.014131 -0.027356  0.064288  0.079317  \n",
       "MaritalStatus      0.185451 -0.068013 -0.129314 -0.023819 -0.199307  \n",
       "Occupation        -0.075607  0.006763  0.080296 -0.012543  0.075468  \n",
       "Relationship       1.000000 -0.116055 -0.582454 -0.005507 -0.250918  \n",
       "Race              -0.116055  1.000000  0.087204  0.137852  0.071846  \n",
       "Sex               -0.582454  0.087204  1.000000 -0.008119  0.215980  \n",
       "Native            -0.005507  0.137852 -0.008119  1.000000  0.015840  \n",
       "Target            -0.250918  0.071846  0.215980  0.015840  1.000000  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corr_matrix=df.corr()\n",
    "corr_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import seaborn as sn\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "fig = plt.subplots(figsize=(17,14))\n",
    "sn.heatmap(corr_matrix, annot=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    24720\n",
       "1     7841\n",
       "Name: Target, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['Target'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Putting independent variables/features to X\n",
    "X = df.drop('Target',axis=1)\n",
    "\n",
    "# Putting response/dependent variable/feature to y\n",
    "y = df['Target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Splitting the data into train and test\n",
    "X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=99)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "22792\n",
      "22792\n",
      "9769\n",
      "9769\n",
      "0.6999785018887626\n",
      "0.30002149811123735\n"
     ]
    }
   ],
   "source": [
    "y_train.dtypes\n",
    "print(len(X_train))\n",
    "print(len(y_train))\n",
    "print(len(X_test))\n",
    "print(len(y_test))\n",
    "print(len(X_train)/(len(X_train)+len(X_test)))\n",
    "print(len(X_test)/(len(X_train)+len(X_test)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Now use AutoML\n",
    "#### example using tpot library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tpot/builtins/__init__.py:36: UserWarning: Warning: optional dependency `torch` is not available. - skipping import of NN models.\n",
      "  warnings.warn(\"Warning: optional dependency `torch` is not available. - skipping import of NN models.\")\n"
     ]
    }
   ],
   "source": [
    "import tpot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Optimization Progress:   0%|          | 0/30 [00:00<?, ?pipeline/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Generation 1 - Current best internal CV score: 0.8726748059978281\n",
      "\n",
      "Generation 2 - Current best internal CV score: 0.8726748059978281\n",
      "\n",
      "Generation 3 - Current best internal CV score: 0.8726748059978281\n",
      "\n",
      "Generation 4 - Current best internal CV score: 0.8726748059978281\n",
      "\n",
      "Generation 5 - Current best internal CV score: 0.8726748059978281\n",
      "\n",
      "Best pipeline: XGBClassifier(input_matrix, learning_rate=0.1, max_depth=6, min_child_weight=4, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TPOTClassifier(generations=5, population_size=5, verbosity=2)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from tpot import TPOTClassifier\n",
    "from tpot import TPOTRegressor\n",
    "\n",
    "tpot = TPOTClassifier(generations=5, population_size=5, verbosity=2)\n",
    "\n",
    "tpot.fit(X_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32 operators have been imported by TPOT.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Optimization Progress:   0%|          | 0/30 [00:00<?, ?pipeline/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
      "\n",
      "Generation 1 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.846393379147429\tExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=True, ExtraTreesClassifier__criterion=entropy, ExtraTreesClassifier__max_features=0.7500000000000001, ExtraTreesClassifier__min_samples_leaf=20, ExtraTreesClassifier__min_samples_split=18, ExtraTreesClassifier__n_estimators=100)\n",
      "\n",
      "Generation 2 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8489381528958578\tExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=True, ExtraTreesClassifier__criterion=entropy, ExtraTreesClassifier__max_features=0.7500000000000001, ExtraTreesClassifier__min_samples_leaf=15, ExtraTreesClassifier__min_samples_split=18, ExtraTreesClassifier__n_estimators=100)\n",
      "_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required..\n",
      "\n",
      "Generation 3 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8675415817248977\tGradientBoostingClassifier(input_matrix, GradientBoostingClassifier__learning_rate=0.1, GradientBoostingClassifier__max_depth=10, GradientBoostingClassifier__max_features=0.5, GradientBoostingClassifier__min_samples_leaf=8, GradientBoostingClassifier__min_samples_split=8, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=1.0)\n",
      "_pre_test decorator: _random_mutation_operator: num_test=0 feature_names mismatch: ['Age', 'Fnlwgt', 'Edu_Num', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'WorkClass', 'Education', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'Native'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13']\n",
      "expected Edu_Num, Race, Sex, Native, WorkClass, Education, Fnlwgt, HrPerWk, CapitalLoss, Relationship, Occupation, Age, CapitalGain, MaritalStatus in input data\n",
      "training data did not have the following fields: f9, f8, f11, f4, f5, f3, f6, f13, f2, f0, f1, f7, f10, f12.\n",
      "Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
      "\n",
      "Generation 4 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8682435189121499\tGradientBoostingClassifier(input_matrix, GradientBoostingClassifier__learning_rate=0.1, GradientBoostingClassifier__max_depth=10, GradientBoostingClassifier__max_features=0.5, GradientBoostingClassifier__min_samples_leaf=17, GradientBoostingClassifier__min_samples_split=8, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=1.0)\n",
      "_pre_test decorator: _random_mutation_operator: num_test=0 feature_names mismatch: ['Age', 'Fnlwgt', 'Edu_Num', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'WorkClass', 'Education', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'Native'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13']\n",
      "expected Edu_Num, Race, Sex, Native, WorkClass, Education, Fnlwgt, HrPerWk, CapitalLoss, Relationship, Occupation, Age, CapitalGain, MaritalStatus in input data\n",
      "training data did not have the following fields: f9, f8, f11, f4, f5, f3, f6, f13, f2, f0, f1, f7, f10, f12.\n",
      "_pre_test decorator: _random_mutation_operator: num_test=0 '(slice(None, None, None), 0)' is an invalid key.\n",
      "Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
      "\n",
      "Generation 5 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8682435189121499\tGradientBoostingClassifier(input_matrix, GradientBoostingClassifier__learning_rate=0.1, GradientBoostingClassifier__max_depth=10, GradientBoostingClassifier__max_features=0.5, GradientBoostingClassifier__min_samples_leaf=17, GradientBoostingClassifier__min_samples_split=8, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=1.0)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TPOTClassifier(generations=5, population_size=5, verbosity=3)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Change Verboity = 3\n",
    "from tpot import TPOTClassifier\n",
    "from tpot import TPOTRegressor\n",
    "\n",
    "tpot = TPOTClassifier(generations=5, population_size=5, verbosity=3)\n",
    "\n",
    "tpot.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32 operators have been imported by TPOT.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Optimization Progress:   0%|          | 0/30 [00:00<?, ?pipeline/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Generation 1 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8431029144382738\tDecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=4, DecisionTreeClassifier__min_samples_leaf=10, DecisionTreeClassifier__min_samples_split=4)\n",
      "_pre_test decorator: _random_mutation_operator: num_test=0 cosine was provided as affinity. Ward can only work with euclidean distances..\n",
      "Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
      "\n",
      "Generation 2 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8431029144382738\tDecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=4, DecisionTreeClassifier__min_samples_leaf=10, DecisionTreeClassifier__min_samples_split=4)\n",
      "Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
      "\n",
      "Generation 3 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8541157276721251\tDecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=7, DecisionTreeClassifier__min_samples_leaf=10, DecisionTreeClassifier__min_samples_split=4)\n",
      "Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
      "\n",
      "Generation 4 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8541157276721251\tDecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=7, DecisionTreeClassifier__min_samples_leaf=10, DecisionTreeClassifier__min_samples_split=4)\n",
      "\n",
      "Generation 5 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8541157276721251\tDecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=7, DecisionTreeClassifier__min_samples_leaf=10, DecisionTreeClassifier__min_samples_split=4)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TPOTClassifier(generations=5, n_jobs=4, population_size=5, verbosity=3)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Change Verboity = 4\n",
    "from tpot import TPOTClassifier\n",
    "from tpot import TPOTRegressor\n",
    "\n",
    "tpot = TPOTClassifier(generations=5, population_size=5, verbosity=3, n_jobs=4) #, max_time_mins=3)\n",
    "\n",
    "tpot.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32 operators have been imported by TPOT.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Optimization Progress:   0%|          | 0/30 [00:00<?, ?pipeline/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Generation 1 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8579326621149012\tRandomForestClassifier(input_matrix, RandomForestClassifier__bootstrap=True, RandomForestClassifier__criterion=entropy, RandomForestClassifier__max_features=0.9500000000000001, RandomForestClassifier__min_samples_leaf=4, RandomForestClassifier__min_samples_split=4, RandomForestClassifier__n_estimators=100)\n",
      "\n",
      "-2\t0.8691648409459862\tXGBClassifier(MaxAbsScaler(input_matrix), XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)\n",
      "\n",
      "Generation 2 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8690771312808586\tXGBClassifier(input_matrix, XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)\n",
      "\n",
      "-2\t0.8691648409459862\tXGBClassifier(MaxAbsScaler(input_matrix), XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)\n",
      "Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.\n",
      "\n",
      "Generation 3 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8690771312808586\tXGBClassifier(input_matrix, XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)\n",
      "\n",
      "-2\t0.8691648409459862\tXGBClassifier(MaxAbsScaler(input_matrix), XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)\n",
      "\n",
      "Generation 4 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8690771312808586\tXGBClassifier(input_matrix, XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.8, XGBClassifier__verbosity=0)\n",
      "\n",
      "-2\t0.8700862977252755\tXGBClassifier(MaxAbsScaler(input_matrix), XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=1.0, XGBClassifier__verbosity=0)\n",
      "\n",
      "Generation 5 - Current Pareto front scores:\n",
      "\n",
      "-1\t0.8700862977252755\tXGBClassifier(input_matrix, XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=9, XGBClassifier__min_child_weight=11, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=1.0, XGBClassifier__verbosity=0)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TPOTClassifier(generations=5, n_jobs=4, population_size=5, scoring='accuracy',\n",
       "               verbosity=3)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Change Verboity = 4  & scoring = accuracy (which is default value)\n",
    "from tpot import TPOTClassifier\n",
    "from tpot import TPOTRegressor\n",
    "\n",
    "tpot = TPOTClassifier(generations=5, population_size=5, verbosity=3, n_jobs=4, scoring='accuracy')\n",
    "\n",
    "tpot.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Add the parameter  random_state  and give it a values\n",
    "#Copy the code and add this parameter.\n",
    "#Q: Are the results different to what is shown above?\n",
    "#Q: Rerun this code and see if the outputs change?  If they remain the same, why?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8670283550005118"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tpot.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export the model\n",
    "tpot.export('/Users/brendan.tierney/Dropbox/4-Datasets/tpot_Adult_pipeline.py')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('xgbclassifier',\n",
       "                 XGBClassifier(base_score=0.5, booster='gbtree',\n",
       "                               colsample_bylevel=1, colsample_bynode=1,\n",
       "                               colsample_bytree=1, gamma=0, gpu_id=-1,\n",
       "                               importance_type='gain',\n",
       "                               interaction_constraints='', learning_rate=0.1,\n",
       "                               max_delta_step=0, max_depth=9,\n",
       "                               min_child_weight=11, missing=nan,\n",
       "                               monotone_constraints='()', n_estimators=100,\n",
       "                               n_jobs=1, num_parallel_tree=1, random_state=0,\n",
       "                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,\n",
       "                               subsample=1.0, tree_method='exact',\n",
       "                               validate_parameters=1, verbosity=0))])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tpot.fitted_pipeline_"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}