{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### TU257 - AutoML Demo" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Run this notebook and examine what is produced for each cell.\n", "\n", "#### Add your own comments to enrich the information given to make it more meaningful to you.\n", "\n", "#### In this notebook we will have a look at two different AutoML libraries. See the notes/website for links to posts illustrating other AutoML libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeWorkClassFnlwgtEducationEdu_NumMaritalStatusOccupationRelationshipRaceSexCapitalGainCapitalLossHrPerWkNativeTarget
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
537Private284582Masters14Married-civ-spouseExec-managerialWifeWhiteFemale0040United-States<=50K
649Private1601879th5Married-spouse-absentOther-serviceNot-in-familyBlackFemale0016Jamaica<=50K
752Self-emp-not-inc209642HS-grad9Married-civ-spouseExec-managerialHusbandWhiteMale0045United-States>50K
831Private45781Masters14Never-marriedProf-specialtyNot-in-familyWhiteFemale14084050United-States>50K
942Private159449Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale5178040United-States>50K
\n", "
" ], "text/plain": [ " Age WorkClass Fnlwgt Education Edu_Num \\\n", "0 39 State-gov 77516 Bachelors 13 \n", "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", "2 38 Private 215646 HS-grad 9 \n", "3 53 Private 234721 11th 7 \n", "4 28 Private 338409 Bachelors 13 \n", "5 37 Private 284582 Masters 14 \n", "6 49 Private 160187 9th 5 \n", "7 52 Self-emp-not-inc 209642 HS-grad 9 \n", "8 31 Private 45781 Masters 14 \n", "9 42 Private 159449 Bachelors 13 \n", "\n", " MaritalStatus Occupation Relationship Race \\\n", "0 Never-married Adm-clerical Not-in-family White \n", "1 Married-civ-spouse Exec-managerial Husband White \n", "2 Divorced Handlers-cleaners Not-in-family White \n", "3 Married-civ-spouse Handlers-cleaners Husband Black \n", "4 Married-civ-spouse Prof-specialty Wife Black \n", "5 Married-civ-spouse Exec-managerial Wife White \n", "6 Married-spouse-absent Other-service Not-in-family Black \n", "7 Married-civ-spouse Exec-managerial Husband White \n", "8 Never-married Prof-specialty Not-in-family White \n", "9 Married-civ-spouse Exec-managerial Husband White \n", "\n", " Sex CapitalGain CapitalLoss HrPerWk Native Target \n", "0 Male 2174 0 40 United-States <=50K \n", "1 Male 0 0 13 United-States <=50K \n", "2 Male 0 0 40 United-States <=50K \n", "3 Male 0 0 40 United-States <=50K \n", "4 Female 0 0 40 Cuba <=50K \n", "5 Female 0 0 40 United-States <=50K \n", "6 Female 0 0 16 Jamaica <=50K \n", "7 Male 0 0 45 United-States >50K \n", "8 Female 14084 0 50 United-States >50K \n", "9 Male 5178 0 40 United-States >50K " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "#Load in the dataset\n", "colnames=['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native', 'Target'] \n", "df = pd.read_csv('/Users/brendan.tierney/Dropbox/4-Datasets/adult.csv', names=colnames, header=None)\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().values.any()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows : 32561\n", "Columns : 15\n", "\n", "Features : \n", " ['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native', 'Target']\n", "\n", "Missing values : 0\n", "\n", "Unique values : \n", " Age 73\n", "WorkClass 9\n", "Fnlwgt 21648\n", "Education 16\n", "Edu_Num 16\n", "MaritalStatus 7\n", "Occupation 15\n", "Relationship 6\n", "Race 5\n", "Sex 2\n", "CapitalGain 119\n", "CapitalLoss 92\n", "HrPerWk 94\n", "Native 42\n", "Target 2\n", "dtype: int64\n" ] } ], "source": [ "print (\"Rows : \" ,df.shape[0])\n", "print (\"Columns : \" ,df.shape[1])\n", "print (\"\\nFeatures : \\n\" ,df.columns.tolist())\n", "print (\"\\nMissing values : \", df.isnull().sum().values.sum())\n", "print (\"\\nUnique values : \\n\",df.nunique())" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 32561 entries, 0 to 32560\n", "Data columns (total 15 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Age 32561 non-null int64 \n", " 1 WorkClass 32561 non-null object\n", " 2 Fnlwgt 32561 non-null int64 \n", " 3 Education 32561 non-null object\n", " 4 Edu_Num 32561 non-null int64 \n", " 5 MaritalStatus 32561 non-null object\n", " 6 Occupation 32561 non-null object\n", " 7 Relationship 32561 non-null object\n", " 8 Race 32561 non-null object\n", " 9 Sex 32561 non-null object\n", " 10 CapitalGain 32561 non-null int64 \n", " 11 CapitalLoss 32561 non-null int64 \n", " 12 HrPerWk 32561 non-null int64 \n", " 13 Native 32561 non-null object\n", " 14 Target 32561 non-null object\n", "dtypes: int64(6), object(9)\n", "memory usage: 3.7+ MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeFnlwgtEdu_NumCapitalGainCapitalLossHrPerWk
count32561.0000003.256100e+0432561.00000032561.00000032561.00000032561.000000
mean38.5816471.897784e+0510.0806791077.64884487.30383040.437456
std13.6404331.055500e+052.5727207385.292085402.96021912.347429
min17.0000001.228500e+041.0000000.0000000.0000001.000000
25%28.0000001.178270e+059.0000000.0000000.00000040.000000
50%37.0000001.783560e+0510.0000000.0000000.00000040.000000
75%48.0000002.370510e+0512.0000000.0000000.00000045.000000
max90.0000001.484705e+0616.00000099999.0000004356.00000099.000000
\n", "
" ], "text/plain": [ " Age Fnlwgt Edu_Num CapitalGain CapitalLoss \\\n", "count 32561.000000 3.256100e+04 32561.000000 32561.000000 32561.000000 \n", "mean 38.581647 1.897784e+05 10.080679 1077.648844 87.303830 \n", "std 13.640433 1.055500e+05 2.572720 7385.292085 402.960219 \n", "min 17.000000 1.228500e+04 1.000000 0.000000 0.000000 \n", "25% 28.000000 1.178270e+05 9.000000 0.000000 0.000000 \n", "50% 37.000000 1.783560e+05 10.000000 0.000000 0.000000 \n", "75% 48.000000 2.370510e+05 12.000000 0.000000 0.000000 \n", "max 90.000000 1.484705e+06 16.000000 99999.000000 4356.000000 \n", "\n", " HrPerWk \n", "count 32561.000000 \n", "mean 40.437456 \n", "std 12.347429 \n", "min 1.000000 \n", "25% 40.000000 \n", "50% 40.000000 \n", "75% 45.000000 \n", "max 99.000000 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Numerical feature of summary/description \n", "df.describe()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Age 0\n", "WorkClass 0\n", "Fnlwgt 0\n", "Education 0\n", "Edu_Num 0\n", "MaritalStatus 0\n", "Occupation 0\n", "Relationship 0\n", "Race 0\n", "Sex 0\n", "CapitalGain 0\n", "CapitalLoss 0\n", "HrPerWk 0\n", "Native 0\n", "Target 0\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# checking \"?\" values, how many are there in the whole dataset\n", "df_missing = (df=='?').sum()\n", "df_missing" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " <=50K 0.75919\n", " >50K 0.24081\n", "Name: Target, dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Target'].value_counts()/len(df) #calculate percentages" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
WorkClassEducationMaritalStatusOccupationRelationshipRaceSexNativeTarget
0State-govBachelorsNever-marriedAdm-clericalNot-in-familyWhiteMaleUnited-States<=50K
1Self-emp-not-incBachelorsMarried-civ-spouseExec-managerialHusbandWhiteMaleUnited-States<=50K
2PrivateHS-gradDivorcedHandlers-cleanersNot-in-familyWhiteMaleUnited-States<=50K
3Private11thMarried-civ-spouseHandlers-cleanersHusbandBlackMaleUnited-States<=50K
4PrivateBachelorsMarried-civ-spouseProf-specialtyWifeBlackFemaleCuba<=50K
\n", "
" ], "text/plain": [ " WorkClass Education MaritalStatus Occupation \\\n", "0 State-gov Bachelors Never-married Adm-clerical \n", "1 Self-emp-not-inc Bachelors Married-civ-spouse Exec-managerial \n", "2 Private HS-grad Divorced Handlers-cleaners \n", "3 Private 11th Married-civ-spouse Handlers-cleaners \n", "4 Private Bachelors Married-civ-spouse Prof-specialty \n", "\n", " Relationship Race Sex Native Target \n", "0 Not-in-family White Male United-States <=50K \n", "1 Husband White Male United-States <=50K \n", "2 Not-in-family White Male United-States <=50K \n", "3 Husband Black Male United-States <=50K \n", "4 Wife Black Female Cuba <=50K " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import preprocessing\n", "\n", "# encode categorical variables using label Encoder\n", "\n", "# select all categorical variables\n", "df_categorical = df.select_dtypes(include=['object'])\n", "df_categorical.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
WorkClassEducationMaritalStatusOccupationRelationshipRaceSexNativeTarget
07941141390
16924041390
241106141390
34126021390
44921052050
\n", "
" ], "text/plain": [ " WorkClass Education MaritalStatus Occupation Relationship Race Sex \\\n", "0 7 9 4 1 1 4 1 \n", "1 6 9 2 4 0 4 1 \n", "2 4 11 0 6 1 4 1 \n", "3 4 1 2 6 0 2 1 \n", "4 4 9 2 10 5 2 0 \n", "\n", " Native Target \n", "0 39 0 \n", "1 39 0 \n", "2 39 0 \n", "3 39 0 \n", "4 5 0 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# apply label encoder to df_categorical\n", "le = preprocessing.LabelEncoder()\n", "df_categorical = df_categorical.apply(le.fit_transform)\n", "df_categorical.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeFnlwgtEdu_NumCapitalGainCapitalLossHrPerWkWorkClassEducationMaritalStatusOccupationRelationshipRaceSexNativeTarget
039775161321740407941141390
150833111300136924041390
2382156469004041106141390
353234721700404126021390
4283384091300404921052050
\n", "
" ], "text/plain": [ " Age Fnlwgt Edu_Num CapitalGain CapitalLoss HrPerWk WorkClass \\\n", "0 39 77516 13 2174 0 40 7 \n", "1 50 83311 13 0 0 13 6 \n", "2 38 215646 9 0 0 40 4 \n", "3 53 234721 7 0 0 40 4 \n", "4 28 338409 13 0 0 40 4 \n", "\n", " Education MaritalStatus Occupation Relationship Race Sex Native \\\n", "0 9 4 1 1 4 1 39 \n", "1 9 2 4 0 4 1 39 \n", "2 11 0 6 1 4 1 39 \n", "3 1 2 6 0 2 1 39 \n", "4 9 2 10 5 2 0 5 \n", "\n", " Target \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Next, Concatenate df_categorical dataframe with original df (dataframe)\n", "\n", "# first, Drop earlier duplicate columns which had categorical values\n", "df = df.drop(df_categorical.columns,axis=1)\n", "df = pd.concat([df,df_categorical],axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeFnlwgtEdu_NumCapitalGainCapitalLossHrPerWkWorkClassEducationMaritalStatusOccupationRelationshipRaceSexNativeTarget
Age1.000000-0.0766460.0365270.0776740.0577750.0687560.003787-0.010508-0.266288-0.020947-0.2636980.0287180.088832-0.0011510.234037
Fnlwgt-0.0766461.000000-0.0431950.000432-0.010252-0.018768-0.016656-0.0281450.0281530.0015970.008931-0.0212910.026858-0.051966-0.009463
Edu_Num0.036527-0.0431951.0000000.1226300.0799230.1481230.0520850.359153-0.0693040.109697-0.0941530.0318380.0122800.0508400.335154
CapitalGain0.0776740.0004320.1226301.000000-0.0316150.0784090.0338350.030046-0.0433930.025505-0.0579190.0111450.048480-0.0019820.223329
CapitalLoss0.057775-0.0102520.079923-0.0316151.0000000.0542560.0122160.016746-0.0341870.017987-0.0610620.0188990.0455670.0004190.150526
HrPerWk0.068756-0.0187680.1481230.0784090.0542561.0000000.1389620.055510-0.1905190.080383-0.2489740.0419100.229309-0.0026710.229689
WorkClass0.003787-0.0166560.0520850.0338350.0122160.1389621.0000000.023513-0.0647310.254892-0.0904610.0497420.095981-0.0076900.051604
Education-0.010508-0.0281450.3591530.0300460.0167460.0555100.0235131.000000-0.038407-0.021260-0.0108760.014131-0.0273560.0642880.079317
MaritalStatus-0.2662880.028153-0.069304-0.043393-0.034187-0.190519-0.064731-0.0384071.000000-0.0096540.185451-0.068013-0.129314-0.023819-0.199307
Occupation-0.0209470.0015970.1096970.0255050.0179870.0803830.254892-0.021260-0.0096541.000000-0.0756070.0067630.080296-0.0125430.075468
Relationship-0.2636980.008931-0.094153-0.057919-0.061062-0.248974-0.090461-0.0108760.185451-0.0756071.000000-0.116055-0.582454-0.005507-0.250918
Race0.028718-0.0212910.0318380.0111450.0188990.0419100.0497420.014131-0.0680130.006763-0.1160551.0000000.0872040.1378520.071846
Sex0.0888320.0268580.0122800.0484800.0455670.2293090.095981-0.027356-0.1293140.080296-0.5824540.0872041.000000-0.0081190.215980
Native-0.001151-0.0519660.050840-0.0019820.000419-0.002671-0.0076900.064288-0.023819-0.012543-0.0055070.137852-0.0081191.0000000.015840
Target0.234037-0.0094630.3351540.2233290.1505260.2296890.0516040.079317-0.1993070.075468-0.2509180.0718460.2159800.0158401.000000
\n", "
" ], "text/plain": [ " Age Fnlwgt Edu_Num CapitalGain CapitalLoss \\\n", "Age 1.000000 -0.076646 0.036527 0.077674 0.057775 \n", "Fnlwgt -0.076646 1.000000 -0.043195 0.000432 -0.010252 \n", "Edu_Num 0.036527 -0.043195 1.000000 0.122630 0.079923 \n", "CapitalGain 0.077674 0.000432 0.122630 1.000000 -0.031615 \n", "CapitalLoss 0.057775 -0.010252 0.079923 -0.031615 1.000000 \n", "HrPerWk 0.068756 -0.018768 0.148123 0.078409 0.054256 \n", "WorkClass 0.003787 -0.016656 0.052085 0.033835 0.012216 \n", "Education -0.010508 -0.028145 0.359153 0.030046 0.016746 \n", "MaritalStatus -0.266288 0.028153 -0.069304 -0.043393 -0.034187 \n", "Occupation -0.020947 0.001597 0.109697 0.025505 0.017987 \n", "Relationship -0.263698 0.008931 -0.094153 -0.057919 -0.061062 \n", "Race 0.028718 -0.021291 0.031838 0.011145 0.018899 \n", "Sex 0.088832 0.026858 0.012280 0.048480 0.045567 \n", "Native -0.001151 -0.051966 0.050840 -0.001982 0.000419 \n", "Target 0.234037 -0.009463 0.335154 0.223329 0.150526 \n", "\n", " HrPerWk WorkClass Education MaritalStatus Occupation \\\n", "Age 0.068756 0.003787 -0.010508 -0.266288 -0.020947 \n", "Fnlwgt -0.018768 -0.016656 -0.028145 0.028153 0.001597 \n", "Edu_Num 0.148123 0.052085 0.359153 -0.069304 0.109697 \n", "CapitalGain 0.078409 0.033835 0.030046 -0.043393 0.025505 \n", "CapitalLoss 0.054256 0.012216 0.016746 -0.034187 0.017987 \n", "HrPerWk 1.000000 0.138962 0.055510 -0.190519 0.080383 \n", "WorkClass 0.138962 1.000000 0.023513 -0.064731 0.254892 \n", "Education 0.055510 0.023513 1.000000 -0.038407 -0.021260 \n", "MaritalStatus -0.190519 -0.064731 -0.038407 1.000000 -0.009654 \n", "Occupation 0.080383 0.254892 -0.021260 -0.009654 1.000000 \n", "Relationship -0.248974 -0.090461 -0.010876 0.185451 -0.075607 \n", "Race 0.041910 0.049742 0.014131 -0.068013 0.006763 \n", "Sex 0.229309 0.095981 -0.027356 -0.129314 0.080296 \n", "Native -0.002671 -0.007690 0.064288 -0.023819 -0.012543 \n", "Target 0.229689 0.051604 0.079317 -0.199307 0.075468 \n", "\n", " Relationship Race Sex Native Target \n", "Age -0.263698 0.028718 0.088832 -0.001151 0.234037 \n", "Fnlwgt 0.008931 -0.021291 0.026858 -0.051966 -0.009463 \n", "Edu_Num -0.094153 0.031838 0.012280 0.050840 0.335154 \n", "CapitalGain -0.057919 0.011145 0.048480 -0.001982 0.223329 \n", "CapitalLoss -0.061062 0.018899 0.045567 0.000419 0.150526 \n", "HrPerWk -0.248974 0.041910 0.229309 -0.002671 0.229689 \n", "WorkClass -0.090461 0.049742 0.095981 -0.007690 0.051604 \n", "Education -0.010876 0.014131 -0.027356 0.064288 0.079317 \n", "MaritalStatus 0.185451 -0.068013 -0.129314 -0.023819 -0.199307 \n", "Occupation -0.075607 0.006763 0.080296 -0.012543 0.075468 \n", "Relationship 1.000000 -0.116055 -0.582454 -0.005507 -0.250918 \n", "Race -0.116055 1.000000 0.087204 0.137852 0.071846 \n", "Sex -0.582454 0.087204 1.000000 -0.008119 0.215980 \n", "Native -0.005507 0.137852 -0.008119 1.000000 0.015840 \n", "Target -0.250918 0.071846 0.215980 0.015840 1.000000 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "corr_matrix=df.corr()\n", "corr_matrix" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import seaborn as sn\n", "import matplotlib.pyplot as plt\n", "\n", "fig = plt.subplots(figsize=(17,14))\n", "sn.heatmap(corr_matrix, annot=True)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 24720\n", "1 7841\n", "Name: Target, dtype: int64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Target'].value_counts()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "# Putting independent variables/features to X\n", "X = df.drop('Target',axis=1)\n", "\n", "# Putting response/dependent variable/feature to y\n", "y = df['Target']" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Splitting the data into train and test\n", "X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=99)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "22792\n", "22792\n", "9769\n", "9769\n", "0.6999785018887626\n", "0.30002149811123735\n" ] } ], "source": [ "y_train.dtypes\n", "print(len(X_train))\n", "print(len(y_train))\n", "print(len(X_test))\n", "print(len(y_test))\n", "print(len(X_train)/(len(X_train)+len(X_test)))\n", "print(len(X_test)/(len(X_train)+len(X_test)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Now use AutoML\n", "#### example using tpot library" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tpot/builtins/__init__.py:36: UserWarning: Warning: optional dependency `torch` is not available. - skipping import of NN models.\n", " warnings.warn(\"Warning: optional dependency `torch` is not available. - skipping import of NN models.\")\n" ] } ], "source": [ "import tpot" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Optimization Progress: 0%| | 0/30 [00:00