{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### TU257-Lab6-Demo1-Models\n", "\n", "In this demo notebook we will work through examples of using KNN, SVM and Nnets.\n", "\n", "We will use the same dataset from last week. Some of the initial cells prepare the data for the models\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from matplotlib import pyplot as plt\n", "from sklearn import datasets\n", "from sklearn.tree import DecisionTreeClassifier \n", "from sklearn import tree\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeWorkClassFnlwgtEducationEdu_NumMaritalStatusOccupationRelationshipRaceSexCapitalGainCapitalLossHrPerWkNativeTarget
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
537Private284582Masters14Married-civ-spouseExec-managerialWifeWhiteFemale0040United-States<=50K
649Private1601879th5Married-spouse-absentOther-serviceNot-in-familyBlackFemale0016Jamaica<=50K
752Self-emp-not-inc209642HS-grad9Married-civ-spouseExec-managerialHusbandWhiteMale0045United-States>50K
831Private45781Masters14Never-marriedProf-specialtyNot-in-familyWhiteFemale14084050United-States>50K
942Private159449Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale5178040United-States>50K
\n", "
" ], "text/plain": [ " Age WorkClass Fnlwgt Education Edu_Num \\\n", "0 39 State-gov 77516 Bachelors 13 \n", "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", "2 38 Private 215646 HS-grad 9 \n", "3 53 Private 234721 11th 7 \n", "4 28 Private 338409 Bachelors 13 \n", "5 37 Private 284582 Masters 14 \n", "6 49 Private 160187 9th 5 \n", "7 52 Self-emp-not-inc 209642 HS-grad 9 \n", "8 31 Private 45781 Masters 14 \n", "9 42 Private 159449 Bachelors 13 \n", "\n", " MaritalStatus Occupation Relationship Race \\\n", "0 Never-married Adm-clerical Not-in-family White \n", "1 Married-civ-spouse Exec-managerial Husband White \n", "2 Divorced Handlers-cleaners Not-in-family White \n", "3 Married-civ-spouse Handlers-cleaners Husband Black \n", "4 Married-civ-spouse Prof-specialty Wife Black \n", "5 Married-civ-spouse Exec-managerial Wife White \n", "6 Married-spouse-absent Other-service Not-in-family Black \n", "7 Married-civ-spouse Exec-managerial Husband White \n", "8 Never-married Prof-specialty Not-in-family White \n", "9 Married-civ-spouse Exec-managerial Husband White \n", "\n", " Sex CapitalGain CapitalLoss HrPerWk Native Target \n", "0 Male 2174 0 40 United-States <=50K \n", "1 Male 0 0 13 United-States <=50K \n", "2 Male 0 0 40 United-States <=50K \n", "3 Male 0 0 40 United-States <=50K \n", "4 Female 0 0 40 Cuba <=50K \n", "5 Female 0 0 40 United-States <=50K \n", "6 Female 0 0 16 Jamaica <=50K \n", "7 Male 0 0 45 United-States >50K \n", "8 Female 14084 0 50 United-States >50K \n", "9 Male 5178 0 40 United-States >50K " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "#Load in the dataset\n", "colnames=['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native', 'Target'] \n", "df = pd.read_csv('/Users/brendan.tierney/Dropbox/4-Datasets/adult.csv', names=colnames, header=None)\n", "df.head(10)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().values.any()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows : 32561\n", "Columns : 15\n", "\n", "Features : \n", " ['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native', 'Target']\n", "\n", "Missing values : 0\n", "\n", "Unique values : \n", " Age 73\n", "WorkClass 9\n", "Fnlwgt 21648\n", "Education 16\n", "Edu_Num 16\n", "MaritalStatus 7\n", "Occupation 15\n", "Relationship 6\n", "Race 5\n", "Sex 2\n", "CapitalGain 119\n", "CapitalLoss 92\n", "HrPerWk 94\n", "Native 42\n", "Target 2\n", "dtype: int64\n" ] } ], "source": [ "print (\"Rows : \" ,df.shape[0])\n", "print (\"Columns : \" ,df.shape[1])\n", "print (\"\\nFeatures : \\n\" ,df.columns.tolist())\n", "print (\"\\nMissing values : \", df.isnull().sum().values.sum())\n", "print (\"\\nUnique values : \\n\",df.nunique())\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 32561 entries, 0 to 32560\n", "Data columns (total 15 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Age 32561 non-null int64 \n", " 1 WorkClass 32561 non-null object\n", " 2 Fnlwgt 32561 non-null int64 \n", " 3 Education 32561 non-null object\n", " 4 Edu_Num 32561 non-null int64 \n", " 5 MaritalStatus 32561 non-null object\n", " 6 Occupation 32561 non-null object\n", " 7 Relationship 32561 non-null object\n", " 8 Race 32561 non-null object\n", " 9 Sex 32561 non-null object\n", " 10 CapitalGain 32561 non-null int64 \n", " 11 CapitalLoss 32561 non-null int64 \n", " 12 HrPerWk 32561 non-null int64 \n", " 13 Native 32561 non-null object\n", " 14 Target 32561 non-null object\n", "dtypes: int64(6), object(9)\n", "memory usage: 3.7+ MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeFnlwgtEdu_NumCapitalGainCapitalLossHrPerWk
count32561.0000003.256100e+0432561.00000032561.00000032561.00000032561.000000
mean38.5816471.897784e+0510.0806791077.64884487.30383040.437456
std13.6404331.055500e+052.5727207385.292085402.96021912.347429
min17.0000001.228500e+041.0000000.0000000.0000001.000000
25%28.0000001.178270e+059.0000000.0000000.00000040.000000
50%37.0000001.783560e+0510.0000000.0000000.00000040.000000
75%48.0000002.370510e+0512.0000000.0000000.00000045.000000
max90.0000001.484705e+0616.00000099999.0000004356.00000099.000000
\n", "
" ], "text/plain": [ " Age Fnlwgt Edu_Num CapitalGain CapitalLoss \\\n", "count 32561.000000 3.256100e+04 32561.000000 32561.000000 32561.000000 \n", "mean 38.581647 1.897784e+05 10.080679 1077.648844 87.303830 \n", "std 13.640433 1.055500e+05 2.572720 7385.292085 402.960219 \n", "min 17.000000 1.228500e+04 1.000000 0.000000 0.000000 \n", "25% 28.000000 1.178270e+05 9.000000 0.000000 0.000000 \n", "50% 37.000000 1.783560e+05 10.000000 0.000000 0.000000 \n", "75% 48.000000 2.370510e+05 12.000000 0.000000 0.000000 \n", "max 90.000000 1.484705e+06 16.000000 99999.000000 4356.000000 \n", "\n", " HrPerWk \n", "count 32561.000000 \n", "mean 40.437456 \n", "std 12.347429 \n", "min 1.000000 \n", "25% 40.000000 \n", "50% 40.000000 \n", "75% 45.000000 \n", "max 99.000000 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Numerical feature of summary/description \n", "df.describe()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Age 0\n", "WorkClass 0\n", "Fnlwgt 0\n", "Education 0\n", "Edu_Num 0\n", "MaritalStatus 0\n", "Occupation 0\n", "Relationship 0\n", "Race 0\n", "Sex 0\n", "CapitalGain 0\n", "CapitalLoss 0\n", "HrPerWk 0\n", "Native 0\n", "Target 0\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "# checking \"?\" values, how many are there in the whole dataset\n", "df_missing = (df=='?').sum()\n", "df_missing" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
WorkClassEducationMaritalStatusOccupationRelationshipRaceSexNativeTarget
0State-govBachelorsNever-marriedAdm-clericalNot-in-familyWhiteMaleUnited-States<=50K
1Self-emp-not-incBachelorsMarried-civ-spouseExec-managerialHusbandWhiteMaleUnited-States<=50K
2PrivateHS-gradDivorcedHandlers-cleanersNot-in-familyWhiteMaleUnited-States<=50K
3Private11thMarried-civ-spouseHandlers-cleanersHusbandBlackMaleUnited-States<=50K
4PrivateBachelorsMarried-civ-spouseProf-specialtyWifeBlackFemaleCuba<=50K
\n", "
" ], "text/plain": [ " WorkClass Education MaritalStatus Occupation \\\n", "0 State-gov Bachelors Never-married Adm-clerical \n", "1 Self-emp-not-inc Bachelors Married-civ-spouse Exec-managerial \n", "2 Private HS-grad Divorced Handlers-cleaners \n", "3 Private 11th Married-civ-spouse Handlers-cleaners \n", "4 Private Bachelors Married-civ-spouse Prof-specialty \n", "\n", " Relationship Race Sex Native Target \n", "0 Not-in-family White Male United-States <=50K \n", "1 Husband White Male United-States <=50K \n", "2 Not-in-family White Male United-States <=50K \n", "3 Husband Black Male United-States <=50K \n", "4 Wife Black Female Cuba <=50K " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import preprocessing\n", "\n", "# encode categorical variables using label Encoder\n", "\n", "# select all categorical variables\n", "df_categorical = df.select_dtypes(include=['object'])\n", "df_categorical.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
WorkClassEducationMaritalStatusOccupationRelationshipRaceSexNativeTarget
07941141390
16924041390
241106141390
34126021390
44921052050
\n", "
" ], "text/plain": [ " WorkClass Education MaritalStatus Occupation Relationship Race Sex \\\n", "0 7 9 4 1 1 4 1 \n", "1 6 9 2 4 0 4 1 \n", "2 4 11 0 6 1 4 1 \n", "3 4 1 2 6 0 2 1 \n", "4 4 9 2 10 5 2 0 \n", "\n", " Native Target \n", "0 39 0 \n", "1 39 0 \n", "2 39 0 \n", "3 39 0 \n", "4 5 0 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# apply label encoder to df_categorical\n", "le = preprocessing.LabelEncoder()\n", "df_categorical = df_categorical.apply(le.fit_transform)\n", "df_categorical.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeFnlwgtEdu_NumCapitalGainCapitalLossHrPerWkWorkClassEducationMaritalStatusOccupationRelationshipRaceSexNativeTarget
039775161321740407941141390
150833111300136924041390
2382156469004041106141390
353234721700404126021390
4283384091300404921052050
\n", "
" ], "text/plain": [ " Age Fnlwgt Edu_Num CapitalGain CapitalLoss HrPerWk WorkClass \\\n", "0 39 77516 13 2174 0 40 7 \n", "1 50 83311 13 0 0 13 6 \n", "2 38 215646 9 0 0 40 4 \n", "3 53 234721 7 0 0 40 4 \n", "4 28 338409 13 0 0 40 4 \n", "\n", " Education MaritalStatus Occupation Relationship Race Sex Native \\\n", "0 9 4 1 1 4 1 39 \n", "1 9 2 4 0 4 1 39 \n", "2 11 0 6 1 4 1 39 \n", "3 1 2 6 0 2 1 39 \n", "4 9 2 10 5 2 0 5 \n", "\n", " Target \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Next, Concatenate df_categorical dataframe with original df (dataframe)\n", "\n", "# first, Drop earlier duplicate columns which had categorical values\n", "df = df.drop(df_categorical.columns,axis=1)\n", "df = pd.concat([df,df_categorical],axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeFnlwgtEdu_NumCapitalGainCapitalLossHrPerWkWorkClassEducationMaritalStatusOccupationRelationshipRaceSexNativeTarget
Age1.000000-0.0766460.0365270.0776740.0577750.0687560.003787-0.010508-0.266288-0.020947-0.2636980.0287180.088832-0.0011510.234037
Fnlwgt-0.0766461.000000-0.0431950.000432-0.010252-0.018768-0.016656-0.0281450.0281530.0015970.008931-0.0212910.026858-0.051966-0.009463
Edu_Num0.036527-0.0431951.0000000.1226300.0799230.1481230.0520850.359153-0.0693040.109697-0.0941530.0318380.0122800.0508400.335154
CapitalGain0.0776740.0004320.1226301.000000-0.0316150.0784090.0338350.030046-0.0433930.025505-0.0579190.0111450.048480-0.0019820.223329
CapitalLoss0.057775-0.0102520.079923-0.0316151.0000000.0542560.0122160.016746-0.0341870.017987-0.0610620.0188990.0455670.0004190.150526
HrPerWk0.068756-0.0187680.1481230.0784090.0542561.0000000.1389620.055510-0.1905190.080383-0.2489740.0419100.229309-0.0026710.229689
WorkClass0.003787-0.0166560.0520850.0338350.0122160.1389621.0000000.023513-0.0647310.254892-0.0904610.0497420.095981-0.0076900.051604
Education-0.010508-0.0281450.3591530.0300460.0167460.0555100.0235131.000000-0.038407-0.021260-0.0108760.014131-0.0273560.0642880.079317
MaritalStatus-0.2662880.028153-0.069304-0.043393-0.034187-0.190519-0.064731-0.0384071.000000-0.0096540.185451-0.068013-0.129314-0.023819-0.199307
Occupation-0.0209470.0015970.1096970.0255050.0179870.0803830.254892-0.021260-0.0096541.000000-0.0756070.0067630.080296-0.0125430.075468
Relationship-0.2636980.008931-0.094153-0.057919-0.061062-0.248974-0.090461-0.0108760.185451-0.0756071.000000-0.116055-0.582454-0.005507-0.250918
Race0.028718-0.0212910.0318380.0111450.0188990.0419100.0497420.014131-0.0680130.006763-0.1160551.0000000.0872040.1378520.071846
Sex0.0888320.0268580.0122800.0484800.0455670.2293090.095981-0.027356-0.1293140.080296-0.5824540.0872041.000000-0.0081190.215980
Native-0.001151-0.0519660.050840-0.0019820.000419-0.002671-0.0076900.064288-0.023819-0.012543-0.0055070.137852-0.0081191.0000000.015840
Target0.234037-0.0094630.3351540.2233290.1505260.2296890.0516040.079317-0.1993070.075468-0.2509180.0718460.2159800.0158401.000000
\n", "
" ], "text/plain": [ " Age Fnlwgt Edu_Num CapitalGain CapitalLoss \\\n", "Age 1.000000 -0.076646 0.036527 0.077674 0.057775 \n", "Fnlwgt -0.076646 1.000000 -0.043195 0.000432 -0.010252 \n", "Edu_Num 0.036527 -0.043195 1.000000 0.122630 0.079923 \n", "CapitalGain 0.077674 0.000432 0.122630 1.000000 -0.031615 \n", "CapitalLoss 0.057775 -0.010252 0.079923 -0.031615 1.000000 \n", "HrPerWk 0.068756 -0.018768 0.148123 0.078409 0.054256 \n", "WorkClass 0.003787 -0.016656 0.052085 0.033835 0.012216 \n", "Education -0.010508 -0.028145 0.359153 0.030046 0.016746 \n", "MaritalStatus -0.266288 0.028153 -0.069304 -0.043393 -0.034187 \n", "Occupation -0.020947 0.001597 0.109697 0.025505 0.017987 \n", "Relationship -0.263698 0.008931 -0.094153 -0.057919 -0.061062 \n", "Race 0.028718 -0.021291 0.031838 0.011145 0.018899 \n", "Sex 0.088832 0.026858 0.012280 0.048480 0.045567 \n", "Native -0.001151 -0.051966 0.050840 -0.001982 0.000419 \n", "Target 0.234037 -0.009463 0.335154 0.223329 0.150526 \n", "\n", " HrPerWk WorkClass Education MaritalStatus Occupation \\\n", "Age 0.068756 0.003787 -0.010508 -0.266288 -0.020947 \n", "Fnlwgt -0.018768 -0.016656 -0.028145 0.028153 0.001597 \n", "Edu_Num 0.148123 0.052085 0.359153 -0.069304 0.109697 \n", "CapitalGain 0.078409 0.033835 0.030046 -0.043393 0.025505 \n", "CapitalLoss 0.054256 0.012216 0.016746 -0.034187 0.017987 \n", "HrPerWk 1.000000 0.138962 0.055510 -0.190519 0.080383 \n", "WorkClass 0.138962 1.000000 0.023513 -0.064731 0.254892 \n", "Education 0.055510 0.023513 1.000000 -0.038407 -0.021260 \n", "MaritalStatus -0.190519 -0.064731 -0.038407 1.000000 -0.009654 \n", "Occupation 0.080383 0.254892 -0.021260 -0.009654 1.000000 \n", "Relationship -0.248974 -0.090461 -0.010876 0.185451 -0.075607 \n", "Race 0.041910 0.049742 0.014131 -0.068013 0.006763 \n", "Sex 0.229309 0.095981 -0.027356 -0.129314 0.080296 \n", "Native -0.002671 -0.007690 0.064288 -0.023819 -0.012543 \n", "Target 0.229689 0.051604 0.079317 -0.199307 0.075468 \n", "\n", " Relationship Race Sex Native Target \n", "Age -0.263698 0.028718 0.088832 -0.001151 0.234037 \n", "Fnlwgt 0.008931 -0.021291 0.026858 -0.051966 -0.009463 \n", "Edu_Num -0.094153 0.031838 0.012280 0.050840 0.335154 \n", "CapitalGain -0.057919 0.011145 0.048480 -0.001982 0.223329 \n", "CapitalLoss -0.061062 0.018899 0.045567 0.000419 0.150526 \n", "HrPerWk -0.248974 0.041910 0.229309 -0.002671 0.229689 \n", "WorkClass -0.090461 0.049742 0.095981 -0.007690 0.051604 \n", "Education -0.010876 0.014131 -0.027356 0.064288 0.079317 \n", "MaritalStatus 0.185451 -0.068013 -0.129314 -0.023819 -0.199307 \n", "Occupation -0.075607 0.006763 0.080296 -0.012543 0.075468 \n", "Relationship 1.000000 -0.116055 -0.582454 -0.005507 -0.250918 \n", "Race -0.116055 1.000000 0.087204 0.137852 0.071846 \n", "Sex -0.582454 0.087204 1.000000 -0.008119 0.215980 \n", "Native -0.005507 0.137852 -0.008119 1.000000 0.015840 \n", "Target -0.250918 0.071846 0.215980 0.015840 1.000000 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "corr_matrix=df.corr()\n", "corr_matrix" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import seaborn as sn\n", "import matplotlib.pyplot as plt\n", "\n", "fig = plt.subplots(figsize=(17,14))\n", "sn.heatmap(corr_matrix, annot=True)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 24720\n", "1 7841\n", "Name: Target, dtype: int64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['Target'].value_counts()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "# Putting independent variables/features to X\n", "X = df.drop('Target',axis=1)\n", "\n", "# Putting response/dependent variable/feature to y\n", "y = df['Target']\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Splitting the data into train and test\n", "X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=99)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "################\n", "# KNN Model\n", "################" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "KNeighborsClassifier(n_neighbors=3)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Importing library\n", "from sklearn.neighbors import KNeighborsClassifier\n", "\n", "#set k=3\n", "knn_model = KNeighborsClassifier(n_neighbors=3)\n", "#fit the model\n", "knn_model.fit(X_train,y_train)\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.82 0.88 0.85 7436\n", " 1 0.50 0.38 0.43 2333\n", "\n", " accuracy 0.76 9769\n", " macro avg 0.66 0.63 0.64 9769\n", "weighted avg 0.74 0.76 0.75 9769\n", "\n" ] } ], "source": [ "# Importing classification report and confusion matrix from sklearn metrics\n", "from sklearn.metrics import classification_report,confusion_matrix,accuracy_score\n", "\n", "# making predictions\n", "y_pred_default = knn_model.predict(X_test)\n", "\n", "# Printing classifier report after prediction\n", "print(classification_report(y_test,y_pred_default))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[6529 907]\n", " [1438 895]]\n", "0.759954959565974\n" ] } ], "source": [ "print(confusion_matrix(y_test,y_pred_default))\n", "print(accuracy_score(y_test,y_pred_default))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Predicted 0 1 All\n", "Actual \n", "0 1854 402 2256\n", "1 563 147 710\n", "All 2417 549 2966\n" ] } ], "source": [ "print(pd.crosstab(pd.Series(y_test), pd.Series(y_pred_default), rownames=['Actual'], colnames=['Predicted'], margins=True))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Exercise: Change the value of K to see what impact it has on the model accuracy -> try a few values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "################\n", "# SVM Model\n", "################" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "#import SVM model\n", "from sklearn import svm" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SVC()" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#use SVC=support vector for classification\n", "svm_model = svm.SVC()\n", "#fit the model\n", "svm_model.fit(X_train,y_train)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.79 1.00 0.88 7436\n", " 1 0.98 0.15 0.26 2333\n", "\n", " accuracy 0.80 9769\n", " macro avg 0.88 0.57 0.57 9769\n", "weighted avg 0.83 0.80 0.73 9769\n", "\n" ] } ], "source": [ "# Importing classification report and confusion matrix from sklearn metrics\n", "from sklearn.metrics import classification_report,confusion_matrix,accuracy_score\n", "\n", "# making predictions\n", "y_pred_default = svm_model.predict(X_test)\n", "\n", "# Printing classifier report after prediction\n", "print(classification_report(y_test,y_pred_default))" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Predicted 0 1 All\n", "Actual \n", "0 2185 71 2256\n", "1 682 28 710\n", "All 2867 99 2966\n" ] } ], "source": [ "print(pd.crosstab(pd.Series(y_test), pd.Series(y_pred_default), rownames=['Actual'], colnames=['Predicted'], margins=True))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7959873067867745\n" ] } ], "source": [ "print(accuracy_score(y_test,y_pred_default))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "##########################\n", "# Neural Network Model\n", "##########################" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "#import SVM model\n", "from sklearn import neural_network as nn" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MLPClassifier()" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#use MLPClassifier = mult-layer perceptron for Classification - uses Back propagation - default layers = 100\n", "# max_iters=200\n", "nn_model = nn.MLPClassifier()\n", "#fit the model\n", "nn_model.fit(X_train,y_train)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.85 0.79 0.82 7436\n", " 1 0.46 0.57 0.51 2333\n", "\n", " accuracy 0.73 9769\n", " macro avg 0.65 0.68 0.66 9769\n", "weighted avg 0.76 0.73 0.74 9769\n", "\n" ] } ], "source": [ "# Importing classification report and confusion matrix from sklearn metrics\n", "from sklearn.metrics import classification_report,confusion_matrix,accuracy_score\n", "\n", "# making predictions\n", "y_pred_default = nn_model.predict(X_test)\n", "\n", "# Printing classifier report after prediction\n", "print(classification_report(y_test,y_pred_default))" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.7348756269833145\n" ] } ], "source": [ "print(accuracy_score(y_test,y_pred_default))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }