{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### TU257-Lab4-Demo2-Decion-Trees  and more\n",
    "\n",
    "This notebook contains two more examples of Classification. This time it is using Decision Trees.\n",
    "\n",
    "There are some Exercises later in this notebook. These ask you to develop a RandomForest and \n",
    "XGBoost models.  These are done in a similar manner to creating a Decision Tree model.\n",
    "\n",
    "Final exercise is to compare the results from all the models to see which one would you recommend using.\n",
    "This recommendation should be based on the data and the results generated by testing the model\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "from sklearn import datasets\n",
    "from sklearn.tree import DecisionTreeClassifier \n",
    "from sklearn import tree\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the iris dataset\n",
    "from sklearn.datasets import load_iris\n",
    "iris = load_iris()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "iris"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = iris.data\n",
    "y = iris.target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fit the classifier with default hyper-parameters\n",
    "clf = DecisionTreeClassifier(random_state=1234)\n",
    "model = clf.fit(X, y)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_representation = tree.export_text(clf)\n",
    "print(text_representation)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_name=['0','1']\n",
    "fig = plt.figure(figsize=(25,20))\n",
    "_ = tree.plot_tree(clf, \n",
    "                   feature_names=iris.feature_names,  \n",
    "                   class_names=iris.target_names,\n",
    "                   filled=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig.savefig(\"decistion_tree.png\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "colnames=['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native', 'Target'] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "#Load in the dataset\n",
    "df = pd.read_csv('C:\\\\Users\\\\Rafael\\\\Documents\\\\DataScience\\\\Data Analitics\\\\week5\\\\adult.csv', names=colnames, header=None)\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.isnull().values.any()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print (\"Rows     : \" ,df.shape[0])\n",
    "print (\"Columns  : \" ,df.shape[1])\n",
    "print (\"\\nFeatures : \\n\" ,df.columns.tolist())\n",
    "print (\"\\nMissing values :  \", df.isnull().sum().values.sum())\n",
    "print (\"\\nUnique values :  \\n\",df.nunique())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Numerical feature of summary/description \n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# checking \"?\" values, how many are there in the whole dataset\n",
    "df_missing = (df=='?').sum()\n",
    "df_missing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import preprocessing\n",
    "\n",
    "# encode categorical variables using label Encoder\n",
    "\n",
    "# select all categorical variables\n",
    "df_categorical = df.select_dtypes(include=['object'])\n",
    "df_categorical.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# apply label encoder to df_categorical\n",
    "le = preprocessing.LabelEncoder()\n",
    "df_categorical = df_categorical.apply(le.fit_transform)\n",
    "df_categorical.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Next, Concatenate df_categorical dataframe with original df (dataframe)\n",
    "\n",
    "# first, Drop earlier duplicate columns which had categorical values\n",
    "df = df.drop(df_categorical.columns,axis=1)\n",
    "df = pd.concat([df,df_categorical],axis=1)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "corr_matrix=df.corr()\n",
    "corr_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sn\n",
    "#import matplotlib.pyplot as plt\n",
    "\n",
    "fig = plt.subplots(figsize=(17,14))\n",
    "sn.heatmap(corr_matrix, annot=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['Target'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Putting independent variables/features to X\n",
    "X = df.drop('Target',axis=1)\n",
    "\n",
    "# Putting response/dependent variable/feature to y\n",
    "y = df['Target']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Splitting the data into train and test\n",
    "X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=99)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Importing decision tree classifier from sklearn library\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "# Fitting the decision tree with default hyperparameters, apart from\n",
    "# max_depth which is 5 so that we can plot and read the tree.\n",
    "dt_default = DecisionTreeClassifier(max_depth=5)\n",
    "dt_default.fit(X_train,y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_representation = tree.export_text(dt_default)\n",
    "print(text_representation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_names=['0','1']\n",
    "feature_names=['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native']\n",
    "fig = plt.figure(figsize=(20,18))\n",
    "_ = tree.plot_tree(dt_default, filled=True, proportion=True, fontsize=10, feature_names=feature_names, class_names=target_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.figure(figsize=(20,18))\n",
    "_ = tree.plot_tree(dt_default, filled=True, proportion=True, fontsize=10, max_depth=2, feature_names=feature_names, class_names=target_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Importing classification report and confusion matrix from sklearn metrics\n",
    "from sklearn.metrics import classification_report,confusion_matrix,accuracy_score\n",
    "\n",
    "# making predictions\n",
    "y_pred_default = dt_default.predict(X_test)\n",
    "\n",
    "# Printing classifier report after prediction\n",
    "print(classification_report(y_test,y_pred_default))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Exercise: See previous example for creating a Confusion matrix.  Create a confusion matrix for th results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(confusion_matrix(y_test,y_pred_default))\n",
    "print(accuracy_score(y_test,y_pred_default))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(pd.crosstab(pd.Series(y_test), pd.Series(y_pred_default), rownames=['Actual'], colnames=['Predicted'], margins=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Exercise: Create a model using X_train, y_train, X_test, y_test, for the following algorithms\n",
    "#            - RandomForest\n",
    "#            - XGBoost\n",
    "#\n",
    "# See shell code below, modify, and complete the code for each algorithm\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Shell code for RandomForest\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "rf_default = RandomForestClassifier(max_depth=5)\n",
    "rf_default.fit(X_train,y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# making predictions\n",
    "y_pred_rf_default = rf_default.predict(X_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Shell code for XGBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Import library -> you might need to install this ->  pip3 install xgboost\n",
    "from xgboost import XGBClassifier\n",
    "\n",
    "xgb_default=XGBClassifier(n_estimators=100)\n",
    "\n",
    "xgb_default.fit(X_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = xgb_default.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Exercise: Compare the results from each algorithm for this dataset (Naive Bayes, Decision Tree, RandomForest, XGBoost)\n",
    "#\n",
    "#Exercise: Which algorithm gives the best outcome?  \n",
    "#             - Which algorithm would you recommend using?"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}