### TU257-Lab4-Demo2-Decion-Trees and more

This notebook contains two more examples of Classification. This time it is using Decision Trees.

There are some Exercises later in this notebook. These ask you to develop a RandomForest and 
XGBoost models. These are done in a similar manner to creating a Decision Tree model.

Final exercise is to compare the results from all the models to see which one would you recommend using.
This recommendation should be based on the data and the results generated by testing the model


In [None]:
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree


In [None]:
# load the iris dataset
from sklearn.datasets import load_iris
iris = load_iris()

In [None]:
iris

In [None]:
X = iris.data
y = iris.target

In [None]:
# Fit the classifier with default hyper-parameters
clf = DecisionTreeClassifier(random_state=1234)
model = clf.fit(X, y)


In [None]:
text_representation = tree.export_text(clf)
print(text_representation)


In [None]:
target_name=['0','1']
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(clf, 
 feature_names=iris.feature_names, 
 class_names=iris.target_names,
 filled=True)

In [None]:
fig.savefig("decistion_tree.png")

In [None]:
colnames=['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native', 'Target'] 

In [None]:
import pandas as pd

#Load in the dataset
df = pd.read_csv('C:\\Users\\Rafael\\Documents\\DataScience\\Data Analitics\\week5\\adult.csv', names=colnames, header=None)
df.head(10)

In [None]:
df.isnull().values.any()

In [None]:
print ("Rows : " ,df.shape[0])
print ("Columns : " ,df.shape[1])
print ("\nFeatures : \n" ,df.columns.tolist())
print ("\nMissing values : ", df.isnull().sum().values.sum())
print ("\nUnique values : \n",df.nunique())


In [None]:
df.info()

In [None]:
# Numerical feature of summary/description 
df.describe()

In [None]:

# checking "?" values, how many are there in the whole dataset
df_missing = (df=='?').sum()
df_missing

In [None]:
from sklearn import preprocessing

# encode categorical variables using label Encoder

# select all categorical variables
df_categorical = df.select_dtypes(include=['object'])
df_categorical.head()

In [None]:
# apply label encoder to df_categorical
le = preprocessing.LabelEncoder()
df_categorical = df_categorical.apply(le.fit_transform)
df_categorical.head()

In [None]:
# Next, Concatenate df_categorical dataframe with original df (dataframe)

# first, Drop earlier duplicate columns which had categorical values
df = df.drop(df_categorical.columns,axis=1)
df = pd.concat([df,df_categorical],axis=1)
df.head()

In [None]:
corr_matrix=df.corr()
corr_matrix

In [None]:
import seaborn as sn
#import matplotlib.pyplot as plt

fig = plt.subplots(figsize=(17,14))
sn.heatmap(corr_matrix, annot=True)

In [None]:
df['Target'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

# Putting independent variables/features to X
X = df.drop('Target',axis=1)

# Putting response/dependent variable/feature to y
y = df['Target']


In [None]:
# Splitting the data into train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=99)


In [None]:
# Importing decision tree classifier from sklearn library
from sklearn.tree import DecisionTreeClassifier

# Fitting the decision tree with default hyperparameters, apart from
# max_depth which is 5 so that we can plot and read the tree.
dt_default = DecisionTreeClassifier(max_depth=5)
dt_default.fit(X_train,y_train)


In [None]:
text_representation = tree.export_text(dt_default)
print(text_representation)

In [None]:
target_names=['0','1']
feature_names=['Age', 'WorkClass', 'Fnlwgt', 'Education', 'Edu_Num', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 'HrPerWk', 'Native']
fig = plt.figure(figsize=(20,18))
_ = tree.plot_tree(dt_default, filled=True, proportion=True, fontsize=10, feature_names=feature_names, class_names=target_names)

In [None]:
fig = plt.figure(figsize=(20,18))
_ = tree.plot_tree(dt_default, filled=True, proportion=True, fontsize=10, max_depth=2, feature_names=feature_names, class_names=target_names)

In [None]:
# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

# making predictions
y_pred_default = dt_default.predict(X_test)

# Printing classifier report after prediction
print(classification_report(y_test,y_pred_default))

In [None]:
#Exercise: See previous example for creating a Confusion matrix. Create a confusion matrix for th results

In [None]:
print(confusion_matrix(y_test,y_pred_default))
print(accuracy_score(y_test,y_pred_default))

In [None]:
print(pd.crosstab(pd.Series(y_test), pd.Series(y_pred_default), rownames=['Actual'], colnames=['Predicted'], margins=True))

In [None]:
#Exercise: Create a model using X_train, y_train, X_test, y_test, for the following algorithms
# - RandomForest
# - XGBoost
#
# See shell code below, modify, and complete the code for each algorithm


In [None]:
#Shell code for RandomForest
from sklearn.ensemble import RandomForestClassifier

rf_default = RandomForestClassifier(max_depth=5)
rf_default.fit(X_train,y_train)


In [None]:
# making predictions
y_pred_rf_default = rf_default.predict(X_test)


In [None]:
#Shell code for XGBoost

In [None]:
#Import library -> you might need to install this -> pip3 install xgboost
from xgboost import XGBClassifier

xgb_default=XGBClassifier(n_estimators=100)

xgb_default.fit(X_train, y_train)


In [None]:
y_pred = xgb_default.predict(X_test)

In [None]:
#Exercise: Compare the results from each algorithm for this dataset (Naive Bayes, Decision Tree, RandomForest, XGBoost)
#
#Exercise: Which algorithm gives the best outcome? 
# - Which algorithm would you recommend using?