Commit
·
f6bc88b
1
Parent(s):
596d10d
Create titatanic_basic
Browse files- titatanic_basic +121 -0
titatanic_basic
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.metrics import accuracy_score, recall_score, precision_score
|
2 |
+
from sklearn.model_selection import train_test_split
|
3 |
+
from sklearn.ensemble import RandomForestClassifier
|
4 |
+
from sklearn.impute import SimpleImputer
|
5 |
+
import pandas as pd_df
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
v_error_tol = 1
|
9 |
+
|
10 |
+
|
11 |
+
# defining reading CSV file
|
12 |
+
def load_analysis_data(v_in_file):
|
13 |
+
return pd_df.read_csv(v_in_file)
|
14 |
+
|
15 |
+
|
16 |
+
def clean_input_data(v_in_df_data, v_data_type):
|
17 |
+
if v_data_type == 'Testset':
|
18 |
+
v_in_df_data['Survived'] = v_in_df_data["Sex"].apply(lambda x: 0 if x == 'male' else 0)
|
19 |
+
|
20 |
+
# Column Sex Analysis and Cleaning
|
21 |
+
V_Null_Count_Sex = v_in_df_data['Sex'].isnull().sum()
|
22 |
+
V_total_Count_Sex = sum(v_in_df_data.value_counts(v_in_df_data['Sex']))
|
23 |
+
V_delta_count_Sex = V_total_Count_Sex - V_Null_Count_Sex
|
24 |
+
V_delta_count_Sex_percentage = (V_Null_Count_Sex / V_total_Count_Sex) * 100
|
25 |
+
|
26 |
+
# Column Embarked Analysis and Cleaning
|
27 |
+
V_Null_Count_Embarked = v_in_df_data['Embarked'].isnull().sum()
|
28 |
+
V_total_Count_Embarked = sum(v_in_df_data.value_counts(v_in_df_data['Embarked']))
|
29 |
+
#print(V_total_Count_Embarked)
|
30 |
+
# V_delta_count_Embarked = V_total_Count_Embarked - V_Null_Count_Embarked
|
31 |
+
V_delta_count_Embarked_percentage = (V_Null_Count_Embarked / V_total_Count_Embarked) * 100
|
32 |
+
#print(V_delta_count_Embarked_percentage)
|
33 |
+
|
34 |
+
# transforming categorical to Numerical for Sex column
|
35 |
+
if V_Null_Count_Sex == 0 or V_delta_count_Sex_percentage < v_error_tol:
|
36 |
+
if V_delta_count_Sex_percentage < v_error_tol:
|
37 |
+
v_in_df_data = v_in_df_data.dropna(subset=["Sex"])
|
38 |
+
v_in_df_data['gender'] = v_in_df_data["Sex"].apply(lambda x: 1 if x == 'male' else 0)
|
39 |
+
else:
|
40 |
+
print('Please review Data set for column Sex')
|
41 |
+
|
42 |
+
########## transforming categorical to Embarked
|
43 |
+
if V_Null_Count_Embarked == 0 or V_delta_count_Embarked_percentage < v_error_tol:
|
44 |
+
|
45 |
+
if V_delta_count_Embarked_percentage < v_error_tol:
|
46 |
+
v_in_df_data = v_in_df_data.dropna(subset=["Embarked"])
|
47 |
+
condition_one = (v_in_df_data["Embarked"] == 'S')
|
48 |
+
condition_two = (v_in_df_data["Embarked"] == 'C')
|
49 |
+
condition_three = (v_in_df_data["Embarked"] == 'Q')
|
50 |
+
conditions = [condition_one, condition_two, condition_three]
|
51 |
+
choices = [1, 2, 3]
|
52 |
+
v_in_df_data["Embarked_val"] = np.select(conditions, choices)
|
53 |
+
else:
|
54 |
+
print('Please review Data set for column Embarked')
|
55 |
+
|
56 |
+
v_in_df_data_clean = pd_df.DataFrame(v_in_df_data, columns=['PassengerId', 'Survived', 'Pclass', 'gender',
|
57 |
+
'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_val'])
|
58 |
+
|
59 |
+
########Filling Median value for rest of the NA column
|
60 |
+
median_impute = SimpleImputer(strategy="median")
|
61 |
+
X_val = median_impute.fit_transform(v_in_df_data_clean)
|
62 |
+
v_in_df_data_clean = pd_df.DataFrame(X_val, columns=v_in_df_data_clean.columns)
|
63 |
+
return v_in_df_data_clean
|
64 |
+
|
65 |
+
|
66 |
+
def split_data(in_df_clean_base, in_file_process_flag):
|
67 |
+
if in_file_process_flag == 'Trainset':
|
68 |
+
X_train, X_test, y_train, y_test = train_test_split(in_df_clean_base, in_df_clean_base['Survived'],
|
69 |
+
test_size=0.2, stratify=in_df_clean_base['Pclass'])
|
70 |
+
X_train = pd_df.DataFrame(X_train, columns=['PassengerId', 'Pclass', 'gender', 'Age', 'Fare', 'Embarked_val'])
|
71 |
+
X_test = pd_df.DataFrame(X_test, columns=['PassengerId', 'Pclass', 'gender', 'Age', 'Fare', 'Embarked_val'])
|
72 |
+
else:
|
73 |
+
X_test = pd_df.DataFrame(in_df_clean_base,
|
74 |
+
columns=['PassengerId', 'Pclass', 'gender', 'Age', 'Fare', 'Embarked_val'])
|
75 |
+
y_test = pd_df.DataFrame(in_df_clean_base,
|
76 |
+
columns=['PassengerId', 'Survived'])
|
77 |
+
X_train = [0]
|
78 |
+
y_train = [0]
|
79 |
+
return X_train, X_test, y_train, y_test
|
80 |
+
|
81 |
+
|
82 |
+
def build_model(in_x_train, in_y_train, in_X_test, in_n_estimators, in_max_leaf_nodes):
|
83 |
+
# Model Build and Test-- Random Forest Classifier
|
84 |
+
rnd_clf = RandomForestClassifier(n_estimators=in_n_estimators, max_leaf_nodes=in_max_leaf_nodes, n_jobs=-1)
|
85 |
+
rnd_clf.fit(in_x_train, in_y_train)
|
86 |
+
y_final_rf = rnd_clf.predict(in_X_test)
|
87 |
+
return rnd_clf, y_final_rf
|
88 |
+
|
89 |
+
|
90 |
+
def model_metrics(in_y_test, in_y_final_rf):
|
91 |
+
# Model Metrics
|
92 |
+
print('accuracy_score->' + str(accuracy_score(in_y_test, in_y_final_rf)))
|
93 |
+
print('recall_score->' + str(recall_score(in_y_test, in_y_final_rf)))
|
94 |
+
print('precision_score->' + str(precision_score(in_y_test, in_y_final_rf)))
|
95 |
+
|
96 |
+
|
97 |
+
########reading file for train data
|
98 |
+
V_file = 'train - Titanic.csv'
|
99 |
+
V_file_process_flag = 'Trainset'
|
100 |
+
V_n_estimators = 500
|
101 |
+
V_max_leaf_nodes = 16
|
102 |
+
titanic_base = load_analysis_data(V_file)
|
103 |
+
titanic_clean_base = clean_input_data(titanic_base, V_file_process_flag)
|
104 |
+
V_X_train, V_X_test, V_y_train, V_y_test = split_data(titanic_clean_base, V_file_process_flag)
|
105 |
+
# build model and test accuracy of the model
|
106 |
+
rnd_clf_model, y_final_test_rf = build_model(V_X_train, V_y_train, V_X_test, V_n_estimators, V_max_leaf_nodes)
|
107 |
+
model_metrics(V_y_test, y_final_test_rf)
|
108 |
+
|
109 |
+
## testing of model
|
110 |
+
V_file = 'test - Titanic.csv'
|
111 |
+
V_file_process_flag = 'Testset'
|
112 |
+
V_n_estimators = 500
|
113 |
+
V_max_leaf_nodes = 16
|
114 |
+
titanic_base = load_analysis_data(V_file)
|
115 |
+
titanic_clean_base = clean_input_data(titanic_base, V_file_process_flag)
|
116 |
+
V_X_train, V_X_test, V_y_train, V_y_test = split_data(titanic_clean_base, V_file_process_flag)
|
117 |
+
y_final_value = rnd_clf_model.predict(V_X_test)
|
118 |
+
#print(y_final_value.to_csv)
|
119 |
+
|
120 |
+
submit = pd_df.DataFrame({"PassengerId": V_X_test.PassengerId, 'Survived': y_final_value})
|
121 |
+
submit.to_csv("Titanic_final_submission.csv", index=False)
|