vijaykumar0704 commited on
Commit
f6bc88b
·
1 Parent(s): 596d10d

Create titatanic_basic

Browse files
Files changed (1) hide show
  1. titatanic_basic +121 -0
titatanic_basic ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.metrics import accuracy_score, recall_score, precision_score
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.ensemble import RandomForestClassifier
4
+ from sklearn.impute import SimpleImputer
5
+ import pandas as pd_df
6
+ import numpy as np
7
+
8
+ v_error_tol = 1
9
+
10
+
11
+ # defining reading CSV file
12
+ def load_analysis_data(v_in_file):
13
+ return pd_df.read_csv(v_in_file)
14
+
15
+
16
+ def clean_input_data(v_in_df_data, v_data_type):
17
+ if v_data_type == 'Testset':
18
+ v_in_df_data['Survived'] = v_in_df_data["Sex"].apply(lambda x: 0 if x == 'male' else 0)
19
+
20
+ # Column Sex Analysis and Cleaning
21
+ V_Null_Count_Sex = v_in_df_data['Sex'].isnull().sum()
22
+ V_total_Count_Sex = sum(v_in_df_data.value_counts(v_in_df_data['Sex']))
23
+ V_delta_count_Sex = V_total_Count_Sex - V_Null_Count_Sex
24
+ V_delta_count_Sex_percentage = (V_Null_Count_Sex / V_total_Count_Sex) * 100
25
+
26
+ # Column Embarked Analysis and Cleaning
27
+ V_Null_Count_Embarked = v_in_df_data['Embarked'].isnull().sum()
28
+ V_total_Count_Embarked = sum(v_in_df_data.value_counts(v_in_df_data['Embarked']))
29
+ #print(V_total_Count_Embarked)
30
+ # V_delta_count_Embarked = V_total_Count_Embarked - V_Null_Count_Embarked
31
+ V_delta_count_Embarked_percentage = (V_Null_Count_Embarked / V_total_Count_Embarked) * 100
32
+ #print(V_delta_count_Embarked_percentage)
33
+
34
+ # transforming categorical to Numerical for Sex column
35
+ if V_Null_Count_Sex == 0 or V_delta_count_Sex_percentage < v_error_tol:
36
+ if V_delta_count_Sex_percentage < v_error_tol:
37
+ v_in_df_data = v_in_df_data.dropna(subset=["Sex"])
38
+ v_in_df_data['gender'] = v_in_df_data["Sex"].apply(lambda x: 1 if x == 'male' else 0)
39
+ else:
40
+ print('Please review Data set for column Sex')
41
+
42
+ ########## transforming categorical to Embarked
43
+ if V_Null_Count_Embarked == 0 or V_delta_count_Embarked_percentage < v_error_tol:
44
+
45
+ if V_delta_count_Embarked_percentage < v_error_tol:
46
+ v_in_df_data = v_in_df_data.dropna(subset=["Embarked"])
47
+ condition_one = (v_in_df_data["Embarked"] == 'S')
48
+ condition_two = (v_in_df_data["Embarked"] == 'C')
49
+ condition_three = (v_in_df_data["Embarked"] == 'Q')
50
+ conditions = [condition_one, condition_two, condition_three]
51
+ choices = [1, 2, 3]
52
+ v_in_df_data["Embarked_val"] = np.select(conditions, choices)
53
+ else:
54
+ print('Please review Data set for column Embarked')
55
+
56
+ v_in_df_data_clean = pd_df.DataFrame(v_in_df_data, columns=['PassengerId', 'Survived', 'Pclass', 'gender',
57
+ 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_val'])
58
+
59
+ ########Filling Median value for rest of the NA column
60
+ median_impute = SimpleImputer(strategy="median")
61
+ X_val = median_impute.fit_transform(v_in_df_data_clean)
62
+ v_in_df_data_clean = pd_df.DataFrame(X_val, columns=v_in_df_data_clean.columns)
63
+ return v_in_df_data_clean
64
+
65
+
66
+ def split_data(in_df_clean_base, in_file_process_flag):
67
+ if in_file_process_flag == 'Trainset':
68
+ X_train, X_test, y_train, y_test = train_test_split(in_df_clean_base, in_df_clean_base['Survived'],
69
+ test_size=0.2, stratify=in_df_clean_base['Pclass'])
70
+ X_train = pd_df.DataFrame(X_train, columns=['PassengerId', 'Pclass', 'gender', 'Age', 'Fare', 'Embarked_val'])
71
+ X_test = pd_df.DataFrame(X_test, columns=['PassengerId', 'Pclass', 'gender', 'Age', 'Fare', 'Embarked_val'])
72
+ else:
73
+ X_test = pd_df.DataFrame(in_df_clean_base,
74
+ columns=['PassengerId', 'Pclass', 'gender', 'Age', 'Fare', 'Embarked_val'])
75
+ y_test = pd_df.DataFrame(in_df_clean_base,
76
+ columns=['PassengerId', 'Survived'])
77
+ X_train = [0]
78
+ y_train = [0]
79
+ return X_train, X_test, y_train, y_test
80
+
81
+
82
+ def build_model(in_x_train, in_y_train, in_X_test, in_n_estimators, in_max_leaf_nodes):
83
+ # Model Build and Test-- Random Forest Classifier
84
+ rnd_clf = RandomForestClassifier(n_estimators=in_n_estimators, max_leaf_nodes=in_max_leaf_nodes, n_jobs=-1)
85
+ rnd_clf.fit(in_x_train, in_y_train)
86
+ y_final_rf = rnd_clf.predict(in_X_test)
87
+ return rnd_clf, y_final_rf
88
+
89
+
90
+ def model_metrics(in_y_test, in_y_final_rf):
91
+ # Model Metrics
92
+ print('accuracy_score->' + str(accuracy_score(in_y_test, in_y_final_rf)))
93
+ print('recall_score->' + str(recall_score(in_y_test, in_y_final_rf)))
94
+ print('precision_score->' + str(precision_score(in_y_test, in_y_final_rf)))
95
+
96
+
97
+ ########reading file for train data
98
+ V_file = 'train - Titanic.csv'
99
+ V_file_process_flag = 'Trainset'
100
+ V_n_estimators = 500
101
+ V_max_leaf_nodes = 16
102
+ titanic_base = load_analysis_data(V_file)
103
+ titanic_clean_base = clean_input_data(titanic_base, V_file_process_flag)
104
+ V_X_train, V_X_test, V_y_train, V_y_test = split_data(titanic_clean_base, V_file_process_flag)
105
+ # build model and test accuracy of the model
106
+ rnd_clf_model, y_final_test_rf = build_model(V_X_train, V_y_train, V_X_test, V_n_estimators, V_max_leaf_nodes)
107
+ model_metrics(V_y_test, y_final_test_rf)
108
+
109
+ ## testing of model
110
+ V_file = 'test - Titanic.csv'
111
+ V_file_process_flag = 'Testset'
112
+ V_n_estimators = 500
113
+ V_max_leaf_nodes = 16
114
+ titanic_base = load_analysis_data(V_file)
115
+ titanic_clean_base = clean_input_data(titanic_base, V_file_process_flag)
116
+ V_X_train, V_X_test, V_y_train, V_y_test = split_data(titanic_clean_base, V_file_process_flag)
117
+ y_final_value = rnd_clf_model.predict(V_X_test)
118
+ #print(y_final_value.to_csv)
119
+
120
+ submit = pd_df.DataFrame({"PassengerId": V_X_test.PassengerId, 'Survived': y_final_value})
121
+ submit.to_csv("Titanic_final_submission.csv", index=False)