vijaykumar0704 commited on
Commit
d9d1ebe
·
1 Parent(s): f6bc88b

Create housingprice.py

Browse files
Files changed (1) hide show
  1. housingprice.py +126 -0
housingprice.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.metrics import accuracy_score, recall_score, precision_score
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.ensemble import RandomForestClassifier
4
+ from sklearn.impute import SimpleImputer
5
+ import pandas as pd_df
6
+ import numpy as np
7
+ import xgboost as xgb
8
+ import sklearn.metrics as mets
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
11
+ from sklearn.ensemble import GradientBoostingRegressor
12
+ from sklearn.ensemble import RandomForestRegressor
13
+ from sklearn.linear_model import LinearRegression
14
+ from sklearn.ensemble import VotingRegressor
15
+
16
+
17
+ # defining reading CSV file
18
+ def load_analysis_data(v_in_file):
19
+ return pd_df.read_csv(v_in_file)
20
+
21
+
22
+ def split_data(in_df_clean_base, in_file_process_flag):
23
+ if in_file_process_flag == 'Trainset':
24
+ X_train, X_test, y_train, y_test = train_test_split(in_df_clean_base, in_df_clean_base['SalePrice'],
25
+ test_size=0.2, stratify=in_df_clean_base['YearRemodAdd'])
26
+ # X_train = pd_df.DataFrame(X_train, columns=['PassengerId', 'Pclass', 'gender', 'Age', 'Fare', 'Embarked_val'])
27
+ # X_test = pd_df.DataFrame(X_test, columns=['PassengerId', 'Pclass', 'gender', 'Age', 'Fare', 'Embarked_val'])
28
+ else:
29
+ X_test = pd_df.DataFrame(in_df_clean_base,
30
+ columns=['PassengerId', 'Pclass', 'gender', 'Age', 'Fare', 'Embarked_val'])
31
+ y_test = pd_df.DataFrame(in_df_clean_base,
32
+ columns=['PassengerId', 'Survived'])
33
+ X_train = [0]
34
+ y_train = [0]
35
+ return X_train, X_test, y_train, y_test
36
+
37
+
38
+ # data analysis
39
+ v_train_file = 'train_HousePrice.csv'
40
+ v_house_base = load_analysis_data(v_train_file)
41
+ v_house_corr = v_house_base.corr()
42
+
43
+ # print(corr.sort_values(corr["SalesPrice"], ascending=False))
44
+ highest_corr_features = v_house_corr.index[abs(v_house_corr["SalePrice"]) > 0.5]
45
+ Final_columns = highest_corr_features.values
46
+ Final_column = np.append(Final_columns, ['Id'])
47
+ #print(Final_column)
48
+ v_house_reduce = pd_df.DataFrame(v_house_base, columns=Final_column)
49
+ Final_column_val = np.delete(Final_column, np.where(Final_column == 'SalePrice'))
50
+
51
+ # print(v_house_reduce["SalePrice"].skew())
52
+
53
+ # std_scale = StandardScaler()
54
+ # X_s = std_scale.fit_transform(v_house_reduce.columns)
55
+ # v_house_std = pd_df.DataFrame(X_s) # Put the np array back into a pandas DataFrame for later
56
+ # print(v_house_std.info())
57
+
58
+ X_train, X_test, y_train, y_test = split_data(v_house_reduce, 'Trainset')
59
+
60
+ X_train = pd_df.DataFrame(X_train, columns=Final_column_val)
61
+ X_test = pd_df.DataFrame(X_test, columns=Final_column_val)
62
+
63
+ xgb_boost = xgb.XGBRegressor(eta=0.2, max_depth=5,subsample=0.8)
64
+ xgb_model = xgb_boost.fit(X_train, y_train, eval_metric=mets.r2_score)
65
+
66
+ reg1 = GradientBoostingRegressor(n_estimators=100, learning_rate=0.2)
67
+ reg2 = RandomForestRegressor(n_estimators=100, max_depth=4)
68
+ reg3 = LinearRegression()
69
+
70
+ reg1.fit(X_train, y_train)
71
+ reg2.fit(X_train, y_train)
72
+ reg3.fit(X_train, y_train)
73
+
74
+ ereg = VotingRegressor([('gb', reg1), ('rf', reg2), ('lr', reg3)])
75
+ ereg.fit(X_train, y_train)
76
+
77
+ xt = X_test
78
+
79
+ pred1 = reg1.predict(xt)
80
+ pred2 = reg2.predict(xt)
81
+ pred3 = reg3.predict(xt)
82
+ pred4 = ereg.predict(xt)
83
+ pred5 = xgb_model.predict(xt)
84
+
85
+ plt.plot(pred1, 'gd', label='GradientBoostingRegressor')
86
+ plt.plot(pred2, 'b^', label='RandomForestRegressor')
87
+ plt.plot(pred3, 'ys', label='LinearRegression')
88
+ plt.plot(pred5, 'xg', label='Xgboost')
89
+ plt.plot(pred4, 'r*', ms=10, label='VotingRegressor')
90
+
91
+ plt.tick_params(axis='x', which='both', bottom=False, top=False,
92
+ labelbottom=False)
93
+ plt.ylabel('predicted')
94
+ plt.xlabel('training samples')
95
+ plt.legend(loc="best")
96
+ plt.title('Regressor predictions and their average')
97
+
98
+ #plt.show()
99
+
100
+ v_train_file = 'testHousePrice.csv'
101
+ v_house_base_test = load_analysis_data(v_train_file)
102
+
103
+ #print(Final_column_val)
104
+ v_house_base_test_red = pd_df.DataFrame(v_house_base_test, columns=Final_column_val)
105
+ print(v_house_base_test_red.info())
106
+
107
+
108
+ median_impute = SimpleImputer(strategy="median")
109
+ median_impute.fit(v_house_base_test_red)
110
+ X_val = median_impute.transform(v_house_base_test_red)
111
+ v_in_df_data_clean = pd_df.DataFrame(X_val, columns=v_house_base_test_red.columns)
112
+ print(v_in_df_data_clean.info())
113
+
114
+
115
+ pred6 = ereg.predict(v_in_df_data_clean)
116
+ pred7 = xgb_model.predict(v_in_df_data_clean)
117
+
118
+ ## Save Result in Output File
119
+ submit_train = pd_df.DataFrame({"PassengerId": X_test.Id, 'calculated_voting': pred4, 'Actual': y_test,
120
+ 'xgboost': pred5, 'Score_voting': mets.r2_score(pred4, y_test),
121
+ 'Score_voting_xgb': mets.r2_score(pred5, y_test)})
122
+
123
+ submit = pd_df.DataFrame({"Id": v_house_base_test_red.Id, 'SalePrice': pred6, 'SalePricexgb': pred7})
124
+ #submit = pd_df.DataFrame(v_in_df_data_clean)
125
+ submit.to_csv("Submission_houseprice_28.csv", index=False)
126
+ submit_train.to_csv("Submission_houseprice_test1.csv", index=False)