Commit
·
d9d1ebe
1
Parent(s):
f6bc88b
Create housingprice.py
Browse files- housingprice.py +126 -0
housingprice.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.metrics import accuracy_score, recall_score, precision_score
|
2 |
+
from sklearn.model_selection import train_test_split
|
3 |
+
from sklearn.ensemble import RandomForestClassifier
|
4 |
+
from sklearn.impute import SimpleImputer
|
5 |
+
import pandas as pd_df
|
6 |
+
import numpy as np
|
7 |
+
import xgboost as xgb
|
8 |
+
import sklearn.metrics as mets
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
import seaborn as sns
|
11 |
+
from sklearn.ensemble import GradientBoostingRegressor
|
12 |
+
from sklearn.ensemble import RandomForestRegressor
|
13 |
+
from sklearn.linear_model import LinearRegression
|
14 |
+
from sklearn.ensemble import VotingRegressor
|
15 |
+
|
16 |
+
|
17 |
+
# defining reading CSV file
|
18 |
+
def load_analysis_data(v_in_file):
|
19 |
+
return pd_df.read_csv(v_in_file)
|
20 |
+
|
21 |
+
|
22 |
+
def split_data(in_df_clean_base, in_file_process_flag):
|
23 |
+
if in_file_process_flag == 'Trainset':
|
24 |
+
X_train, X_test, y_train, y_test = train_test_split(in_df_clean_base, in_df_clean_base['SalePrice'],
|
25 |
+
test_size=0.2, stratify=in_df_clean_base['YearRemodAdd'])
|
26 |
+
# X_train = pd_df.DataFrame(X_train, columns=['PassengerId', 'Pclass', 'gender', 'Age', 'Fare', 'Embarked_val'])
|
27 |
+
# X_test = pd_df.DataFrame(X_test, columns=['PassengerId', 'Pclass', 'gender', 'Age', 'Fare', 'Embarked_val'])
|
28 |
+
else:
|
29 |
+
X_test = pd_df.DataFrame(in_df_clean_base,
|
30 |
+
columns=['PassengerId', 'Pclass', 'gender', 'Age', 'Fare', 'Embarked_val'])
|
31 |
+
y_test = pd_df.DataFrame(in_df_clean_base,
|
32 |
+
columns=['PassengerId', 'Survived'])
|
33 |
+
X_train = [0]
|
34 |
+
y_train = [0]
|
35 |
+
return X_train, X_test, y_train, y_test
|
36 |
+
|
37 |
+
|
38 |
+
# data analysis
|
39 |
+
v_train_file = 'train_HousePrice.csv'
|
40 |
+
v_house_base = load_analysis_data(v_train_file)
|
41 |
+
v_house_corr = v_house_base.corr()
|
42 |
+
|
43 |
+
# print(corr.sort_values(corr["SalesPrice"], ascending=False))
|
44 |
+
highest_corr_features = v_house_corr.index[abs(v_house_corr["SalePrice"]) > 0.5]
|
45 |
+
Final_columns = highest_corr_features.values
|
46 |
+
Final_column = np.append(Final_columns, ['Id'])
|
47 |
+
#print(Final_column)
|
48 |
+
v_house_reduce = pd_df.DataFrame(v_house_base, columns=Final_column)
|
49 |
+
Final_column_val = np.delete(Final_column, np.where(Final_column == 'SalePrice'))
|
50 |
+
|
51 |
+
# print(v_house_reduce["SalePrice"].skew())
|
52 |
+
|
53 |
+
# std_scale = StandardScaler()
|
54 |
+
# X_s = std_scale.fit_transform(v_house_reduce.columns)
|
55 |
+
# v_house_std = pd_df.DataFrame(X_s) # Put the np array back into a pandas DataFrame for later
|
56 |
+
# print(v_house_std.info())
|
57 |
+
|
58 |
+
X_train, X_test, y_train, y_test = split_data(v_house_reduce, 'Trainset')
|
59 |
+
|
60 |
+
X_train = pd_df.DataFrame(X_train, columns=Final_column_val)
|
61 |
+
X_test = pd_df.DataFrame(X_test, columns=Final_column_val)
|
62 |
+
|
63 |
+
xgb_boost = xgb.XGBRegressor(eta=0.2, max_depth=5,subsample=0.8)
|
64 |
+
xgb_model = xgb_boost.fit(X_train, y_train, eval_metric=mets.r2_score)
|
65 |
+
|
66 |
+
reg1 = GradientBoostingRegressor(n_estimators=100, learning_rate=0.2)
|
67 |
+
reg2 = RandomForestRegressor(n_estimators=100, max_depth=4)
|
68 |
+
reg3 = LinearRegression()
|
69 |
+
|
70 |
+
reg1.fit(X_train, y_train)
|
71 |
+
reg2.fit(X_train, y_train)
|
72 |
+
reg3.fit(X_train, y_train)
|
73 |
+
|
74 |
+
ereg = VotingRegressor([('gb', reg1), ('rf', reg2), ('lr', reg3)])
|
75 |
+
ereg.fit(X_train, y_train)
|
76 |
+
|
77 |
+
xt = X_test
|
78 |
+
|
79 |
+
pred1 = reg1.predict(xt)
|
80 |
+
pred2 = reg2.predict(xt)
|
81 |
+
pred3 = reg3.predict(xt)
|
82 |
+
pred4 = ereg.predict(xt)
|
83 |
+
pred5 = xgb_model.predict(xt)
|
84 |
+
|
85 |
+
plt.plot(pred1, 'gd', label='GradientBoostingRegressor')
|
86 |
+
plt.plot(pred2, 'b^', label='RandomForestRegressor')
|
87 |
+
plt.plot(pred3, 'ys', label='LinearRegression')
|
88 |
+
plt.plot(pred5, 'xg', label='Xgboost')
|
89 |
+
plt.plot(pred4, 'r*', ms=10, label='VotingRegressor')
|
90 |
+
|
91 |
+
plt.tick_params(axis='x', which='both', bottom=False, top=False,
|
92 |
+
labelbottom=False)
|
93 |
+
plt.ylabel('predicted')
|
94 |
+
plt.xlabel('training samples')
|
95 |
+
plt.legend(loc="best")
|
96 |
+
plt.title('Regressor predictions and their average')
|
97 |
+
|
98 |
+
#plt.show()
|
99 |
+
|
100 |
+
v_train_file = 'testHousePrice.csv'
|
101 |
+
v_house_base_test = load_analysis_data(v_train_file)
|
102 |
+
|
103 |
+
#print(Final_column_val)
|
104 |
+
v_house_base_test_red = pd_df.DataFrame(v_house_base_test, columns=Final_column_val)
|
105 |
+
print(v_house_base_test_red.info())
|
106 |
+
|
107 |
+
|
108 |
+
median_impute = SimpleImputer(strategy="median")
|
109 |
+
median_impute.fit(v_house_base_test_red)
|
110 |
+
X_val = median_impute.transform(v_house_base_test_red)
|
111 |
+
v_in_df_data_clean = pd_df.DataFrame(X_val, columns=v_house_base_test_red.columns)
|
112 |
+
print(v_in_df_data_clean.info())
|
113 |
+
|
114 |
+
|
115 |
+
pred6 = ereg.predict(v_in_df_data_clean)
|
116 |
+
pred7 = xgb_model.predict(v_in_df_data_clean)
|
117 |
+
|
118 |
+
## Save Result in Output File
|
119 |
+
submit_train = pd_df.DataFrame({"PassengerId": X_test.Id, 'calculated_voting': pred4, 'Actual': y_test,
|
120 |
+
'xgboost': pred5, 'Score_voting': mets.r2_score(pred4, y_test),
|
121 |
+
'Score_voting_xgb': mets.r2_score(pred5, y_test)})
|
122 |
+
|
123 |
+
submit = pd_df.DataFrame({"Id": v_house_base_test_red.Id, 'SalePrice': pred6, 'SalePricexgb': pred7})
|
124 |
+
#submit = pd_df.DataFrame(v_in_df_data_clean)
|
125 |
+
submit.to_csv("Submission_houseprice_28.csv", index=False)
|
126 |
+
submit_train.to_csv("Submission_houseprice_test1.csv", index=False)
|