|
from sklearn.metrics import accuracy_score, recall_score, precision_score |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.ensemble import RandomForestClassifier |
|
from sklearn.impute import SimpleImputer |
|
import pandas as pd_df |
|
import numpy as np |
|
import xgboost as xgb |
|
import sklearn.metrics as mets |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from sklearn.ensemble import GradientBoostingRegressor |
|
from sklearn.ensemble import RandomForestRegressor |
|
from sklearn.linear_model import LinearRegression |
|
from sklearn.ensemble import VotingRegressor |
|
|
|
|
|
|
|
def load_analysis_data(v_in_file): |
|
return pd_df.read_csv(v_in_file) |
|
|
|
|
|
def split_data(in_df_clean_base, in_file_process_flag): |
|
if in_file_process_flag == 'Trainset': |
|
X_train, X_test, y_train, y_test = train_test_split(in_df_clean_base, in_df_clean_base['SalePrice'], |
|
test_size=0.2, stratify=in_df_clean_base['YearRemodAdd']) |
|
|
|
|
|
else: |
|
X_test = pd_df.DataFrame(in_df_clean_base, |
|
columns=['PassengerId', 'Pclass', 'gender', 'Age', 'Fare', 'Embarked_val']) |
|
y_test = pd_df.DataFrame(in_df_clean_base, |
|
columns=['PassengerId', 'Survived']) |
|
X_train = [0] |
|
y_train = [0] |
|
return X_train, X_test, y_train, y_test |
|
|
|
|
|
|
|
v_train_file = 'train_HousePrice.csv' |
|
v_house_base = load_analysis_data(v_train_file) |
|
v_house_corr = v_house_base.corr() |
|
|
|
|
|
highest_corr_features = v_house_corr.index[abs(v_house_corr["SalePrice"]) > 0.5] |
|
Final_columns = highest_corr_features.values |
|
Final_column = np.append(Final_columns, ['Id']) |
|
|
|
v_house_reduce = pd_df.DataFrame(v_house_base, columns=Final_column) |
|
Final_column_val = np.delete(Final_column, np.where(Final_column == 'SalePrice')) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = split_data(v_house_reduce, 'Trainset') |
|
|
|
X_train = pd_df.DataFrame(X_train, columns=Final_column_val) |
|
X_test = pd_df.DataFrame(X_test, columns=Final_column_val) |
|
|
|
xgb_boost = xgb.XGBRegressor(eta=0.2, max_depth=5,subsample=0.8) |
|
xgb_model = xgb_boost.fit(X_train, y_train, eval_metric=mets.r2_score) |
|
|
|
reg1 = GradientBoostingRegressor(n_estimators=100, learning_rate=0.2) |
|
reg2 = RandomForestRegressor(n_estimators=100, max_depth=4) |
|
reg3 = LinearRegression() |
|
|
|
reg1.fit(X_train, y_train) |
|
reg2.fit(X_train, y_train) |
|
reg3.fit(X_train, y_train) |
|
|
|
ereg = VotingRegressor([('gb', reg1), ('rf', reg2), ('lr', reg3)]) |
|
ereg.fit(X_train, y_train) |
|
|
|
xt = X_test |
|
|
|
pred1 = reg1.predict(xt) |
|
pred2 = reg2.predict(xt) |
|
pred3 = reg3.predict(xt) |
|
pred4 = ereg.predict(xt) |
|
pred5 = xgb_model.predict(xt) |
|
|
|
plt.plot(pred1, 'gd', label='GradientBoostingRegressor') |
|
plt.plot(pred2, 'b^', label='RandomForestRegressor') |
|
plt.plot(pred3, 'ys', label='LinearRegression') |
|
plt.plot(pred5, 'xg', label='Xgboost') |
|
plt.plot(pred4, 'r*', ms=10, label='VotingRegressor') |
|
|
|
plt.tick_params(axis='x', which='both', bottom=False, top=False, |
|
labelbottom=False) |
|
plt.ylabel('predicted') |
|
plt.xlabel('training samples') |
|
plt.legend(loc="best") |
|
plt.title('Regressor predictions and their average') |
|
|
|
|
|
|
|
v_train_file = 'testHousePrice.csv' |
|
v_house_base_test = load_analysis_data(v_train_file) |
|
|
|
|
|
v_house_base_test_red = pd_df.DataFrame(v_house_base_test, columns=Final_column_val) |
|
print(v_house_base_test_red.info()) |
|
|
|
|
|
median_impute = SimpleImputer(strategy="median") |
|
median_impute.fit(v_house_base_test_red) |
|
X_val = median_impute.transform(v_house_base_test_red) |
|
v_in_df_data_clean = pd_df.DataFrame(X_val, columns=v_house_base_test_red.columns) |
|
print(v_in_df_data_clean.info()) |
|
|
|
|
|
pred6 = ereg.predict(v_in_df_data_clean) |
|
pred7 = xgb_model.predict(v_in_df_data_clean) |
|
|
|
|
|
submit_train = pd_df.DataFrame({"PassengerId": X_test.Id, 'calculated_voting': pred4, 'Actual': y_test, |
|
'xgboost': pred5, 'Score_voting': mets.r2_score(pred4, y_test), |
|
'Score_voting_xgb': mets.r2_score(pred5, y_test)}) |
|
|
|
submit = pd_df.DataFrame({"Id": v_house_base_test_red.Id, 'SalePrice': pred6, 'SalePricexgb': pred7}) |
|
|
|
submit.to_csv("Submission_houseprice_28.csv", index=False) |
|
submit_train.to_csv("Submission_houseprice_test1.csv", index=False) |