File size: 8,525 Bytes
cb6a094
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import streamlit as st
import shap
import xgboost
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import pickle as pk
import os

# ml related
from sklearn import ensemble, tree, linear_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.utils import shuffle
import warnings
from scipy import stats
from scipy.stats import norm, skew

warnings.filterwarnings('ignore')

train_path = "train.csv"
test_path = "test.csv"
train_encode_path = "train_encode.csv"
real = "RealEstate.csv"
head = "head.jpg"

@st.cache_data
def load_train_data(train_path):
    return pd.read_csv(train_path)

@st.cache_data
def load_train_encode_data(train_encode_path):
    return pd.read_csv(train_encode_path)

@st.cache_data
def load_test_data(test_path):
    return pd.read_csv(test_path)

@st.cache_data
def load_data(real):
    return pd.read_csv(real)


def save_data(value,res):
    file = 'db.csv'
    if not os.path.exists(file):
        with open(file,'w') as f:
            f.write("OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,fstFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd,GarageYrBlt,MasVnrArea,Fireplaces,BsmtFinSF1,Result\n")
    with open(file,'a') as f:
        data = f"{OverallQual},{GrLivArea},{GarageCars},{GarageArea},{TotalBsmtSF},{fstFlrSF},{FullBath},{TotRmsAbvGrd},{YearBuilt},{YearRemodAdd},{GarageYrBlt},{MasVnrArea},{Fireplaces},{BsmtFinSF1},{res}\n"
        f.write(data)

st.sidebar.image(head, caption="project on Artificial intelligence",use_column_width=True)




st.title("House Pricing Analysis")

menu=["House Prediction","Predicted House","About","Visual"]
choices=st.sidebar.selectbox("Menu Bar",menu)

if choices=='House Prediction':
    st.subheader("House Prediction")
    OverallQual=st.selectbox("Select the overall quality(10 being 'Very Excellent' and 1 being 'very poor')",(10,9,8,7,6,5,4,3,2,1))
    GrLivArea= st.number_input("Enter Ground Floor Living Area (in Sqft)",value=0,min_value=0,format='%d')
    GarageArea=st.number_input("Enter area of Garage (in Sqft)",value=0.0,format='%f',step=1.0)
    GarageCars=st.number_input("Number of Cars to be accomodated in garage",min_value=1.0,max_value=10.0,step=1.0,format='%f')
    TotalBsmtSF=st.number_input("Enter area of Basement (in Sqft)",value=0.0,format='%f',step=1.0)
    fstFlrSF=st.number_input("Enter area of First Floor (in Sqft)",value=0,format='%d')
    FullBath=st.number_input("Enter number of Bathrooms",min_value=1,max_value=10,format='%d')
    TotRmsAbvGrd=st.number_input("Enter number of Rooms",min_value=1,max_value=10,format='%d')
    years=tuple([i for i in range(1872,2011)])
    YearBuilt=st.selectbox("Select the overall quality(10 being 'Very Excellent' and 1 being 'very poor')",years)
    remyears=tuple([i for i in range(1950,2011)])
    YearRemodAdd=st.selectbox("Select Remodel date (same as construction date if no remodeling or additions)",remyears)
    garyears=tuple([i for i in range(1872,2011)])
    garyears=tuple(map(float,garyears))
    GarageYrBlt=st.selectbox("Select year in which Garage was built)",garyears)
    MasVnrArea=st.number_input("Masonry veneer area (in Sqft)",value=0.0,format='%f',step=1.0)
    Fireplaces=st.number_input("Select number of FirePlaces",min_value=1,max_value=10,format='%d')
    BsmtFinSF1=st.number_input("Enter Basement Finished Area(in Sqft)",value=0,format='%d')
    submit = st.button('Predict')
    if submit:
        st.success("Prediction Done")
        value=[OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,fstFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd,GarageYrBlt,MasVnrArea,Fireplaces,BsmtFinSF1]
        df=pd.DataFrame(value).transpose()
        # st.dataframe(df)
        model=pk.load(open('rfrmodel.pkl','rb'))
        scaler=pk.load(open('scale.pkl','rb'))
        #xgboostmodel=pk.load(open('xgboostmodel.pkl','rb'))
        scaler.transform(df)
        ans=int(model.predict(df))
        st.subheader(f"The price is {ans} ($) ")
        save_data(value,ans)

if choices=='Predicted House':
    st.subheader("Predicted House")
    st.info("expand to see data clearly")
    if os.path.exists("db.csv"):
        data = pd.read_csv('db.csv')
        st.write(data)
    else:
        st.error("please try some prediction, then the data will be available here")
if choices=='About':
    st.subheader("About Us")
    info='''
        A house value is simply more than location and square footage. Like the features that make up a person, an educated party would want to know all aspects that give a house its value.

        We are going to take advantage of all of the feature variables available to use and use it to analyze and predict house prices.

        We are going to break everything into logical steps that allow us to ensure the cleanest, most realistic data for our model to make accurate predictions from.

        - Load Data and Packages
        - Analyzing the Test Variable (Sale Price)
        - Multivariable Analysis
        - Impute Missing Data and Clean Data
        - Feature Transformation/Engineering
        - Modeling and Predictions
    '''
    st.markdown(info,unsafe_allow_html=True)

if choices=='Visual':
    st.subheader("Data Visualization")

    train_data = load_train_data(train_path)
    train_encode_data = load_train_encode_data(train_encode_path)
    test_data = load_test_data(test_path)

    X=train_encode_data.drop(['SalePrice'],axis=1)
    y=train_encode_data['SalePrice']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)


    if st.checkbox("view dataset colum description"):
        st.subheader('displaying the column wise stats for the dataset')
        st.write(train_data.columns)
        st.write(train_data.describe())

    st.subheader('Correlation b/w dataset columns')
    corrmatrix = train_data.corr()
    f,ax = plt.subplots(figsize=(20,9))
    sns.heatmap(corrmatrix,vmax = .8, annot=True)
    #st.set_option('deprecation.showPyplotGlobalUse', False)
    st.pyplot()

    st.subheader("most correlated features")
    top_corr = train_data.corr()
    top_corr_feat = corrmatrix.index[abs(corrmatrix['SalePrice'])>.5]
    plt.figure(figsize=(10,10))
    sns.heatmap(train_data[top_corr_feat].corr(), annot=True, cmap="RdYlGn")
    #st.set_option('deprecation.showPyplotGlobalUse', False)
    st.pyplot()

    st.subheader("Comparing Overall Quality vs Sale Price")
    sns.barplot(x=train_data.OverallQual, y=train_data.SalePrice, alpha=0.8)
    #st.set_option('deprecation.showPyplotGlobalUse', False)
    st.pyplot()

    st.subheader("Pairplot visualization to describe correlation easily")
    sns.set()
    cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
    sns.pairplot(train_data[cols], size=2.5)
    #st.set_option('deprecation.showPyplotGlobalUse', False)
    st.pyplot()

    st.subheader("Analyis of Sale Price column in dataset")
    sns.distplot(train_data['SalePrice'] , fit=norm)# Get the fitted parameters used by the function
    (mu, sigma) = norm.fit(train_data['SalePrice'])
    st.write( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

    plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best')
    plt.ylabel('Frequency')
    plt.title('SalePrice distribution')
    #st.set_option('deprecation.showPyplotGlobalUse', False)
    st.pyplot()

    fig = plt.figure(figsize=(10,10))
    res = stats.probplot(train_data['SalePrice'], plot=plt,)
    #st.set_option('deprecation.showPyplotGlobalUse', False)
    st.pyplot()


    #Explaning the model's predictions using SHAP values
    xgboostmodel = xgboost.XGBRegressor().fit(X_train, y_train)
    explainer = shap.Explainer(xgboostmodel)
    shap_values = explainer(X_train)

    # visualize the first prediction's explanation
    shap.plots.waterfall(shap_values[0])

    st.header('SHAP Feature importances')
    plt.title('Feature importances based on SHAP values')
    shap.summary_plot(shap_values,X_train)
    st.pyplot(bbox_inches='tight')
    st.write('---')

    plt.title('Feature importances based on SHAP values (Bar plot)')
    shap.summary_plot(shap_values,X_train, plot_type="bar", show=False)
    st.pyplot(bbox_inches='tight')