File size: 3,030 Bytes
80df7fa
 
 
 
 
 
 
 
 
 
b3fa682
7dc04fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3fa682
7dc04fd
 
 
 
 
b3fa682
80df7fa
 
 
 
b3fa682
 
 
 
 
4cd4c80
b3fa682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# create new features
def create_new_features(df):
    df['year_sold'] = df['date'].dt.year
    df = df.drop(columns=['date'])
    df['house_age'] = df['year_sold'] - df['yr_built']
    df['years_since_renovation'] = df['year_sold'] - df['yr_renovated']
    df.drop(columns=['year_sold'], inplace=True)
    df['has_basement'] = df['sqft_basement'].apply(lambda x: 1 if x > 0 else 0)
    return df

def normalize(df):
    # min_dict = {'bedrooms': 0,
    #  'bathrooms': 0,
    #  'sqft_living': 370,
    #  'sqft_lot': 638,
    #  'floors': 1,
    #  'waterfront': 0,
    #  'view': 0,
    #  'condition': 1,
    #  'sqft_above': 370,
    #  'sqft_basement': 0,
    #  'yr_built': 1900,
    #  'yr_renovated': 0,
    #  'house_age': 0,
    #  'years_since_renovation': 0}
    # max_dict = {'bedrooms': 9,
    #  'bathrooms': 8,
    #  'sqft_living': 13540,
    #  'sqft_lot': 1074218,
    #  'floors': 3,
    #  'waterfront': 1,
    #  'view': 4,
    #  'condition': 5,
    #  'sqft_above': 9410,
    #  'sqft_basement': 4820,
    #  'yr_built': 2014,
    #  'yr_renovated': 2014,
    #  'house_age': 114,
    #  'years_since_renovation': 2014}

    with open("./min_dict.json", "r") as f:
        min_dict = json.load(f)

    with open("./max_dict.json", "r") as f:
        max_dict = json.load(f)
    
    numerical_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront',
                          'view', 'condition', 'sqft_above', 'sqft_basement',
                          'yr_built', 'yr_renovated', 'house_age', 'years_since_renovation']
    
    for col in numerical_features:
        df[col] = df[col].apply(lambda x: (x-min_dict[col])/(max_dict[col]-min_dict[col]))
    return df

def init_new_pred():
    import pandas as pd
    columns = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'house_age', 'years_since_renovation',
       'has_basement', 'city_Algona', 'city_Auburn', 'city_Beaux Arts Village',
       'city_Bellevue', 'city_Black Diamond', 'city_Bothell', 'city_Burien',
       'city_Carnation', 'city_Clyde Hill', 'city_Covington',
       'city_Des Moines', 'city_Duvall', 'city_Enumclaw', 'city_Fall City',
       'city_Federal Way', 'city_Inglewood-Finn Hill', 'city_Issaquah',
       'city_Kenmore', 'city_Kent', 'city_Kirkland', 'city_Lake Forest Park',
       'city_Maple Valley', 'city_Medina', 'city_Mercer Island', 'city_Milton',
       'city_Newcastle', 'city_Normandy Park', 'city_North Bend',
       'city_Pacific', 'city_Preston', 'city_Ravensdale', 'city_Redmond',
       'city_Renton', 'city_Sammamish', 'city_SeaTac', 'city_Seattle',
       'city_Shoreline', 'city_Skykomish', 'city_Snoqualmie',
       'city_Snoqualmie Pass', 'city_Tukwila', 'city_Vashon',
       'city_Woodinville', 'city_Yarrow Point']

    new_pred = {key:0 for key in columns}
    new_pred['date'] = pd.to_datetime('2014-07-10') # do not change
    return new_pred