alpertml commited on
Commit
fa10c3d
·
verified ·
1 Parent(s): cc96811

Upload 16 files

Browse files
app.py CHANGED
@@ -1,10 +1,10 @@
1
  # external libraries
2
  import streamlit as st
3
  import pandas as pd
4
- import numpy as np
5
  import os
6
  import datetime
7
 
 
8
  from config import Config
9
 
10
  config = vars(Config)
@@ -62,7 +62,7 @@ def main():
62
  'planning_method_latest'])
63
 
64
  st.write(f'Average demand by category "{category}"')
65
- st.bar_chart(st.session_state["predictions_df"].loc[st.session_state["predictions_df"]['product_id'] == pid,:].groupby(category)['demand'].mean())
66
 
67
  input_save = st.checkbox(config['SAVE_CHECKBOX_TEXT'])
68
  confirm_params = {
@@ -92,38 +92,16 @@ def save(params):
92
 
93
  if params['input_save']:
94
  today = datetime.datetime.today().strftime("%d-%m-%Y")
95
- st.session_state["predictions_df"].to_excel(f'{dir}/predictions_{today}.xlsx', index=False)
96
 
97
 
98
  # forecasting
99
  def predict(input_date):
100
 
101
- data = {
102
- 'product_id': [f'P{1}' for i in range(5)],
103
- 'date':['2022-01-01','2022-02-01','2022-03-01','2022-04-01','2022-05-01'],
104
- 'demand': np.random.randint(1, 100, size=5),
105
- 'product_application': ['A','A','A','B','B']
106
- }
107
-
108
- data2 = {
109
- 'product_id': [f'P{2}' for i in range(5)],
110
- 'date':['2022-01-01','2022-02-01','2022-03-01','2022-04-01','2022-05-01'],
111
- 'demand': np.random.randint(1, 100, size=5),
112
- 'product_application': ['A','A','A','B','B']
113
- }
114
-
115
- df1 = pd.DataFrame(data)
116
- df2 = pd.DataFrame(data2)
117
-
118
- # Concatenate the two DataFrames vertically
119
- combined_df = pd.concat([df1, df2], ignore_index=True)
120
-
121
-
122
  forecast_start_date = input_date[0].strftime("%Y-%m-%d")
123
  forecast_end_date = input_date[1].strftime("%Y-%m-%d")
124
- print(forecast_start_date, forecast_end_date)
125
 
126
- st.session_state["predictions_df"] = combined_df
127
 
128
  if __name__ == "__main__":
129
  main()
 
1
  # external libraries
2
  import streamlit as st
3
  import pandas as pd
 
4
  import os
5
  import datetime
6
 
7
+ import pipeline
8
  from config import Config
9
 
10
  config = vars(Config)
 
62
  'planning_method_latest'])
63
 
64
  st.write(f'Average demand by category "{category}"')
65
+ st.bar_chart(st.session_state["predictions_df"].groupby(category)['demand'].mean())
66
 
67
  input_save = st.checkbox(config['SAVE_CHECKBOX_TEXT'])
68
  confirm_params = {
 
92
 
93
  if params['input_save']:
94
  today = datetime.datetime.today().strftime("%d-%m-%Y")
95
+ st.session_state["predictions_df"][['product_id','date','demand']].to_excel(f'{dir}/predictions_{today}.xlsx', index=False)
96
 
97
 
98
  # forecasting
99
  def predict(input_date):
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  forecast_start_date = input_date[0].strftime("%Y-%m-%d")
102
  forecast_end_date = input_date[1].strftime("%Y-%m-%d")
 
103
 
104
+ st.session_state["predictions_df"] = pipeline.run(forecast_start_date, forecast_end_date)
105
 
106
  if __name__ == "__main__":
107
  main()
config.py CHANGED
@@ -1,9 +1,12 @@
 
 
1
  class Config():
2
 
3
  def __init__(self):
4
  pass
5
 
6
  target = 'demand'
 
7
 
8
  not_include_features = [
9
  target,
@@ -18,7 +21,27 @@ class Config():
18
  'planning_method_latest'
19
  ]
20
 
21
- # production
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  MAIN_TITLE = 'Infineon Product Demand Forecasting System'
23
  SUB_TITLE = 'Data Analytics in Applications'
24
  ICON_PATH = 'images/infineon-icon-1.png'
 
1
+ from sklearn.metrics import r2_score
2
+
3
  class Config():
4
 
5
  def __init__(self):
6
  pass
7
 
8
  target = 'demand'
9
+ split_local_test = False
10
 
11
  not_include_features = [
12
  target,
 
21
  'planning_method_latest'
22
  ]
23
 
24
+ scorer = r2_score
25
+ model_type = 'CATBOOST'
26
+ fold = 5
27
+ fold_models_directory = 'models/date_models_test'
28
+ fold_input_directory = 'maps/date_models_test'
29
+
30
+ catboost_params = {
31
+ 'learning_rate': 0.03,
32
+ 'objective':'RMSE',
33
+ 'depth': 5,
34
+ 'early_stopping_rounds':200,
35
+ 'iterations': 2000,
36
+ 'use_best_model': True,
37
+ # 'eval_metric': CatBoostEvalMetricSMAPE(),
38
+ 'eval_metric': 'R2',
39
+ 'random_state': 42,
40
+ 'allow_writing_files': False,
41
+ 'thread_count':-1
42
+ }
43
+
44
+ # deployment
45
  MAIN_TITLE = 'Infineon Product Demand Forecasting System'
46
  SUB_TITLE = 'Data Analytics in Applications'
47
  ICON_PATH = 'images/infineon-icon-1.png'
pipeline.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.data.load_data import get_data, generate_test_data
2
+ from src.features.build_features import prepare_data
3
+ from src.data.preprocess import get_Xy
4
+ from src.utils.helper_functions import load_models, get_predictions, load_parquet
5
+ from config import Config
6
+ import numpy as np
7
+
8
+
9
+ import pandas as pd
10
+
11
+ config = vars(Config)
12
+
13
+ def run(forecast_start_date, forecast_end_date):
14
+
15
+ print('Script Executing...')
16
+
17
+
18
+ generated_test = generate_test_data(forecast_start_date,
19
+ forecast_end_date,
20
+ product_ids=load_parquet(f'{config["fold_input_directory"]}/unique_products.parquet').values)
21
+
22
+ generated_test['date'] = pd.to_datetime(generated_test['date'])
23
+
24
+ # merge the fixed columns
25
+ generated_test = pd.merge(load_parquet(f'{config["fold_input_directory"]}/fixed_columns.parquet'),
26
+ generated_test, on=['product_id'], how='right')
27
+
28
+ dataframe = prepare_data(
29
+ dataframe=pd.concat([generated_test], axis=0),
30
+ add_datetime_features=True
31
+ )
32
+
33
+ dataframe[config['target']] = np.nan
34
+
35
+ X, X_test, y = get_Xy(
36
+ dataframe=dataframe,
37
+ not_include=config['not_include_features'],
38
+ cat_features=config['cat_features'],
39
+ cat_encoding='category'
40
+ )
41
+
42
+ models = load_models(config['fold_models_directory'])
43
+
44
+ y_test_preds = get_predictions(models, X_test)
45
+
46
+ generated_test[config['target']] = y_test_preds
47
+
48
+ print('Script Done!')
49
+
50
+ return generated_test
51
+
52
+ # if __name__ == '__main__':
53
+ # run()
src/data/__pycache__/load_data.cpython-311.pyc ADDED
Binary file (4.05 kB). View file
 
src/data/__pycache__/preprocess.cpython-311.pyc ADDED
Binary file (1.38 kB). View file
 
src/data/load_data.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from src.utils.helper_functions import save_parquet
4
+ import os
5
+ from config import Config
6
+
7
+ config = vars(Config)
8
+
9
+ def get_data(
10
+ data_dir = '../data/raw/input/',
11
+ file_name = 'demand_data_IFX.csv',
12
+ date_columns = ['reporting_month_start'],
13
+ split_local_test = False,
14
+ target = 'demand',
15
+ fixed_columns = [
16
+ 'product_id',
17
+ 'product_application',
18
+ 'product_marketing_name',
19
+ 'product_main_family',
20
+ 'planning_method_latest',
21
+ ],
22
+ prediction_interval = ('2023-11-01', '2024-07-01')
23
+ ):
24
+
25
+ print('Loading data...')
26
+
27
+ dataframe = pd.read_csv(os.path.join(data_dir, file_name), parse_dates=date_columns)
28
+ dataframe['date'] = pd.to_datetime(dataframe['reporting_month_start'].dt.date)
29
+ dataframe.sort_values(by='date', inplace=True)
30
+
31
+ if split_local_test:
32
+ train, test = split_train_test(
33
+ dataframe = dataframe
34
+ )
35
+
36
+ test_min_date, test_max_date = test.date.min(), test.date.max()
37
+ else:
38
+ train, test = dataframe, None
39
+
40
+ test_min_date, test_max_date = prediction_interval
41
+
42
+
43
+ generated_test = generate_test_data(
44
+ start_date=test_min_date,
45
+ end_date=test_max_date,
46
+ product_ids=train.product_id.unique()
47
+ )
48
+
49
+ generated_test['date'] = pd.to_datetime(generated_test['date'])
50
+
51
+ # merge the fixed columns
52
+ generated_test = pd.merge(train[fixed_columns].drop_duplicates(subset=fixed_columns), generated_test, on=['product_id'], how='right')
53
+
54
+ save_parquet(
55
+ dataframe= train[fixed_columns].drop_duplicates(subset=fixed_columns),
56
+ path=f'{config["fold_input_directory"]}/fixed_columns.parquet'
57
+ )
58
+
59
+ # merge the ground-truth
60
+ if split_local_test:
61
+ generated_test = pd.merge(test[[target,'date','product_id']], generated_test, on=['product_id','date'], how='right')
62
+ generated_test[target] = generated_test[target].fillna(0)
63
+ else:
64
+ generated_test[target] = np.nan
65
+
66
+ # generate fixed train
67
+ generated_train = train[[target,'date'] + fixed_columns]
68
+
69
+ y_test = generated_test[target]
70
+ generated_test.drop(target, axis=1, inplace=True)
71
+
72
+ return generated_train, generated_test, y_test
73
+
74
+ def split_train_test(dataframe):
75
+
76
+ train = dataframe[dataframe['date'] < pd.to_datetime('2022-11-01')]
77
+ test = dataframe[(dataframe['date'] >= pd.to_datetime('2022-11-01'))&
78
+ (dataframe['date'] <= pd.to_datetime('2023-07-01'))]
79
+
80
+ return train, test
81
+
82
+ def generate_test_data(start_date, end_date, product_ids):
83
+
84
+ # Generate a range of monthly start dates
85
+ monthly_starts = pd.date_range(start=start_date, end=end_date, freq='MS')
86
+ monthly_starts = pd.DataFrame(monthly_starts, columns=['date'])
87
+
88
+ product_ids = pd.DataFrame(product_ids, columns=['product_id'])
89
+
90
+ joined_df = product_ids.merge(monthly_starts, how='cross')
91
+
92
+ return joined_df
src/data/preprocess.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def get_Xy(
4
+ dataframe,
5
+ not_include,
6
+ cat_features,
7
+ target='demand',
8
+ cat_encoding='category'
9
+ ):
10
+
11
+ print('Preprocessing...')
12
+
13
+ tmp_df = dataframe.copy()
14
+
15
+ features = [col for col in tmp_df.columns if col not in not_include]
16
+
17
+ if cat_encoding == 'category':
18
+ tmp_df[cat_features] = tmp_df[cat_features].astype('category')
19
+
20
+ X, y = tmp_df.loc[~tmp_df[target].isnull(), features], tmp_df.loc[~tmp_df[target].isnull(), target]
21
+ X_test = tmp_df.loc[tmp_df[target].isnull(), features]
22
+
23
+ return X, X_test, y
src/features/__pycache__/build_features.cpython-311.pyc ADDED
Binary file (1.12 kB). View file
 
src/features/build_features.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def prepare_data(
2
+ dataframe,
3
+ add_datetime_features=True
4
+ ):
5
+
6
+ print('Building features...')
7
+
8
+ if add_datetime_features:
9
+ dataframe = datetime_features(dataframe)
10
+
11
+ return dataframe
12
+
13
+ def datetime_features(dataframe, date='date', suffix=''):
14
+
15
+ dataframe[f'{suffix}_month'] = dataframe[date].dt.month
16
+ dataframe[f'{suffix}_year'] = dataframe[date].dt.year
17
+ dataframe[f'{suffix}_quarter'] = dataframe[date].dt.quarter
18
+ dataframe[f'{suffix}_weekofyear'] = dataframe[date].dt.isocalendar().week
19
+
20
+ return dataframe
src/models/__pycache__/evaluate_model.cpython-311.pyc ADDED
Binary file (3.57 kB). View file
 
src/models/__pycache__/train_model.cpython-311.pyc ADDED
Binary file (836 Bytes). View file
 
src/models/evaluate_model.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.models.train_model import train_model
2
+ from src.utils.helper_functions import post_process
3
+ import numpy as np
4
+
5
+ class MonthlyKFold:
6
+ def __init__(self, n_splits=3):
7
+ self.n_splits = n_splits
8
+
9
+ def split(self, X, y=None, groups=None):
10
+ dates = 12 * X["_year"] + X["_month"]
11
+ timesteps = sorted(dates.unique().tolist())
12
+ X = X.reset_index()
13
+
14
+ for t in timesteps[-self.n_splits:]:
15
+ idx_train = X[dates.values < t].index
16
+ idx_test = X[dates.values == t].index
17
+
18
+ yield idx_train, idx_test
19
+
20
+ def get_n_splits(self, X, y=None, groups=None):
21
+ return self.n_splits
22
+
23
+
24
+ def evaluate(
25
+ X, y,
26
+ model_params,
27
+ cat_features,
28
+ scorer,
29
+ FOLD=5,
30
+ model_type='CATBOOST'
31
+ ):
32
+
33
+ print('Evaluating...')
34
+
35
+ tscv = MonthlyKFold(FOLD)
36
+
37
+
38
+ scores = []
39
+ models = []
40
+ iterations = []
41
+ test_preds = []
42
+
43
+ oof = np.zeros(len(X))
44
+ for i, (train_index, valid_index) in enumerate(tscv.split(X)):
45
+
46
+ print(f'FOLD:{i+1}')
47
+
48
+ X_train, y_train = X.iloc[train_index, :], y.iloc[train_index]
49
+ X_valid, y_valid = X.iloc[valid_index, :], y.iloc[valid_index]
50
+
51
+ model = train_model(
52
+ train=(X_train, y_train),
53
+ model_params=model_params,
54
+ model_type=model_type,
55
+ cat_features=cat_features,
56
+ valid=(X_valid, y_valid))
57
+
58
+ score = scorer(y_valid, post_process(model.predict(X_valid)))
59
+ print(f'Score:{score:.5f}')
60
+
61
+ models.append(model)
62
+ scores.append(score)
63
+
64
+ print(f"Scores:{scores}")
65
+ print(f'Mean Score:{np.mean(scores):.5f} +- {np.std(scores):.3f}')
66
+
67
+ return models, scores
68
+
src/models/train_model.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from catboost import CatBoostRegressor
2
+
3
+ def train_model(
4
+ train,
5
+ model_params,
6
+ model_type,
7
+ cat_features,
8
+ valid=None,
9
+ ):
10
+
11
+ X_train, y_train = train
12
+
13
+ if model_type == 'CATBOOST':
14
+
15
+ model = CatBoostRegressor(**model_params,
16
+ cat_features=cat_features)
17
+
18
+ if valid:
19
+ X_valid, y_valid = valid
20
+ eval_set=[(X_valid,y_valid)]
21
+
22
+ model.fit(X_train,y_train,
23
+ eval_set=eval_set,
24
+ verbose=200
25
+ )
26
+
27
+ return model
src/utils/__pycache__/helper_functions.cpython-311.pyc ADDED
Binary file (3.69 kB). View file
 
src/utils/helper_functions.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ import numpy as np
4
+ from datetime import datetime
5
+ import pandas as pd
6
+
7
+ def save_models(models, model_type, directory):
8
+
9
+ print('Saving models...')
10
+
11
+ for i, model in enumerate(models):
12
+
13
+ with open(f'{directory}/{model_type}_FOLD_{i+1}.pkl', 'wb') as file:
14
+ pickle.dump(model, file)
15
+
16
+
17
+ def load_models(directory):
18
+
19
+ print('Loading models...')
20
+
21
+ models = []
22
+ # List all files in the directory
23
+ files = os.listdir(directory)
24
+
25
+ pkl_files = [file for file in files if file.endswith('.pkl')]
26
+
27
+ for file in pkl_files:
28
+ with open(os.path.join(directory, file), 'rb') as file:
29
+ model = pickle.load(file)
30
+ models.append(model)
31
+
32
+ return models
33
+
34
+ def get_predictions(models, X_test):
35
+
36
+ print('Forecasting test data...')
37
+
38
+ preds = []
39
+ for model in models:
40
+ preds.append(post_process(model.predict(X_test)))
41
+
42
+ return np.mean(preds, axis=0)
43
+
44
+ def post_process(predictions):
45
+
46
+ predictions = predictions.clip(0)
47
+
48
+ return predictions
49
+
50
+ def save_results(dataframe, file_name):
51
+
52
+ print('Saving results...')
53
+
54
+ today_date = datetime.now().strftime("%Y-%m-%d")
55
+
56
+ dataframe.to_excel(f'demand_predictions/{file_name}_{today_date}.xlsx', index=False)
57
+
58
+ def save_parquet(dataframe, path):
59
+ dataframe.to_parquet(path, index=False)
60
+
61
+ def load_parquet(path):
62
+ return pd.read_parquet(path)
63
+
64
+
65
+
66
+
67
+
68
+
src/visualization/visualize.py ADDED
File without changes