from src.models.train_model import train_model from src.utils.helper_functions import post_process import numpy as np class MonthlyKFold: def __init__(self, n_splits=3): self.n_splits = n_splits def split(self, X, y=None, groups=None): dates = 12 * X["_year"] + X["_month"] timesteps = sorted(dates.unique().tolist()) X = X.reset_index() for t in timesteps[-self.n_splits:]: idx_train = X[dates.values < t].index idx_test = X[dates.values == t].index yield idx_train, idx_test def get_n_splits(self, X, y=None, groups=None): return self.n_splits def evaluate( X, y, model_params, cat_features, scorer, FOLD=5, model_type='CATBOOST' ): print('Evaluating...') tscv = MonthlyKFold(FOLD) scores = [] models = [] iterations = [] test_preds = [] oof = np.zeros(len(X)) for i, (train_index, valid_index) in enumerate(tscv.split(X)): print(f'FOLD:{i+1}') X_train, y_train = X.iloc[train_index, :], y.iloc[train_index] X_valid, y_valid = X.iloc[valid_index, :], y.iloc[valid_index] model = train_model( train=(X_train, y_train), model_params=model_params, model_type=model_type, cat_features=cat_features, valid=(X_valid, y_valid)) score = scorer(y_valid, post_process(model.predict(X_valid))) print(f'Score:{score:.5f}') models.append(model) scores.append(score) print(f"Scores:{scores}") print(f'Mean Score:{np.mean(scores):.5f} +- {np.std(scores):.3f}') return models, scores