import pandas as pd import numpy as np from src.utils.helper_functions import save_parquet import os from config import Config config = vars(Config) def get_data( data_dir = '../data/raw/input/', file_name = 'demand_data_IFX.csv', date_columns = ['reporting_month_start'], split_local_test = False, target = 'demand', fixed_columns = [ 'product_id', 'product_application', 'product_marketing_name', 'product_main_family', 'planning_method_latest', ], prediction_interval = ('2023-11-01', '2024-07-01') ): print('Loading data...') dataframe = pd.read_csv(os.path.join(data_dir, file_name), parse_dates=date_columns) dataframe['date'] = pd.to_datetime(dataframe['reporting_month_start'].dt.date) dataframe.sort_values(by='date', inplace=True) if split_local_test: train, test = split_train_test( dataframe = dataframe ) test_min_date, test_max_date = test.date.min(), test.date.max() else: train, test = dataframe, None test_min_date, test_max_date = prediction_interval generated_test = generate_test_data( start_date=test_min_date, end_date=test_max_date, product_ids=train.product_id.unique() ) generated_test['date'] = pd.to_datetime(generated_test['date']) # merge the fixed columns generated_test = pd.merge(train[fixed_columns].drop_duplicates(subset=fixed_columns), generated_test, on=['product_id'], how='right') save_parquet( dataframe= train[fixed_columns].drop_duplicates(subset=fixed_columns), path=f'{config["fold_input_directory"]}/fixed_columns.parquet' ) # merge the ground-truth if split_local_test: generated_test = pd.merge(test[[target,'date','product_id']], generated_test, on=['product_id','date'], how='right') generated_test[target] = generated_test[target].fillna(0) else: generated_test[target] = np.nan # generate fixed train generated_train = train[[target,'date'] + fixed_columns] y_test = generated_test[target] generated_test.drop(target, axis=1, inplace=True) return dataframe, generated_train, generated_test, y_test def split_train_test(dataframe): train = dataframe[dataframe['date'] < pd.to_datetime('2022-11-01')] test = dataframe[(dataframe['date'] >= pd.to_datetime('2022-11-01'))& (dataframe['date'] <= pd.to_datetime('2023-07-01'))] return train, test def generate_test_data(start_date, end_date, product_ids): # Generate a range of monthly start dates monthly_starts = pd.date_range(start=start_date, end=end_date, freq='MS') monthly_starts = pd.DataFrame(monthly_starts, columns=['date']) product_ids = pd.DataFrame(product_ids, columns=['product_id']) joined_df = product_ids.merge(monthly_starts, how='cross') return joined_df