alpertml's picture
Upload 16 files
fa10c3d verified
raw
history blame
3.07 kB
import pandas as pd
import numpy as np
from src.utils.helper_functions import save_parquet
import os
from config import Config
config = vars(Config)
def get_data(
data_dir = '../data/raw/input/',
file_name = 'demand_data_IFX.csv',
date_columns = ['reporting_month_start'],
split_local_test = False,
target = 'demand',
fixed_columns = [
'product_id',
'product_application',
'product_marketing_name',
'product_main_family',
'planning_method_latest',
],
prediction_interval = ('2023-11-01', '2024-07-01')
):
print('Loading data...')
dataframe = pd.read_csv(os.path.join(data_dir, file_name), parse_dates=date_columns)
dataframe['date'] = pd.to_datetime(dataframe['reporting_month_start'].dt.date)
dataframe.sort_values(by='date', inplace=True)
if split_local_test:
train, test = split_train_test(
dataframe = dataframe
)
test_min_date, test_max_date = test.date.min(), test.date.max()
else:
train, test = dataframe, None
test_min_date, test_max_date = prediction_interval
generated_test = generate_test_data(
start_date=test_min_date,
end_date=test_max_date,
product_ids=train.product_id.unique()
)
generated_test['date'] = pd.to_datetime(generated_test['date'])
# merge the fixed columns
generated_test = pd.merge(train[fixed_columns].drop_duplicates(subset=fixed_columns), generated_test, on=['product_id'], how='right')
save_parquet(
dataframe= train[fixed_columns].drop_duplicates(subset=fixed_columns),
path=f'{config["fold_input_directory"]}/fixed_columns.parquet'
)
# merge the ground-truth
if split_local_test:
generated_test = pd.merge(test[[target,'date','product_id']], generated_test, on=['product_id','date'], how='right')
generated_test[target] = generated_test[target].fillna(0)
else:
generated_test[target] = np.nan
# generate fixed train
generated_train = train[[target,'date'] + fixed_columns]
y_test = generated_test[target]
generated_test.drop(target, axis=1, inplace=True)
return generated_train, generated_test, y_test
def split_train_test(dataframe):
train = dataframe[dataframe['date'] < pd.to_datetime('2022-11-01')]
test = dataframe[(dataframe['date'] >= pd.to_datetime('2022-11-01'))&
(dataframe['date'] <= pd.to_datetime('2023-07-01'))]
return train, test
def generate_test_data(start_date, end_date, product_ids):
# Generate a range of monthly start dates
monthly_starts = pd.date_range(start=start_date, end=end_date, freq='MS')
monthly_starts = pd.DataFrame(monthly_starts, columns=['date'])
product_ids = pd.DataFrame(product_ids, columns=['product_id'])
joined_df = product_ids.merge(monthly_starts, how='cross')
return joined_df