Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from src.utils.helper_functions import save_parquet | |
import os | |
from config import Config | |
config = vars(Config) | |
def get_data( | |
data_dir = '../data/raw/input/', | |
file_name = 'demand_data_IFX.csv', | |
date_columns = ['reporting_month_start'], | |
split_local_test = False, | |
target = 'demand', | |
fixed_columns = [ | |
'product_id', | |
'product_application', | |
'product_marketing_name', | |
'product_main_family', | |
'planning_method_latest', | |
], | |
prediction_interval = ('2023-11-01', '2024-07-01') | |
): | |
print('Loading data...') | |
dataframe = pd.read_csv(os.path.join(data_dir, file_name), parse_dates=date_columns) | |
dataframe['date'] = pd.to_datetime(dataframe['reporting_month_start'].dt.date) | |
dataframe.sort_values(by='date', inplace=True) | |
if split_local_test: | |
train, test = split_train_test( | |
dataframe = dataframe | |
) | |
test_min_date, test_max_date = test.date.min(), test.date.max() | |
else: | |
train, test = dataframe, None | |
test_min_date, test_max_date = prediction_interval | |
generated_test = generate_test_data( | |
start_date=test_min_date, | |
end_date=test_max_date, | |
product_ids=train.product_id.unique() | |
) | |
generated_test['date'] = pd.to_datetime(generated_test['date']) | |
# merge the fixed columns | |
generated_test = pd.merge(train[fixed_columns].drop_duplicates(subset=fixed_columns), generated_test, on=['product_id'], how='right') | |
save_parquet( | |
dataframe= train[fixed_columns].drop_duplicates(subset=fixed_columns), | |
path=f'{config["fold_input_directory"]}/fixed_columns.parquet' | |
) | |
# merge the ground-truth | |
if split_local_test: | |
generated_test = pd.merge(test[[target,'date','product_id']], generated_test, on=['product_id','date'], how='right') | |
generated_test[target] = generated_test[target].fillna(0) | |
else: | |
generated_test[target] = np.nan | |
# generate fixed train | |
generated_train = train[[target,'date'] + fixed_columns] | |
y_test = generated_test[target] | |
generated_test.drop(target, axis=1, inplace=True) | |
return generated_train, generated_test, y_test | |
def split_train_test(dataframe): | |
train = dataframe[dataframe['date'] < pd.to_datetime('2022-11-01')] | |
test = dataframe[(dataframe['date'] >= pd.to_datetime('2022-11-01'))& | |
(dataframe['date'] <= pd.to_datetime('2023-07-01'))] | |
return train, test | |
def generate_test_data(start_date, end_date, product_ids): | |
# Generate a range of monthly start dates | |
monthly_starts = pd.date_range(start=start_date, end=end_date, freq='MS') | |
monthly_starts = pd.DataFrame(monthly_starts, columns=['date']) | |
product_ids = pd.DataFrame(product_ids, columns=['product_id']) | |
joined_df = product_ids.merge(monthly_starts, how='cross') | |
return joined_df |