File size: 3,078 Bytes
fa10c3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43e3ffb
fa10c3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import pandas as pd
import numpy as np
from src.utils.helper_functions import save_parquet
import os
from config import Config

config = vars(Config)

def get_data(

        data_dir = '../data/raw/input/',

        file_name = 'demand_data_IFX.csv',

        date_columns = ['reporting_month_start'],

        split_local_test = False,

        target = 'demand',

        fixed_columns = [

            'product_id',

            'product_application',

            'product_marketing_name',

            'product_main_family',

            'planning_method_latest',

        ],

        prediction_interval = ('2023-11-01', '2024-07-01')

    ):

    print('Loading data...')
    
    dataframe = pd.read_csv(os.path.join(data_dir, file_name), parse_dates=date_columns)
    dataframe['date'] = pd.to_datetime(dataframe['reporting_month_start'].dt.date)
    dataframe.sort_values(by='date', inplace=True)

    if split_local_test:
        train, test = split_train_test(
            dataframe = dataframe
        )

        test_min_date, test_max_date = test.date.min(), test.date.max()
    else:
        train, test = dataframe, None

        test_min_date, test_max_date = prediction_interval


    generated_test = generate_test_data(
        start_date=test_min_date,
        end_date=test_max_date,
        product_ids=train.product_id.unique()
    )

    generated_test['date'] = pd.to_datetime(generated_test['date'])

    # merge the fixed columns
    generated_test = pd.merge(train[fixed_columns].drop_duplicates(subset=fixed_columns), generated_test, on=['product_id'], how='right')

    save_parquet(
        dataframe= train[fixed_columns].drop_duplicates(subset=fixed_columns),
        path=f'{config["fold_input_directory"]}/fixed_columns.parquet'
    )

    # merge the ground-truth
    if split_local_test:
        generated_test = pd.merge(test[[target,'date','product_id']], generated_test, on=['product_id','date'], how='right')
        generated_test[target] = generated_test[target].fillna(0)
    else:
        generated_test[target] = np.nan

    # generate fixed train
    generated_train = train[[target,'date'] + fixed_columns]

    y_test = generated_test[target]
    generated_test.drop(target, axis=1, inplace=True)

    return dataframe, generated_train, generated_test, y_test

def split_train_test(dataframe):

    train = dataframe[dataframe['date'] < pd.to_datetime('2022-11-01')]
    test = dataframe[(dataframe['date'] >= pd.to_datetime('2022-11-01'))&
                     (dataframe['date'] <= pd.to_datetime('2023-07-01'))]
    
    return train, test

def generate_test_data(start_date, end_date, product_ids):

    # Generate a range of monthly start dates
    monthly_starts = pd.date_range(start=start_date, end=end_date, freq='MS')
    monthly_starts = pd.DataFrame(monthly_starts, columns=['date'])

    product_ids = pd.DataFrame(product_ids, columns=['product_id'])

    joined_df = product_ids.merge(monthly_starts, how='cross')

    return joined_df