File size: 4,453 Bytes
1233062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from tsfeatures import (
    tsfeatures, acf_features, arch_stat, crossing_points,
    entropy, flat_spots, heterogeneity, holt_parameters,
    lumpiness, nonlinearity, pacf_features, stl_features,
    stability, hw_parameters, unitroot_kpss, unitroot_pp,
    series_length, sparsity, hurst, statistics
)


FILE_CATALOGUE = os.environ['FILE_CATALOGUE']
BUCKET_TIMENET = os.environ['BUCKET_TIMENET']
KEY_TIMENET = os.environ['KEY_TIMENET']


FEATS_COLS = ['hurst', 'series_length', 'unitroot_pp', 'unitroot_kpss', 'hw_alpha',
       'hw_beta', 'hw_gamma', 'stability', 'nperiods', 'seasonal_period',
       'trend_strength', 'spike', 'linearity', 'curvature', 'e_acf1',
       'e_acf10', 'seasonal_strength', 'peak', 'trough', 'x_pacf5',
       'diff1x_pacf5', 'diff2x_pacf5', 'seas_pacf', 'nonlinearity',
       'lumpiness', 'alpha', 'beta', 'flat_spots', 'entropy',
       'crossing_points', 'arch_lm', 'x_acf1', 'x_acf10', 'diff1_acf1',
       'diff1_acf10', 'diff2_acf1', 'diff2_acf10', 'seas_acf1', 'sparsity',
       'total_sum', 'mean', 'variance', 'median', 'p2point5', 'p5', 'p25',
       'p75', 'p95', 'p97point5', 'max', 'min']

def tsfeatures_vector(df:pd.DataFrame, seasonality: int) -> pd.DataFrame:
    ts_df = tsfeatures(
        ts=df[['unique_id', 'ds', 'y']],
        freq=seasonality, 
        features=[sparsity, acf_features, crossing_points,
                  entropy, flat_spots, holt_parameters,
                  lumpiness, nonlinearity, pacf_features, stl_features,
                  stability, hw_parameters, unitroot_kpss, unitroot_pp,
                  series_length, hurst, arch_stat, statistics], 
        scale=False,
    ).rename(columns={'trend': 'trend_strength'})
    if seasonality == 1:
        # add missing features when seasonality != 1
        ts_df[['seasonal_strength', 'peak', 'trough', 'seas_pacf', 'seas_acf1']] = np.nan
    ts_df[['trend_strength', 'seasonal_strength']] = ts_df[['trend_strength', 'seasonal_strength']].fillna(0)
    vector = ts_df[FEATS_COLS].fillna(0).iloc[0].values
    vector = (vector - vector.min()) / (vector.max() - vector.min())
    return vector.tolist()

def get_closest_ids(x: list, top_k: int, index_pinecone):
    query_response = index_pinecone.query(
        top_k=top_k,
        include_values=False,
        include_metadata=True,
        vector=x,
    )
    return query_response['matches']

def plot_best_models_count(ids, catalogue):
    uids = [x['id'] for x in ids]
    file_evaluations = catalogue['file_evaluation'].loc[uids].unique()
    eval_df = [pd.read_parquet(f_eval) for f_eval in file_evaluations]
    eval_df = pd.concat(eval_df).query('unique_id in @uids')
    eval_df = pd.pivot(
        eval_df,
        index=['unique_id', 'metric'],
        columns='model', 
        values='value'
    ).reset_index()
    models = eval_df.drop(columns=['unique_id', 'metric']).columns
    eval_df['BestModel'] = eval_df[models].idxmin(axis=1)
    #eval_df = eval_df.groupby(['BestModel', 'metric']).size().rename('n').reset_index()
    fig = sns.catplot(eval_df.query('metric != "mase"'), y='BestModel', kind='count', col='metric')
    return fig

def plot_closest_series(Y_df, id, catalogue):
    # leer archivo de file_timenet y hacer el plot
    uid_catalogue = catalogue.loc[id] 
    closest_df = pd.read_parquet(uid_catalogue.file_timenet).query('unique_id == @id')
    #Y_df['unique_id'] = 'ProvidedByUser'
    
    # Create a figure with 1 row and 2 columns
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
    
    # Get the unique_id for each DataFrame
    unique_id_Y_df = Y_df['unique_id'].unique()[0]
    unique_id_closest_df = closest_df['unique_id'].unique()[0]
    
    # Plot the 'y' column for both dataframes, against 'ds', and label them with unique_id
    sns.lineplot(x='ds', y='y', ax=axes[0], data=Y_df, label=unique_id_Y_df)
    sns.lineplot(x='ds', y='y', ax=axes[1], data=closest_df)
    
    # Set the titles for the subplots
    axes[0].set_title('Uploaded Dataset')
    axes[1].set_title(f'TimenetTimeSeries:{uid_catalogue.dataset},{uid_catalogue.subdataset},{uid_catalogue.ts_name}')
    
    # Show legend on each subplot
    axes[0].legend()
    axes[1].legend()
    
    # Display the plot
    plt.tight_layout()
    plt.show()
    return fig

def get_catalogue():
    return pd.read_parquet(FILE_CATALOGUE)