File size: 5,357 Bytes
1233062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56baf6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1233062
 
 
 
 
 
 
 
 
 
 
 
56baf6d
 
 
 
 
 
1233062
 
 
56baf6d
1233062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from tsfeatures import (
    tsfeatures, acf_features, arch_stat, crossing_points,
    entropy, flat_spots, heterogeneity, holt_parameters,
    lumpiness, nonlinearity, pacf_features, stl_features,
    stability, hw_parameters, unitroot_kpss, unitroot_pp,
    series_length, sparsity, hurst, statistics
)


FILE_CATALOGUE = os.environ['FILE_CATALOGUE']
BUCKET_TIMENET = os.environ['BUCKET_TIMENET']
KEY_TIMENET = os.environ['KEY_TIMENET']


FEATS_COLS = ['hurst', 'series_length', 'unitroot_pp', 'unitroot_kpss', 'hw_alpha',
       'hw_beta', 'hw_gamma', 'stability', 'nperiods', 'seasonal_period',
       'trend_strength', 'spike', 'linearity', 'curvature', 'e_acf1',
       'e_acf10', 'seasonal_strength', 'peak', 'trough', 'x_pacf5',
       'diff1x_pacf5', 'diff2x_pacf5', 'seas_pacf', 'nonlinearity',
       'lumpiness', 'alpha', 'beta', 'flat_spots', 'entropy',
       'crossing_points', 'arch_lm', 'x_acf1', 'x_acf10', 'diff1_acf1',
       'diff1_acf10', 'diff2_acf1', 'diff2_acf10', 'seas_acf1', 'sparsity',
       'total_sum', 'mean', 'variance', 'median', 'p2point5', 'p5', 'p25',
       'p75', 'p95', 'p97point5', 'max', 'min']

def tsfeatures_vector(df:pd.DataFrame, seasonality: int) -> pd.DataFrame:
    ts_df = tsfeatures(
        ts=df[['unique_id', 'ds', 'y']],
        freq=seasonality, 
        features=[sparsity, acf_features, crossing_points,
                  entropy, flat_spots, holt_parameters,
                  lumpiness, nonlinearity, pacf_features, stl_features,
                  stability, hw_parameters, unitroot_kpss, unitroot_pp,
                  series_length, hurst, arch_stat, statistics], 
        scale=False,
    ).rename(columns={'trend': 'trend_strength'})
    if seasonality == 1:
        # add missing features when seasonality != 1
        ts_df[['seasonal_strength', 'peak', 'trough', 'seas_pacf', 'seas_acf1']] = np.nan
    ts_df[['trend_strength', 'seasonal_strength']] = ts_df[['trend_strength', 'seasonal_strength']].fillna(0)
    vector = ts_df[FEATS_COLS].fillna(0).iloc[0].values
    vector = (vector - vector.min()) / (vector.max() - vector.min())
    return vector.tolist()

def get_closest_ids(x: list, top_k: int, index_pinecone):
    query_response = index_pinecone.query(
        top_k=top_k,
        include_values=False,
        include_metadata=True,
        vector=x,
    )
    return query_response['matches']

def highlight_smallest(s, nsmallest=3):
    # Define colors
    colors = ['lightgreen', 'lightblue', 'lightpink']
    
    # Rank data and find the nsmallest
    ranks = s.rank(method="min").astype(int)
    smallest = ranks.isin(ranks.nsmallest(nsmallest))

    # Initialize an empty string for the styles
    attr = ['' for _ in s]
    
    # Apply styles to the nsmallest
    for i in range(1, nsmallest+1):
        mask = ranks == i
        attr = ['background-color: {};'.format(colors[i-1]) if v else a for v, a in zip(mask, attr)]
    
    return attr

def plot_best_models_count(ids, catalogue):
    uids = [x['id'] for x in ids]
    file_evaluations = catalogue['file_evaluation'].loc[uids].unique()
    eval_df = [pd.read_parquet(f_eval) for f_eval in file_evaluations]
    eval_df = pd.concat(eval_df).query('unique_id in @uids')
    eval_df = pd.pivot(
        eval_df,
        index=['unique_id', 'metric'],
        columns='model', 
        values='value'
    ).reset_index()
    models = eval_df.drop(columns=['unique_id', 'metric']).columns
    # compute relative metric
    for model in models:
        eval_df[model] = eval_df[model] / eval_df['Naive']
    summary_df = eval_df.groupby('metric')[models].median().T
    summary_df = summary_df[summary_df.index != 'Naive'].sort_values('mae')
    summary_df = summary_df.style.apply(highlight_smallest, nsmallest=3, axis=0)
    eval_df['BestModel'] = eval_df[models].idxmin(axis=1)
    #eval_df = eval_df.groupby(['BestModel', 'metric']).size().rename('n').reset_index()
    fig = sns.catplot(eval_df.query('metric != "mase"'), y='BestModel', kind='count', col='metric')
    return fig, summary_df

def plot_closest_series(Y_df, id, catalogue):
    # leer archivo de file_timenet y hacer el plot
    uid_catalogue = catalogue.loc[id] 
    closest_df = pd.read_parquet(uid_catalogue.file_timenet).query('unique_id == @id')
    #Y_df['unique_id'] = 'ProvidedByUser'
    
    # Create a figure with 1 row and 2 columns
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
    
    # Get the unique_id for each DataFrame
    unique_id_Y_df = Y_df['unique_id'].unique()[0]
    unique_id_closest_df = closest_df['unique_id'].unique()[0]
    
    # Plot the 'y' column for both dataframes, against 'ds', and label them with unique_id
    sns.lineplot(x='ds', y='y', ax=axes[0], data=Y_df, label=unique_id_Y_df)
    sns.lineplot(x='ds', y='y', ax=axes[1], data=closest_df)
    
    # Set the titles for the subplots
    axes[0].set_title('Uploaded Dataset')
    axes[1].set_title(f'TimenetTimeSeries:{uid_catalogue.dataset},{uid_catalogue.subdataset},{uid_catalogue.ts_name}')
    
    # Show legend on each subplot
    axes[0].legend()
    axes[1].legend()
    
    # Display the plot
    plt.tight_layout()
    plt.show()
    return fig

def get_catalogue():
    return pd.read_parquet(FILE_CATALOGUE)