Spaces:
Runtime error
Runtime error
File size: 5,357 Bytes
1233062 56baf6d 1233062 56baf6d 1233062 56baf6d 1233062 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tsfeatures import (
tsfeatures, acf_features, arch_stat, crossing_points,
entropy, flat_spots, heterogeneity, holt_parameters,
lumpiness, nonlinearity, pacf_features, stl_features,
stability, hw_parameters, unitroot_kpss, unitroot_pp,
series_length, sparsity, hurst, statistics
)
FILE_CATALOGUE = os.environ['FILE_CATALOGUE']
BUCKET_TIMENET = os.environ['BUCKET_TIMENET']
KEY_TIMENET = os.environ['KEY_TIMENET']
FEATS_COLS = ['hurst', 'series_length', 'unitroot_pp', 'unitroot_kpss', 'hw_alpha',
'hw_beta', 'hw_gamma', 'stability', 'nperiods', 'seasonal_period',
'trend_strength', 'spike', 'linearity', 'curvature', 'e_acf1',
'e_acf10', 'seasonal_strength', 'peak', 'trough', 'x_pacf5',
'diff1x_pacf5', 'diff2x_pacf5', 'seas_pacf', 'nonlinearity',
'lumpiness', 'alpha', 'beta', 'flat_spots', 'entropy',
'crossing_points', 'arch_lm', 'x_acf1', 'x_acf10', 'diff1_acf1',
'diff1_acf10', 'diff2_acf1', 'diff2_acf10', 'seas_acf1', 'sparsity',
'total_sum', 'mean', 'variance', 'median', 'p2point5', 'p5', 'p25',
'p75', 'p95', 'p97point5', 'max', 'min']
def tsfeatures_vector(df:pd.DataFrame, seasonality: int) -> pd.DataFrame:
ts_df = tsfeatures(
ts=df[['unique_id', 'ds', 'y']],
freq=seasonality,
features=[sparsity, acf_features, crossing_points,
entropy, flat_spots, holt_parameters,
lumpiness, nonlinearity, pacf_features, stl_features,
stability, hw_parameters, unitroot_kpss, unitroot_pp,
series_length, hurst, arch_stat, statistics],
scale=False,
).rename(columns={'trend': 'trend_strength'})
if seasonality == 1:
# add missing features when seasonality != 1
ts_df[['seasonal_strength', 'peak', 'trough', 'seas_pacf', 'seas_acf1']] = np.nan
ts_df[['trend_strength', 'seasonal_strength']] = ts_df[['trend_strength', 'seasonal_strength']].fillna(0)
vector = ts_df[FEATS_COLS].fillna(0).iloc[0].values
vector = (vector - vector.min()) / (vector.max() - vector.min())
return vector.tolist()
def get_closest_ids(x: list, top_k: int, index_pinecone):
query_response = index_pinecone.query(
top_k=top_k,
include_values=False,
include_metadata=True,
vector=x,
)
return query_response['matches']
def highlight_smallest(s, nsmallest=3):
# Define colors
colors = ['lightgreen', 'lightblue', 'lightpink']
# Rank data and find the nsmallest
ranks = s.rank(method="min").astype(int)
smallest = ranks.isin(ranks.nsmallest(nsmallest))
# Initialize an empty string for the styles
attr = ['' for _ in s]
# Apply styles to the nsmallest
for i in range(1, nsmallest+1):
mask = ranks == i
attr = ['background-color: {};'.format(colors[i-1]) if v else a for v, a in zip(mask, attr)]
return attr
def plot_best_models_count(ids, catalogue):
uids = [x['id'] for x in ids]
file_evaluations = catalogue['file_evaluation'].loc[uids].unique()
eval_df = [pd.read_parquet(f_eval) for f_eval in file_evaluations]
eval_df = pd.concat(eval_df).query('unique_id in @uids')
eval_df = pd.pivot(
eval_df,
index=['unique_id', 'metric'],
columns='model',
values='value'
).reset_index()
models = eval_df.drop(columns=['unique_id', 'metric']).columns
# compute relative metric
for model in models:
eval_df[model] = eval_df[model] / eval_df['Naive']
summary_df = eval_df.groupby('metric')[models].median().T
summary_df = summary_df[summary_df.index != 'Naive'].sort_values('mae')
summary_df = summary_df.style.apply(highlight_smallest, nsmallest=3, axis=0)
eval_df['BestModel'] = eval_df[models].idxmin(axis=1)
#eval_df = eval_df.groupby(['BestModel', 'metric']).size().rename('n').reset_index()
fig = sns.catplot(eval_df.query('metric != "mase"'), y='BestModel', kind='count', col='metric')
return fig, summary_df
def plot_closest_series(Y_df, id, catalogue):
# leer archivo de file_timenet y hacer el plot
uid_catalogue = catalogue.loc[id]
closest_df = pd.read_parquet(uid_catalogue.file_timenet).query('unique_id == @id')
#Y_df['unique_id'] = 'ProvidedByUser'
# Create a figure with 1 row and 2 columns
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
# Get the unique_id for each DataFrame
unique_id_Y_df = Y_df['unique_id'].unique()[0]
unique_id_closest_df = closest_df['unique_id'].unique()[0]
# Plot the 'y' column for both dataframes, against 'ds', and label them with unique_id
sns.lineplot(x='ds', y='y', ax=axes[0], data=Y_df, label=unique_id_Y_df)
sns.lineplot(x='ds', y='y', ax=axes[1], data=closest_df)
# Set the titles for the subplots
axes[0].set_title('Uploaded Dataset')
axes[1].set_title(f'TimenetTimeSeries:{uid_catalogue.dataset},{uid_catalogue.subdataset},{uid_catalogue.ts_name}')
# Show legend on each subplot
axes[0].legend()
axes[1].legend()
# Display the plot
plt.tight_layout()
plt.show()
return fig
def get_catalogue():
return pd.read_parquet(FILE_CATALOGUE)
|