azulgarza commited on
Commit
1233062
1 Parent(s): 1e8b640

helloworld

Browse files
Files changed (3) hide show
  1. app.py +139 -0
  2. requirements.txt +13 -0
  3. src/utils.py +110 -0
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import pinecone
6
+ import streamlit as st
7
+
8
+ from src.utils import (
9
+ tsfeatures_vector, get_closest_ids,
10
+ plot_best_models_count, plot_closest_series,
11
+ get_catalogue
12
+ )
13
+
14
+ CATALOGUE = get_catalogue()
15
+ pinecone.init(
16
+ api_key=os.environ['API_KEY'],
17
+ environment=os.environ['ENVIRONMENT'],
18
+ )
19
+ INDEX = pinecone.Index(os.environ['INDEX_NAME'])
20
+ DATASETS = {
21
+ "Demand (AirPassengers)": "https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/air_passengers.csv",
22
+ "Electricity (Ercot COAST)": "https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/ercot_COAST.csv",
23
+ #"Electriciy (ERCOT, multiple markets)": "https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/ercot_multiple_ts.csv",
24
+ "Web Traffic (Peyton Manning)": "https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/peyton_manning.csv",
25
+ "Finance (Exchange USD-EUR)": "https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/usdeur.csv",
26
+ }
27
+
28
+
29
+
30
+ def st_timenet_features():
31
+ st.set_page_config(
32
+ page_title="TimeNet Insights",
33
+ page_icon="🔮",
34
+ layout="wide",
35
+ initial_sidebar_state="expanded",
36
+ )
37
+
38
+ st.title(
39
+ "TimeNet Insights: Revolutionizing Time Series by Nixtla"
40
+ )
41
+ st.write(
42
+ "<style>div.block-container{padding-top:2rem;}</style>", unsafe_allow_html=True
43
+ )
44
+
45
+ intro = """
46
+ This tool is designed to perform time series analysis through comparative studies with the TimeNet dataset, curated by Nixtla.
47
+
48
+ Beyond simply identifying the most similar series within our dataset to your uploaded data, this application provides insights into your time series, especially in terms of forecasting model performance.
49
+
50
+ This app allows you to determine which predictive models might perform optimally for your particular data. The comparison between your series and similar ones within the TimeNet dataset can enhance your understanding of the context of your time series and facilitate better-informed forecasting decisions.
51
+ """
52
+ st.write(intro)
53
+
54
+ required_cols = ["ds", "y"]
55
+
56
+ with st.sidebar.expander("Dataset", expanded=True):
57
+ data_selection = st.selectbox("Select example dataset", DATASETS.keys())
58
+ data_url = DATASETS[data_selection]
59
+ url_json = st.text_input("Data (you can pass your own url here)", data_url)
60
+ st.write(
61
+ "You can also upload a CSV file like [this one](https://github.com/Nixtla/transfer-learning-time-series/blob/main/datasets/air_passengers.csv)."
62
+ )
63
+
64
+ uploaded_file = st.file_uploader("Upload CSV")
65
+ with st.form("Data"):
66
+
67
+ if uploaded_file is not None:
68
+ df = pd.read_csv(uploaded_file)
69
+ cols = df.columns
70
+ timestamp_col = st.selectbox("Timestamp column", options=cols)
71
+ value_col = st.selectbox("Value column", options=cols)
72
+ else:
73
+ timestamp_col = st.text_input("Timestamp column", value="timestamp")
74
+ value_col = st.text_input("Value column", value="value")
75
+ st.write("You must press Submit each time you want to forecast.")
76
+ submitted = st.form_submit_button("Submit")
77
+ if submitted:
78
+ if uploaded_file is None:
79
+ st.write("Please provide a dataframe.")
80
+ if url_json.endswith("json"):
81
+ df = pd.read_json(url_json)
82
+ else:
83
+ df = pd.read_csv(url_json)
84
+ df = df.rename(
85
+ columns=dict(zip([timestamp_col, value_col], required_cols))
86
+ )
87
+ else:
88
+ # df = pd.read_csv(uploaded_file)
89
+ df = df.rename(
90
+ columns=dict(zip([timestamp_col, value_col], required_cols))
91
+ )
92
+ else:
93
+ if url_json.endswith("json"):
94
+ df = pd.read_json(url_json)
95
+ else:
96
+ df = pd.read_csv(url_json)
97
+ cols = df.columns
98
+ if "unique_id" in cols:
99
+ cols = cols[-2:]
100
+ df = df.rename(columns=dict(zip(cols, required_cols)))
101
+
102
+ if "unique_id" not in df:
103
+ df.insert(0, "unique_id", "ts_0")
104
+
105
+ df["ds"] = pd.to_datetime(df["ds"])
106
+ df = df.sort_values(["unique_id", "ds"])
107
+ with st.sidebar:
108
+ seasonality = st.number_input("Seasonality of your data:", value=1)
109
+ top_k = st.number_input('Number of closest series:', value=12)
110
+
111
+ y_vector_feats = tsfeatures_vector(df.tail(100), seasonality)
112
+ closest_ids = get_closest_ids(y_vector_feats, top_k, INDEX)
113
+ st.header('Closest match from TimeNet')
114
+ st.write(
115
+ """
116
+ This side-by-side plot visualizes your uploaded time series (left) and its closest match from the TimeNet dataset (right).
117
+
118
+ By comparing these two plots, you can see how your data's behavior aligns with the most similar series in the TimeNet dataset. Similarities in trends, cycles, or patterns can indicate shared underlying structures or influences between your series and the TimeNet series.
119
+ """
120
+
121
+ )
122
+ st.pyplot(
123
+ plot_closest_series(df, closest_ids[0]['id'], CATALOGUE)
124
+ )
125
+ st.header('Potential winner models')
126
+ st.write(
127
+ """
128
+ This plot showcases the "win rate" of various predictive models.
129
+ Each model's win rate is based on how frequently it outperforms others when used to forecast the closest series to your own data.
130
+
131
+ This visualization allows you to compare the effectiveness of different models and identify which ones are more likely to provide accurate forecasts for your data.
132
+ """
133
+ )
134
+ st.pyplot(
135
+ plot_best_models_count(closest_ids, CATALOGUE)
136
+ )
137
+
138
+ if __name__ == "__main__":
139
+ st_timenet_features()
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fire
2
+ numpy
3
+ pandas
4
+ pinecone-client
5
+ plotly
6
+ pyarrow
7
+ python-dotenv
8
+ s3fs
9
+ seaborn
10
+ statsforecast
11
+ streamlit
12
+ streamlit-aggrid
13
+ tsfeatures
src/utils.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import pandas as pd
6
+ import seaborn as sns
7
+
8
+ from tsfeatures import (
9
+ tsfeatures, acf_features, arch_stat, crossing_points,
10
+ entropy, flat_spots, heterogeneity, holt_parameters,
11
+ lumpiness, nonlinearity, pacf_features, stl_features,
12
+ stability, hw_parameters, unitroot_kpss, unitroot_pp,
13
+ series_length, sparsity, hurst, statistics
14
+ )
15
+
16
+
17
+ FILE_CATALOGUE = os.environ['FILE_CATALOGUE']
18
+ BUCKET_TIMENET = os.environ['BUCKET_TIMENET']
19
+ KEY_TIMENET = os.environ['KEY_TIMENET']
20
+
21
+
22
+ FEATS_COLS = ['hurst', 'series_length', 'unitroot_pp', 'unitroot_kpss', 'hw_alpha',
23
+ 'hw_beta', 'hw_gamma', 'stability', 'nperiods', 'seasonal_period',
24
+ 'trend_strength', 'spike', 'linearity', 'curvature', 'e_acf1',
25
+ 'e_acf10', 'seasonal_strength', 'peak', 'trough', 'x_pacf5',
26
+ 'diff1x_pacf5', 'diff2x_pacf5', 'seas_pacf', 'nonlinearity',
27
+ 'lumpiness', 'alpha', 'beta', 'flat_spots', 'entropy',
28
+ 'crossing_points', 'arch_lm', 'x_acf1', 'x_acf10', 'diff1_acf1',
29
+ 'diff1_acf10', 'diff2_acf1', 'diff2_acf10', 'seas_acf1', 'sparsity',
30
+ 'total_sum', 'mean', 'variance', 'median', 'p2point5', 'p5', 'p25',
31
+ 'p75', 'p95', 'p97point5', 'max', 'min']
32
+
33
+ def tsfeatures_vector(df:pd.DataFrame, seasonality: int) -> pd.DataFrame:
34
+ ts_df = tsfeatures(
35
+ ts=df[['unique_id', 'ds', 'y']],
36
+ freq=seasonality,
37
+ features=[sparsity, acf_features, crossing_points,
38
+ entropy, flat_spots, holt_parameters,
39
+ lumpiness, nonlinearity, pacf_features, stl_features,
40
+ stability, hw_parameters, unitroot_kpss, unitroot_pp,
41
+ series_length, hurst, arch_stat, statistics],
42
+ scale=False,
43
+ ).rename(columns={'trend': 'trend_strength'})
44
+ if seasonality == 1:
45
+ # add missing features when seasonality != 1
46
+ ts_df[['seasonal_strength', 'peak', 'trough', 'seas_pacf', 'seas_acf1']] = np.nan
47
+ ts_df[['trend_strength', 'seasonal_strength']] = ts_df[['trend_strength', 'seasonal_strength']].fillna(0)
48
+ vector = ts_df[FEATS_COLS].fillna(0).iloc[0].values
49
+ vector = (vector - vector.min()) / (vector.max() - vector.min())
50
+ return vector.tolist()
51
+
52
+ def get_closest_ids(x: list, top_k: int, index_pinecone):
53
+ query_response = index_pinecone.query(
54
+ top_k=top_k,
55
+ include_values=False,
56
+ include_metadata=True,
57
+ vector=x,
58
+ )
59
+ return query_response['matches']
60
+
61
+ def plot_best_models_count(ids, catalogue):
62
+ uids = [x['id'] for x in ids]
63
+ file_evaluations = catalogue['file_evaluation'].loc[uids].unique()
64
+ eval_df = [pd.read_parquet(f_eval) for f_eval in file_evaluations]
65
+ eval_df = pd.concat(eval_df).query('unique_id in @uids')
66
+ eval_df = pd.pivot(
67
+ eval_df,
68
+ index=['unique_id', 'metric'],
69
+ columns='model',
70
+ values='value'
71
+ ).reset_index()
72
+ models = eval_df.drop(columns=['unique_id', 'metric']).columns
73
+ eval_df['BestModel'] = eval_df[models].idxmin(axis=1)
74
+ #eval_df = eval_df.groupby(['BestModel', 'metric']).size().rename('n').reset_index()
75
+ fig = sns.catplot(eval_df.query('metric != "mase"'), y='BestModel', kind='count', col='metric')
76
+ return fig
77
+
78
+ def plot_closest_series(Y_df, id, catalogue):
79
+ # leer archivo de file_timenet y hacer el plot
80
+ uid_catalogue = catalogue.loc[id]
81
+ closest_df = pd.read_parquet(uid_catalogue.file_timenet).query('unique_id == @id')
82
+ #Y_df['unique_id'] = 'ProvidedByUser'
83
+
84
+ # Create a figure with 1 row and 2 columns
85
+ fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
86
+
87
+ # Get the unique_id for each DataFrame
88
+ unique_id_Y_df = Y_df['unique_id'].unique()[0]
89
+ unique_id_closest_df = closest_df['unique_id'].unique()[0]
90
+
91
+ # Plot the 'y' column for both dataframes, against 'ds', and label them with unique_id
92
+ sns.lineplot(x='ds', y='y', ax=axes[0], data=Y_df, label=unique_id_Y_df)
93
+ sns.lineplot(x='ds', y='y', ax=axes[1], data=closest_df)
94
+
95
+ # Set the titles for the subplots
96
+ axes[0].set_title('Uploaded Dataset')
97
+ axes[1].set_title(f'TimenetTimeSeries:{uid_catalogue.dataset},{uid_catalogue.subdataset},{uid_catalogue.ts_name}')
98
+
99
+ # Show legend on each subplot
100
+ axes[0].legend()
101
+ axes[1].legend()
102
+
103
+ # Display the plot
104
+ plt.tight_layout()
105
+ plt.show()
106
+ return fig
107
+
108
+ def get_catalogue():
109
+ return pd.read_parquet(FILE_CATALOGUE)
110
+