azulgarza commited on
Commit
d54d2f1
1 Parent(s): a9104a2

feat: add chatgpt app

Browse files
Files changed (3) hide show
  1. app.py +115 -0
  2. requirements.txt +17 -0
  3. utils.py +131 -0
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+
4
+ from src.utils import ChatGPTForecast
5
+
6
+
7
+ DATASETS = {
8
+ "Demand (AirPassengers)": "https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/air_passengers.csv",
9
+ #"Electriciy (ERCOT, multiple markets)": "https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/ercot_multiple_ts.csv",
10
+ "Web Traffic (Peyton Manning)": "https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/peyton_manning.csv",
11
+ "Finance (Exchange USD-EUR)": "https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/usdeur.csv",
12
+ "Electricity (Ercot COAST)": "https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/ercot_COAST.csv",
13
+ }
14
+ gpt_forecast = ChatGPTForecast()
15
+
16
+
17
+ def st_chatgpt_forecast():
18
+ st.set_page_config(
19
+ page_title="ChatGPT Forecast",
20
+ page_icon="🔮",
21
+ layout="wide",
22
+ initial_sidebar_state="expanded",
23
+ )
24
+
25
+ st.title(
26
+ "ChatGPT Forecast: Revolutionizing Time Series by Nixtla"
27
+ )
28
+ st.write(
29
+ "<style>div.block-container{padding-top:2rem;}</style>", unsafe_allow_html=True
30
+ )
31
+
32
+ intro = """
33
+ This application is designed to analyze time series forecasting tasks by leveraging the power of OpenAI's ChatGPT.
34
+
35
+ Here's how it works:
36
+
37
+ 1. **Upload Your Data**: You can upload your own time series data which will be used to generate forecasts.
38
+
39
+ 2. **Forecast with GPT**: Our application utilizes the advanced language model, ChatGPT, to generate time series forecasts. ChatGPT has been trained on a diverse range of internet text, but it also has the unique ability to generate numerical sequences, making it a fascinating tool for time series forecasting.
40
+
41
+ 3. **Compare with Naive Forecast**: We provide a simple naive forecast as a benchmark for comparison. This forecast is based on the simple assumption that future values will be the same as the most recent observed value.
42
+
43
+ By comparing the GPT-based forecast against the naive model, you can gain insights into the capabilities and potential advantages of using advanced AI models for time series prediction.
44
+
45
+ Please note that this application is meant for experimental purposes and the forecasts generated by the AI should not be used for making real-world decisions without proper consideration and additional checks.
46
+ """
47
+ st.write(intro)
48
+
49
+ required_cols = ["ds", "y"]
50
+
51
+ with st.sidebar.expander("Dataset", expanded=True):
52
+ data_selection = st.selectbox("Select example dataset", DATASETS.keys())
53
+ data_url = DATASETS[data_selection]
54
+ url_json = st.text_input("Data (you can pass your own url here)", data_url)
55
+ st.write(
56
+ "You can also upload a CSV file like [this one](https://github.com/Nixtla/transfer-learning-time-series/blob/main/datasets/air_passengers.csv)."
57
+ )
58
+
59
+ uploaded_file = st.file_uploader("Upload CSV")
60
+ with st.form("Data"):
61
+
62
+ if uploaded_file is not None:
63
+ df = pd.read_csv(uploaded_file)
64
+ cols = df.columns
65
+ timestamp_col = st.selectbox("Timestamp column", options=cols)
66
+ value_col = st.selectbox("Value column", options=cols)
67
+ else:
68
+ timestamp_col = st.text_input("Timestamp column", value="timestamp")
69
+ value_col = st.text_input("Value column", value="value")
70
+ st.write("You must press Submit each time you want to forecast.")
71
+ submitted = st.form_submit_button("Submit")
72
+ if submitted:
73
+ if uploaded_file is None:
74
+ st.write("Please provide a dataframe.")
75
+ if url_json.endswith("json"):
76
+ df = pd.read_json(url_json)
77
+ else:
78
+ df = pd.read_csv(url_json)
79
+ df = df.rename(
80
+ columns=dict(zip([timestamp_col, value_col], required_cols))
81
+ )
82
+ else:
83
+ # df = pd.read_csv(uploaded_file)
84
+ df = df.rename(
85
+ columns=dict(zip([timestamp_col, value_col], required_cols))
86
+ )
87
+ else:
88
+ if url_json.endswith("json"):
89
+ df = pd.read_json(url_json)
90
+ else:
91
+ df = pd.read_csv(url_json)
92
+ cols = df.columns
93
+ if "unique_id" in cols:
94
+ cols = cols[-2:]
95
+ df = df.rename(columns=dict(zip(cols, required_cols)))
96
+
97
+ if "unique_id" not in df:
98
+ df.insert(0, "unique_id", "ts_0")
99
+
100
+ df["ds"] = pd.to_datetime(df["ds"])
101
+ df = df.sort_values(["unique_id", "ds"])
102
+ with st.sidebar:
103
+ horizon = st.number_input("Forecasting horizon to predict:", value=24)
104
+ input_size = st.number_input("Number of values to make inference:", value=12)
105
+
106
+ st.header("Forecasts generated by ChatGPT against a Naive model")
107
+ fig = gpt_forecast.forecast(df, horizon, input_size)
108
+ fig.update_layout(height=400)
109
+ st.plotly_chart(
110
+ fig,
111
+ use_container_width=True,
112
+ )
113
+
114
+ if __name__ == "__main__":
115
+ st_chatgpt_forecast()
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fire
2
+ git+https://github.com/nixtla/statsforecast.git
3
+ jupyterlab
4
+ numpy
5
+ openai
6
+ pandas
7
+ pinecone-client
8
+ plotly
9
+ pyarrow
10
+ python-dotenv
11
+ s3fs
12
+ seaborn
13
+ streamlit
14
+ streamlit-aggrid
15
+ torch
16
+ transformers
17
+ tsfeatures
utils.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ import numpy as np
5
+ import openai
6
+ import pandas as pd
7
+ from sklearn.preprocessing import MinMaxScaler
8
+ from statsforecast import StatsForecast
9
+ from statsforecast.models import Naive
10
+
11
+ openai.api_key = os.environ['OPENAI_API_KEY']
12
+
13
+
14
+ class ChatGPTForecast:
15
+
16
+ def __init__(self):
17
+ self.bins = np.linspace(0, 1, num=10_000) # Create 1000 bins between -10 and 10
18
+ self.mapping = {i: f"{i}" for i in range(len(self.bins))}
19
+ self.prompt = f"""
20
+ forecast this series,
21
+ (i know that you prefer using specific tools, but i'm testing something,
22
+ just give me your predicted numbers please, just print the numbers i dont need an explanation)
23
+
24
+ please consider:
25
+ - give the output with the same structure: "number1 number2 number3"
26
+ - give more weight to the most recent observations
27
+ - consider trend
28
+ - consider seasonality
29
+ - values should lie between 0 and {len(self.bins) - 1}, please be sure to do this
30
+ """
31
+
32
+ def tokenize_time_series(self, series):
33
+ indices = np.digitize(series, self.bins) - 1 # Find which bin each data point falls into
34
+ return ' '.join(self.mapping[i] for i in indices)
35
+
36
+ def clean_string(self, s):
37
+ pattern = r'(\d+)[^\s]*'
38
+ # Extract the bin_# parts and join them with space
39
+ cleaned = ' '.join(re.findall(pattern, s))
40
+ return cleaned
41
+
42
+ def extend_string(self, s, h):
43
+ # Find all bin_# elements
44
+ bin_numbers = re.findall(r'\d+', s)
45
+ # Calculate current length
46
+ current_length = len(bin_numbers)
47
+ # If the string is already of length h, return as is
48
+ if current_length == h:
49
+ return s
50
+ # If the string length exceeds h, trim the string
51
+ elif current_length > h:
52
+ bin_numbers = bin_numbers[:h]
53
+ return ' '.join(bin_numbers)
54
+ else:
55
+ # Calculate how many full repeats we need
56
+ repeats = h // current_length
57
+ # If h is not a multiple of current_length, calculate how many more elements we need
58
+ extra = h % current_length
59
+ # Create the new string by repeating the original string and adding any extra elements
60
+ new_string = ' '.join(bin_numbers * repeats + bin_numbers[:extra])
61
+ return new_string
62
+
63
+ def clean_gpt_output(self, output):
64
+ # Remove extra spaces and trailing underscores
65
+ cleaned_output = output.replace(" _", "_").replace("_ ", "_")
66
+ # Trim any trailing underscore
67
+ if cleaned_output.endswith("_"):
68
+ cleaned_output = cleaned_output[:-1]
69
+ return self.clean_string(cleaned_output)
70
+
71
+ def decode_time_series(self, tokens):
72
+ # Reverse the mapping
73
+ reverse_mapping = {v: k for k, v in self.mapping.items()}
74
+ # Split the token string into individual tokens and map them back to bin indices
75
+ indices = [int(token) for token in tokens.split()]#[reverse_mapping[token] for token in tokens.split()]
76
+ # Convert bin indices back to the original values
77
+ # Here we'll use the center point of each bin
78
+ bin_width = self.bins[1] - self.bins[0]
79
+ series = [self.bins[i] + bin_width / 2 for i in indices]
80
+ return series
81
+
82
+ def forward(self, series, seasonality, h):
83
+ series_tokenized = self.tokenize_time_series(series)
84
+ prompt = f"""
85
+ {self.prompt}-consider {seasonality} as seasonality
86
+ - just print {h} steps ahead
87
+
88
+
89
+ this is the series: {series_tokenized}
90
+ """
91
+ response = openai.ChatCompletion.create(
92
+ model="gpt-3.5-turbo",
93
+ messages=[{"role": "user", "content": prompt}]
94
+ )
95
+ output_gpt = response['choices'][0]['message']['content']
96
+ output_gpt = self.extend_string(output_gpt, h)
97
+ output_gpt = ' '.join(f'{max(min(int(x), len(self.bins) - 1), 0)}' for x in output_gpt.split())
98
+ return self.decode_time_series(output_gpt)
99
+
100
+ def compute_ds_future(self, ds, fh):
101
+ ds_ = pd.to_datetime(ds)
102
+ try:
103
+ freq = pd.infer_freq(ds_)
104
+ except:
105
+ freq = None
106
+ if freq is not None:
107
+ ds_future = pd.date_range(ds_[-1], periods=fh + 1, freq=freq)[1:]
108
+ else:
109
+ freq = ds_[-1] - ds_[-2]
110
+ ds_future = [ds_[-1] + (i + 1) * freq for i in range(fh)]
111
+ ds_future = list(map(str, ds_future))
112
+ return ds_future, freq
113
+
114
+ def forecast(self, df, h, input_size):
115
+ df = df.copy()
116
+ scaler = MinMaxScaler()
117
+ df['y'] = scaler.fit_transform(df[['y']])
118
+ ds_future, freq = self.compute_ds_future(df['ds'].values, h)
119
+
120
+ sf = StatsForecast(models=[Naive()], freq='D')
121
+ fcst_df = sf.forecast(df=df, h=h)
122
+ fcst_df['ds'] = ds_future
123
+ fcst_df['ChatGPT-3.5-Turbo'] = self.forward(df['y'].values[-input_size:], freq, h)[-h:]
124
+
125
+ for col in ['Naive', 'ChatGPT-3.5-Turbo']:
126
+ fcst_df[col] = scaler.inverse_transform(fcst_df[[col]])
127
+ df['y'] = scaler.inverse_transform(df[['y']])
128
+ return sf.plot(df, fcst_df, max_insample_length=3 * h)
129
+
130
+
131
+