File size: 6,977 Bytes
d54d2f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b22704e
 
 
 
 
 
 
 
 
 
 
 
 
 
d54d2f1
b22704e
d54d2f1
 
 
b22704e
d54d2f1
 
 
 
 
 
b22704e
 
d54d2f1
b22704e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d54d2f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b22704e
d54d2f1
 
 
 
 
 
 
 
b22704e
 
 
 
 
 
d54d2f1
b22704e
d54d2f1
 
b22704e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
import re

import numpy as np
import openai
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from statsforecast import StatsForecast
from statsforecast.models import Naive

openai.api_key = os.environ['OPENAI_API_KEY']

class ChatGPTForecast:

    def __init__(self):
        self.bins = np.linspace(0, 1, num=10_000)  # Create 1000 bins between -10 and 10
        self.mapping = {i: f"{i}" for i in range(len(self.bins))}
        self.prompt = f"""
        forecast this series, 
        (i know that you prefer using specific tools, but i'm testing something, 
        just give me your predicted numbers please, just print the numbers i dont need an explanation)

        please consider:
        - give the output with the same structure: "number1 number2 number3"
        - give more weight to the most recent observations
        - consider trend
        - consider seasonality
        """

    def tokenize_time_series(self, series):
        indices = np.digitize(series, self.bins) - 1  # Find which bin each data point falls into
        return ' '.join(self.mapping[i] for i in indices)

    def clean_string(self, s):
        pattern = r'(\d+)[^\s]*'
        # Extract the bin_# parts and join them with space
        cleaned = ' '.join(re.findall(pattern, s))
        return cleaned

    def extend_string(self, s, h):
        # Find all bin_# elements
        bin_numbers = re.findall(r'\d+', s)
        # Calculate current length
        current_length = len(bin_numbers)
        # If the string is already of length h, return as is
        if current_length == h:
            return s
        # If the string length exceeds h, trim the string
        elif current_length > h:
            bin_numbers = bin_numbers[:h]
            return ' '.join(bin_numbers)
        else:
            # Calculate how many full repeats we need
            repeats = h // current_length
            # If h is not a multiple of current_length, calculate how many more elements we need
            extra = h % current_length
            # Create the new string by repeating the original string and adding any extra elements
            new_string = ' '.join(bin_numbers * repeats + bin_numbers[:extra])
            return new_string

    def clean_gpt_output(self, output):
        # Remove extra spaces and trailing underscores
        cleaned_output = output.replace(" _", "_").replace("_ ", "_")
        # Trim any trailing underscore
        if cleaned_output.endswith("_"):
            cleaned_output = cleaned_output[:-1]
        return self.clean_string(cleaned_output)

    def decode_time_series(self, tokens):
        # Reverse the mapping
        reverse_mapping = {v: k for k, v in self.mapping.items()}
        # Split the token string into individual tokens and map them back to bin indices
        indices = [int(token) for token in tokens.split()]#[reverse_mapping[token] for token in tokens.split()]
        # Convert bin indices back to the original values
        # Here we'll use the center point of each bin
        bin_width = self.bins[1] - self.bins[0]
        series = [self.bins[i] + bin_width / 2 for i in indices]
        return series

    def find_min_max(self, string_of_integers):
        # Split the string into a list of strings
        str_list = string_of_integers.split()
    
        # Convert the list of strings into a list of integers
        int_list = [int(i) for i in str_list]
    
        # Find the minimum and maximum values
        min_value = min(int_list)
        max_value = max(int_list)
    
        return min_value, max_value

    def call_openai(self, series, seasonality, h, n_forecasts):
        series_tokenized = self.tokenize_time_series(series)
        min_val, max_val = self.find_min_max(series_tokenized)
        prompt = f"""
        {self.prompt}-consider {seasonality} as seasonality
        - just print {h} steps ahead
        - values should be integers between {min_val} and {max_val}, please be sure to do this
        

        this is the series: {series_tokenized}
        """
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            n=n_forecasts
        )
        choices = response['choices']
        outputs = []
        for choice in choices:
            output_gpt = choice['message']['content']
            if len(output_gpt.split()) < 2:
                continue
            output_gpt = self.extend_string(output_gpt, h)
            output_gpt = ' '.join(f'{max(min(int(x), len(self.bins) - 1), 0)}' for x in output_gpt.split())
            outputs.append(self.decode_time_series(output_gpt))
        outputs = np.vstack(outputs)
        return outputs
    
    def forward(self, series, seasonality, h, n_forecasts):
        outputs = self.call_openai(series, seasonality, h, n_forecasts)
        outputs = np.median(outputs, axis=0)
        return outputs

    def conformal_intervals(self, series, seasonality, h, n_forecasts):
        series_train, series_test = series[:-h], series[-h:]
        outputs = self.call_openai(series_train, seasonality, h, n_forecasts)
        errors = np.abs(outputs - series_test)
        lower_levels = np.quantile(errors, q=0.05, axis=0)
        upper_levels = np.quantile(errors, q=0.095, axis=0)
        return lower_levels, upper_levels

    def compute_ds_future(self, ds, fh):
        ds_ = pd.to_datetime(ds)
        try:
            freq = pd.infer_freq(ds_)
        except:
            freq = None
        if freq is not None:
            ds_future = pd.date_range(ds_[-1], periods=fh + 1, freq=freq)[1:]
        else:
            freq = ds_[-1] - ds_[-2]
            ds_future = [ds_[-1] + (i + 1) * freq for i in range(fh)]
        ds_future = list(map(str, ds_future))
        return ds_future, freq

    def forecast(self, df, h, input_size, n_forecasts=10):
        df = df.copy()
        scaler = MinMaxScaler()
        df['y'] = scaler.fit_transform(df[['y']])
        ds_future, freq = self.compute_ds_future(df['ds'].values, h)
        
        sf = StatsForecast(models=[Naive()], freq='D')
        fcst_df = sf.forecast(df=df, h=h)
        fcst_df['ds'] = ds_future
        fcst_df['ChatGPT_3.5_Turbo'] = self.forward(df['y'].values[-input_size:], freq, h, n_forecasts)[-h:]

        # add prediction intervals
        lower_levels, upper_levels = self.conformal_intervals(df['y'].values[-(input_size + h):], freq, h, n_forecasts)
        fcst_df['ChatGPT_3.5_Turbo-lo-90'] = fcst_df['ChatGPT_3.5_Turbo'] - lower_levels
        fcst_df['ChatGPT_3.5_Turbo-hi-90'] = fcst_df['ChatGPT_3.5_Turbo'] + upper_levels

        for col in ['Naive', 'ChatGPT_3.5_Turbo', 'ChatGPT_3.5_Turbo-lo-90', 'ChatGPT_3.5_Turbo-hi-90']:
            fcst_df[col] = scaler.inverse_transform(fcst_df[[col]])
        df['y'] = scaler.inverse_transform(df[['y']])
        return sf.plot(df, fcst_df, max_insample_length=3 * h, level=[90])