File size: 5,169 Bytes
d54d2f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import re

import numpy as np
import openai
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from statsforecast import StatsForecast
from statsforecast.models import Naive

openai.api_key = os.environ['OPENAI_API_KEY']


class ChatGPTForecast:

    def __init__(self):
        self.bins = np.linspace(0, 1, num=10_000)  # Create 1000 bins between -10 and 10
        self.mapping = {i: f"{i}" for i in range(len(self.bins))}
        self.prompt = f"""
        forecast this series, 
        (i know that you prefer using specific tools, but i'm testing something, 
        just give me your predicted numbers please, just print the numbers i dont need an explanation)

        please consider:
        - give the output with the same structure: "number1 number2 number3"
        - give more weight to the most recent observations
        - consider trend
        - consider seasonality
        - values should lie between 0 and {len(self.bins) - 1}, please be sure to do this
        """

    def tokenize_time_series(self, series):
        indices = np.digitize(series, self.bins) - 1  # Find which bin each data point falls into
        return ' '.join(self.mapping[i] for i in indices)

    def clean_string(self, s):
        pattern = r'(\d+)[^\s]*'
        # Extract the bin_# parts and join them with space
        cleaned = ' '.join(re.findall(pattern, s))
        return cleaned

    def extend_string(self, s, h):
        # Find all bin_# elements
        bin_numbers = re.findall(r'\d+', s)
        # Calculate current length
        current_length = len(bin_numbers)
        # If the string is already of length h, return as is
        if current_length == h:
            return s
        # If the string length exceeds h, trim the string
        elif current_length > h:
            bin_numbers = bin_numbers[:h]
            return ' '.join(bin_numbers)
        else:
            # Calculate how many full repeats we need
            repeats = h // current_length
            # If h is not a multiple of current_length, calculate how many more elements we need
            extra = h % current_length
            # Create the new string by repeating the original string and adding any extra elements
            new_string = ' '.join(bin_numbers * repeats + bin_numbers[:extra])
            return new_string

    def clean_gpt_output(self, output):
        # Remove extra spaces and trailing underscores
        cleaned_output = output.replace(" _", "_").replace("_ ", "_")
        # Trim any trailing underscore
        if cleaned_output.endswith("_"):
            cleaned_output = cleaned_output[:-1]
        return self.clean_string(cleaned_output)

    def decode_time_series(self, tokens):
        # Reverse the mapping
        reverse_mapping = {v: k for k, v in self.mapping.items()}
        # Split the token string into individual tokens and map them back to bin indices
        indices = [int(token) for token in tokens.split()]#[reverse_mapping[token] for token in tokens.split()]
        # Convert bin indices back to the original values
        # Here we'll use the center point of each bin
        bin_width = self.bins[1] - self.bins[0]
        series = [self.bins[i] + bin_width / 2 for i in indices]
        return series

    def forward(self, series, seasonality, h):
        series_tokenized = self.tokenize_time_series(series)
        prompt = f"""
        {self.prompt}-consider {seasonality} as seasonality
        - just print {h} steps ahead
        

        this is the series: {series_tokenized}
        """
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}]
        )
        output_gpt = response['choices'][0]['message']['content']
        output_gpt = self.extend_string(output_gpt, h)
        output_gpt = ' '.join(f'{max(min(int(x), len(self.bins) - 1), 0)}' for x in output_gpt.split())
        return self.decode_time_series(output_gpt)

    def compute_ds_future(self, ds, fh):
        ds_ = pd.to_datetime(ds)
        try:
            freq = pd.infer_freq(ds_)
        except:
            freq = None
        if freq is not None:
            ds_future = pd.date_range(ds_[-1], periods=fh + 1, freq=freq)[1:]
        else:
            freq = ds_[-1] - ds_[-2]
            ds_future = [ds_[-1] + (i + 1) * freq for i in range(fh)]
        ds_future = list(map(str, ds_future))
        return ds_future, freq

    def forecast(self, df, h, input_size):
        df = df.copy()
        scaler = MinMaxScaler()
        df['y'] = scaler.fit_transform(df[['y']])
        ds_future, freq = self.compute_ds_future(df['ds'].values, h)
        
        sf = StatsForecast(models=[Naive()], freq='D')
        fcst_df = sf.forecast(df=df, h=h)
        fcst_df['ds'] = ds_future
        fcst_df['ChatGPT-3.5-Turbo'] = self.forward(df['y'].values[-input_size:], freq, h)[-h:]

        for col in ['Naive', 'ChatGPT-3.5-Turbo']:
            fcst_df[col] = scaler.inverse_transform(fcst_df[[col]])
        df['y'] = scaler.inverse_transform(df[['y']])
        return sf.plot(df, fcst_df, max_insample_length=3 * h)