Spaces:

Nixtla
/

chatgpt-forecast

Runtime error

App Files Files Community

chatgpt-forecast / src /utils.py

azulgarza

fix: mv utils module

e676cab about 2 years ago

raw

history blame

5.17 kB

	import os
	import re

	import numpy as np
	import openai
	import pandas as pd
	from sklearn.preprocessing import MinMaxScaler
	from statsforecast import StatsForecast
	from statsforecast.models import Naive

	openai.api_key = os.environ['OPENAI_API_KEY']


	class ChatGPTForecast:

	def __init__(self):
	self.bins = np.linspace(0, 1, num=10_000) # Create 1000 bins between -10 and 10
	self.mapping = {i: f"{i}" for i in range(len(self.bins))}
	self.prompt = f"""
	forecast this series,
	(i know that you prefer using specific tools, but i'm testing something,
	just give me your predicted numbers please, just print the numbers i dont need an explanation)

	please consider:
	- give the output with the same structure: "number1 number2 number3"
	- give more weight to the most recent observations
	- consider trend
	- consider seasonality
	- values should lie between 0 and {len(self.bins) - 1}, please be sure to do this
	"""

	def tokenize_time_series(self, series):
	indices = np.digitize(series, self.bins) - 1 # Find which bin each data point falls into
	return ' '.join(self.mapping[i] for i in indices)

	def clean_string(self, s):
	pattern = r'(\d+)[^\s]*'
	# Extract the bin_# parts and join them with space
	cleaned = ' '.join(re.findall(pattern, s))
	return cleaned

	def extend_string(self, s, h):
	# Find all bin_# elements
	bin_numbers = re.findall(r'\d+', s)
	# Calculate current length
	current_length = len(bin_numbers)
	# If the string is already of length h, return as is
	if current_length == h:
	return s
	# If the string length exceeds h, trim the string
	elif current_length > h:
	bin_numbers = bin_numbers[:h]
	return ' '.join(bin_numbers)
	else:
	# Calculate how many full repeats we need
	repeats = h // current_length
	# If h is not a multiple of current_length, calculate how many more elements we need
	extra = h % current_length
	# Create the new string by repeating the original string and adding any extra elements
	new_string = ' '.join(bin_numbers * repeats + bin_numbers[:extra])
	return new_string

	def clean_gpt_output(self, output):
	# Remove extra spaces and trailing underscores
	cleaned_output = output.replace(" _", "_").replace("_ ", "_")
	# Trim any trailing underscore
	if cleaned_output.endswith("_"):
	cleaned_output = cleaned_output[:-1]
	return self.clean_string(cleaned_output)

	def decode_time_series(self, tokens):
	# Reverse the mapping
	reverse_mapping = {v: k for k, v in self.mapping.items()}
	# Split the token string into individual tokens and map them back to bin indices
	indices = [int(token) for token in tokens.split()]#[reverse_mapping[token] for token in tokens.split()]
	# Convert bin indices back to the original values
	# Here we'll use the center point of each bin
	bin_width = self.bins[1] - self.bins[0]
	series = [self.bins[i] + bin_width / 2 for i in indices]
	return series

	def forward(self, series, seasonality, h):
	series_tokenized = self.tokenize_time_series(series)
	prompt = f"""
	{self.prompt}-consider {seasonality} as seasonality
	- just print {h} steps ahead


	this is the series: {series_tokenized}
	"""
	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[{"role": "user", "content": prompt}]
	)
	output_gpt = response['choices'][0]['message']['content']
	output_gpt = self.extend_string(output_gpt, h)
	output_gpt = ' '.join(f'{max(min(int(x), len(self.bins) - 1), 0)}' for x in output_gpt.split())
	return self.decode_time_series(output_gpt)

	def compute_ds_future(self, ds, fh):
	ds_ = pd.to_datetime(ds)
	try:
	freq = pd.infer_freq(ds_)
	except:
	freq = None
	if freq is not None:
	ds_future = pd.date_range(ds_[-1], periods=fh + 1, freq=freq)[1:]
	else:
	freq = ds_[-1] - ds_[-2]
	ds_future = [ds_[-1] + (i + 1) * freq for i in range(fh)]
	ds_future = list(map(str, ds_future))
	return ds_future, freq

	def forecast(self, df, h, input_size):
	df = df.copy()
	scaler = MinMaxScaler()
	df['y'] = scaler.fit_transform(df[['y']])
	ds_future, freq = self.compute_ds_future(df['ds'].values, h)

	sf = StatsForecast(models=[Naive()], freq='D')
	fcst_df = sf.forecast(df=df, h=h)
	fcst_df['ds'] = ds_future
	fcst_df['ChatGPT-3.5-Turbo'] = self.forward(df['y'].values[-input_size:], freq, h)[-h:]

	for col in ['Naive', 'ChatGPT-3.5-Turbo']:
	fcst_df[col] = scaler.inverse_transform(fcst_df[[col]])
	df['y'] = scaler.inverse_transform(df[['y']])
	return sf.plot(df, fcst_df, max_insample_length=3 * h)