<a href="https://colab.research.google.com/github/jsebdev/Stock_Predictor/blob/main/stock_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [90]:
from google.colab import drive
drive.mount('/content/drive')
project_path = '/content/drive/MyDrive/projects/Stock_Predicter'
%cd $project_path

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/projects/Stock_Predicter


In [91]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pandas_datareader as web
import datetime as dt
import yfinance as yfin
import tensorflow as tf
import os
import re

from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM


# Get Data

In [92]:
# Select a company for now
ticker = 'AAPL'

start = dt.datetime(2013,1,1)
end = dt.datetime(2023,4,5)

In [93]:
yfin.pdr_override()
data = web.data.get_data_yahoo(ticker, start, end)


[*********************100%***********************]  1 of 1 completed


# Preprocess_data

In [94]:
def create_remove_columns(data):
  # create jump column
  data = pd.DataFrame.copy(data)
  data['Jump'] = data['Open'] - data['Close'].shift(1)
  data['Jump'].fillna(0, inplace=True)
  # data = data.reindex(columns=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Jump'])
  data.insert(0,'Jump', data.pop('Jump'))
  return data

In [95]:
def normalize_data(data, scaler=None):
  the_data = pd.DataFrame.copy(data)
  # substract the open value to all columns but the first one and the last one which are "Jump" and "Volume"
  the_data.iloc[:, 1:-1] = the_data.iloc[:,1:-1] - the_data['Open'].values[:, np.newaxis]
  # print('the_data')
  # print(the_data)

  the_data.pop('Open')
  # todo save an csv with the values for the scaler
  if scaler is None:
    # Create the scaler
    values = np.abs(the_data.values)
    max_value = np.max(values[:,:-1])
    max_volume = np.max(values[:,-1])
    def scaler(d):
      data = pd.DataFrame.copy(d)
      print('max_value: ', max_value)
      print('max_volume: ', max_volume)
      data.iloc[:, :-1] = data.iloc[:,:-1].apply(lambda x: x/max_value)
      data.iloc[:, -1] = data.iloc[:,-1].apply(lambda x: x/max_volume)
      return data
    def decoder(values):
      decoded_values = values * max_value
      return decoded_values
  else:
    decoder = None
  
  normalized_data = scaler(the_data)

  return normalized_data, scaler, decoder




In [96]:
def create_training_data(norm_data):
  prediction_days = 500
  
  x_train_list = []
  y_train_list = []
  
  for i in range(prediction_days, len(norm_data)):
    x_train_list.append(norm_data[i-prediction_days:i])
    y_train_list.append(norm_data.iloc[i].values[0:4])
  
  x_train = np.array(x_train_list)
  y_train = np.array(y_train_list)
  return x_train, y_train

In [97]:
#Make all the preprocesing
def preprocessing(data, scaler=None):
  # print(data.head(3))
  data_0 = create_remove_columns(data)
  # print(data_0.head(3))
  #todo: save the_scaler somehow to use in new runtimes
  norm_data, scaler, decoder = normalize_data(data_0, scaler=scaler)
  # print(norm_data.head(3))
  x_train, y_train = create_training_data(norm_data)
  # print(x_train.shape, y_train.shape)
  return x_train, y_train, scaler, decoder

In [98]:
x_train, y_train, scaler, decoder = preprocessing(data)

max_value:  10.589996337890625
max_volume:  1460852400.0


In [99]:
print(x_train.shape)
x_train[1,499,:]

(2082, 500, 6)


array([ 0.00212456,  0.05712934, -0.00212456,  0.04461756, -0.22778379,
        0.09233239])

In [100]:
td = data.iloc[498:501]
# print('td:\n',td)
td0 = create_remove_columns(td)
print('td0:\n',td0)
print(decoder(y_train[0]))

td0:
                 Jump       Open       High        Low      Close  Adj Close  \
Date                                                                          
2014-12-23  0.000000  28.307501  28.332500  28.115000  28.135000  25.286961   
2014-12-24  0.010000  28.145000  28.177500  28.002501  28.002501  25.167873   
2014-12-26  0.022499  28.025000  28.629999  28.002501  28.497499  25.612770   

               Volume  
Date                   
2014-12-23  104113600  
2014-12-24   57918400  
2014-12-26  134884000  
[ 0.02249908  0.60499954 -0.02249908  0.47249985]


# Model

## Create Model

In [101]:
def create_model():
  model = Sequential()
  # model.add(LSTM(units=112, return_sequences=True, input_shape=(x_train.shape[1:])))
  model.add(LSTM(units=1000, return_sequences=True, input_shape=(None,x_train.shape[-1],)))
  model.add(Dropout(0.2))
  model.add(LSTM(units=1000, return_sequences=True))
  model.add(Dropout(0.2))
  model.add(LSTM(units=1000))
  model.add(Dropout(0.2))
  model.add(Dense(units=4))
  return model

model = create_model()
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, None, 1000)        4028000   
                                                                 
 dropout_6 (Dropout)         (None, None, 1000)        0         
                                                                 
 lstm_7 (LSTM)               (None, None, 1000)        8004000   
                                                                 
 dropout_7 (Dropout)         (None, None, 1000)        0         
                                                                 
 lstm_8 (LSTM)               (None, 1000)              8004000   
                                                                 
 dropout_8 (Dropout)         (None, 1000)              0         
                                                                 
 dense_2 (Dense)             (None, 4)                

In [102]:
model.compile(optimizer='adam', loss='mean_squared_error')

## Model Train

In [103]:
print(x_train.shape)
print(y_train.shape)

(2082, 500, 6)
(2082, 4)


In [104]:
# Change to False to avoid trainging the model
# if False:
if True:
  # Directory where the checkpoints will be saved
  checkpoint_dir = './training_checkpoints_'+dt.datetime.now().strftime("%Y%m%d%H%M%S")
  # Name of the checkpoint files
  checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_epoch{epoch}_loss{loss}")
  
  checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
      filepath=checkpoint_prefix,
      save_weights_only=True,
      monitor="loss", mode="min",
      save_best_only=True)
  model.fit(x_train, y_train, epochs=25, batch_size=32, callbacks=[checkpoint_callback])


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25

KeyboardInterrupt: ignored

# Testing a model

In [None]:
#print trainings directories to pick one
!ls -ld training_checkpoints_*/

In [105]:
test_model = create_model()

In [107]:
# if checkpoint_dir does not exists, select the one stated in the except block
try:
  checkpoint_dir
except NameError: 
  checkpoint_dir = './training_checkpoints_20230406214431'

print(checkpoint_dir)

def load_weights(epoch=None):
  if epoch is None:
    weights_file = tf.train.latest_checkpoint(checkpoint_dir)
  else:
    with os.scandir(checkpoint_dir) as entries:
      for entry in entries:
        if re.search(f'^ckpt_epoch{epoch}_.*\.index', entry.name):
          weights_file = checkpoint_dir + '/'+ entry.name[:-6]

  print(weights_file)
  test_model.load_weights(weights_file)
  return test_model

test_model = load_weights()

./training_checkpoints_20230406230143
./training_checkpoints_20230406230143/ckpt_epoch16_loss0.01097947172820568


In [114]:
test_start = dt.datetime(2013,1,1)
end = dt.datetime(2023,4,5)

yfin.pdr_override()
test_data = web.data.get_data_yahoo(ticker, test_start, test_end)

[*********************100%***********************]  1 of 1 completed


In [115]:
# def close_tester(model, test_data, scaler=None):
model = test_model
scaler = scaler
test_x_train, test_y_train, _, _ = preprocessing(data, scaler=scaler)
print(test_x_train.shape)
print(test_y_train.shape)
results = model.predict(test_x_train)
# the results are tensors of 4 numbers, Jump, High, Low, and Close respectively

# close_tester(test_model, test_data, scaler=the_scaler)


max_value:  10.589996337890625
max_volume:  1460852400.0
(2082, 500, 6)
(2082, 4)


In [120]:
right_counter = 0
wrong_counter = 0
no_action_counter = 0
# for result, expected in zip(results[:2], test_y_train[:2]):
for result, expected in zip(results[:], test_y_train[:]):
  # print(result)
  # print(expected)
  comparer = result[3] * expected[3]
  if comparer > 0:
    right_counter += 1
  elif comparer == 0:
    no_action_counter
  elif comparer < 0:
    wrong_counter += 1

  # print('expected: ', decoder(expected))
  # print('result: ', decoder(result))

print('right_counter :', right_counter)
print('no_action_counter :',no_action_counter)
print('wrong_counter :', wrong_counter)
print('success rate: {}%'.format(right_counter*100/len(results)))

right_counter : 1118
no_action_counter : 0
wrong_counter : 959
success rate: 53.6983669548511%


In [123]:
test_data.iloc[500,:]

Open         2.802500e+01
High         2.863000e+01
Low          2.800250e+01
Close        2.849750e+01
Adj Close    2.561277e+01
Volume       1.348840e+08
Name: 2014-12-26 00:00:00, dtype: float64