pmthangk09 commited on
Commit
a6f70d0
·
2 Parent(s): ee56cf8 f7ed643

Merge branch 'main' of https://huggingface.co/spaces/ATB/AI-trade-bot-demo

Browse files
Files changed (4) hide show
  1. rl_agent/env.py +28 -29
  2. rl_agent/policy.py +6 -6
  3. rl_agent/test_env.py +127 -0
  4. rl_agent/utils.py +35 -0
rl_agent/env.py CHANGED
@@ -1,53 +1,47 @@
1
  import numpy as np
2
  import pandas as pd
 
3
 
4
  class Environment:
5
 
6
- def __init__(self, data, history_t=90):
7
  self.data = data
 
8
  self.history_t = history_t
 
 
9
  self.reset()
10
 
11
  def reset(self):
12
  self.t = 0
13
  self.done = False
14
  self.profits = 0
15
- self.positions = []
16
- self.position_value = 0
17
- self.history = [0 for _ in range(self.history_t)]
18
  return [self.position_value] + self.history # obs
19
 
20
  def step(self, act):
21
  reward = 0
22
 
23
- # act = 0: stay, 1: buy, -1: sell
24
- if act == 1:
25
- self.positions.append(self.data.iloc[self.t, :]['Close'])
26
- elif act == 2: # sell
27
- if len(self.positions) == 0:
28
- reward = -1
29
- else:
30
- profits = 0
31
- for p in self.positions:
32
- profits += (self.data.iloc[self.t, :]['Close'] - p)
33
- reward += profits
34
- self.profits += profits
35
- self.positions = []
36
 
 
 
 
 
 
37
  # set next time
38
  self.t += 1
39
- self.position_value = 0
40
- for p in self.positions:
41
- self.position_value += (self.data.iloc[self.t, :]['Close'] - p)
42
  self.history.pop(0)
43
- self.history.append(self.data.iloc[self.t, :]['Close'] - self.data.iloc[(self.t-1), :]['Close'])
44
 
45
- # clipping reward
46
- if reward > 0:
47
- reward = 1
48
- elif reward < 0:
49
- reward = -1
50
 
 
 
51
  return [self.position_value] + self.history, reward, self.done # obs, reward, done
52
 
53
 
@@ -64,9 +58,14 @@ if __name__ == "__main__":
64
  test = data[date_split:]
65
  print(train.head(10))
66
 
67
- env = Environment(train)
 
 
 
 
 
68
  print(env.reset())
69
- for _ in range(3):
70
  pact = np.random.randint(3)
71
- print(env.step(pact))
72
 
 
1
  import numpy as np
2
  import pandas as pd
3
+ import torch
4
 
5
  class Environment:
6
 
7
+ def __init__(self, data, history_t=8, history=[0.1, 0.2, -0.1, -0.2, 0., 0.5, 0.9], state_size=9):
8
  self.data = data
9
+ self.history = history
10
  self.history_t = history_t
11
+ self.state_size = state_size
12
+ self.cost_rate = 0.0001
13
  self.reset()
14
 
15
  def reset(self):
16
  self.t = 0
17
  self.done = False
18
  self.profits = 0
19
+ self.position_value = 0.
20
+ self.history = self.history[:self.state_size - 1]
 
21
  return [self.position_value] + self.history # obs
22
 
23
  def step(self, act):
24
  reward = 0
25
 
26
+ # act = 0: stay, act > 0: buy, act < 0: sell
27
+ #Additive profits
28
+ cost_amount = np.abs(act-self.position_value)
 
 
 
 
 
 
 
 
 
 
29
 
30
+ Zt = self.data.iloc[self.t, :]['Close'] - self.data.iloc[(self.t-1), :]['Close']
31
+ reward = (self.position_value * Zt) - (self.cost_rate * cost_amount)
32
+ self.profit = self.position_value * Zt
33
+ self.profits += self.profit
34
+
35
  # set next time
36
  self.t += 1
37
+ self.position_value = act
38
+
 
39
  self.history.pop(0)
 
40
 
41
+ self.history.append(self.data.iloc[self.t, :]['Close'] - self.data.iloc[(self.t-1), :]['Close']) # the price being traded
 
 
 
 
42
 
43
+ self.position_value = self.position_value.item()
44
+
45
  return [self.position_value] + self.history, reward, self.done # obs, reward, done
46
 
47
 
 
58
  test = data[date_split:]
59
  print(train.head(10))
60
 
61
+ history = []
62
+ for i in range(1, 9):
63
+ c = train.iloc[i, :]['Close'] - train.iloc[i-1, :]['Close']
64
+ history.append(c)
65
+
66
+ env = Environment(train, history=history)
67
  print(env.reset())
68
+ for _ in range(9, 12):
69
  pact = np.random.randint(3)
70
+ print(env.step(pact)[1])
71
 
rl_agent/policy.py CHANGED
@@ -8,19 +8,19 @@ class Policy(nn.Module):
8
 
9
  super(Policy, self).__init__()
10
 
11
- self.layer1 = nn.Linear(input_channels, 2 * input_channels)
12
  self.tanh1 = nn.Tanh()
13
- self.layer2 = nn.linear(2 * input_channels, 1)
14
- self.tanh2 = nn.Tanh()
15
 
16
  def forward(self, state):
17
 
18
  hidden = self.layer1(state)
19
  hidden = self.tanh1(hidden)
20
- hidden = self.layer2(hidden)
21
- action = self.tanh2(hidden)
22
 
23
- return action
24
 
25
 
26
 
 
8
 
9
  super(Policy, self).__init__()
10
 
11
+ self.layer1 = nn.Linear(input_channels, 1)
12
  self.tanh1 = nn.Tanh()
13
+ # self.layer2 = nn.Linear(2 * input_channels, 1)
14
+ # self.tanh2 = nn.Tanh()
15
 
16
  def forward(self, state):
17
 
18
  hidden = self.layer1(state)
19
  hidden = self.tanh1(hidden)
20
+ # hidden = self.layer2(hidden)
21
+ # action = self.tanh2(hidden)
22
 
23
+ return hidden
24
 
25
 
26
 
rl_agent/test_env.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from env import Environment
2
+ from policy import Policy
3
+ from utils import myOptimizer
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ import torch
8
+ from collections import OrderedDict
9
+
10
+ import matplotlib.pyplot as plt
11
+
12
+ from tqdm import tqdm
13
+ from torch.utils.tensorboard import SummaryWriter
14
+
15
+ if __name__ == "__main__":
16
+ writer = SummaryWriter('runs/new_data_ex_7')
17
+
18
+ # data = pd.read_csv('./data/EURUSD_Candlestick_1_M_BID_01.01.2021-04.02.2023.csv')
19
+ data = pd.read_csv('./data/EURUSD_Candlestick_30_M_BID_01.01.2021-04.02.2023.csv')
20
+ # data['Local time'] = pd.to_datetime(data['Local time'])
21
+ data = data.set_index('Local time')
22
+ print(data.index.min(), data.index.max())
23
+
24
+ # date_split = '19.09.2022 17:55:00.000 GMT-0500'
25
+ # date_split = '25.08.2022 04:30:00.000 GMT-0500' # 30 min
26
+
27
+ date_split = '03.02.2023 15:30:00.000 GMT-0600' # 30 min
28
+
29
+ train = data[:date_split]
30
+ test = data[date_split:]
31
+
32
+
33
+ learning_rate = 0.001
34
+ first_momentum = 0.0
35
+ second_momentum = 0.0001
36
+ transaction_cost = 0.0001
37
+ adaptation_rate = 0.01
38
+ state_size = 15
39
+ equity = 1.0
40
+
41
+ agent = Policy(input_channels=state_size)
42
+ optimizer = myOptimizer(learning_rate, first_momentum, second_momentum, adaptation_rate, transaction_cost)
43
+
44
+
45
+
46
+ history = []
47
+ for i in range(1, state_size):
48
+ c = train.iloc[i, :]['Close'] - train.iloc[i-1, :]['Close']
49
+ history.append(c)
50
+
51
+ env = Environment(train, history=history, state_size=state_size)
52
+ observation = env.reset()
53
+
54
+
55
+ model_gradients_history = dict()
56
+ checkpoint = OrderedDict()
57
+
58
+ for name, param in agent.named_parameters():
59
+ model_gradients_history.update({name: torch.zeros_like(param)})
60
+
61
+
62
+
63
+ for i in tqdm(range(state_size, len(train))):
64
+ observation = torch.as_tensor(observation).float()
65
+ action = agent(observation)
66
+ observation, reward, _ = env.step(action.data.to("cpu").numpy())
67
+
68
+
69
+
70
+
71
+ action.backward()
72
+
73
+ for name, param in agent.named_parameters():
74
+
75
+ grad_n = param.grad
76
+ param = param + optimizer.step(grad_n, reward, observation[-1], model_gradients_history[name])
77
+ checkpoint[name] = param
78
+ model_gradients_history.update({name: grad_n})
79
+
80
+ if i > 10000:
81
+ equity += env.profit
82
+ writer.add_scalar('equity', equity, i)
83
+ else:
84
+ writer.add_scalar('equity', 1.0, i)
85
+
86
+ optimizer.after_step(reward)
87
+ agent.load_state_dict(checkpoint)
88
+
89
+ ###########
90
+ ###########
91
+
92
+ # history = []
93
+ # for i in range(1, state_size):
94
+ # c = test.iloc[i, :]['Close'] - test.iloc[i-1, :]['Close']
95
+ # history.append(c)
96
+
97
+ # env = Environment(test, history=history, state_size=state_size)
98
+ # observation = env.reset()
99
+
100
+
101
+ # model_gradients_history = dict()
102
+ # checkpoint = OrderedDict()
103
+
104
+ # for name, param in agent.named_parameters():
105
+ # model_gradients_history.update({name: torch.zeros_like(param)})
106
+
107
+ # for _ in tqdm(range(state_size, len(test))):
108
+ # observation = torch.as_tensor(observation).float()
109
+ # action = agent(observation)
110
+ # observation, reward, _ = env.step(action.data.numpy())
111
+
112
+
113
+
114
+
115
+ # action.backward()
116
+
117
+ # for name, param in agent.named_parameters():
118
+
119
+ # grad_n = param.grad
120
+ # param = param + optimizer.step(grad_n, reward, observation[-1], model_gradients_history[name])
121
+ # checkpoint[name] = param
122
+ # model_gradients_history.update({name: grad_n})
123
+
124
+ # optimizer.after_step(reward)
125
+ # agent.load_state_dict(checkpoint)
126
+
127
+ print(env.profits)
rl_agent/utils.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+
4
+ class myOptimizer():
5
+
6
+ def __init__(self, lr, mu, mu_square, adaptation_rate, transaction_cost):
7
+ self.lr = lr
8
+ self.mu = mu
9
+ self.mu_square = mu_square
10
+ self.adaptation_rate = adaptation_rate
11
+ self.transaction_cost = transaction_cost
12
+
13
+ def step(self, grad_n, reward, last_observation, last_gradient):
14
+
15
+ numerator = self.mu_square - (self.mu * reward)
16
+ denominator = np.sqrt((self.mu_square - (self.mu ** 2)) ** 3)
17
+
18
+ gradient = numerator / denominator
19
+
20
+ current_grad = (-1.0 * self.transaction_cost * grad_n)
21
+
22
+ previous_grad = (last_observation + self.transaction_cost) * last_gradient
23
+
24
+ gradient = torch.as_tensor(gradient) * (current_grad + previous_grad)
25
+
26
+ return torch.as_tensor(self.lr * gradient)
27
+
28
+ def after_step(self, reward):
29
+
30
+ self.mu = self.mu + self.adaptation_rate * (reward - self.mu)
31
+ self.mu_square = self.mu_square + self.adaptation_rate * ((reward ** 2) - self.mu_square)
32
+
33
+
34
+
35
+