File size: 4,494 Bytes
dc2106c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Copyright (c) ONNX Project Contributors
#
# SPDX-License-Identifier: Apache-2.0

import numpy as np

import onnx
from onnx.backend.test.case.base import Base
from onnx.backend.test.case.node import expect
from onnx.defs import AI_ONNX_PREVIEW_TRAINING_DOMAIN


def apply_adam(r, t, x, g, v, h, norm_coefficient, norm_coefficient_post, alpha, beta, epsilon):  # type: ignore
    # Add gradient of regularization term.
    g_regularized = norm_coefficient * x + g
    # Update momentum.
    v_new = alpha * v + (1 - alpha) * g_regularized
    # Update second-order momentum.
    h_new = beta * h + (1 - beta) * (g_regularized * g_regularized)
    # Compute element-wise square root.
    h_sqrt = np.sqrt(h_new) + epsilon
    # Adjust learning rate.
    r_adjusted = None
    if t > 0:
        # Consider bias correction on momentums.
        r_adjusted = r * np.sqrt(1 - beta**t) / (1 - alpha**t)
    else:
        # No bias correction on momentums.
        r_adjusted = r
    # Apply Adam update rule.
    x_new = x - r_adjusted * (v_new / h_sqrt)
    # It's possible to apply regularization in the end.
    x_final = (1 - norm_coefficient_post) * x_new
    return x_final, v_new, h_new


class Adam(Base):
    @staticmethod
    def export_adam() -> None:
        # Define operator attributes.
        norm_coefficient = 0.001
        alpha = 0.95
        beta = 0.1
        epsilon = 1e-7

        # Create operator.
        node = onnx.helper.make_node(
            "Adam",
            inputs=["R", "T", "X", "G", "V", "H"],
            outputs=["X_new", "V_new", "H_new"],
            norm_coefficient=norm_coefficient,
            alpha=alpha,
            beta=beta,
            epsilon=epsilon,
            domain=AI_ONNX_PREVIEW_TRAINING_DOMAIN,
        )

        # Define operator inputs.
        r = np.array(0.1, dtype=np.float32)  # scalar
        t = np.array(0, dtype=np.int64)  # scalar
        x = np.array([1.2, 2.8], dtype=np.float32)
        g = np.array([-0.94, -2.5], dtype=np.float32)
        v = np.array([1.7, 3.6], dtype=np.float32)
        h = np.array([0.1, 0.1], dtype=np.float32)

        # Compute expected outputs of Adam.
        x_new, v_new, h_new = apply_adam(
            r, t, x, g, v, h, norm_coefficient, 0.0, alpha, beta, epsilon
        )

        # Check results.
        expect(
            node,
            inputs=[r, t, x, g, v, h],
            outputs=[x_new, v_new, h_new],
            name="test_adam",
            opset_imports=[
                onnx.helper.make_opsetid(AI_ONNX_PREVIEW_TRAINING_DOMAIN, 1)
            ],
        )

    @staticmethod
    def export_adam_multiple() -> None:
        # Define operator attributes.
        norm_coefficient = 0.001
        alpha = 0.95
        beta = 0.85
        epsilon = 1e-2

        node = onnx.helper.make_node(
            "Adam",
            inputs=["R", "T", "X1", "X2", "G1", "G2", "V1", "V2", "H1", "H2"],
            outputs=["X1_new", "X2_new", "V1_new", "V2_new", "H1_new", "H2_new"],
            norm_coefficient=norm_coefficient,
            alpha=alpha,
            beta=beta,
            domain=AI_ONNX_PREVIEW_TRAINING_DOMAIN,
        )

        # Define operator inputs.
        r = np.array(0.1, dtype=np.float32)  # scalar
        t = np.array(0, dtype=np.int64)  # scalar

        x1 = np.array([1.0], dtype=np.float32)
        g1 = np.array([-1.0], dtype=np.float32)
        v1 = np.array([2.0], dtype=np.float32)
        h1 = np.array([0.5], dtype=np.float32)

        x2 = np.array([1.0, 2.0], dtype=np.float32)
        g2 = np.array([-1.0, -3.0], dtype=np.float32)
        v2 = np.array([4.0, 1.0], dtype=np.float32)
        h2 = np.array([1.0, 10.0], dtype=np.float32)

        # Compute expected outputs of Adam.
        x1_new, v1_new, h1_new = apply_adam(
            r, t, x1, g1, v1, h1, norm_coefficient, 0.0, alpha, beta, epsilon
        )
        x2_new, v2_new, h2_new = apply_adam(
            r, t, x2, g2, v2, h2, norm_coefficient, 0.0, alpha, beta, epsilon
        )

        # Check results.
        expect(
            node,
            inputs=[r, t, x1, x2, g1, g2, v1, v2, h1, h2],
            outputs=[x1_new, x2_new, v1_new, v2_new, h1_new, h2_new],
            name="test_adam_multiple",
            opset_imports=[
                onnx.helper.make_opsetid(AI_ONNX_PREVIEW_TRAINING_DOMAIN, 1)
            ],
        )