File size: 3,429 Bytes
64e5a86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ecfb5c
 
 
 
64e5a86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import gradio as gr


def deepmind_flops(n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads):
    embeddings = 2 * n_ctx * n_vocab * d_model
    attn_qkv = 2 * n_ctx * 3 * d_model * (d_attn * n_heads)
    attn_logits = 2 * n_ctx * n_ctx * (d_attn * n_heads)
    attn_softmax = 3 * n_heads * n_ctx * n_ctx
    attn_reduce = 2 * n_ctx * n_ctx * (d_attn * n_heads)
    attn_project = 2 * n_ctx * (d_attn * n_heads) * d_model
    ff = 2 * n_ctx * (d_model * d_ff + d_model * d_ff)
    logits = 2 * n_ctx * d_model * n_vocab

    return (
        embeddings,
        attn_qkv * n_layer,
        attn_logits * n_layer,
        attn_softmax * n_layer,
        attn_reduce * n_layer,
        attn_project * n_layer,
        ff * n_layer,
        logits,
    )


def calculator(n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed):
    d_attn = d_model // n_heads
    if d_model % n_heads != 0:
        raise gr.Error("d_model must be divisible by n_heads")
    d_ff = d_model * ff_ratio

    flops_terms = deepmind_flops(
        n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads
    )

    if incl_embed:
        flops_per_sequence = sum(flops_terms)
    else:
        flops_per_sequence = sum(flops_terms[1:-1])

    return flops_per_sequence, flops_per_sequence / n_ctx


with gr.Blocks() as iface:
    gr.Markdown("## Transformer FLOPs Calculator")
    gr.Markdown(
        "Calculate how many FLOPs a Transformer language model has using the method described in [DeepMind's Chinchilla scaling law paper](https://arxiv.org/abs/2203.15556) (see Appendix F)."
    )
    with gr.Row():
        with gr.Column():
            n_layer = gr.Number(label="Number of layers (n_layer)")
            d_model = gr.Number(label="Model dimensions (d_model)")
            n_heads = gr.Number(label="Number of attention heads per layer (n_heads)")
            n_vocab = gr.Number(label="Vocabulary size (n_vocab)")
            n_ctx = gr.Number(label="Sequence length")
            ff_ratio = gr.Number(value=4, label="Feedforward ratio")
            incl_embed = gr.Checkbox(
                value=True, label="Include embedding and logits FLOPs"
            )

            btn = gr.Button(value="Submit", variant="primary")

        with gr.Column():
            flops_per_sequence = gr.Number(label="FLOPs per sequence")
            flops_per_token = gr.Number(label="FLOPs per token")

    btn.click(
        calculator,
        inputs=[n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
        outputs=[flops_per_sequence, flops_per_token],
    )

    gr.Markdown("### GPT-3 model family examples")
    gr.Markdown(
        "In order are the 125M, 350M, 1.3B, 2.7B, 6.7B, 13B, 30B, 66B, and 175B parameter variants."
    )
    gr.Examples(
        [
            [12, 768, 12, 50257, 4096, 4, True],
            [24, 1024, 16, 50257, 4096, 4, True],
            [24, 2048, 32, 50257, 4096, 4, True],
            [32, 2560, 32, 50257, 4096, 4, True],
            [32, 4096, 32, 50257, 4096, 4, True],
            [40, 5120, 40, 50257, 4096, 4, True],
            [48, 7168, 56, 50257, 4096, 4, True],
            [64, 9216, 72, 50257, 4096, 4, True],
            [96, 12288, 96, 50257, 4096, 4, True],
        ],
        [n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
        [flops_per_sequence, flops_per_token],
        calculator,
        cache_examples=False,
    )

iface.launch()