File size: 3,878 Bytes
254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 6ecfb5c 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
from typing import Tuple
import gradio as gr
def deepmind_flops(
n_layer: int,
d_model: int,
d_ff: int,
d_attn: int,
n_ctx: int,
n_vocab: int,
n_heads: int,
) -> int:
embeddings = 2 * n_ctx * n_vocab * d_model
attn_qkv = 2 * n_ctx * 3 * d_model * (d_attn * n_heads)
attn_logits = 2 * n_ctx * n_ctx * (d_attn * n_heads)
attn_softmax = 3 * n_heads * n_ctx * n_ctx
attn_reduce = 2 * n_ctx * n_ctx * (d_attn * n_heads)
attn_project = 2 * n_ctx * (d_attn * n_heads) * d_model
ff = 2 * n_ctx * (d_model * d_ff + d_model * d_ff)
logits = 2 * n_ctx * d_model * n_vocab
params = (
embeddings / n_ctx / 2,
(n_layer * (attn_qkv + attn_project + ff)) / n_ctx / 2,
logits / n_ctx / 2,
)
return (
embeddings,
attn_qkv * n_layer,
attn_logits * n_layer,
attn_softmax * n_layer,
attn_reduce * n_layer,
attn_project * n_layer,
ff * n_layer,
logits,
), params
def calculator(
n_layer: int,
d_model: int,
n_heads: int,
n_vocab: int,
n_ctx: int,
ff_ratio: int,
incl_embed: bool,
) -> Tuple[int, int, int]:
d_attn = d_model // n_heads
if d_model % n_heads != 0:
raise gr.Error("d_model must be divisible by n_heads")
d_ff = d_model * ff_ratio
flops_terms, params = deepmind_flops(
n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads
)
if incl_embed:
flops_per_sequence = sum(flops_terms)
params = sum(params)
else:
flops_per_sequence = sum(flops_terms[1:3])
params = sum(params[1:3])
return params, flops_per_sequence, flops_per_sequence / n_ctx
with gr.Blocks() as iface:
gr.Markdown("## Transformer FLOPs Calculator")
gr.Markdown(
"Calculate how many FLOPs a Transformer language model has using the method described in [DeepMind's Chinchilla scaling law paper](https://arxiv.org/abs/2203.15556) (see Appendix F)."
)
with gr.Row():
with gr.Column():
n_layer = gr.Number(label="Number of layers (n_layer)")
d_model = gr.Number(label="Model dimensions (d_model)")
n_heads = gr.Number(label="Number of attention heads per layer (n_heads)")
n_vocab = gr.Number(label="Vocabulary size (n_vocab)")
n_ctx = gr.Number(label="Sequence length")
ff_ratio = gr.Number(value=4, label="Feedforward ratio")
incl_embed = gr.Checkbox(value=True, label="Include embeddings")
btn = gr.Button(value="Submit", variant="primary")
with gr.Column():
params = gr.Number(label="Model parameters")
flops_per_sequence = gr.Number(label="FLOPs per sequence")
flops_per_token = gr.Number(label="FLOPs per token")
btn.click(
calculator,
inputs=[n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
outputs=[params, flops_per_sequence, flops_per_token],
)
gr.Markdown("### GPT-3 model family examples")
gr.Markdown(
"In order are the 125M, 350M, 1.3B, 2.7B, 6.7B, 13B, 30B, 66B, and 175B parameter variants."
)
gr.Examples(
[
[12, 768, 12, 50257, 4096, 4, True],
[24, 1024, 16, 50257, 4096, 4, True],
[24, 2048, 32, 50257, 4096, 4, True],
[32, 2560, 32, 50257, 4096, 4, True],
[32, 4096, 32, 50257, 4096, 4, True],
[40, 5120, 40, 50257, 4096, 4, True],
[48, 7168, 56, 50257, 4096, 4, True],
[64, 9216, 72, 50257, 4096, 4, True],
[96, 12288, 96, 50257, 4096, 4, True],
],
[n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
[params, flops_per_sequence, flops_per_token],
calculator,
cache_examples=False,
)
iface.launch()
|