File size: 3,826 Bytes
254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 6ecfb5c 64e5a86 254bbe9 64e5a86 7718a35 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 254bbe9 64e5a86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
from typing import Tuple
import gradio as gr
def deepmind_flops(
n_layer: int,
d_model: int,
d_ff: int,
d_attn: int,
n_ctx: int,
n_vocab: int,
n_heads: int,
) -> int:
embeddings = 2 * n_ctx * n_vocab * d_model
attn_qkv = 2 * n_ctx * 3 * d_model * (d_attn * n_heads)
attn_logits = 2 * n_ctx * n_ctx * (d_attn * n_heads)
attn_softmax = 3 * n_heads * n_ctx * n_ctx
attn_reduce = 2 * n_ctx * n_ctx * (d_attn * n_heads)
attn_project = 2 * n_ctx * (d_attn * n_heads) * d_model
ff = 2 * n_ctx * (d_model * d_ff + d_model * d_ff)
logits = 2 * n_ctx * d_model * n_vocab
params = (
embeddings / n_ctx / 2,
(n_layer * (attn_qkv + attn_project + ff)) / n_ctx / 2,
logits / n_ctx / 2,
)
return (
embeddings,
attn_qkv * n_layer,
attn_logits * n_layer,
attn_softmax * n_layer,
attn_reduce * n_layer,
attn_project * n_layer,
ff * n_layer,
logits,
), params
def calculator(
n_layer: int,
d_model: int,
n_heads: int,
n_vocab: int,
n_ctx: int,
ff_ratio: int,
incl_embed: bool,
) -> Tuple[int, int, int]:
d_attn = d_model // n_heads
if d_model % n_heads != 0:
raise gr.Error("d_model must be divisible by n_heads")
d_ff = d_model * ff_ratio
flops_terms, params = deepmind_flops(
n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads
)
if incl_embed:
flops_per_sequence = sum(flops_terms)
params = sum(params)
else:
flops_per_sequence = sum(flops_terms[1:3])
params = sum(params[1:3])
return params, flops_per_sequence, flops_per_sequence / n_ctx
with gr.Blocks() as iface:
gr.Markdown(
"Calculate how many FLOPs a Transformer language model has using the method described in [DeepMind's Chinchilla scaling law paper](https://arxiv.org/abs/2203.15556) (see Appendix F)."
)
with gr.Row():
with gr.Column():
n_layer = gr.Number(label="Number of layers (n_layer)")
d_model = gr.Number(label="Model dimensions (d_model)")
n_heads = gr.Number(label="Number of attention heads per layer (n_heads)")
n_vocab = gr.Number(label="Vocabulary size (n_vocab)")
n_ctx = gr.Number(label="Sequence length")
ff_ratio = gr.Number(value=4, label="Feedforward ratio")
incl_embed = gr.Checkbox(value=True, label="Include embeddings")
btn = gr.Button(value="Enter", variant="primary")
with gr.Column():
params = gr.Number(label="Model parameters")
flops_per_sequence = gr.Number(label="FLOPs per sequence")
flops_per_token = gr.Number(label="FLOPs per token")
btn.click(
calculator,
inputs=[n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
outputs=[params, flops_per_sequence, flops_per_token],
)
gr.Markdown("### GPT-3 model family examples")
gr.Markdown(
"In order are the 125M, 350M, 1.3B, 2.7B, 6.7B, 13B, 30B, 66B, and 175B parameter variants."
)
gr.Examples(
[
[12, 768, 12, 50257, 4096, 4, True],
[24, 1024, 16, 50257, 4096, 4, True],
[24, 2048, 32, 50257, 4096, 4, True],
[32, 2560, 32, 50257, 4096, 4, True],
[32, 4096, 32, 50257, 4096, 4, True],
[40, 5120, 40, 50257, 4096, 4, True],
[48, 7168, 56, 50257, 4096, 4, True],
[64, 9216, 72, 50257, 4096, 4, True],
[96, 12288, 96, 50257, 4096, 4, True],
],
[n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
[params, flops_per_sequence, flops_per_token],
calculator,
cache_examples=False,
)
iface.launch()
|