adamcasson
commited on
Commit
·
254bbe9
1
Parent(s):
6ecfb5c
add param count
Browse files
app.py
CHANGED
@@ -1,7 +1,17 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
|
4 |
-
def deepmind_flops(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
embeddings = 2 * n_ctx * n_vocab * d_model
|
6 |
attn_qkv = 2 * n_ctx * 3 * d_model * (d_attn * n_heads)
|
7 |
attn_logits = 2 * n_ctx * n_ctx * (d_attn * n_heads)
|
@@ -11,6 +21,12 @@ def deepmind_flops(n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads):
|
|
11 |
ff = 2 * n_ctx * (d_model * d_ff + d_model * d_ff)
|
12 |
logits = 2 * n_ctx * d_model * n_vocab
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
return (
|
15 |
embeddings,
|
16 |
attn_qkv * n_layer,
|
@@ -20,25 +36,35 @@ def deepmind_flops(n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads):
|
|
20 |
attn_project * n_layer,
|
21 |
ff * n_layer,
|
22 |
logits,
|
23 |
-
)
|
24 |
|
25 |
|
26 |
-
def calculator(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
d_attn = d_model // n_heads
|
28 |
if d_model % n_heads != 0:
|
29 |
raise gr.Error("d_model must be divisible by n_heads")
|
30 |
d_ff = d_model * ff_ratio
|
31 |
|
32 |
-
flops_terms = deepmind_flops(
|
33 |
n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads
|
34 |
)
|
35 |
|
36 |
if incl_embed:
|
37 |
flops_per_sequence = sum(flops_terms)
|
|
|
38 |
else:
|
39 |
-
flops_per_sequence = sum(flops_terms[1
|
|
|
40 |
|
41 |
-
return flops_per_sequence, flops_per_sequence / n_ctx
|
42 |
|
43 |
|
44 |
with gr.Blocks() as iface:
|
@@ -54,20 +80,19 @@ with gr.Blocks() as iface:
|
|
54 |
n_vocab = gr.Number(label="Vocabulary size (n_vocab)")
|
55 |
n_ctx = gr.Number(label="Sequence length")
|
56 |
ff_ratio = gr.Number(value=4, label="Feedforward ratio")
|
57 |
-
incl_embed = gr.Checkbox(
|
58 |
-
value=True, label="Include embedding and logits FLOPs"
|
59 |
-
)
|
60 |
|
61 |
btn = gr.Button(value="Submit", variant="primary")
|
62 |
|
63 |
with gr.Column():
|
|
|
64 |
flops_per_sequence = gr.Number(label="FLOPs per sequence")
|
65 |
flops_per_token = gr.Number(label="FLOPs per token")
|
66 |
|
67 |
btn.click(
|
68 |
calculator,
|
69 |
inputs=[n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
|
70 |
-
outputs=[flops_per_sequence, flops_per_token],
|
71 |
)
|
72 |
|
73 |
gr.Markdown("### GPT-3 model family examples")
|
@@ -87,7 +112,7 @@ with gr.Blocks() as iface:
|
|
87 |
[96, 12288, 96, 50257, 4096, 4, True],
|
88 |
],
|
89 |
[n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
|
90 |
-
[flops_per_sequence, flops_per_token],
|
91 |
calculator,
|
92 |
cache_examples=False,
|
93 |
)
|
|
|
1 |
+
from typing import Tuple
|
2 |
+
|
3 |
import gradio as gr
|
4 |
|
5 |
|
6 |
+
def deepmind_flops(
|
7 |
+
n_layer: int,
|
8 |
+
d_model: int,
|
9 |
+
d_ff: int,
|
10 |
+
d_attn: int,
|
11 |
+
n_ctx: int,
|
12 |
+
n_vocab: int,
|
13 |
+
n_heads: int,
|
14 |
+
) -> int:
|
15 |
embeddings = 2 * n_ctx * n_vocab * d_model
|
16 |
attn_qkv = 2 * n_ctx * 3 * d_model * (d_attn * n_heads)
|
17 |
attn_logits = 2 * n_ctx * n_ctx * (d_attn * n_heads)
|
|
|
21 |
ff = 2 * n_ctx * (d_model * d_ff + d_model * d_ff)
|
22 |
logits = 2 * n_ctx * d_model * n_vocab
|
23 |
|
24 |
+
params = (
|
25 |
+
embeddings / n_ctx / 2,
|
26 |
+
(n_layer * (attn_qkv + attn_project + ff)) / n_ctx / 2,
|
27 |
+
logits / n_ctx / 2,
|
28 |
+
)
|
29 |
+
|
30 |
return (
|
31 |
embeddings,
|
32 |
attn_qkv * n_layer,
|
|
|
36 |
attn_project * n_layer,
|
37 |
ff * n_layer,
|
38 |
logits,
|
39 |
+
), params
|
40 |
|
41 |
|
42 |
+
def calculator(
|
43 |
+
n_layer: int,
|
44 |
+
d_model: int,
|
45 |
+
n_heads: int,
|
46 |
+
n_vocab: int,
|
47 |
+
n_ctx: int,
|
48 |
+
ff_ratio: int,
|
49 |
+
incl_embed: bool,
|
50 |
+
) -> Tuple[int, int, int]:
|
51 |
d_attn = d_model // n_heads
|
52 |
if d_model % n_heads != 0:
|
53 |
raise gr.Error("d_model must be divisible by n_heads")
|
54 |
d_ff = d_model * ff_ratio
|
55 |
|
56 |
+
flops_terms, params = deepmind_flops(
|
57 |
n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads
|
58 |
)
|
59 |
|
60 |
if incl_embed:
|
61 |
flops_per_sequence = sum(flops_terms)
|
62 |
+
params = sum(params)
|
63 |
else:
|
64 |
+
flops_per_sequence = sum(flops_terms[1:3])
|
65 |
+
params = sum(params[1:3])
|
66 |
|
67 |
+
return params, flops_per_sequence, flops_per_sequence / n_ctx
|
68 |
|
69 |
|
70 |
with gr.Blocks() as iface:
|
|
|
80 |
n_vocab = gr.Number(label="Vocabulary size (n_vocab)")
|
81 |
n_ctx = gr.Number(label="Sequence length")
|
82 |
ff_ratio = gr.Number(value=4, label="Feedforward ratio")
|
83 |
+
incl_embed = gr.Checkbox(value=True, label="Include embeddings")
|
|
|
|
|
84 |
|
85 |
btn = gr.Button(value="Submit", variant="primary")
|
86 |
|
87 |
with gr.Column():
|
88 |
+
params = gr.Number(label="Model parameters")
|
89 |
flops_per_sequence = gr.Number(label="FLOPs per sequence")
|
90 |
flops_per_token = gr.Number(label="FLOPs per token")
|
91 |
|
92 |
btn.click(
|
93 |
calculator,
|
94 |
inputs=[n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
|
95 |
+
outputs=[params, flops_per_sequence, flops_per_token],
|
96 |
)
|
97 |
|
98 |
gr.Markdown("### GPT-3 model family examples")
|
|
|
112 |
[96, 12288, 96, 50257, 4096, 4, True],
|
113 |
],
|
114 |
[n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
|
115 |
+
[params, flops_per_sequence, flops_per_token],
|
116 |
calculator,
|
117 |
cache_examples=False,
|
118 |
)
|