adamcasson commited on
Commit
64e5a86
·
1 Parent(s): caf7cfa
Files changed (1) hide show
  1. app.py +91 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+
4
+ def deepmind_flops(n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads):
5
+ embeddings = 2 * n_ctx * n_vocab * d_model
6
+ attn_qkv = 2 * n_ctx * 3 * d_model * (d_attn * n_heads)
7
+ attn_logits = 2 * n_ctx * n_ctx * (d_attn * n_heads)
8
+ attn_softmax = 3 * n_heads * n_ctx * n_ctx
9
+ attn_reduce = 2 * n_ctx * n_ctx * (d_attn * n_heads)
10
+ attn_project = 2 * n_ctx * (d_attn * n_heads) * d_model
11
+ ff = 2 * n_ctx * (d_model * d_ff + d_model * d_ff)
12
+ logits = 2 * n_ctx * d_model * n_vocab
13
+
14
+ return (
15
+ embeddings,
16
+ attn_qkv * n_layer,
17
+ attn_logits * n_layer,
18
+ attn_softmax * n_layer,
19
+ attn_reduce * n_layer,
20
+ attn_project * n_layer,
21
+ ff * n_layer,
22
+ logits,
23
+ )
24
+
25
+
26
+ def calculator(n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed):
27
+ d_attn = d_model // n_heads
28
+ if d_model % n_heads != 0:
29
+ raise gr.Error("d_model must be divisible by n_heads")
30
+ d_ff = d_model * ff_ratio
31
+
32
+ flops_terms = deepmind_flops(
33
+ n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads
34
+ )
35
+
36
+ if incl_embed:
37
+ flops_per_sequence = sum(flops_terms)
38
+ else:
39
+ flops_per_sequence = sum(flops_terms[1:-1])
40
+
41
+ return flops_per_sequence, flops_per_sequence / n_ctx
42
+
43
+
44
+ with gr.Blocks() as iface:
45
+ with gr.Row():
46
+ with gr.Column():
47
+ n_layer = gr.Number(label="Number of layers (n_layer)")
48
+ d_model = gr.Number(label="Model dimensions (d_model)")
49
+ n_heads = gr.Number(label="Number of attention heads per layer (n_heads)")
50
+ n_vocab = gr.Number(label="Vocabulary size (n_vocab)")
51
+ n_ctx = gr.Number(label="Sequence length")
52
+ ff_ratio = gr.Number(value=4, label="Feedforward ratio")
53
+ incl_embed = gr.Checkbox(
54
+ value=True, label="Include embedding and logits FLOPs"
55
+ )
56
+
57
+ btn = gr.Button(value="Submit", variant="primary")
58
+
59
+ with gr.Column():
60
+ flops_per_sequence = gr.Number(label="FLOPs per sequence")
61
+ flops_per_token = gr.Number(label="FLOPs per token")
62
+
63
+ btn.click(
64
+ calculator,
65
+ inputs=[n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
66
+ outputs=[flops_per_sequence, flops_per_token],
67
+ )
68
+
69
+ gr.Markdown("### GPT-3 model family examples")
70
+ gr.Markdown(
71
+ "In order are the 125M, 350M, 1.3B, 2.7B, 6.7B, 13B, 30B, 66B, and 175B parameter variants."
72
+ )
73
+ gr.Examples(
74
+ [
75
+ [12, 768, 12, 50257, 4096, 4, True],
76
+ [24, 1024, 16, 50257, 4096, 4, True],
77
+ [24, 2048, 32, 50257, 4096, 4, True],
78
+ [32, 2560, 32, 50257, 4096, 4, True],
79
+ [32, 4096, 32, 50257, 4096, 4, True],
80
+ [40, 5120, 40, 50257, 4096, 4, True],
81
+ [48, 7168, 56, 50257, 4096, 4, True],
82
+ [64, 9216, 72, 50257, 4096, 4, True],
83
+ [96, 12288, 96, 50257, 4096, 4, True],
84
+ ],
85
+ [n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
86
+ [flops_per_sequence, flops_per_token],
87
+ calculator,
88
+ cache_examples=False,
89
+ )
90
+
91
+ iface.launch()