adamcasson commited on
Commit
254bbe9
·
1 Parent(s): 6ecfb5c

add param count

Browse files
Files changed (1) hide show
  1. app.py +36 -11
app.py CHANGED
@@ -1,7 +1,17 @@
 
 
1
  import gradio as gr
2
 
3
 
4
- def deepmind_flops(n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads):
 
 
 
 
 
 
 
 
5
  embeddings = 2 * n_ctx * n_vocab * d_model
6
  attn_qkv = 2 * n_ctx * 3 * d_model * (d_attn * n_heads)
7
  attn_logits = 2 * n_ctx * n_ctx * (d_attn * n_heads)
@@ -11,6 +21,12 @@ def deepmind_flops(n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads):
11
  ff = 2 * n_ctx * (d_model * d_ff + d_model * d_ff)
12
  logits = 2 * n_ctx * d_model * n_vocab
13
 
 
 
 
 
 
 
14
  return (
15
  embeddings,
16
  attn_qkv * n_layer,
@@ -20,25 +36,35 @@ def deepmind_flops(n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads):
20
  attn_project * n_layer,
21
  ff * n_layer,
22
  logits,
23
- )
24
 
25
 
26
- def calculator(n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed):
 
 
 
 
 
 
 
 
27
  d_attn = d_model // n_heads
28
  if d_model % n_heads != 0:
29
  raise gr.Error("d_model must be divisible by n_heads")
30
  d_ff = d_model * ff_ratio
31
 
32
- flops_terms = deepmind_flops(
33
  n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads
34
  )
35
 
36
  if incl_embed:
37
  flops_per_sequence = sum(flops_terms)
 
38
  else:
39
- flops_per_sequence = sum(flops_terms[1:-1])
 
40
 
41
- return flops_per_sequence, flops_per_sequence / n_ctx
42
 
43
 
44
  with gr.Blocks() as iface:
@@ -54,20 +80,19 @@ with gr.Blocks() as iface:
54
  n_vocab = gr.Number(label="Vocabulary size (n_vocab)")
55
  n_ctx = gr.Number(label="Sequence length")
56
  ff_ratio = gr.Number(value=4, label="Feedforward ratio")
57
- incl_embed = gr.Checkbox(
58
- value=True, label="Include embedding and logits FLOPs"
59
- )
60
 
61
  btn = gr.Button(value="Submit", variant="primary")
62
 
63
  with gr.Column():
 
64
  flops_per_sequence = gr.Number(label="FLOPs per sequence")
65
  flops_per_token = gr.Number(label="FLOPs per token")
66
 
67
  btn.click(
68
  calculator,
69
  inputs=[n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
70
- outputs=[flops_per_sequence, flops_per_token],
71
  )
72
 
73
  gr.Markdown("### GPT-3 model family examples")
@@ -87,7 +112,7 @@ with gr.Blocks() as iface:
87
  [96, 12288, 96, 50257, 4096, 4, True],
88
  ],
89
  [n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
90
- [flops_per_sequence, flops_per_token],
91
  calculator,
92
  cache_examples=False,
93
  )
 
1
+ from typing import Tuple
2
+
3
  import gradio as gr
4
 
5
 
6
+ def deepmind_flops(
7
+ n_layer: int,
8
+ d_model: int,
9
+ d_ff: int,
10
+ d_attn: int,
11
+ n_ctx: int,
12
+ n_vocab: int,
13
+ n_heads: int,
14
+ ) -> int:
15
  embeddings = 2 * n_ctx * n_vocab * d_model
16
  attn_qkv = 2 * n_ctx * 3 * d_model * (d_attn * n_heads)
17
  attn_logits = 2 * n_ctx * n_ctx * (d_attn * n_heads)
 
21
  ff = 2 * n_ctx * (d_model * d_ff + d_model * d_ff)
22
  logits = 2 * n_ctx * d_model * n_vocab
23
 
24
+ params = (
25
+ embeddings / n_ctx / 2,
26
+ (n_layer * (attn_qkv + attn_project + ff)) / n_ctx / 2,
27
+ logits / n_ctx / 2,
28
+ )
29
+
30
  return (
31
  embeddings,
32
  attn_qkv * n_layer,
 
36
  attn_project * n_layer,
37
  ff * n_layer,
38
  logits,
39
+ ), params
40
 
41
 
42
+ def calculator(
43
+ n_layer: int,
44
+ d_model: int,
45
+ n_heads: int,
46
+ n_vocab: int,
47
+ n_ctx: int,
48
+ ff_ratio: int,
49
+ incl_embed: bool,
50
+ ) -> Tuple[int, int, int]:
51
  d_attn = d_model // n_heads
52
  if d_model % n_heads != 0:
53
  raise gr.Error("d_model must be divisible by n_heads")
54
  d_ff = d_model * ff_ratio
55
 
56
+ flops_terms, params = deepmind_flops(
57
  n_layer, d_model, d_ff, d_attn, n_ctx, n_vocab, n_heads
58
  )
59
 
60
  if incl_embed:
61
  flops_per_sequence = sum(flops_terms)
62
+ params = sum(params)
63
  else:
64
+ flops_per_sequence = sum(flops_terms[1:3])
65
+ params = sum(params[1:3])
66
 
67
+ return params, flops_per_sequence, flops_per_sequence / n_ctx
68
 
69
 
70
  with gr.Blocks() as iface:
 
80
  n_vocab = gr.Number(label="Vocabulary size (n_vocab)")
81
  n_ctx = gr.Number(label="Sequence length")
82
  ff_ratio = gr.Number(value=4, label="Feedforward ratio")
83
+ incl_embed = gr.Checkbox(value=True, label="Include embeddings")
 
 
84
 
85
  btn = gr.Button(value="Submit", variant="primary")
86
 
87
  with gr.Column():
88
+ params = gr.Number(label="Model parameters")
89
  flops_per_sequence = gr.Number(label="FLOPs per sequence")
90
  flops_per_token = gr.Number(label="FLOPs per token")
91
 
92
  btn.click(
93
  calculator,
94
  inputs=[n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
95
+ outputs=[params, flops_per_sequence, flops_per_token],
96
  )
97
 
98
  gr.Markdown("### GPT-3 model family examples")
 
112
  [96, 12288, 96, 50257, 4096, 4, True],
113
  ],
114
  [n_layer, d_model, n_heads, n_vocab, n_ctx, ff_ratio, incl_embed],
115
+ [params, flops_per_sequence, flops_per_token],
116
  calculator,
117
  cache_examples=False,
118
  )