Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
|
|
5 |
# 'dp', 'tp', 'pp', 'cp', 'GPU numbers', 'Batch size', 'FP8', 'Model parameters', 'Model_states', 'Activation', 'Total']
|
6 |
|
7 |
col=['L', 'H', 'FFN', 'S', 'A', 'G',
|
8 |
-
'dp', 'tp', 'pp', 'cp', '
|
9 |
|
10 |
abbr = """
|
11 |
<div align="center">
|
@@ -35,9 +35,18 @@ def Compute_Parameters_input(seq_length, hidden_size, vocab_size, act_func, tp):
|
|
35 |
|
36 |
return num_parameters_word_embedding + num_parameters_position_embedding
|
37 |
|
38 |
-
def Compute_Parameters_output(hidden_size, vocab_size, tp):
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
return num_parameters_output_layernorm + num_parameters_output_embedding
|
42 |
|
43 |
def Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, act_func, tp):
|
@@ -79,7 +88,7 @@ def Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp):
|
|
79 |
|
80 |
return num_parameters_mlp
|
81 |
|
82 |
-
def Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp):
|
83 |
if is_group_query == "False":
|
84 |
group_query_num = head_num
|
85 |
kv_hidden_size = hidden_size / head_num * group_query_num
|
@@ -94,7 +103,7 @@ def Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size,
|
|
94 |
num_parameters_in_total_layers = num_parameters_in_single_layer * layer_num / pp
|
95 |
|
96 |
# output part
|
97 |
-
parameters_output = Compute_Parameters_output(hidden_size, vocab_size, tp)
|
98 |
|
99 |
if pp == 1:
|
100 |
num_parameters_total = (
|
@@ -146,9 +155,9 @@ def Compute_Master_weight(numParametersTotal, is_dist_opt, dp, cp):
|
|
146 |
|
147 |
return master_weight_memory
|
148 |
|
149 |
-
def Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size, ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func,
|
150 |
dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty):
|
151 |
-
numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp)
|
152 |
|
153 |
weight_memory = Compute_Weight(numParametersTotal, is_fp8, is_fp8_init)
|
154 |
gradient_memory = Compute_Gradient(numParametersTotal, g_ty)
|
@@ -160,7 +169,7 @@ def Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size, ffn_siz
|
|
160 |
|
161 |
# activation memory:
|
162 |
def compute_activation_memory_attention(activation_dtype, seq_length, b, hidden_size, kv_hidden_size, is_sp, tp):
|
163 |
-
# LN
|
164 |
activation_mem_attn_ln = seq_length * b * hidden_size * 2
|
165 |
if is_sp == "False":
|
166 |
activation_mem_attn_ln *= tp
|
@@ -288,11 +297,13 @@ def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, he
|
|
288 |
return activation_memory / tp / cp
|
289 |
|
290 |
# compute_btn.click.function
|
291 |
-
def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_length, head_num, is_group_query, group_query_num, is_bias, act_func,
|
292 |
dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count):
|
293 |
# data type trans
|
294 |
if is_group_query == "True":
|
295 |
group_query_num = int(group_query_num)
|
|
|
|
|
296 |
|
297 |
# check input
|
298 |
[result, Error_message] = check_input(dp, tp, pp, cp, hidden_size, head_num, layer_num, seq_length, vp, b, b_global)
|
@@ -301,13 +312,13 @@ def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_l
|
|
301 |
|
302 |
# get model states
|
303 |
numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size,
|
304 |
-
ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty)
|
305 |
|
306 |
# get activation memory
|
307 |
activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
|
308 |
|
309 |
# get model parameters
|
310 |
-
numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, 1, 1)
|
311 |
# get gpu number
|
312 |
gpu_num = dp * tp * pp * cp
|
313 |
|
@@ -349,9 +360,10 @@ def generate_csv(record_df):
|
|
349 |
# formula string
|
350 |
formula = r"""
|
351 |
> **Note**🔑: In this formula, we assume LLM training with FP8 training.
|
352 |
-
> 1.
|
353 |
-
> 2.
|
354 |
-
> 3.
|
|
|
355 |
|
356 |
<div align="center">
|
357 |
<img src=file/T1.jpg width=50%/>
|
@@ -359,7 +371,7 @@ formula = r"""
|
|
359 |
|
360 |
$$
|
361 |
{Total\ Model\ parameters} =
|
362 |
-
HV +
|
363 |
$$
|
364 |
|
365 |
***
|
@@ -371,7 +383,7 @@ formula = r"""
|
|
371 |
$$
|
372 |
{Model\ states} =
|
373 |
(6 + \frac{12}{dp \times cp}) \times
|
374 |
-
(\frac{(\frac{4H^2 + 3H \times FFN}{tp} + 2H) \times L}{pp} + \frac{HV}{tp}
|
375 |
$$
|
376 |
|
377 |
$$
|
@@ -477,26 +489,27 @@ with gr.Blocks() as demo:
|
|
477 |
# Input 1.[Model Parameters]
|
478 |
gr.Markdown(
|
479 |
"""
|
480 |
-
<
|
481 |
"""
|
482 |
)
|
483 |
with gr.Accordion("Model Parameters"):
|
484 |
# with gr.Row():
|
485 |
act_func = gr.Radio(["LLaMA", "GPT"], value="LLaMA", label="Model type") #, info="Action Function in MLP, whether to use GLU (Gated Linear Unit). [e.g \"True\" for LlaMA, \"False\" for GPT.]")
|
486 |
with gr.Row():
|
487 |
-
vocab_size = gr.Number(label="Vocab size", value=32000)
|
488 |
-
layer_num = gr.Number(label="Layer number", value=32)
|
489 |
with gr.Row():
|
490 |
-
hidden_size = gr.Number(label="Hidden size", value=4096)
|
491 |
-
ffn_size = gr.Number(label="FFN Hidden size", value=11008)
|
492 |
with gr.Row():
|
493 |
-
sequence_len = gr.Number(label="Sequence length", value=2048)
|
494 |
-
head_num = gr.Number(label="Number of Attention Heads", value=32)
|
495 |
with gr.Row():
|
496 |
is_group_query = gr.Radio(["True", "False"], value="False", label="Use Group Query Attention")
|
497 |
-
group_query_num = gr.Textbox(label="Number of Query Groups", max_lines=1, value=None, interactive=False)
|
498 |
-
|
499 |
-
|
|
|
500 |
# change editable function
|
501 |
def toggle_textbox_editable(radio_value):
|
502 |
# 根据 radio_value 的值来决定 textbox 是否可编辑
|
@@ -511,30 +524,30 @@ with gr.Blocks() as demo:
|
|
511 |
# Input 2.[Parallelism]
|
512 |
gr.Markdown(
|
513 |
"""
|
514 |
-
<
|
515 |
"""
|
516 |
)
|
517 |
with gr.Accordion("Parallelism config"):
|
518 |
# with gr.Row():
|
519 |
-
dp = gr.Number(label="Data parallelism", value=2)
|
520 |
-
tp = gr.Number(label="Tensor parallelism", value=2)
|
521 |
-
pp = gr.Number(label="Pipeline parallelism", value=2)
|
522 |
-
cp = gr.Number(label="Context parallelism", value=
|
523 |
# with gr.Row():
|
524 |
is_sp = gr.Radio(["True", "False"], value="True", label="Sequence parallelism")
|
525 |
-
vp = gr.Number(label="Virtual Pipeline Size")
|
526 |
is_dist_opt = gr.Radio(["True", "False"], value="True", label="Use Distributed Optimizer(Zero1)")
|
527 |
|
528 |
with gr.Column():
|
529 |
# Input 3.[Training Settings]
|
530 |
gr.Markdown(
|
531 |
"""
|
532 |
-
<
|
533 |
"""
|
534 |
)
|
535 |
with gr.Accordion("Training Config"):
|
536 |
# with gr.Row():
|
537 |
-
b = gr.Number(label="Micro Batch size", value=4)
|
538 |
b_global = gr.Number(label="Global Batch size", value=64)
|
539 |
# with gr.Row():
|
540 |
gr.Checkbox(label="True", value=True, info="BF16 Training")
|
@@ -547,11 +560,11 @@ with gr.Blocks() as demo:
|
|
547 |
compute_btn = gr.Button("Compute")
|
548 |
with gr.Tab("Output"):
|
549 |
with gr.Column():
|
550 |
-
gr.Markdown(
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
)
|
555 |
output_text = gr.Textbox(
|
556 |
label="Compute result",
|
557 |
interactive=False,
|
@@ -565,7 +578,7 @@ with gr.Blocks() as demo:
|
|
565 |
, latex_delimiters=[{ "left": "$$", "right": "$$", "display": True }]
|
566 |
)
|
567 |
|
568 |
-
gr.Markdown(abbr)
|
569 |
|
570 |
record_df = gr.Dataframe(
|
571 |
label="Record Table",
|
@@ -576,7 +589,7 @@ with gr.Blocks() as demo:
|
|
576 |
count = gr.Number(label="Row count", value=1, visible=False)
|
577 |
compute_btn.click(
|
578 |
fn=Compute_ALL_Model_memory,
|
579 |
-
inputs=[vocab_size, layer_num, hidden_size, ffn_size, sequence_len, head_num, is_group_query, group_query_num, is_bias, act_func,
|
580 |
dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count],
|
581 |
outputs=[output_text, record_df, count]
|
582 |
)
|
|
|
5 |
# 'dp', 'tp', 'pp', 'cp', 'GPU numbers', 'Batch size', 'FP8', 'Model parameters', 'Model_states', 'Activation', 'Total']
|
6 |
|
7 |
col=['L', 'H', 'FFN', 'S', 'A', 'G',
|
8 |
+
'dp', 'tp', 'pp', 'cp', 'Number of GPUs', 'B', 'FP8', 'Model parameters (B)', 'Model states (GB)', 'Activation (GB)', 'Total (GB)']
|
9 |
|
10 |
abbr = """
|
11 |
<div align="center">
|
|
|
35 |
|
36 |
return num_parameters_word_embedding + num_parameters_position_embedding
|
37 |
|
38 |
+
def Compute_Parameters_output(hidden_size, vocab_size, is_tie_word_embedding, act_func, tp):
|
39 |
+
# layernorm: h/2h
|
40 |
+
if act_func == "LLaMA":
|
41 |
+
num_parameters_output_layernorm = hidden_size # RMSNorm
|
42 |
+
else:
|
43 |
+
num_parameters_output_layernorm = 2 * hidden_size # LayerNorm
|
44 |
+
|
45 |
+
if is_tie_word_embedding == "True":
|
46 |
+
num_parameters_output_embedding = 0 # due to sharedWordEmbedding
|
47 |
+
else:
|
48 |
+
num_parameters_output_embedding = hidden_size * vocab_size / tp
|
49 |
+
|
50 |
return num_parameters_output_layernorm + num_parameters_output_embedding
|
51 |
|
52 |
def Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, act_func, tp):
|
|
|
88 |
|
89 |
return num_parameters_mlp
|
90 |
|
91 |
+
def Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, head_num, tp, pp):
|
92 |
if is_group_query == "False":
|
93 |
group_query_num = head_num
|
94 |
kv_hidden_size = hidden_size / head_num * group_query_num
|
|
|
103 |
num_parameters_in_total_layers = num_parameters_in_single_layer * layer_num / pp
|
104 |
|
105 |
# output part
|
106 |
+
parameters_output = Compute_Parameters_output(hidden_size, vocab_size, is_tie_word_embedding, act_func, tp)
|
107 |
|
108 |
if pp == 1:
|
109 |
num_parameters_total = (
|
|
|
155 |
|
156 |
return master_weight_memory
|
157 |
|
158 |
+
def Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size, ffn_size, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func,
|
159 |
dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty):
|
160 |
+
numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, head_num, tp, pp)
|
161 |
|
162 |
weight_memory = Compute_Weight(numParametersTotal, is_fp8, is_fp8_init)
|
163 |
gradient_memory = Compute_Gradient(numParametersTotal, g_ty)
|
|
|
169 |
|
170 |
# activation memory:
|
171 |
def compute_activation_memory_attention(activation_dtype, seq_length, b, hidden_size, kv_hidden_size, is_sp, tp):
|
172 |
+
# LN 2bsh
|
173 |
activation_mem_attn_ln = seq_length * b * hidden_size * 2
|
174 |
if is_sp == "False":
|
175 |
activation_mem_attn_ln *= tp
|
|
|
297 |
return activation_memory / tp / cp
|
298 |
|
299 |
# compute_btn.click.function
|
300 |
+
def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_length, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func,
|
301 |
dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count):
|
302 |
# data type trans
|
303 |
if is_group_query == "True":
|
304 |
group_query_num = int(group_query_num)
|
305 |
+
else:
|
306 |
+
group_query_num = head_num
|
307 |
|
308 |
# check input
|
309 |
[result, Error_message] = check_input(dp, tp, pp, cp, hidden_size, head_num, layer_num, seq_length, vp, b, b_global)
|
|
|
312 |
|
313 |
# get model states
|
314 |
numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size,
|
315 |
+
ffn_size, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty)
|
316 |
|
317 |
# get activation memory
|
318 |
activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
|
319 |
|
320 |
# get model parameters
|
321 |
+
numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, head_num, 1, 1)
|
322 |
# get gpu number
|
323 |
gpu_num = dp * tp * pp * cp
|
324 |
|
|
|
360 |
# formula string
|
361 |
formula = r"""
|
362 |
> **Note**🔑: In this formula, we assume LLM training with FP8 training.
|
363 |
+
> 1. LlaMA-family Model.
|
364 |
+
> 2. Interleaved pipeline.
|
365 |
+
> 3. bias = False.
|
366 |
+
> 4. SP = True.
|
367 |
|
368 |
<div align="center">
|
369 |
<img src=file/T1.jpg width=50%/>
|
|
|
371 |
|
372 |
$$
|
373 |
{Total\ Model\ parameters} =
|
374 |
+
HV + (4H^2 + 3H \times FFN + 2H) \times L + H
|
375 |
$$
|
376 |
|
377 |
***
|
|
|
383 |
$$
|
384 |
{Model\ states} =
|
385 |
(6 + \frac{12}{dp \times cp}) \times
|
386 |
+
(\frac{(\frac{4H^2 + 3H \times FFN}{tp} + 2H) \times L}{pp} + \frac{HV}{tp})
|
387 |
$$
|
388 |
|
389 |
$$
|
|
|
489 |
# Input 1.[Model Parameters]
|
490 |
gr.Markdown(
|
491 |
"""
|
492 |
+
<h2>Model Parameters:</h2>
|
493 |
"""
|
494 |
)
|
495 |
with gr.Accordion("Model Parameters"):
|
496 |
# with gr.Row():
|
497 |
act_func = gr.Radio(["LLaMA", "GPT"], value="LLaMA", label="Model type") #, info="Action Function in MLP, whether to use GLU (Gated Linear Unit). [e.g \"True\" for LlaMA, \"False\" for GPT.]")
|
498 |
with gr.Row():
|
499 |
+
vocab_size = gr.Number(label="Vocab size (V)", value=32000)
|
500 |
+
layer_num = gr.Number(label="Layer number (L)", value=32)
|
501 |
with gr.Row():
|
502 |
+
hidden_size = gr.Number(label="Hidden size (H)", value=4096)
|
503 |
+
ffn_size = gr.Number(label="FFN Hidden size (FFN)", value=11008)
|
504 |
with gr.Row():
|
505 |
+
sequence_len = gr.Number(label="Sequence length (S)", value=2048)
|
506 |
+
head_num = gr.Number(label="Number of Attention Heads (A)", value=32)
|
507 |
with gr.Row():
|
508 |
is_group_query = gr.Radio(["True", "False"], value="False", label="Use Group Query Attention")
|
509 |
+
group_query_num = gr.Textbox(label="Number of Query Groups (G)", max_lines=1, value=None, interactive=False)
|
510 |
+
with gr.Row():
|
511 |
+
is_bias = gr.Radio(["True", "False"], value="False", label="Use Bias")
|
512 |
+
is_tie_word_embedding = gr.Radio(["True", "False"], value="False", label="Tie word embeddings")
|
513 |
# change editable function
|
514 |
def toggle_textbox_editable(radio_value):
|
515 |
# 根据 radio_value 的值来决定 textbox 是否可编辑
|
|
|
524 |
# Input 2.[Parallelism]
|
525 |
gr.Markdown(
|
526 |
"""
|
527 |
+
<h2>Parallelism config:</h2>
|
528 |
"""
|
529 |
)
|
530 |
with gr.Accordion("Parallelism config"):
|
531 |
# with gr.Row():
|
532 |
+
dp = gr.Number(label="Data parallelism (dp)", value=2)
|
533 |
+
tp = gr.Number(label="Tensor parallelism (tp)", value=2)
|
534 |
+
pp = gr.Number(label="Pipeline parallelism (pp)", value=2)
|
535 |
+
cp = gr.Number(label="Context parallelism (cp)", value=1)
|
536 |
# with gr.Row():
|
537 |
is_sp = gr.Radio(["True", "False"], value="True", label="Sequence parallelism")
|
538 |
+
vp = gr.Number(label="Virtual Pipeline Size (vp)")
|
539 |
is_dist_opt = gr.Radio(["True", "False"], value="True", label="Use Distributed Optimizer(Zero1)")
|
540 |
|
541 |
with gr.Column():
|
542 |
# Input 3.[Training Settings]
|
543 |
gr.Markdown(
|
544 |
"""
|
545 |
+
<h2>Training Config:</h2>
|
546 |
"""
|
547 |
)
|
548 |
with gr.Accordion("Training Config"):
|
549 |
# with gr.Row():
|
550 |
+
b = gr.Number(label="Micro Batch size (B)", value=4)
|
551 |
b_global = gr.Number(label="Global Batch size", value=64)
|
552 |
# with gr.Row():
|
553 |
gr.Checkbox(label="True", value=True, info="BF16 Training")
|
|
|
560 |
compute_btn = gr.Button("Compute")
|
561 |
with gr.Tab("Output"):
|
562 |
with gr.Column():
|
563 |
+
# gr.Markdown(
|
564 |
+
# """
|
565 |
+
# <h1>Output Data:</h1>
|
566 |
+
# """
|
567 |
+
# )
|
568 |
output_text = gr.Textbox(
|
569 |
label="Compute result",
|
570 |
interactive=False,
|
|
|
578 |
, latex_delimiters=[{ "left": "$$", "right": "$$", "display": True }]
|
579 |
)
|
580 |
|
581 |
+
# gr.Markdown(abbr)
|
582 |
|
583 |
record_df = gr.Dataframe(
|
584 |
label="Record Table",
|
|
|
589 |
count = gr.Number(label="Row count", value=1, visible=False)
|
590 |
compute_btn.click(
|
591 |
fn=Compute_ALL_Model_memory,
|
592 |
+
inputs=[vocab_size, layer_num, hidden_size, ffn_size, sequence_len, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func,
|
593 |
dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count],
|
594 |
outputs=[output_text, record_df, count]
|
595 |
)
|