xxyux commited on
Commit
9f3e0f7
·
verified ·
1 Parent(s): 719f946

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -42
app.py CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
5
  # 'dp', 'tp', 'pp', 'cp', 'GPU numbers', 'Batch size', 'FP8', 'Model parameters', 'Model_states', 'Activation', 'Total']
6
 
7
  col=['L', 'H', 'FFN', 'S', 'A', 'G',
8
- 'dp', 'tp', 'pp', 'cp', 'GPU number', 'Batch size', 'FP8', 'Model parameters', 'Model states', 'Activation', 'Total']
9
 
10
  abbr = """
11
  <div align="center">
@@ -35,9 +35,18 @@ def Compute_Parameters_input(seq_length, hidden_size, vocab_size, act_func, tp):
35
 
36
  return num_parameters_word_embedding + num_parameters_position_embedding
37
 
38
- def Compute_Parameters_output(hidden_size, vocab_size, tp):
39
- num_parameters_output_layernorm = 2 * hidden_size
40
- num_parameters_output_embedding = 0 # due to sharedWordEmbedding
 
 
 
 
 
 
 
 
 
41
  return num_parameters_output_layernorm + num_parameters_output_embedding
42
 
43
  def Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, act_func, tp):
@@ -79,7 +88,7 @@ def Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp):
79
 
80
  return num_parameters_mlp
81
 
82
- def Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp):
83
  if is_group_query == "False":
84
  group_query_num = head_num
85
  kv_hidden_size = hidden_size / head_num * group_query_num
@@ -94,7 +103,7 @@ def Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size,
94
  num_parameters_in_total_layers = num_parameters_in_single_layer * layer_num / pp
95
 
96
  # output part
97
- parameters_output = Compute_Parameters_output(hidden_size, vocab_size, tp)
98
 
99
  if pp == 1:
100
  num_parameters_total = (
@@ -146,9 +155,9 @@ def Compute_Master_weight(numParametersTotal, is_dist_opt, dp, cp):
146
 
147
  return master_weight_memory
148
 
149
- def Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size, ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func,
150
  dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty):
151
- numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp)
152
 
153
  weight_memory = Compute_Weight(numParametersTotal, is_fp8, is_fp8_init)
154
  gradient_memory = Compute_Gradient(numParametersTotal, g_ty)
@@ -160,7 +169,7 @@ def Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size, ffn_siz
160
 
161
  # activation memory:
162
  def compute_activation_memory_attention(activation_dtype, seq_length, b, hidden_size, kv_hidden_size, is_sp, tp):
163
- # LN 2bsq
164
  activation_mem_attn_ln = seq_length * b * hidden_size * 2
165
  if is_sp == "False":
166
  activation_mem_attn_ln *= tp
@@ -288,11 +297,13 @@ def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, he
288
  return activation_memory / tp / cp
289
 
290
  # compute_btn.click.function
291
- def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_length, head_num, is_group_query, group_query_num, is_bias, act_func,
292
  dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count):
293
  # data type trans
294
  if is_group_query == "True":
295
  group_query_num = int(group_query_num)
 
 
296
 
297
  # check input
298
  [result, Error_message] = check_input(dp, tp, pp, cp, hidden_size, head_num, layer_num, seq_length, vp, b, b_global)
@@ -301,13 +312,13 @@ def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_l
301
 
302
  # get model states
303
  numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size,
304
- ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty)
305
 
306
  # get activation memory
307
  activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
308
 
309
  # get model parameters
310
- numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, 1, 1)
311
  # get gpu number
312
  gpu_num = dp * tp * pp * cp
313
 
@@ -349,9 +360,10 @@ def generate_csv(record_df):
349
  # formula string
350
  formula = r"""
351
  > **Note**🔑: In this formula, we assume LLM training with FP8 training.
352
- > 1. Interleaved pipeline.
353
- > 2. bias = False.
354
- > 3. SP = True.
 
355
 
356
  <div align="center">
357
  <img src=file/T1.jpg width=50%/>
@@ -359,7 +371,7 @@ formula = r"""
359
 
360
  $$
361
  {Total\ Model\ parameters} =
362
- HV + HS + (4H^2 + 3H \times FFN + 2H) \times L + 2H + HV
363
  $$
364
 
365
  ***
@@ -371,7 +383,7 @@ formula = r"""
371
  $$
372
  {Model\ states} =
373
  (6 + \frac{12}{dp \times cp}) \times
374
- (\frac{(\frac{4H^2 + 3H \times FFN}{tp} + 2H) \times L}{pp} + \frac{HV}{tp} + HS)
375
  $$
376
 
377
  $$
@@ -477,26 +489,27 @@ with gr.Blocks() as demo:
477
  # Input 1.[Model Parameters]
478
  gr.Markdown(
479
  """
480
- <h1>Model Parameters:</h1>
481
  """
482
  )
483
  with gr.Accordion("Model Parameters"):
484
  # with gr.Row():
485
  act_func = gr.Radio(["LLaMA", "GPT"], value="LLaMA", label="Model type") #, info="Action Function in MLP, whether to use GLU (Gated Linear Unit). [e.g \"True\" for LlaMA, \"False\" for GPT.]")
486
  with gr.Row():
487
- vocab_size = gr.Number(label="Vocab size", value=32000)
488
- layer_num = gr.Number(label="Layer number", value=32)
489
  with gr.Row():
490
- hidden_size = gr.Number(label="Hidden size", value=4096)
491
- ffn_size = gr.Number(label="FFN Hidden size", value=11008)
492
  with gr.Row():
493
- sequence_len = gr.Number(label="Sequence length", value=2048)
494
- head_num = gr.Number(label="Number of Attention Heads", value=32)
495
  with gr.Row():
496
  is_group_query = gr.Radio(["True", "False"], value="False", label="Use Group Query Attention")
497
- group_query_num = gr.Textbox(label="Number of Query Groups", max_lines=1, value=None, interactive=False)
498
- is_bias = gr.Radio(["True", "False"], value="False", label="Use Bias")
499
-
 
500
  # change editable function
501
  def toggle_textbox_editable(radio_value):
502
  # 根据 radio_value 的值来决定 textbox 是否可编辑
@@ -511,30 +524,30 @@ with gr.Blocks() as demo:
511
  # Input 2.[Parallelism]
512
  gr.Markdown(
513
  """
514
- <h1>Parallelism config:</h1>
515
  """
516
  )
517
  with gr.Accordion("Parallelism config"):
518
  # with gr.Row():
519
- dp = gr.Number(label="Data parallelism", value=2)
520
- tp = gr.Number(label="Tensor parallelism", value=2)
521
- pp = gr.Number(label="Pipeline parallelism", value=2)
522
- cp = gr.Number(label="Context parallelism", value=2)
523
  # with gr.Row():
524
  is_sp = gr.Radio(["True", "False"], value="True", label="Sequence parallelism")
525
- vp = gr.Number(label="Virtual Pipeline Size")
526
  is_dist_opt = gr.Radio(["True", "False"], value="True", label="Use Distributed Optimizer(Zero1)")
527
 
528
  with gr.Column():
529
  # Input 3.[Training Settings]
530
  gr.Markdown(
531
  """
532
- <h1>Training Config:</h1>
533
  """
534
  )
535
  with gr.Accordion("Training Config"):
536
  # with gr.Row():
537
- b = gr.Number(label="Micro Batch size", value=4)
538
  b_global = gr.Number(label="Global Batch size", value=64)
539
  # with gr.Row():
540
  gr.Checkbox(label="True", value=True, info="BF16 Training")
@@ -547,11 +560,11 @@ with gr.Blocks() as demo:
547
  compute_btn = gr.Button("Compute")
548
  with gr.Tab("Output"):
549
  with gr.Column():
550
- gr.Markdown(
551
- """
552
- <h1>Output Data:</h1>
553
- """
554
- )
555
  output_text = gr.Textbox(
556
  label="Compute result",
557
  interactive=False,
@@ -565,7 +578,7 @@ with gr.Blocks() as demo:
565
  , latex_delimiters=[{ "left": "$$", "right": "$$", "display": True }]
566
  )
567
 
568
- gr.Markdown(abbr)
569
 
570
  record_df = gr.Dataframe(
571
  label="Record Table",
@@ -576,7 +589,7 @@ with gr.Blocks() as demo:
576
  count = gr.Number(label="Row count", value=1, visible=False)
577
  compute_btn.click(
578
  fn=Compute_ALL_Model_memory,
579
- inputs=[vocab_size, layer_num, hidden_size, ffn_size, sequence_len, head_num, is_group_query, group_query_num, is_bias, act_func,
580
  dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count],
581
  outputs=[output_text, record_df, count]
582
  )
 
5
  # 'dp', 'tp', 'pp', 'cp', 'GPU numbers', 'Batch size', 'FP8', 'Model parameters', 'Model_states', 'Activation', 'Total']
6
 
7
  col=['L', 'H', 'FFN', 'S', 'A', 'G',
8
+ 'dp', 'tp', 'pp', 'cp', 'Number of GPUs', 'B', 'FP8', 'Model parameters (B)', 'Model states (GB)', 'Activation (GB)', 'Total (GB)']
9
 
10
  abbr = """
11
  <div align="center">
 
35
 
36
  return num_parameters_word_embedding + num_parameters_position_embedding
37
 
38
+ def Compute_Parameters_output(hidden_size, vocab_size, is_tie_word_embedding, act_func, tp):
39
+ # layernorm: h/2h
40
+ if act_func == "LLaMA":
41
+ num_parameters_output_layernorm = hidden_size # RMSNorm
42
+ else:
43
+ num_parameters_output_layernorm = 2 * hidden_size # LayerNorm
44
+
45
+ if is_tie_word_embedding == "True":
46
+ num_parameters_output_embedding = 0 # due to sharedWordEmbedding
47
+ else:
48
+ num_parameters_output_embedding = hidden_size * vocab_size / tp
49
+
50
  return num_parameters_output_layernorm + num_parameters_output_embedding
51
 
52
  def Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, act_func, tp):
 
88
 
89
  return num_parameters_mlp
90
 
91
+ def Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, head_num, tp, pp):
92
  if is_group_query == "False":
93
  group_query_num = head_num
94
  kv_hidden_size = hidden_size / head_num * group_query_num
 
103
  num_parameters_in_total_layers = num_parameters_in_single_layer * layer_num / pp
104
 
105
  # output part
106
+ parameters_output = Compute_Parameters_output(hidden_size, vocab_size, is_tie_word_embedding, act_func, tp)
107
 
108
  if pp == 1:
109
  num_parameters_total = (
 
155
 
156
  return master_weight_memory
157
 
158
+ def Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size, ffn_size, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func,
159
  dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty):
160
+ numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, head_num, tp, pp)
161
 
162
  weight_memory = Compute_Weight(numParametersTotal, is_fp8, is_fp8_init)
163
  gradient_memory = Compute_Gradient(numParametersTotal, g_ty)
 
169
 
170
  # activation memory:
171
  def compute_activation_memory_attention(activation_dtype, seq_length, b, hidden_size, kv_hidden_size, is_sp, tp):
172
+ # LN 2bsh
173
  activation_mem_attn_ln = seq_length * b * hidden_size * 2
174
  if is_sp == "False":
175
  activation_mem_attn_ln *= tp
 
297
  return activation_memory / tp / cp
298
 
299
  # compute_btn.click.function
300
+ def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_length, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func,
301
  dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count):
302
  # data type trans
303
  if is_group_query == "True":
304
  group_query_num = int(group_query_num)
305
+ else:
306
+ group_query_num = head_num
307
 
308
  # check input
309
  [result, Error_message] = check_input(dp, tp, pp, cp, hidden_size, head_num, layer_num, seq_length, vp, b, b_global)
 
312
 
313
  # get model states
314
  numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size,
315
+ ffn_size, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty)
316
 
317
  # get activation memory
318
  activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
319
 
320
  # get model parameters
321
+ numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, head_num, 1, 1)
322
  # get gpu number
323
  gpu_num = dp * tp * pp * cp
324
 
 
360
  # formula string
361
  formula = r"""
362
  > **Note**🔑: In this formula, we assume LLM training with FP8 training.
363
+ > 1. LlaMA-family Model.
364
+ > 2. Interleaved pipeline.
365
+ > 3. bias = False.
366
+ > 4. SP = True.
367
 
368
  <div align="center">
369
  <img src=file/T1.jpg width=50%/>
 
371
 
372
  $$
373
  {Total\ Model\ parameters} =
374
+ HV + (4H^2 + 3H \times FFN + 2H) \times L + H
375
  $$
376
 
377
  ***
 
383
  $$
384
  {Model\ states} =
385
  (6 + \frac{12}{dp \times cp}) \times
386
+ (\frac{(\frac{4H^2 + 3H \times FFN}{tp} + 2H) \times L}{pp} + \frac{HV}{tp})
387
  $$
388
 
389
  $$
 
489
  # Input 1.[Model Parameters]
490
  gr.Markdown(
491
  """
492
+ <h2>Model Parameters:</h2>
493
  """
494
  )
495
  with gr.Accordion("Model Parameters"):
496
  # with gr.Row():
497
  act_func = gr.Radio(["LLaMA", "GPT"], value="LLaMA", label="Model type") #, info="Action Function in MLP, whether to use GLU (Gated Linear Unit). [e.g \"True\" for LlaMA, \"False\" for GPT.]")
498
  with gr.Row():
499
+ vocab_size = gr.Number(label="Vocab size (V)", value=32000)
500
+ layer_num = gr.Number(label="Layer number (L)", value=32)
501
  with gr.Row():
502
+ hidden_size = gr.Number(label="Hidden size (H)", value=4096)
503
+ ffn_size = gr.Number(label="FFN Hidden size (FFN)", value=11008)
504
  with gr.Row():
505
+ sequence_len = gr.Number(label="Sequence length (S)", value=2048)
506
+ head_num = gr.Number(label="Number of Attention Heads (A)", value=32)
507
  with gr.Row():
508
  is_group_query = gr.Radio(["True", "False"], value="False", label="Use Group Query Attention")
509
+ group_query_num = gr.Textbox(label="Number of Query Groups (G)", max_lines=1, value=None, interactive=False)
510
+ with gr.Row():
511
+ is_bias = gr.Radio(["True", "False"], value="False", label="Use Bias")
512
+ is_tie_word_embedding = gr.Radio(["True", "False"], value="False", label="Tie word embeddings")
513
  # change editable function
514
  def toggle_textbox_editable(radio_value):
515
  # 根据 radio_value 的值来决定 textbox 是否可编辑
 
524
  # Input 2.[Parallelism]
525
  gr.Markdown(
526
  """
527
+ <h2>Parallelism config:</h2>
528
  """
529
  )
530
  with gr.Accordion("Parallelism config"):
531
  # with gr.Row():
532
+ dp = gr.Number(label="Data parallelism (dp)", value=2)
533
+ tp = gr.Number(label="Tensor parallelism (tp)", value=2)
534
+ pp = gr.Number(label="Pipeline parallelism (pp)", value=2)
535
+ cp = gr.Number(label="Context parallelism (cp)", value=1)
536
  # with gr.Row():
537
  is_sp = gr.Radio(["True", "False"], value="True", label="Sequence parallelism")
538
+ vp = gr.Number(label="Virtual Pipeline Size (vp)")
539
  is_dist_opt = gr.Radio(["True", "False"], value="True", label="Use Distributed Optimizer(Zero1)")
540
 
541
  with gr.Column():
542
  # Input 3.[Training Settings]
543
  gr.Markdown(
544
  """
545
+ <h2>Training Config:</h2>
546
  """
547
  )
548
  with gr.Accordion("Training Config"):
549
  # with gr.Row():
550
+ b = gr.Number(label="Micro Batch size (B)", value=4)
551
  b_global = gr.Number(label="Global Batch size", value=64)
552
  # with gr.Row():
553
  gr.Checkbox(label="True", value=True, info="BF16 Training")
 
560
  compute_btn = gr.Button("Compute")
561
  with gr.Tab("Output"):
562
  with gr.Column():
563
+ # gr.Markdown(
564
+ # """
565
+ # <h1>Output Data:</h1>
566
+ # """
567
+ # )
568
  output_text = gr.Textbox(
569
  label="Compute result",
570
  interactive=False,
 
578
  , latex_delimiters=[{ "left": "$$", "right": "$$", "display": True }]
579
  )
580
 
581
+ # gr.Markdown(abbr)
582
 
583
  record_df = gr.Dataframe(
584
  label="Record Table",
 
589
  count = gr.Number(label="Row count", value=1, visible=False)
590
  compute_btn.click(
591
  fn=Compute_ALL_Model_memory,
592
+ inputs=[vocab_size, layer_num, hidden_size, ffn_size, sequence_len, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func,
593
  dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count],
594
  outputs=[output_text, record_df, count]
595
  )