xxyux commited on
Commit
aed6a9f
·
verified ·
1 Parent(s): 9f3e0f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -26
app.py CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
5
  # 'dp', 'tp', 'pp', 'cp', 'GPU numbers', 'Batch size', 'FP8', 'Model parameters', 'Model_states', 'Activation', 'Total']
6
 
7
  col=['L', 'H', 'FFN', 'S', 'A', 'G',
8
- 'dp', 'tp', 'pp', 'cp', 'Number of GPUs', 'B', 'FP8', 'Model parameters (B)', 'Model states (GB)', 'Activation (GB)', 'Total (GB)']
9
 
10
  abbr = """
11
  <div align="center">
@@ -31,7 +31,7 @@ def Compute_Parameters_input(seq_length, hidden_size, vocab_size, act_func, tp):
31
  if act_func == "LLaMA":
32
  num_parameters_position_embedding = 0
33
  else:
34
- num_parameters_position_embedding = seq_length * hidden_size
35
 
36
  return num_parameters_word_embedding + num_parameters_position_embedding
37
 
@@ -119,13 +119,15 @@ def Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size,
119
 
120
  return num_parameters_total
121
 
122
- def Compute_Weight(numParametersTotal, is_fp8, is_fp8_init):
123
- if is_fp8 == "False":
124
- weight_memory = 2 * numParametersTotal
125
- elif is_fp8_init == "False":
126
  weight_memory = 4 * numParametersTotal
127
  else:
128
- weight_memory = 2 * numParametersTotal
 
 
 
129
 
130
  return weight_memory
131
 
@@ -137,7 +139,7 @@ def Compute_Gradient(numParametersTotal, g_ty):
137
 
138
  return gradient_memory
139
 
140
- def Compute_Optimizer_states(numParametersTotal, o_ty, is_dist_opt, dp, cp):
141
  if o_ty == "FP32":
142
  optimizer_memory = 4 * 2 * numParametersTotal
143
  elif o_ty =="BF16":
@@ -146,23 +148,30 @@ def Compute_Optimizer_states(numParametersTotal, o_ty, is_dist_opt, dp, cp):
146
  if is_dist_opt == "True":
147
  optimizer_memory = optimizer_memory / (dp * cp)
148
 
 
 
 
 
149
  return optimizer_memory
150
 
151
- def Compute_Master_weight(numParametersTotal, is_dist_opt, dp, cp):
152
- master_weight_memory = 4 * numParametersTotal
 
 
 
153
  if is_dist_opt == "True":
154
  master_weight_memory = master_weight_memory / (dp * cp)
155
 
156
  return master_weight_memory
157
 
158
  def Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size, ffn_size, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func,
159
- dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty):
160
  numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, head_num, tp, pp)
161
 
162
- weight_memory = Compute_Weight(numParametersTotal, is_fp8, is_fp8_init)
163
  gradient_memory = Compute_Gradient(numParametersTotal, g_ty)
164
- optimizer_memory = Compute_Optimizer_states(numParametersTotal, o_ty, is_dist_opt, dp, cp)
165
- master_weight_memory = Compute_Master_weight(numParametersTotal, is_dist_opt, dp, cp)
166
 
167
  return numParametersTotal, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, \
168
  weight_memory + gradient_memory + optimizer_memory + master_weight_memory
@@ -298,7 +307,7 @@ def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, he
298
 
299
  # compute_btn.click.function
300
  def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_length, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func,
301
- dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count):
302
  # data type trans
303
  if is_group_query == "True":
304
  group_query_num = int(group_query_num)
@@ -312,7 +321,7 @@ def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_l
312
 
313
  # get model states
314
  numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size,
315
- ffn_size, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty)
316
 
317
  # get activation memory
318
  activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
@@ -344,7 +353,7 @@ def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_l
344
  GPU numbers = {str(gpu_num)}, \n
345
  Model parameters = {str(numParametersTotal)} B, \n
346
  Model parameters on each device = {str(numParameters)} B, \n
347
- Model_states = {str(model_states_memory)} GB, \n
348
  Activation = {str(activation_memory)} GB, \n
349
  Total memory consumption = {str(Total)} GB \n
350
  """, record_df, count
@@ -389,7 +398,7 @@ formula = r"""
389
  $$
390
  {Activation} =
391
  (1 + \frac{pp-1}{pp \times vp}) \times
392
- \frac{(8BS + BSH) \times pp + 15BSH + 5BS \times FFN}{tp \times cp}
393
  $$
394
 
395
  ***
@@ -494,7 +503,7 @@ with gr.Blocks() as demo:
494
  )
495
  with gr.Accordion("Model Parameters"):
496
  # with gr.Row():
497
- act_func = gr.Radio(["LLaMA", "GPT"], value="LLaMA", label="Model type") #, info="Action Function in MLP, whether to use GLU (Gated Linear Unit). [e.g \"True\" for LlaMA, \"False\" for GPT.]")
498
  with gr.Row():
499
  vocab_size = gr.Number(label="Vocab size (V)", value=32000)
500
  layer_num = gr.Number(label="Layer number (L)", value=32)
@@ -549,13 +558,14 @@ with gr.Blocks() as demo:
549
  # with gr.Row():
550
  b = gr.Number(label="Micro Batch size (B)", value=4)
551
  b_global = gr.Number(label="Global Batch size", value=64)
552
- # with gr.Row():
553
- gr.Checkbox(label="True", value=True, info="BF16 Training")
554
- is_fp8 = gr.Radio(["True", "False"], value="True", label="FP8 Training")
555
- is_fp8_init = gr.Radio(["True", "False"], value="True", label="FP8 Initialization(will reduce memory)")
556
- # with gr.Row():
557
  g_ty = gr.Dropdown(["FP32", "BF16"], value="FP32", label="Gradients Dtype")
558
- o_ty = gr.Dropdown(["FP32", "BF16"], value="FP32", label="Optimizer State Dtype")
 
 
559
 
560
  compute_btn = gr.Button("Compute")
561
  with gr.Tab("Output"):
@@ -590,7 +600,7 @@ with gr.Blocks() as demo:
590
  compute_btn.click(
591
  fn=Compute_ALL_Model_memory,
592
  inputs=[vocab_size, layer_num, hidden_size, ffn_size, sequence_len, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func,
593
- dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count],
594
  outputs=[output_text, record_df, count]
595
  )
596
 
 
5
  # 'dp', 'tp', 'pp', 'cp', 'GPU numbers', 'Batch size', 'FP8', 'Model parameters', 'Model_states', 'Activation', 'Total']
6
 
7
  col=['L', 'H', 'FFN', 'S', 'A', 'G',
8
+ 'DP', 'TP', 'PP', 'CP', 'GPUs', 'B', 'FP8', 'Model parameters (B)', 'Model states (GB)', 'Activation (GB)', 'Total (GB)']
9
 
10
  abbr = """
11
  <div align="center">
 
31
  if act_func == "LLaMA":
32
  num_parameters_position_embedding = 0
33
  else:
34
+ num_parameters_position_embedding = seq_length * hidden_size / tp
35
 
36
  return num_parameters_word_embedding + num_parameters_position_embedding
37
 
 
119
 
120
  return num_parameters_total
121
 
122
+ def Compute_Weight(numParametersTotal, precision, is_fp8, is_fp8_init):
123
+ weight_memory = 0
124
+ if precision == "FP32":
 
125
  weight_memory = 4 * numParametersTotal
126
  else:
127
+ weight_memory = 2 * numParametersTotal
128
+
129
+ if is_fp8 == "True" and is_fp8_init == "False":
130
+ weight_memory += 2 * numParametersTotal
131
 
132
  return weight_memory
133
 
 
139
 
140
  return gradient_memory
141
 
142
+ def Compute_Optimizer_states(numParametersTotal, opt_func, o_ty, is_dist_opt, dp, cp):
143
  if o_ty == "FP32":
144
  optimizer_memory = 4 * 2 * numParametersTotal
145
  elif o_ty =="BF16":
 
148
  if is_dist_opt == "True":
149
  optimizer_memory = optimizer_memory / (dp * cp)
150
 
151
+ # for SGD, we have no optimizer states
152
+ if opt_func == "SGD":
153
+ optimizer_memory = 0
154
+
155
  return optimizer_memory
156
 
157
+ def Compute_Master_weight(numParametersTotal, precision, is_dist_opt, dp, cp):
158
+ if precision == "BF16":
159
+ master_weight_memory = 4 * numParametersTotal
160
+ else:
161
+ master_weight_memory = 0
162
  if is_dist_opt == "True":
163
  master_weight_memory = master_weight_memory / (dp * cp)
164
 
165
  return master_weight_memory
166
 
167
  def Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size, ffn_size, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func,
168
+ dp, tp, pp, cp, is_dist_opt, precision, is_fp8, is_fp8_init, g_ty, opt_func, o_ty):
169
  numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, head_num, tp, pp)
170
 
171
+ weight_memory = Compute_Weight(numParametersTotal, precision, is_fp8, is_fp8_init)
172
  gradient_memory = Compute_Gradient(numParametersTotal, g_ty)
173
+ optimizer_memory = Compute_Optimizer_states(numParametersTotal, opt_func, o_ty, is_dist_opt, dp, cp)
174
+ master_weight_memory = Compute_Master_weight(numParametersTotal, precision, is_dist_opt, dp, cp)
175
 
176
  return numParametersTotal, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, \
177
  weight_memory + gradient_memory + optimizer_memory + master_weight_memory
 
307
 
308
  # compute_btn.click.function
309
  def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_length, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func,
310
+ dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, precision, is_fp8, is_fp8_init, g_ty, opt_func, o_ty, record_df, count):
311
  # data type trans
312
  if is_group_query == "True":
313
  group_query_num = int(group_query_num)
 
321
 
322
  # get model states
323
  numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size,
324
+ ffn_size, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, dp, tp, pp, cp, is_dist_opt, precision, is_fp8, is_fp8_init, g_ty, opt_func, o_ty)
325
 
326
  # get activation memory
327
  activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
 
353
  GPU numbers = {str(gpu_num)}, \n
354
  Model parameters = {str(numParametersTotal)} B, \n
355
  Model parameters on each device = {str(numParameters)} B, \n
356
+ Model_states = Weight + Gradient + Optimizer = {str(model_states_memory)} GB, \n
357
  Activation = {str(activation_memory)} GB, \n
358
  Total memory consumption = {str(Total)} GB \n
359
  """, record_df, count
 
398
  $$
399
  {Activation} =
400
  (1 + \frac{pp-1}{pp \times vp}) \times
401
+ \frac{(8BS + BSH) \times pp + (15BSH + 5BS \times FFN) \times L}{tp \times cp}
402
  $$
403
 
404
  ***
 
503
  )
504
  with gr.Accordion("Model Parameters"):
505
  # with gr.Row():
506
+ act_func = gr.Radio(["LLaMA", "GPT"], value="LLaMA", label="Model type", info="eg. LLaMa: SwiGLU, RoPE, RMSNorm") #, info="Action Function in MLP, whether to use GLU (Gated Linear Unit). [e.g \"True\" for LlaMA, \"False\" for GPT.]")
507
  with gr.Row():
508
  vocab_size = gr.Number(label="Vocab size (V)", value=32000)
509
  layer_num = gr.Number(label="Layer number (L)", value=32)
 
558
  # with gr.Row():
559
  b = gr.Number(label="Micro Batch size (B)", value=4)
560
  b_global = gr.Number(label="Global Batch size", value=64)
561
+ precision = gr.Dropdown(["FP32", "BF16"], value="BF16", label="Training precision")
562
+ with gr.Row():
563
+ is_fp8 = gr.Radio(["True", "False"], value="True", label="FP8 Training")
564
+ is_fp8_init = gr.Radio(["True", "False"], value="True", label="FP8 Initialization(will reduce memory)")
 
565
  g_ty = gr.Dropdown(["FP32", "BF16"], value="FP32", label="Gradients Dtype")
566
+ with gr.Row():
567
+ opt_func = gr.Radio(["Adam", "SGD"], value="Adam", label="Optimizer function")
568
+ o_ty = gr.Dropdown(["FP32", "BF16"], value="FP32", label="Optimizer State Dtype")
569
 
570
  compute_btn = gr.Button("Compute")
571
  with gr.Tab("Output"):
 
600
  compute_btn.click(
601
  fn=Compute_ALL_Model_memory,
602
  inputs=[vocab_size, layer_num, hidden_size, ffn_size, sequence_len, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func,
603
+ dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, precision, is_fp8, is_fp8_init, g_ty, opt_func, o_ty, record_df, count],
604
  outputs=[output_text, record_df, count]
605
  )
606