Spaces:

xxyux
/

GPU_memory_calculator_LLMTraining

Running

App Files Files Community

xxyux commited on Jul 8, 2024

Commit

5c53556

verified ·

1 Parent(s): affd6ff

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -21

app.py CHANGED Viewed

@@ -177,21 +177,21 @@ def Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size, ffn_siz
             weight_memory + gradient_memory + optimizer_memory + master_weight_memory
 # activation memory:
-def compute_activation_memory_attention(activation_dtype, seq_length, b, hidden_size, kv_hidden_size, is_sp, tp):
     # LN 2bsh
-    activation_mem_attn_ln = seq_length * b * hidden_size * 2
     if is_sp == "False":
         activation_mem_attn_ln *= tp
     # attention input X, qkv 2bsh/1bsh
-    activation_mem_attn_qkv = seq_length * b * hidden_size * activation_dtype
     if is_sp == "False":
         activation_mem_attn_qkv *= tp
     # attention q 2bsh
-    activation_mem_attn_q = seq_length * b * hidden_size * 2
     # attention k and v 4bsh
-    activation_mem_attn_kv = seq_length * b * kv_hidden_size * 2 * 2
     # attention proj input 2bsh/1bsh
-    activation_mem_attn_proj = seq_length * b * hidden_size * activation_dtype
     # dropout bsh
     activation_mem_attn_dropout = seq_length * b * hidden_size
     if is_sp == "False":
@@ -208,22 +208,22 @@ def compute_activation_memory_attention(activation_dtype, seq_length, b, hidden_
     )
     return activation_memory_attn
-def compute_activation_memory_mlp(activation_dtype, seq_length, b, hidden_size, ffn_size, act_func, is_sp, tp):
     # LN 2bsh
-    activation_mem_mlp_ln = seq_length * b * hidden_size * 2
     if is_sp == "False":
         activation_mem_mlp_ln *= tp
     # FC1 2bsh/1bsh
-    activation_mem_mlp_fc1 = seq_length * b * hidden_size * activation_dtype
     if is_sp == "False":
         activation_mem_mlp_fc1 *= tp
     # Act 8bsh
     if act_func == "LLaMA":
-        activation_mem_mlp_act = seq_length * b * ffn_size * 2 * 2
     else:
-        activation_mem_mlp_act = seq_length * b * ffn_size * 2
     # FC2 8bsh/4bsh
-    activation_mem_mlp_fc2 = seq_length * b * ffn_size * activation_dtype
     # dropout bsh
     activation_mem_mlp_dropout = seq_length * b * hidden_size
     if is_sp == "False":
@@ -261,25 +261,33 @@ def compute_activation_memory_pp(activation_memory, vp, pp, num_microbatches):
     return activation_memory
-def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp):
     # Using formula in Table 2 of https://arxiv.org/pdf/2205.05198.pdf.
     # We are trying to compute the maximum activation footprint, so all calculations in this function
     # are for the first pipeline stage.
-    # activation dataType
-    if is_fp8 == "False":
-        activation_dtype = 2
-    else:
-        activation_dtype = 1
     # kv_hidden_size
     if is_group_query == "False":
         group_query_num = head_num
     kv_hidden_size = hidden_size / head_num * group_query_num
-    activation_memory_attn = compute_activation_memory_attention(activation_dtype, seq_length, b, hidden_size, kv_hidden_size, is_sp, tp)
-    activation_memory_mlp = compute_activation_memory_mlp(activation_dtype, seq_length, b, hidden_size, ffn_size, act_func, is_sp, tp)
     activation_memory = activation_memory_attn + activation_memory_mlp
@@ -324,7 +332,7 @@ def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_l
         ffn_size, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, dp, tp, pp, cp, is_dist_opt, precision, is_fp8, is_fp8_init, g_ty, opt_func, o_ty)
     # get activation memory
-    activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
     # get model parameters
     numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, head_num, 1, 1)

             weight_memory + gradient_memory + optimizer_memory + master_weight_memory
 # activation memory:
+def compute_activation_memory_attention(training_dtype, gemm_dtype, seq_length, b, hidden_size, kv_hidden_size, is_sp, tp):
     # LN 2bsh
+    activation_mem_attn_ln = seq_length * b * hidden_size * training_dtype
     if is_sp == "False":
         activation_mem_attn_ln *= tp
     # attention input X, qkv 2bsh/1bsh
+    activation_mem_attn_qkv = seq_length * b * hidden_size * gemm_dtype
     if is_sp == "False":
         activation_mem_attn_qkv *= tp
     # attention q 2bsh
+    activation_mem_attn_q = seq_length * b * hidden_size * training_dtype
     # attention k and v 4bsh
+    activation_mem_attn_kv = seq_length * b * kv_hidden_size * training_dtype * 2
     # attention proj input 2bsh/1bsh
+    activation_mem_attn_proj = seq_length * b * hidden_size * gemm_dtype
     # dropout bsh
     activation_mem_attn_dropout = seq_length * b * hidden_size
     if is_sp == "False":
     )
     return activation_memory_attn
+def compute_activation_memory_mlp(training_dtype, gemm_dtype, seq_length, b, hidden_size, ffn_size, act_func, is_sp, tp):
     # LN 2bsh
+    activation_mem_mlp_ln = seq_length * b * hidden_size * training_dtype
     if is_sp == "False":
         activation_mem_mlp_ln *= tp
     # FC1 2bsh/1bsh
+    activation_mem_mlp_fc1 = seq_length * b * hidden_size * gemm_dtype
     if is_sp == "False":
         activation_mem_mlp_fc1 *= tp
     # Act 8bsh
     if act_func == "LLaMA":
+        activation_mem_mlp_act = seq_length * b * ffn_size * training_dtype * 2
     else:
+        activation_mem_mlp_act = seq_length * b * ffn_size * training_dtype
     # FC2 8bsh/4bsh
+    activation_mem_mlp_fc2 = seq_length * b * ffn_size * gemm_dtype
     # dropout bsh
     activation_mem_mlp_dropout = seq_length * b * hidden_size
     if is_sp == "False":
     return activation_memory
+def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, precision, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp):
     # Using formula in Table 2 of https://arxiv.org/pdf/2205.05198.pdf.
     # We are trying to compute the maximum activation footprint, so all calculations in this function
     # are for the first pipeline stage.
+    # activation dataType for Training
+    if precision == "FP32":
+        training_dtype = 4
+    else:
+        training_dtype = 2
+    # activation dataType for GEMM
+    if precision == "FP32":
+        gemm_dtype = 4
+    elif is_fp8 == "False":
+        gemm_dtype = 2
+    else:
+        gemm_dtype = 1
     # kv_hidden_size
     if is_group_query == "False":
         group_query_num = head_num
     kv_hidden_size = hidden_size / head_num * group_query_num
+    activation_memory_attn = compute_activation_memory_attention(training_dtype, gemm_dtype, seq_length, b, hidden_size, kv_hidden_size, is_sp, tp)
+    activation_memory_mlp = compute_activation_memory_mlp(training_dtype, gemm_dtype, seq_length, b, hidden_size, ffn_size, act_func, is_sp, tp)
     activation_memory = activation_memory_attn + activation_memory_mlp
         ffn_size, head_num, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, dp, tp, pp, cp, is_dist_opt, precision, is_fp8, is_fp8_init, g_ty, opt_func, o_ty)
     # get activation memory
+    activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, precision, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
     # get model parameters
     numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, is_tie_word_embedding, act_func, head_num, 1, 1)