xxyux commited on
Commit
ac04d4c
·
verified ·
1 Parent(s): 0b0eb0d

Update app.py. [RoPE, RMSNorm]

Browse files
Files changed (1) hide show
  1. app.py +26 -15
app.py CHANGED
@@ -25,9 +25,14 @@ def Get_BillionParameter(parameter):
25
  return parameter / 1000**3
26
 
27
  # model states:
28
- def Compute_Parameters_input(hidden_size, vocab_size, tp):
29
  num_parameters_word_embedding = hidden_size * vocab_size / tp
30
- num_parameters_position_embedding = 0 #args.hidden_size * args.seq_length
 
 
 
 
 
31
  return num_parameters_word_embedding + num_parameters_position_embedding
32
 
33
  def Compute_Parameters_output(hidden_size, vocab_size, tp):
@@ -35,10 +40,13 @@ def Compute_Parameters_output(hidden_size, vocab_size, tp):
35
  num_parameters_output_embedding = 0 # due to sharedWordEmbedding
36
  return num_parameters_output_layernorm + num_parameters_output_embedding
37
 
38
- def Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, tp):
39
  # attention:
40
- # layernorm: 2h
41
- num_parameters_attention = 2 * hidden_size
 
 
 
42
  # QKV weight: 3h*h/tp, bias: 3h/tp
43
  # output linear weight: h*h/tp, bias: h
44
  num_parameters_attention_Q_weight = hidden_size * hidden_size / tp
@@ -51,10 +59,13 @@ def Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, tp):
51
 
52
  return num_parameters_attention
53
 
54
- def Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp):
55
  # MLP:
56
- # layernorm: 2h
57
- num_parameters_mlp = 2 * hidden_size
 
 
 
58
  # mlp1 weight: h*ffn/tp, bias: ffn/tp
59
  # mlp2 weight: ffn*h/tp, bias: h
60
  if act_func == "LLaMA":
@@ -68,16 +79,16 @@ def Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp):
68
 
69
  return num_parameters_mlp
70
 
71
- def Compute_Parameters(vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp):
72
  if is_group_query == "False":
73
  group_query_num = head_num
74
  kv_hidden_size = hidden_size / head_num * group_query_num
75
 
76
  # input part
77
- num_parameters_input = Compute_Parameters_input(hidden_size, vocab_size, tp)
78
 
79
  # middle layers part
80
- num_parameters_attention = Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, tp)
81
  num_parameters_mlp = Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp)
82
  num_parameters_in_single_layer = num_parameters_attention + num_parameters_mlp
83
  num_parameters_in_total_layers = num_parameters_in_single_layer * layer_num / pp
@@ -135,9 +146,9 @@ def Compute_Master_weight(numParametersTotal, is_dist_opt, dp, cp):
135
 
136
  return master_weight_memory
137
 
138
- def Compute_Model_states(vocab_size, layer_num, hidden_size, ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func,
139
  dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty):
140
- numParametersTotal = Compute_Parameters(vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp)
141
 
142
  weight_memory = Compute_Weight(numParametersTotal, is_fp8, is_fp8_init)
143
  gradient_memory = Compute_Gradient(numParametersTotal, g_ty)
@@ -289,14 +300,14 @@ def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_l
289
  return Error_message, record_df, count
290
 
291
  # get model states
292
- numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(vocab_size, layer_num, hidden_size,
293
  ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty)
294
 
295
  # get activation memory
296
  activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
297
 
298
  # get model parameters
299
- numParametersTotal = Compute_Parameters(vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, 1, 1)
300
  # get gpu number
301
  gpu_num = dp * tp * pp * cp
302
 
 
25
  return parameter / 1000**3
26
 
27
  # model states:
28
+ def Compute_Parameters_input(seq_length, hidden_size, vocab_size, act_func, tp):
29
  num_parameters_word_embedding = hidden_size * vocab_size / tp
30
+ # position embedding
31
+ if act_func == "LLaMA":
32
+ num_parameters_position_embedding = 0
33
+ else:
34
+ num_parameters_position_embedding = seq_length * hidden_size
35
+
36
  return num_parameters_word_embedding + num_parameters_position_embedding
37
 
38
  def Compute_Parameters_output(hidden_size, vocab_size, tp):
 
40
  num_parameters_output_embedding = 0 # due to sharedWordEmbedding
41
  return num_parameters_output_layernorm + num_parameters_output_embedding
42
 
43
+ def Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, act_func, tp):
44
  # attention:
45
+ # layernorm: h/2h
46
+ if act_func == "LLaMA":
47
+ num_parameters_mlp = hidden_size # RMSNorm
48
+ else:
49
+ num_parameters_mlp = 2 * hidden_size # LayerNorm
50
  # QKV weight: 3h*h/tp, bias: 3h/tp
51
  # output linear weight: h*h/tp, bias: h
52
  num_parameters_attention_Q_weight = hidden_size * hidden_size / tp
 
59
 
60
  return num_parameters_attention
61
 
62
+ def Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp):
63
  # MLP:
64
+ # layernorm: h/2h
65
+ if act_func == "LLaMA":
66
+ num_parameters_mlp = hidden_size # RMSNorm
67
+ else:
68
+ num_parameters_mlp = 2 * hidden_size # LayerNorm
69
  # mlp1 weight: h*ffn/tp, bias: ffn/tp
70
  # mlp2 weight: ffn*h/tp, bias: h
71
  if act_func == "LLaMA":
 
79
 
80
  return num_parameters_mlp
81
 
82
+ def Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp):
83
  if is_group_query == "False":
84
  group_query_num = head_num
85
  kv_hidden_size = hidden_size / head_num * group_query_num
86
 
87
  # input part
88
+ num_parameters_input = Compute_Parameters_input(seq_length, hidden_size, vocab_size, tp)
89
 
90
  # middle layers part
91
+ num_parameters_attention = Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, act_func, tp)
92
  num_parameters_mlp = Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp)
93
  num_parameters_in_single_layer = num_parameters_attention + num_parameters_mlp
94
  num_parameters_in_total_layers = num_parameters_in_single_layer * layer_num / pp
 
146
 
147
  return master_weight_memory
148
 
149
+ def Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size, ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func,
150
  dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty):
151
+ numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp)
152
 
153
  weight_memory = Compute_Weight(numParametersTotal, is_fp8, is_fp8_init)
154
  gradient_memory = Compute_Gradient(numParametersTotal, g_ty)
 
300
  return Error_message, record_df, count
301
 
302
  # get model states
303
+ numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size,
304
  ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty)
305
 
306
  # get activation memory
307
  activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
308
 
309
  # get model parameters
310
+ numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, 1, 1)
311
  # get gpu number
312
  gpu_num = dp * tp * pp * cp
313