Update app.py. [RoPE, RMSNorm]
Browse files
app.py
CHANGED
@@ -25,9 +25,14 @@ def Get_BillionParameter(parameter):
|
|
25 |
return parameter / 1000**3
|
26 |
|
27 |
# model states:
|
28 |
-
def Compute_Parameters_input(hidden_size, vocab_size, tp):
|
29 |
num_parameters_word_embedding = hidden_size * vocab_size / tp
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
31 |
return num_parameters_word_embedding + num_parameters_position_embedding
|
32 |
|
33 |
def Compute_Parameters_output(hidden_size, vocab_size, tp):
|
@@ -35,10 +40,13 @@ def Compute_Parameters_output(hidden_size, vocab_size, tp):
|
|
35 |
num_parameters_output_embedding = 0 # due to sharedWordEmbedding
|
36 |
return num_parameters_output_layernorm + num_parameters_output_embedding
|
37 |
|
38 |
-
def Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, tp):
|
39 |
# attention:
|
40 |
-
# layernorm: 2h
|
41 |
-
|
|
|
|
|
|
|
42 |
# QKV weight: 3h*h/tp, bias: 3h/tp
|
43 |
# output linear weight: h*h/tp, bias: h
|
44 |
num_parameters_attention_Q_weight = hidden_size * hidden_size / tp
|
@@ -51,10 +59,13 @@ def Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, tp):
|
|
51 |
|
52 |
return num_parameters_attention
|
53 |
|
54 |
-
def Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func,
|
55 |
# MLP:
|
56 |
-
# layernorm: 2h
|
57 |
-
|
|
|
|
|
|
|
58 |
# mlp1 weight: h*ffn/tp, bias: ffn/tp
|
59 |
# mlp2 weight: ffn*h/tp, bias: h
|
60 |
if act_func == "LLaMA":
|
@@ -68,16 +79,16 @@ def Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp):
|
|
68 |
|
69 |
return num_parameters_mlp
|
70 |
|
71 |
-
def Compute_Parameters(vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp):
|
72 |
if is_group_query == "False":
|
73 |
group_query_num = head_num
|
74 |
kv_hidden_size = hidden_size / head_num * group_query_num
|
75 |
|
76 |
# input part
|
77 |
-
num_parameters_input = Compute_Parameters_input(hidden_size, vocab_size, tp)
|
78 |
|
79 |
# middle layers part
|
80 |
-
num_parameters_attention = Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, tp)
|
81 |
num_parameters_mlp = Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp)
|
82 |
num_parameters_in_single_layer = num_parameters_attention + num_parameters_mlp
|
83 |
num_parameters_in_total_layers = num_parameters_in_single_layer * layer_num / pp
|
@@ -135,9 +146,9 @@ def Compute_Master_weight(numParametersTotal, is_dist_opt, dp, cp):
|
|
135 |
|
136 |
return master_weight_memory
|
137 |
|
138 |
-
def Compute_Model_states(vocab_size, layer_num, hidden_size, ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func,
|
139 |
dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty):
|
140 |
-
numParametersTotal = Compute_Parameters(vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp)
|
141 |
|
142 |
weight_memory = Compute_Weight(numParametersTotal, is_fp8, is_fp8_init)
|
143 |
gradient_memory = Compute_Gradient(numParametersTotal, g_ty)
|
@@ -289,14 +300,14 @@ def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_l
|
|
289 |
return Error_message, record_df, count
|
290 |
|
291 |
# get model states
|
292 |
-
numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(vocab_size, layer_num, hidden_size,
|
293 |
ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty)
|
294 |
|
295 |
# get activation memory
|
296 |
activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
|
297 |
|
298 |
# get model parameters
|
299 |
-
numParametersTotal = Compute_Parameters(vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, 1, 1)
|
300 |
# get gpu number
|
301 |
gpu_num = dp * tp * pp * cp
|
302 |
|
|
|
25 |
return parameter / 1000**3
|
26 |
|
27 |
# model states:
|
28 |
+
def Compute_Parameters_input(seq_length, hidden_size, vocab_size, act_func, tp):
|
29 |
num_parameters_word_embedding = hidden_size * vocab_size / tp
|
30 |
+
# position embedding
|
31 |
+
if act_func == "LLaMA":
|
32 |
+
num_parameters_position_embedding = 0
|
33 |
+
else:
|
34 |
+
num_parameters_position_embedding = seq_length * hidden_size
|
35 |
+
|
36 |
return num_parameters_word_embedding + num_parameters_position_embedding
|
37 |
|
38 |
def Compute_Parameters_output(hidden_size, vocab_size, tp):
|
|
|
40 |
num_parameters_output_embedding = 0 # due to sharedWordEmbedding
|
41 |
return num_parameters_output_layernorm + num_parameters_output_embedding
|
42 |
|
43 |
+
def Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, act_func, tp):
|
44 |
# attention:
|
45 |
+
# layernorm: h/2h
|
46 |
+
if act_func == "LLaMA":
|
47 |
+
num_parameters_mlp = hidden_size # RMSNorm
|
48 |
+
else:
|
49 |
+
num_parameters_mlp = 2 * hidden_size # LayerNorm
|
50 |
# QKV weight: 3h*h/tp, bias: 3h/tp
|
51 |
# output linear weight: h*h/tp, bias: h
|
52 |
num_parameters_attention_Q_weight = hidden_size * hidden_size / tp
|
|
|
59 |
|
60 |
return num_parameters_attention
|
61 |
|
62 |
+
def Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp):
|
63 |
# MLP:
|
64 |
+
# layernorm: h/2h
|
65 |
+
if act_func == "LLaMA":
|
66 |
+
num_parameters_mlp = hidden_size # RMSNorm
|
67 |
+
else:
|
68 |
+
num_parameters_mlp = 2 * hidden_size # LayerNorm
|
69 |
# mlp1 weight: h*ffn/tp, bias: ffn/tp
|
70 |
# mlp2 weight: ffn*h/tp, bias: h
|
71 |
if act_func == "LLaMA":
|
|
|
79 |
|
80 |
return num_parameters_mlp
|
81 |
|
82 |
+
def Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp):
|
83 |
if is_group_query == "False":
|
84 |
group_query_num = head_num
|
85 |
kv_hidden_size = hidden_size / head_num * group_query_num
|
86 |
|
87 |
# input part
|
88 |
+
num_parameters_input = Compute_Parameters_input(seq_length, hidden_size, vocab_size, tp)
|
89 |
|
90 |
# middle layers part
|
91 |
+
num_parameters_attention = Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, act_func, tp)
|
92 |
num_parameters_mlp = Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp)
|
93 |
num_parameters_in_single_layer = num_parameters_attention + num_parameters_mlp
|
94 |
num_parameters_in_total_layers = num_parameters_in_single_layer * layer_num / pp
|
|
|
146 |
|
147 |
return master_weight_memory
|
148 |
|
149 |
+
def Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size, ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func,
|
150 |
dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty):
|
151 |
+
numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp)
|
152 |
|
153 |
weight_memory = Compute_Weight(numParametersTotal, is_fp8, is_fp8_init)
|
154 |
gradient_memory = Compute_Gradient(numParametersTotal, g_ty)
|
|
|
300 |
return Error_message, record_df, count
|
301 |
|
302 |
# get model states
|
303 |
+
numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(seq_length, vocab_size, layer_num, hidden_size,
|
304 |
ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty)
|
305 |
|
306 |
# get activation memory
|
307 |
activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
|
308 |
|
309 |
# get model parameters
|
310 |
+
numParametersTotal = Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, 1, 1)
|
311 |
# get gpu number
|
312 |
gpu_num = dp * tp * pp * cp
|
313 |
|