Update modeling_densebackward_olmoe0125.py
Browse files
modeling_densebackward_olmoe0125.py
CHANGED
@@ -80,7 +80,13 @@ class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
|
|
80 |
|
81 |
# ---------- Dense估计部分 ----------
|
82 |
# 计算所有专家对所有 token 的 dense 输出,shape: (B*seq_len, num_experts, hidden_dim)
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
# 将 selected_experts 转换为 list,每个 token 的激活专家列表
|
85 |
all_routing = selected_experts.tolist() # 长度为 (B*seq_len)
|
86 |
|
@@ -130,9 +136,21 @@ class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
|
|
130 |
indices.append(idx)
|
131 |
if indices:
|
132 |
selected_outputs = all_expert_outputs[indices, i, :] # (n, hidden_dim)
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
else:
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
dense_parts[i] = estimated
|
137 |
# 按 gate_prob 加权求和各专家输出
|
138 |
estimated_dense = 0
|
@@ -152,20 +170,48 @@ class DenseBackwardOLMoEForCausalLM(OlmoeForCausalLM):
|
|
152 |
base_model_prefix = "olmoe"
|
153 |
|
154 |
def __init__(self, config):
|
|
|
155 |
super().__init__(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
# 遍历模型中所有 decoder 层,替换每个 OlmoeSparseMoeBlock 为 DenseBackward 版本
|
157 |
# 此处假设官方模型在 self.model.layers 中组织 decoder 层,
|
158 |
# 且每层中 mlp 模块包含属性 sparse_moe_block。
|
159 |
for layer in self.model.layers:
|
160 |
-
if hasattr(layer.mlp, "
|
161 |
-
|
|
|
162 |
# 通过直接复制原版属性创建新的块
|
163 |
new_block = DenseBackwardOlmoeSparseMoeBlock(config) # 或其他适当参数
|
164 |
# 然后手动复制需要共享的属性:
|
165 |
new_block.gate = orig_block.gate
|
166 |
new_block.experts = orig_block.experts
|
167 |
-
new_block.router = orig_block.router
|
168 |
new_block.num_experts = orig_block.num_experts
|
169 |
new_block.top_k = orig_block.top_k
|
170 |
new_block.norm_topk_prob = orig_block.norm_topk_prob
|
171 |
-
layer.mlp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
# ---------- Dense估计部分 ----------
|
82 |
# 计算所有专家对所有 token 的 dense 输出,shape: (B*seq_len, num_experts, hidden_dim)
|
83 |
+
# 创建全零张量,只填入已激活专家的输出
|
84 |
+
all_expert_outputs = torch.zeros((flat_hidden.size(0), self.num_experts, hidden_dim),
|
85 |
+
dtype=flat_hidden.dtype, device=flat_hidden.device)
|
86 |
+
# 填入已激活专家的输出
|
87 |
+
for i in range(flat_hidden.size(0)):
|
88 |
+
for expert_idx in activated_outputs[i].keys():
|
89 |
+
all_expert_outputs[i, expert_idx] = activated_outputs[i][expert_idx]
|
90 |
# 将 selected_experts 转换为 list,每个 token 的激活专家列表
|
91 |
all_routing = selected_experts.tolist() # 长度为 (B*seq_len)
|
92 |
|
|
|
136 |
indices.append(idx)
|
137 |
if indices:
|
138 |
selected_outputs = all_expert_outputs[indices, i, :] # (n, hidden_dim)
|
139 |
+
# 只计算非零值的平均值
|
140 |
+
mask = (selected_outputs.sum(dim=-1) != 0).to(selected_outputs.dtype).unsqueeze(-1)
|
141 |
+
if mask.sum() > 0:
|
142 |
+
estimated = (selected_outputs * mask).sum(dim=0) / mask.sum()
|
143 |
+
else:
|
144 |
+
# 如果全是零,返回零向量
|
145 |
+
estimated = torch.zeros_like(selected_outputs[0])
|
146 |
else:
|
147 |
+
all_outputs = all_expert_outputs[:, i, :]
|
148 |
+
mask = (all_outputs.sum(dim=-1) != 0).to(all_outputs.dtype).unsqueeze(-1)
|
149 |
+
if mask.sum() > 0:
|
150 |
+
estimated = (all_outputs * mask).sum(dim=0) / mask.sum()
|
151 |
+
else:
|
152 |
+
# 如果全是零,返回零向量
|
153 |
+
estimated = torch.zeros_like(all_outputs[0])
|
154 |
dense_parts[i] = estimated
|
155 |
# 按 gate_prob 加权求和各专家输出
|
156 |
estimated_dense = 0
|
|
|
170 |
base_model_prefix = "olmoe"
|
171 |
|
172 |
def __init__(self, config):
|
173 |
+
# 首先调用父类初始化方法
|
174 |
super().__init__(config)
|
175 |
+
|
176 |
+
# 不要尝试重新赋值self,而是从预训练模型加载并更新当前模型
|
177 |
+
pretrained_model = OlmoeForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0125")
|
178 |
+
|
179 |
+
# 复制预训练模型的状态到当前模型
|
180 |
+
self.config = pretrained_model.config
|
181 |
+
self.model = pretrained_model.model
|
182 |
+
self.vocab_size = pretrained_model.vocab_size
|
183 |
+
self.router_aux_loss_coef = pretrained_model.router_aux_loss_coef
|
184 |
+
self.num_experts = pretrained_model.num_experts
|
185 |
+
self.lm_head = pretrained_model.lm_head
|
186 |
+
|
187 |
# 遍历模型中所有 decoder 层,替换每个 OlmoeSparseMoeBlock 为 DenseBackward 版本
|
188 |
# 此处假设官方模型在 self.model.layers 中组织 decoder 层,
|
189 |
# 且每层中 mlp 模块包含属性 sparse_moe_block。
|
190 |
for layer in self.model.layers:
|
191 |
+
if hasattr(layer.mlp, "gate"):
|
192 |
+
print("111")
|
193 |
+
orig_block = layer.mlp
|
194 |
# 通过直接复制原版属性创建新的块
|
195 |
new_block = DenseBackwardOlmoeSparseMoeBlock(config) # 或其他适当参数
|
196 |
# 然后手动复制需要共享的属性:
|
197 |
new_block.gate = orig_block.gate
|
198 |
new_block.experts = orig_block.experts
|
|
|
199 |
new_block.num_experts = orig_block.num_experts
|
200 |
new_block.top_k = orig_block.top_k
|
201 |
new_block.norm_topk_prob = orig_block.norm_topk_prob
|
202 |
+
layer.mlp = new_block
|
203 |
+
print(type(layer.mlp))
|
204 |
+
|
205 |
+
def main():
|
206 |
+
config = DenseBackwardOLMoEConfig( # 官方模型参数
|
207 |
+
model_marker="DenseBackward_olmoe_marker",
|
208 |
+
)
|
209 |
+
# 创建自定义模型实例
|
210 |
+
model = DenseBackwardOLMoEForCausalLM(config)
|
211 |
+
print(type(model))
|
212 |
+
print(type(model.model))
|
213 |
+
print(type(model.model.layers[0]))
|
214 |
+
print(type(model.model.layers[0].mlp))
|
215 |
+
print(type(model.model.layers[0].mlp.experts))
|
216 |
+
if __name__ == "__main__":
|
217 |
+
main()
|