autoprogrammer commited on
Commit
f23ed8f
·
verified ·
1 Parent(s): b8b0146

Update modeling_densebackward_olmoe0125.py

Browse files
Files changed (1) hide show
  1. modeling_densebackward_olmoe0125.py +53 -7
modeling_densebackward_olmoe0125.py CHANGED
@@ -80,7 +80,13 @@ class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
80
 
81
  # ---------- Dense估计部分 ----------
82
  # 计算所有专家对所有 token 的 dense 输出,shape: (B*seq_len, num_experts, hidden_dim)
83
- all_expert_outputs = torch.stack([expert(flat_hidden) for expert in self.experts], dim=1)
 
 
 
 
 
 
84
  # 将 selected_experts 转换为 list,每个 token 的激活专家列表
85
  all_routing = selected_experts.tolist() # 长度为 (B*seq_len)
86
 
@@ -130,9 +136,21 @@ class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
130
  indices.append(idx)
131
  if indices:
132
  selected_outputs = all_expert_outputs[indices, i, :] # (n, hidden_dim)
133
- estimated = selected_outputs.mean(dim=0)
 
 
 
 
 
 
134
  else:
135
- estimated = all_expert_outputs[:, i, :].mean(dim=0)
 
 
 
 
 
 
136
  dense_parts[i] = estimated
137
  # 按 gate_prob 加权求和各专家输出
138
  estimated_dense = 0
@@ -152,20 +170,48 @@ class DenseBackwardOLMoEForCausalLM(OlmoeForCausalLM):
152
  base_model_prefix = "olmoe"
153
 
154
  def __init__(self, config):
 
155
  super().__init__(config)
 
 
 
 
 
 
 
 
 
 
 
 
156
  # 遍历模型中所有 decoder 层,替换每个 OlmoeSparseMoeBlock 为 DenseBackward 版本
157
  # 此处假设官方模型在 self.model.layers 中组织 decoder 层,
158
  # 且每层中 mlp 模块包含属性 sparse_moe_block。
159
  for layer in self.model.layers:
160
- if hasattr(layer.mlp, "sparse_moe_block"):
161
- orig_block = layer.mlp.sparse_moe_block
 
162
  # 通过直接复制原版属性创建新的块
163
  new_block = DenseBackwardOlmoeSparseMoeBlock(config) # 或其他适当参数
164
  # 然后手动复制需要共享的属性:
165
  new_block.gate = orig_block.gate
166
  new_block.experts = orig_block.experts
167
- new_block.router = orig_block.router
168
  new_block.num_experts = orig_block.num_experts
169
  new_block.top_k = orig_block.top_k
170
  new_block.norm_topk_prob = orig_block.norm_topk_prob
171
- layer.mlp.sparse_moe_block = new_block
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  # ---------- Dense估计部分 ----------
82
  # 计算所有专家对所有 token 的 dense 输出,shape: (B*seq_len, num_experts, hidden_dim)
83
+ # 创建全零张量,只填入已激活专家的输出
84
+ all_expert_outputs = torch.zeros((flat_hidden.size(0), self.num_experts, hidden_dim),
85
+ dtype=flat_hidden.dtype, device=flat_hidden.device)
86
+ # 填入已激活专家的输出
87
+ for i in range(flat_hidden.size(0)):
88
+ for expert_idx in activated_outputs[i].keys():
89
+ all_expert_outputs[i, expert_idx] = activated_outputs[i][expert_idx]
90
  # 将 selected_experts 转换为 list,每个 token 的激活专家列表
91
  all_routing = selected_experts.tolist() # 长度为 (B*seq_len)
92
 
 
136
  indices.append(idx)
137
  if indices:
138
  selected_outputs = all_expert_outputs[indices, i, :] # (n, hidden_dim)
139
+ # 只计算非零值的平均值
140
+ mask = (selected_outputs.sum(dim=-1) != 0).to(selected_outputs.dtype).unsqueeze(-1)
141
+ if mask.sum() > 0:
142
+ estimated = (selected_outputs * mask).sum(dim=0) / mask.sum()
143
+ else:
144
+ # 如果全是零,返回零向量
145
+ estimated = torch.zeros_like(selected_outputs[0])
146
  else:
147
+ all_outputs = all_expert_outputs[:, i, :]
148
+ mask = (all_outputs.sum(dim=-1) != 0).to(all_outputs.dtype).unsqueeze(-1)
149
+ if mask.sum() > 0:
150
+ estimated = (all_outputs * mask).sum(dim=0) / mask.sum()
151
+ else:
152
+ # 如果全是零,返回零向量
153
+ estimated = torch.zeros_like(all_outputs[0])
154
  dense_parts[i] = estimated
155
  # 按 gate_prob 加权求和各专家输出
156
  estimated_dense = 0
 
170
  base_model_prefix = "olmoe"
171
 
172
  def __init__(self, config):
173
+ # 首先调用父类初始化方法
174
  super().__init__(config)
175
+
176
+ # 不要尝试重新赋值self,而是从预训练模型加载并更新当前模型
177
+ pretrained_model = OlmoeForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0125")
178
+
179
+ # 复制预训练模型的状态到当前模型
180
+ self.config = pretrained_model.config
181
+ self.model = pretrained_model.model
182
+ self.vocab_size = pretrained_model.vocab_size
183
+ self.router_aux_loss_coef = pretrained_model.router_aux_loss_coef
184
+ self.num_experts = pretrained_model.num_experts
185
+ self.lm_head = pretrained_model.lm_head
186
+
187
  # 遍历模型中所有 decoder 层,替换每个 OlmoeSparseMoeBlock 为 DenseBackward 版本
188
  # 此处假设官方模型在 self.model.layers 中组织 decoder 层,
189
  # 且每层中 mlp 模块包含属性 sparse_moe_block。
190
  for layer in self.model.layers:
191
+ if hasattr(layer.mlp, "gate"):
192
+ print("111")
193
+ orig_block = layer.mlp
194
  # 通过直接复制原版属性创建新的块
195
  new_block = DenseBackwardOlmoeSparseMoeBlock(config) # 或其他适当参数
196
  # 然后手动复制需要共享的属性:
197
  new_block.gate = orig_block.gate
198
  new_block.experts = orig_block.experts
 
199
  new_block.num_experts = orig_block.num_experts
200
  new_block.top_k = orig_block.top_k
201
  new_block.norm_topk_prob = orig_block.norm_topk_prob
202
+ layer.mlp = new_block
203
+ print(type(layer.mlp))
204
+
205
+ def main():
206
+ config = DenseBackwardOLMoEConfig( # 官方模型参数
207
+ model_marker="DenseBackward_olmoe_marker",
208
+ )
209
+ # 创建自定义模型实例
210
+ model = DenseBackwardOLMoEForCausalLM(config)
211
+ print(type(model))
212
+ print(type(model.model))
213
+ print(type(model.model.layers[0]))
214
+ print(type(model.model.layers[0].mlp))
215
+ print(type(model.model.layers[0].mlp.experts))
216
+ if __name__ == "__main__":
217
+ main()