autoprogrammer
/

olmoe_densebackward0125_v1

autoprogrammer commited on Mar 20

Commit

c401025

verified ·

1 Parent(s): 41d840a

Update modeling_densebackward_olmoe0125.py

Files changed (1) hide show

modeling_densebackward_olmoe0125.py CHANGED Viewed

@@ -30,13 +30,13 @@ class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
         # 计算路由逻辑
         router_logits = self.gate(flat_hidden)  # (B*seq_len, num_experts)
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)  # (B*seq_len, num_experts)
         # 选择top-k专家
         routing_weights_topk, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
         if self.norm_topk_prob:
             routing_weights_topk = routing_weights_topk / routing_weights_topk.sum(dim=-1, keepdim=True)
-        routing_weights_topk = routing_weights_topk.to(flat_hidden.dtype)
         # ---------- 真实计算所有专家输出（密集计算）----------
         all_expert_outputs = torch.zeros((N_tokens, self.num_experts, hidden_dim),

         # 计算路由逻辑
         router_logits = self.gate(flat_hidden)  # (B*seq_len, num_experts)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=flat_hidden.dtype)  # (B*seq_len, num_experts)
         # 选择top-k专家
         routing_weights_topk, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
         if self.norm_topk_prob:
             routing_weights_topk = routing_weights_topk / routing_weights_topk.sum(dim=-1, keepdim=True)
+        # 这里不需要转换类型，因为routing_weights已经使用了flat_hidden.dtype
         # ---------- 真实计算所有专家输出（密集计算）----------
         all_expert_outputs = torch.zeros((N_tokens, self.num_experts, hidden_dim),