autoprogrammer
/

olmoe_densebackward

Safetensors

olmoe

custom_code

Model card Files Files and versions Community

autoprogrammer commited on Mar 17

Commit

370ac60

verified ·

1 Parent(s): 56d7a8b

Update modeling_densebackward_olmoe.py

Browse files

Files changed (1) hide show

modeling_densebackward_olmoe.py +80 -78

modeling_densebackward_olmoe.py CHANGED Viewed

@@ -10,42 +10,11 @@ from .configuration_densebackward_olmoe import DenseBackwardOLMoEConfig
 class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
-    """
-    继承自官方 OlmoeSparseMoeBlock，实现 dense backward 功能：
-    前向输出依旧保持与官方相同（即稀疏计算结果），
-    但在反向传播时，通过直通梯度让 dense 计算的梯度传递回来，
-    dense 输出通过对每个专家在所有 token 上进行计算，并利用全 routing 权重加权获得。
-    输入：
-        hidden_states: Tensor, shape (batch_size, sequence_length, hidden_dim)
-    输出：
-        final_output: Tensor, shape (batch_size, sequence_length, hidden_dim)
-        router_logits: Tensor, shape (batch_size * sequence_length, num_experts)
-    """
     def forward(self, hidden_states: torch.Tensor):
-        """
-        输入:
-        hidden_states: Tensor, shape (batch_size, sequence_length, hidden_dim)
-        输出:
-            final_output: Tensor, shape (batch_size, sequence_length, hidden_dim)
-            router_logits: Tensor, shape (batch_size * sequence_length, num_experts)
-        实现思路：
-          1. 将输入展平为 (B*seq_len, hidden_dim)，通过 self.gate 得到 router_logits，
-             并计算全专家的 routing 权重（softmax 后）。
-          2. 对 routing 权重取 top-k，得到 routing_weights_topk 与 selected_experts；
-             如配置要求，归一化 top-k 概率。
-          3. 稀疏计算部分：仅计算每个 token 对于 top-k 专家的输出，
-             并累加得到 sparse_output（保留原版计算流程，同时记录激活专家的实际输出）。
-          4. Dense 估计部分：先计算所有专家对所有 token 的输出（all_expert_outputs），
-             再逐 token 调用 estimate_dense_output 得到 dense 输出（dense_estimated）。
-          5. 使用直通梯度技巧：前向输出用 sparse_output，但梯度来源于 dense_estimated。
-          6. 最后 reshape 为 (batch_size, sequence_length, hidden_dim) 并返回 final_output 及 router_logits.
-        """
-        #determine the shape of hidden_states
         batch_size, seq_length, hidden_dim = hidden_states.shape
         flat_hidden = hidden_states.view(-1, hidden_dim)  # (B*seq_len, hidden_dim)
-        # 计算路由 logits 和全专家 routing 权重
         router_logits = self.gate(flat_hidden)  # (B*seq_len, num_experts)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)  # (B*seq_len, num_experts)
@@ -56,14 +25,20 @@ class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
         routing_weights_topk = routing_weights_topk.to(flat_hidden.dtype)
         # ---------- 稀疏计算部分 ----------
-        # 初始化稀疏输出，shape: (B*seq_len, hidden_dim)
         sparse_output = torch.zeros((flat_hidden.size(0), hidden_dim), dtype=flat_hidden.dtype, device=flat_hidden.device)
-        # 用于记录每个 token 对激活专家的实际输出
-        activated_outputs = [{} for _ in range(flat_hidden.size(0))]
-        # one-hot 编码 top-k 专家，shape: (B*seq_len, top_k, num_experts)
         expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts)  # (B*seq_len, top_k, num_experts)
         expert_mask = expert_mask.permute(2, 1, 0)  # (num_experts, top_k, B*seq_len)
         for expert_idx in range(self.num_experts):
             expert_layer = self.experts[expert_idx]
             idx, top_x = torch.where(expert_mask[expert_idx])
@@ -73,71 +48,98 @@ class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
                 weight = routing_weights_topk[top_x, idx].unsqueeze(-1)  # (n, 1)
                 weighted_output = current_output * weight
                 sparse_output.index_add_(0, top_x, weighted_output.to(flat_hidden.dtype))
-                # 保存当前 token 对该专家的实际输出
                 for pos, token_idx in enumerate(top_x.tolist()):
-                    activated_outputs[token_idx][expert_idx] = current_output[pos]
         # ---------- 稀疏计算结束 ----------
         # ---------- Dense估计部分 ----------
-        # 计算所有专家对所有 token 的 dense 输出，shape: (B*seq_len, num_experts, hidden_dim)
-        all_expert_outputs = torch.stack([expert(flat_hidden) for expert in self.experts], dim=1)
-        # 将 selected_experts 转换为 list，每个 token 的激活专家列表
         all_routing = selected_experts.tolist()  # 长度为 (B*seq_len)
         dense_outputs = []
-        for i in range(flat_hidden.size(0)):
-            dense_est = self.estimate_dense_output(
-                token_idx=i,
-                activated=all_routing[i],              # 当前 token 激活的专家列表，例如 [a, b]
-                gate_prob=routing_weights[i],            # 当前 token 的完整 routing 权重 (num_experts,)
-                activated_outputs=activated_outputs[i],  # 当前 token 对激活专家的实际输出
-                all_routing=all_routing,                 # 全 batch 每个 token 的激活专家列表（list of lists）
-                all_expert_outputs=all_expert_outputs      # (B*seq_len, num_experts, hidden_dim)
             )
             dense_outputs.append(dense_est.unsqueeze(0))
         dense_outputs = torch.cat(dense_outputs, dim=0)  # (B*seq_len, hidden_dim)
         # ---------- Dense估计结束 ----------
-        # 使用直通梯度：前向输出用稀疏结果，但反向传播时梯度来源于 dense 估计
         final_flat = sparse_output.detach() + (dense_outputs - dense_outputs.detach())
         final_output = final_flat.view(batch_size, seq_length, hidden_dim)
         return final_output, router_logits
-    def estimate_dense_output(self, token_idx, activated, gate_prob, activated_outputs, all_routing, all_expert_outputs):
         """
-        对于当前 token，根据 mini-batch 中的信息估计 dense 输出。
-        参数：
-            token_idx: 当前 token 的索引（标量）
-            activated: 当前 token 激活的专家列表，例如 [1, 3]
-            gate_prob: 当前 token 的 routing 权重，形状 (num_experts,)
-            activated_outputs: dict，当前 token 对激活专家的实际输出，形状 (hidden_dim,)
-            all_routing: list，每个 token 的激活专家列表（长度为 N，每个元素为 list）
-            all_expert_outputs: Tensor, (N, num_experts, hidden_dim)
-        返回：
-            estimated_dense: Tensor, (hidden_dim,)
         """
         num_experts = gate_prob.size(0)
         dense_parts = {}
-        # 对于激活的专家，直接使用其实际输出
-        for idx in activated:
-            dense_parts[idx] = activated_outputs[idx]
-        # 对于未激活的专家，使用 mini-batch 中其他 token 的输出估计
         non_activated = [i for i in range(num_experts) if i not in activated]
-        for i in non_activated:
-            indices = []
-            for idx, r_dec in enumerate(all_routing):
-                if (i in r_dec) and (len(set(r_dec) & set(activated)) > 0):
-                    indices.append(idx)
-            if indices:
-                selected_outputs = all_expert_outputs[indices, i, :]  # (n, hidden_dim)
-                estimated = selected_outputs.mean(dim=0)
             else:
-                estimated = all_expert_outputs[:, i, :].mean(dim=0)
-            dense_parts[i] = estimated
-        # 按 gate_prob 加权求和各专家输出
         estimated_dense = 0
-        for i in range(num_experts):
-            estimated_dense += gate_prob[i] * dense_parts[i]
         return estimated_dense

 class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
     def forward(self, hidden_states: torch.Tensor):
         batch_size, seq_length, hidden_dim = hidden_states.shape
         flat_hidden = hidden_states.view(-1, hidden_dim)  # (B*seq_len, hidden_dim)
+        # 计算路由 logits 和 routing 权重
         router_logits = self.gate(flat_hidden)  # (B*seq_len, num_experts)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)  # (B*seq_len, num_experts)
         routing_weights_topk = routing_weights_topk.to(flat_hidden.dtype)
         # ---------- 稀疏计算部分 ----------
+        # 初始化稀疏输出
         sparse_output = torch.zeros((flat_hidden.size(0), hidden_dim), dtype=flat_hidden.dtype, device=flat_hidden.device)
+        # 存储所有激活信息的数据结构
+        num_tokens = flat_hidden.size(0)
+        all_activated_outputs = {}  # {expert_idx: {token_idx: output_tensor}}
+        all_routing_indices = {}    # {expert_idx: [token_indices]}
+        token_activated_experts = {}  # {token_idx: [activated_expert_indices]}
+        # one-hot 编码 top-k 专家
         expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts)  # (B*seq_len, top_k, num_experts)
         expert_mask = expert_mask.permute(2, 1, 0)  # (num_experts, top_k, B*seq_len)
+        # 稀疏计算，同时记录激活情况
         for expert_idx in range(self.num_experts):
             expert_layer = self.experts[expert_idx]
             idx, top_x = torch.where(expert_mask[expert_idx])
                 weight = routing_weights_topk[top_x, idx].unsqueeze(-1)  # (n, 1)
                 weighted_output = current_output * weight
                 sparse_output.index_add_(0, top_x, weighted_output.to(flat_hidden.dtype))
+                # 记录该专家激活的token和对应输出
+                all_activated_outputs[expert_idx] = {}
+                all_routing_indices[expert_idx] = top_x.tolist()
                 for pos, token_idx in enumerate(top_x.tolist()):
+                    # 记录该专家对该token的输出
+                    all_activated_outputs[expert_idx][token_idx] = current_output[pos]
+                    # 记录该token激活的专家
+                    if token_idx not in token_activated_experts:
+                        token_activated_experts[token_idx] = []
+                    token_activated_experts[token_idx].append(expert_idx)
         # ---------- 稀疏计算结束 ----------
         # ---------- Dense估计部分 ----------
+        # 将activated_experts 转换为list格式，与路由权重匹配
         all_routing = selected_experts.tolist()  # 长度为 (B*seq_len)
+        # 使用已激活信息估计dense输出
         dense_outputs = []
+        for token_idx in range(num_tokens):
+            # 获取当前token的激活专家列表
+            activated = all_routing[token_idx] if token_idx in token_activated_experts else []
+            # 估计dense输出（只使用已经计算过的专家输出）
+            dense_est = self.estimate_dense_output_efficient(
+                token_idx=token_idx,
+                activated=activated,
+                gate_prob=routing_weights[token_idx],
+                all_activated_outputs=all_activated_outputs,
+                all_routing_indices=all_routing_indices,
+                token_activated_experts=token_activated_experts
             )
             dense_outputs.append(dense_est.unsqueeze(0))
         dense_outputs = torch.cat(dense_outputs, dim=0)  # (B*seq_len, hidden_dim)
         # ---------- Dense估计结束 ----------
+        # 使用直通梯度技巧
         final_flat = sparse_output.detach() + (dense_outputs - dense_outputs.detach())
         final_output = final_flat.view(batch_size, seq_length, hidden_dim)
         return final_output, router_logits
+    def estimate_dense_output_efficient(self, token_idx, activated, gate_prob,
+                                       all_activated_outputs, all_routing_indices, token_activated_experts):
         """
+        优化版本的dense输出估计，只使用已计算的专家输出
         """
         num_experts = gate_prob.size(0)
         dense_parts = {}
+        # 对于激活的专家，直接使用其输出
+        for expert_idx in activated:
+            if expert_idx in all_activated_outputs and token_idx in all_activated_outputs[expert_idx]:
+                dense_parts[expert_idx] = all_activated_outputs[expert_idx][token_idx]
+        # 对于未激活的专家，使用其他token的激活输出估计
         non_activated = [i for i in range(num_experts) if i not in activated]
+        for expert_idx in non_activated:
+            # 如果该专家没有被任何token激活，跳过
+            if expert_idx not in all_routing_indices or not all_routing_indices[expert_idx]:
+                # 使用零向量或平均值作为估计
+                dense_parts[expert_idx] = torch.zeros_like(next(iter(dense_parts.values()))) if dense_parts else 0
+                continue
+            # 找出激活了该专家的token，并且这些token也激活了当前token激活的某些专家
+            candidate_tokens = []
+            for other_token in all_routing_indices[expert_idx]:
+                # 检查other_token是否与当前token共享某些激活专家
+                if other_token in token_activated_experts:
+                    common_experts = set(activated) & set(token_activated_experts[other_token])
+                    if common_experts:
+                        candidate_tokens.append(other_token)
+            # 如果找到了候选token，使用它们的输出平均值
+            if candidate_tokens:
+                expert_outputs = [all_activated_outputs[expert_idx][t] for t in candidate_tokens]
+                estimated = torch.stack(expert_outputs).mean(dim=0)
             else:
+                # 找不到合适的候选，使用所有激活了该专家的token
+                expert_outputs = [all_activated_outputs[expert_idx][t] for t in all_routing_indices[expert_idx]]
+                estimated = torch.stack(expert_outputs).mean(dim=0)
+            dense_parts[expert_idx] = estimated
+        # 按路由权重加权求和
         estimated_dense = 0
+        for expert_idx in range(num_experts):
+            if expert_idx in dense_parts:
+                estimated_dense += gate_prob[expert_idx] * dense_parts[expert_idx]
         return estimated_dense