autoprogrammer
/

olmoe_densebackward0125_v1

Safetensors

olmoe

custom_code

Model card Files Files and versions Community

autoprogrammer commited on Mar 20

Commit

34dd82a

verified ·

1 Parent(s): f23ed8f

Update modeling_densebackward_olmoe0125.py

Browse files

Files changed (1) hide show

modeling_densebackward_olmoe0125.py +56 -77

modeling_densebackward_olmoe0125.py CHANGED Viewed

@@ -10,6 +10,7 @@ from .configuration_densebackward_olmoe0125 import DenseBackwardOLMoEConfig
 class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
     """
     继承自官方 OlmoeSparseMoeBlock，实现 dense backward 功能：
     前向输出依旧保持与官方相同（即稀疏计算结果），
@@ -23,48 +24,27 @@ class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
         router_logits: Tensor, shape (batch_size * sequence_length, num_experts)
     """
     def forward(self, hidden_states: torch.Tensor):
-        """
-        输入:
-        hidden_states: Tensor, shape (batch_size, sequence_length, hidden_dim)
-        输出:
-            final_output: Tensor, shape (batch_size, sequence_length, hidden_dim)
-            router_logits: Tensor, shape (batch_size * sequence_length, num_experts)
-        实现思路：
-          1. 将输入展平为 (B*seq_len, hidden_dim)，通过 self.gate 得到 router_logits，
-             并计算全专家的 routing 权重（softmax 后）。
-          2. 对 routing 权重取 top-k，得到 routing_weights_topk 与 selected_experts；
-             如配置要求，归一化 top-k 概率。
-          3. 稀疏计算部分：仅计算每个 token 对于 top-k 专家的输出，
-             并累加得到 sparse_output（保留原版计算流程，同时记录激活专家的实际输出）。
-          4. Dense 估计部分：先计算所有专家对所有 token 的输出（all_expert_outputs），
-             再逐 token 调用 estimate_dense_output 得到 dense 输出（dense_estimated）。
-          5. 使用直通梯度技巧：前向输出用 sparse_output，但梯度来源于 dense_estimated。
-          6. 最后 reshape 为 (batch_size, sequence_length, hidden_dim) 并返回 final_output 及 router_logits.
-        """
-        #determine the shape of hidden_states
         batch_size, seq_length, hidden_dim = hidden_states.shape
         flat_hidden = hidden_states.view(-1, hidden_dim)  # (B*seq_len, hidden_dim)
-        # 计算路由 logits 和全专家 routing 权重
         router_logits = self.gate(flat_hidden)  # (B*seq_len, num_experts)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)  # (B*seq_len, num_experts)
-        # Top-k 选择
         routing_weights_topk, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
         if self.norm_topk_prob:
             routing_weights_topk = routing_weights_topk / routing_weights_topk.sum(dim=-1, keepdim=True)
         routing_weights_topk = routing_weights_topk.to(flat_hidden.dtype)
         # ---------- 稀疏计算部分 ----------
-        # 初始化稀疏输出，shape: (B*seq_len, hidden_dim)
-        sparse_output = torch.zeros((flat_hidden.size(0), hidden_dim), dtype=flat_hidden.dtype, device=flat_hidden.device)
-        # 用于记录每个 token 对激活专家的实际输出
-        activated_outputs = [{} for _ in range(flat_hidden.size(0))]
-        # one-hot 编码 top-k 专家，shape: (B*seq_len, top_k, num_experts)
-        expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts)  # (B*seq_len, top_k, num_experts)
         expert_mask = expert_mask.permute(2, 1, 0)  # (num_experts, top_k, B*seq_len)
-        for expert_idx in range(self.num_experts):
             expert_layer = self.experts[expert_idx]
             idx, top_x = torch.where(expert_mask[expert_idx])
             if top_x.numel() > 0:
@@ -73,75 +53,77 @@ class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
                 weight = routing_weights_topk[top_x, idx].unsqueeze(-1)  # (n, 1)
                 weighted_output = current_output * weight
                 sparse_output.index_add_(0, top_x, weighted_output.to(flat_hidden.dtype))
-                # 保存当前 token 对该专家的实际输出
-                for pos, token_idx in enumerate(top_x.tolist()):
-                    activated_outputs[token_idx][expert_idx] = current_output[pos]
         # ---------- 稀疏计算结束 ----------
-        # ---------- Dense估计部分 ----------
-        # 计算所有专家对所有 token 的 dense 输出，shape: (B*seq_len, num_experts, hidden_dim)
-        # 创建全零张量，只填入已激活专家的输��
-        all_expert_outputs = torch.zeros((flat_hidden.size(0), self.num_experts, hidden_dim),
-                                   dtype=flat_hidden.dtype, device=flat_hidden.device)
-        # 填入已激活专家的输出
-        for i in range(flat_hidden.size(0)):
-            for expert_idx in activated_outputs[i].keys():
-                all_expert_outputs[i, expert_idx] = activated_outputs[i][expert_idx]
-        # 将 selected_experts 转换为 list，每个 token 的激活专家列表
-        all_routing = selected_experts.tolist()  # 长度为 (B*seq_len)
-        dense_outputs = []
-        for i in range(flat_hidden.size(0)):
-            dense_est = self.estimate_dense_output(
-                token_idx=i,
-                activated=all_routing[i],              # 当前 token 激活的专家列表，例如 [a, b]
-                gate_prob=routing_weights[i],            # 当前 token 的完整 routing 权重 (num_experts,)
-                activated_outputs=activated_outputs[i],  # 当前 token 对激活专家的实际输出
-                all_routing=all_routing,                 # 全 batch 每个 token 的激活专家列表（list of lists）
-                all_expert_outputs=all_expert_outputs      # (B*seq_len, num_experts, hidden_dim)
-            )
-            dense_outputs.append(dense_est.unsqueeze(0))
-        dense_outputs = torch.cat(dense_outputs, dim=0)  # (B*seq_len, hidden_dim)
-        # ---------- Dense估计结束 ----------
-        # 使用直通梯度：前向输出用稀疏结果，但反向传播时梯度来源于 dense 估计
         final_flat = sparse_output.detach() + (dense_outputs - dense_outputs.detach())
         final_output = final_flat.view(batch_size, seq_length, hidden_dim)
         return final_output, router_logits
     def estimate_dense_output(self, token_idx, activated, gate_prob, activated_outputs, all_routing, all_expert_outputs):
-        """
-        对于当前 token，根��� mini-batch 中的信息估计 dense 输出。
-        参数：
-            token_idx: 当前 token 的索引（标量）
-            activated: 当前 token 激活的专家列表，例如 [1, 3]
-            gate_prob: 当前 token 的 routing 权重，形状 (num_experts,)
-            activated_outputs: dict，当前 token 对激活专家的实际输出，形状 (hidden_dim,)
-            all_routing: list，每个 token 的激活专家列表（长度为 N，每个元素为 list）
-            all_expert_outputs: Tensor, (N, num_experts, hidden_dim)
-        返回：
-            estimated_dense: Tensor, (hidden_dim,)
-        """
         num_experts = gate_prob.size(0)
         dense_parts = {}
-        # 对于激活的专家，直接使用其实际输出
         for idx in activated:
             dense_parts[idx] = activated_outputs[idx]
-        # 对于未激活的专家，使用 mini-batch 中其他 token 的输出估计
         non_activated = [i for i in range(num_experts) if i not in activated]
-        for i in non_activated:
             indices = []
             for idx, r_dec in enumerate(all_routing):
                 if (i in r_dec) and (len(set(r_dec) & set(activated)) > 0):
                     indices.append(idx)
             if indices:
                 selected_outputs = all_expert_outputs[indices, i, :]  # (n, hidden_dim)
-                 # 只计算非零值的平均值
                 mask = (selected_outputs.sum(dim=-1) != 0).to(selected_outputs.dtype).unsqueeze(-1)
                 if mask.sum() > 0:
                     estimated = (selected_outputs * mask).sum(dim=0) / mask.sum()
                 else:
-                   # 如果全是零，返回零向量
                     estimated = torch.zeros_like(selected_outputs[0])
             else:
                 all_outputs = all_expert_outputs[:, i, :]
@@ -149,16 +131,13 @@ class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
                 if mask.sum() > 0:
                     estimated = (all_outputs * mask).sum(dim=0) / mask.sum()
                 else:
-                    # 如果全是零，返回零向量
                     estimated = torch.zeros_like(all_outputs[0])
             dense_parts[i] = estimated
-        # 按 gate_prob 加权求和各专家输出
         estimated_dense = 0
         for i in range(num_experts):
             estimated_dense += gate_prob[i] * dense_parts[i]
         return estimated_dense
 class DenseBackwardOLMoEForCausalLM(OlmoeForCausalLM):
     """
     自定义的 Olmoe ForCausalLM 模型，使用新的 DenseBackwardOlmoeSparseMoeBlock 替换原版的 MoE 模块，

 class DenseBackwardOlmoeSparseMoeBlock(OlmoeSparseMoeBlock):
     """
     继承自官方 OlmoeSparseMoeBlock，实现 dense backward 功能：
     前向输出依旧保持与官方相同（即稀疏计算结果），
         router_logits: Tensor, shape (batch_size * sequence_length, num_experts)
     """
     def forward(self, hidden_states: torch.Tensor):
         batch_size, seq_length, hidden_dim = hidden_states.shape
         flat_hidden = hidden_states.view(-1, hidden_dim)  # (B*seq_len, hidden_dim)
         router_logits = self.gate(flat_hidden)  # (B*seq_len, num_experts)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)  # (B*seq_len, num_experts)
         routing_weights_topk, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
         if self.norm_topk_prob:
             routing_weights_topk = routing_weights_topk / routing_weights_topk.sum(dim=-1, keepdim=True)
         routing_weights_topk = routing_weights_topk.to(flat_hidden.dtype)
         # ---------- 稀疏计算部分 ----------
+        sparse_output = torch.zeros((flat_hidden.size(0), hidden_dim),
+                                    dtype=flat_hidden.dtype, device=flat_hidden.device)
+        # 使用 tensor 存储，每个 token 对各专家的输出：形状 (B*seq_len, num_experts, hidden_dim)
+        activated_outputs_tensor = torch.zeros((flat_hidden.size(0), self.num_experts, hidden_dim),
+                                               dtype=flat_hidden.dtype, device=flat_hidden.device)
+        expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts)
         expert_mask = expert_mask.permute(2, 1, 0)  # (num_experts, top_k, B*seq_len)
+        for expert_idx in tqdm(range(self.num_experts), desc="修改版本-专家循环"):
             expert_layer = self.experts[expert_idx]
             idx, top_x = torch.where(expert_mask[expert_idx])
             if top_x.numel() > 0:
                 weight = routing_weights_topk[top_x, idx].unsqueeze(-1)  # (n, 1)
                 weighted_output = current_output * weight
                 sparse_output.index_add_(0, top_x, weighted_output.to(flat_hidden.dtype))
+                # 直接存入 tensor：激活 token 对当前专家的输出
+                activated_outputs_tensor[top_x, expert_idx, :] = current_output
         # ---------- 稀疏计算结束 ----------
+        # ---------- Dense估计部分 (向量化版本，激活专家直接使用输出) ----------
+        all_expert_outputs = activated_outputs_tensor  # (B*seq_len, num_experts, hidden_dim)
+        all_routing = selected_experts.tolist()  # list，每个 token 的激活专家列表
+        N_tokens = flat_hidden.size(0)
+        num_experts = self.num_experts
+        # 将 selected_experts 转换为 one-hot 二值矩阵 R: (N_tokens, num_experts)
+        R = F.one_hot(selected_experts, num_classes=num_experts).float()  # (N_tokens, top_k, num_experts)
+        R = R.sum(dim=1)  # (N_tokens, num_experts)，激活的专家位置值大于0
+        # 计算 token 之间共享激活情况 S: (N_tokens, N_tokens)
+        S = torch.matmul(R, R.t())  # S[i,j] > 0 表示 token i 和 token j 至少共享一个激活专家
+        S = S * (1 - torch.eye(N_tokens, device=S.device))  # 去除自身
+        # 构造候选 mask M: (N_tokens, N_tokens, num_experts)
+        # M[i, j, e] = 1 表示 token j 激活了专家 e 且 token i 与 token j 至少共享一个激活专家
+        R_expanded = R.unsqueeze(0).expand(N_tokens, -1, -1)      # (N_tokens, N_tokens, num_experts)
+        S_expanded = S.unsqueeze(-1)                              # (N_tokens, N_tokens, 1)
+        candidate_mask = ((R_expanded > 0) & (S_expanded > 0)).float()  # (N_tokens, N_tokens, num_experts)
+        # 对于数值稳定，排除 token 自身（对角线置0）
+        candidate_mask = candidate_mask * (1 - torch.eye(N_tokens, device=candidate_mask.device).unsqueeze(-1))
+        # 扩展 mask 和 all_expert_outputs 以便批量聚合
+        # all_expert_outputs: (N_tokens, num_experts, hidden_dim)
+        candidate_mask_exp = candidate_mask.unsqueeze(-1)          # (N_tokens, N_tokens, num_experts, 1)
+        all_expert_outputs_exp = all_expert_outputs.unsqueeze(0)   # (1, N_tokens, num_experts, hidden_dim)
+        # 对每个 token i 和专家 e，聚合候选 token 的输出
+        sum_outputs = (candidate_mask_exp * all_expert_outputs_exp).sum(dim=1)   # (N_tokens, num_experts, hidden_dim)
+        count_outputs = candidate_mask.sum(dim=1).unsqueeze(-1)                  # (N_tokens, num_experts, 1)
+        estimated_dense_all = torch.where(count_outputs > 0, sum_outputs / count_outputs,
+                                         torch.zeros_like(sum_outputs))  # (N_tokens, num_experts, hidden_dim)
+        # 对于激活的专家，直接使用当前 token 的输出
+        # R > 0 表示激活，扩展为 (N_tokens, num_experts, 1) 与 activated_outputs_tensor 对齐
+        activated_mask = (R > 0).unsqueeze(-1)
+        estimated_dense_all = torch.where(activated_mask, activated_outputs_tensor, estimated_dense_all)
+        # 利用 gate_prob 加权聚合所有专家输出
+        gate_prob_exp = routing_weights.to(estimated_dense_all.dtype).unsqueeze(-1)  # (N_tokens, num_experts, 1)
+        dense_outputs = (gate_prob_exp * estimated_dense_all).sum(dim=1)  # (N_tokens, hidden_dim)
+        # ---------- Dense估计结束 (向量化版本) ----------
         final_flat = sparse_output.detach() + (dense_outputs - dense_outputs.detach())
         final_output = final_flat.view(batch_size, seq_length, hidden_dim)
         return final_output, router_logits
     def estimate_dense_output(self, token_idx, activated, gate_prob, activated_outputs, all_routing, all_expert_outputs):
         num_experts = gate_prob.size(0)
         dense_parts = {}
+        # 对于激活的专家，直接使用 tensor 的对应行
         for idx in activated:
             dense_parts[idx] = activated_outputs[idx]
         non_activated = [i for i in range(num_experts) if i not in activated]
+        for i in tqdm(non_activated, desc=f"修改版本-Token {token_idx} 非激活专家估计"):
             indices = []
             for idx, r_dec in enumerate(all_routing):
                 if (i in r_dec) and (len(set(r_dec) & set(activated)) > 0):
                     indices.append(idx)
             if indices:
                 selected_outputs = all_expert_outputs[indices, i, :]  # (n, hidden_dim)
                 mask = (selected_outputs.sum(dim=-1) != 0).to(selected_outputs.dtype).unsqueeze(-1)
                 if mask.sum() > 0:
                     estimated = (selected_outputs * mask).sum(dim=0) / mask.sum()
                 else:
                     estimated = torch.zeros_like(selected_outputs[0])
             else:
                 all_outputs = all_expert_outputs[:, i, :]
                 if mask.sum() > 0:
                     estimated = (all_outputs * mask).sum(dim=0) / mask.sum()
                 else:
                     estimated = torch.zeros_like(all_outputs[0])
             dense_parts[i] = estimated
         estimated_dense = 0
         for i in range(num_experts):
             estimated_dense += gate_prob[i] * dense_parts[i]
         return estimated_dense
 class DenseBackwardOLMoEForCausalLM(OlmoeForCausalLM):
     """
     自定义的 Olmoe ForCausalLM 模型，使用新的 DenseBackwardOlmoeSparseMoeBlock 替换原版的 MoE 模块，