Update modeling_densebackward_olmoe0125.py
Browse files
modeling_densebackward_olmoe0125.py
CHANGED
@@ -152,21 +152,48 @@ class DenseBackwardOLMoEForCausalLM(OlmoeForCausalLM):
|
|
152 |
base_model_prefix = "olmoe"
|
153 |
|
154 |
def __init__(self, config):
|
|
|
155 |
super().__init__(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
# 遍历模型中所有 decoder 层,替换每个 OlmoeSparseMoeBlock 为 DenseBackward 版本
|
157 |
# 此处假设官方模型在 self.model.layers 中组织 decoder 层,
|
158 |
# 且每层中 mlp 模块包含属性 sparse_moe_block。
|
159 |
for layer in self.model.layers:
|
160 |
-
if hasattr(layer.mlp, "
|
161 |
-
|
|
|
162 |
# 通过直接复制原版属性创建新的块
|
163 |
new_block = DenseBackwardOlmoeSparseMoeBlock(config) # 或其他适当参数
|
164 |
# 然后手动复制需要共享的属性:
|
165 |
new_block.gate = orig_block.gate
|
166 |
new_block.experts = orig_block.experts
|
167 |
-
new_block.router = orig_block.router
|
168 |
new_block.num_experts = orig_block.num_experts
|
169 |
new_block.top_k = orig_block.top_k
|
170 |
new_block.norm_topk_prob = orig_block.norm_topk_prob
|
171 |
-
layer.mlp
|
|
|
172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
base_model_prefix = "olmoe"
|
153 |
|
154 |
def __init__(self, config):
|
155 |
+
# 首先调用父类初始化方法
|
156 |
super().__init__(config)
|
157 |
+
|
158 |
+
# 不要尝试重新赋值self,而是从预训练模型加载并更新当前模型
|
159 |
+
pretrained_model = OlmoeForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0125")
|
160 |
+
|
161 |
+
# 复制预训练模型的状态到当前模型
|
162 |
+
self.config = pretrained_model.config
|
163 |
+
self.model = pretrained_model.model
|
164 |
+
self.vocab_size = pretrained_model.vocab_size
|
165 |
+
self.router_aux_loss_coef = pretrained_model.router_aux_loss_coef
|
166 |
+
self.num_experts = pretrained_model.num_experts
|
167 |
+
self.lm_head = pretrained_model.lm_head
|
168 |
+
|
169 |
# 遍历模型中所有 decoder 层,替换每个 OlmoeSparseMoeBlock 为 DenseBackward 版本
|
170 |
# 此处假设官方模型在 self.model.layers 中组织 decoder 层,
|
171 |
# 且每层中 mlp 模块包含属性 sparse_moe_block。
|
172 |
for layer in self.model.layers:
|
173 |
+
if hasattr(layer.mlp, "gate"):
|
174 |
+
print("111")
|
175 |
+
orig_block = layer.mlp
|
176 |
# 通过直接复制原版属性创建新的块
|
177 |
new_block = DenseBackwardOlmoeSparseMoeBlock(config) # 或其他适当参数
|
178 |
# 然后手动复制需要共享的属性:
|
179 |
new_block.gate = orig_block.gate
|
180 |
new_block.experts = orig_block.experts
|
|
|
181 |
new_block.num_experts = orig_block.num_experts
|
182 |
new_block.top_k = orig_block.top_k
|
183 |
new_block.norm_topk_prob = orig_block.norm_topk_prob
|
184 |
+
layer.mlp = new_block
|
185 |
+
print(type(layer.mlp))
|
186 |
|
187 |
+
def main():
|
188 |
+
config = DenseBackwardOLMoEConfig( # 官方模型参数
|
189 |
+
model_marker="DenseBackward_olmoe_marker",
|
190 |
+
)
|
191 |
+
# 创建自定义模型实例
|
192 |
+
model = DenseBackwardOLMoEForCausalLM(config)
|
193 |
+
print(type(model))
|
194 |
+
print(type(model.model))
|
195 |
+
print(type(model.model.layers[0]))
|
196 |
+
print(type(model.model.layers[0].mlp))
|
197 |
+
print(type(model.model.layers[0].mlp.experts))
|
198 |
+
if __name__ == "__main__":
|
199 |
+
main()
|