nomic-ai
/

nomic-bert-2048

Model card Files Files and versions

zpn commited on Feb 11

Commit

f169ad7

·

1 Parent(s): 9f26eba

fix: changes

Files changed (1) hide show

modeling_hf_nomic_bert.py +9 -2

modeling_hf_nomic_bert.py CHANGED Viewed

@@ -1616,6 +1616,7 @@ class NomicBertBlock(NomicBertPreTrainedModel):
             if config.activation_function == "glu"
             else (F.silu if config.activation_function == "swiglu" else F.gelu)
         )
         if moe:
             if dmoe is not None:
                 megablocks_args = Arguments(
@@ -1702,7 +1703,10 @@ class NomicBertBlock(NomicBertPreTrainedModel):
             dropped = self.dropout2(hidden_states)
             residual = (dropped + residual) if residual is not None else dropped
             hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
-            hidden_states = self.mlp(hidden_states)
             return hidden_states, None, residual
         else:
@@ -1716,7 +1720,10 @@ class NomicBertBlock(NomicBertPreTrainedModel):
                 rope=rope,
             )
             hidden_states = self.norm1((self.dropout1(attn_outputs) + hidden_states).to(dtype=self.norm1.weight.dtype))
-            mlp_out = self.mlp(hidden_states)
             hidden_states = self.norm2((self.dropout2(mlp_out) + hidden_states).to(dtype=self.norm2.weight.dtype))
             return hidden_states, None, None

             if config.activation_function == "glu"
             else (F.silu if config.activation_function == "swiglu" else F.gelu)
         )
+        self.moe = moe
         if moe:
             if dmoe is not None:
                 megablocks_args = Arguments(
             dropped = self.dropout2(hidden_states)
             residual = (dropped + residual) if residual is not None else dropped
             hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
+            if self.moe:
+                hidden_states = self.mlp(hidden_states, attention_mask)
+            else:
+                hidden_states = self.mlp(hidden_states)
             return hidden_states, None, residual
         else:
                 rope=rope,
             )
             hidden_states = self.norm1((self.dropout1(attn_outputs) + hidden_states).to(dtype=self.norm1.weight.dtype))
+            if self.moe:
+                mlp_out = self.mlp(hidden_states, attention_mask)
+            else:
+                mlp_out = self.mlp(hidden_states)
             hidden_states = self.norm2((self.dropout2(mlp_out) + hidden_states).to(dtype=self.norm2.weight.dtype))
             return hidden_states, None, None