Commit
·
a5b83ec
1
Parent(s):
1da34bf
Update modeling_transnormer.py
Browse files- modeling_transnormer.py +0 -40
modeling_transnormer.py
CHANGED
@@ -734,43 +734,6 @@ class TransnormerModel(TransnormerPreTrainedModel):
|
|
734 |
slope_rate = slope_rates[idx]
|
735 |
slope_rate = slope_rate * (1 - idx / (self.num_layers - 1) + 1e-5)
|
736 |
mask = linear_attn_mask
|
737 |
-
|
738 |
-
# if self.gradient_checkpointing and self.training:
|
739 |
-
|
740 |
-
# def create_custom_forward(module):
|
741 |
-
# def custom_forward(*inputs):
|
742 |
-
# # None for past_key_value
|
743 |
-
# return module(*inputs, output_attentions, None)
|
744 |
-
|
745 |
-
# return custom_forward
|
746 |
-
|
747 |
-
# # layer_outputs = torch.utils.checkpoint.checkpoint(
|
748 |
-
# # create_custom_forward(layer),
|
749 |
-
# # hidden_states,
|
750 |
-
# # mask,
|
751 |
-
# # linear_attn_padding_mask,
|
752 |
-
# # None,
|
753 |
-
# # )
|
754 |
-
# layer_outputs = torch.utils.checkpoint.checkpoint(
|
755 |
-
# create_custom_forward(layer),
|
756 |
-
# hidden_states,
|
757 |
-
# mask,
|
758 |
-
# linear_attn_padding_mask,
|
759 |
-
# None,
|
760 |
-
# output_attentions,
|
761 |
-
# use_cache,
|
762 |
-
# slope_rate,
|
763 |
-
# )
|
764 |
-
# else:
|
765 |
-
# layer_outputs = layer(
|
766 |
-
# hidden_states,
|
767 |
-
# attn_mask=mask,
|
768 |
-
# attn_padding_mask=linear_attn_padding_mask,
|
769 |
-
# past_key_value=past_key_value,
|
770 |
-
# output_attentions=output_attentions,
|
771 |
-
# use_cache=use_cache,
|
772 |
-
# slope_rate=slope_rate,
|
773 |
-
# )
|
774 |
|
775 |
layer_outputs = layer(
|
776 |
hidden_states,
|
@@ -789,9 +752,6 @@ class TransnormerModel(TransnormerPreTrainedModel):
|
|
789 |
|
790 |
if output_attentions:
|
791 |
all_self_attns += (layer_outputs[1],)
|
792 |
-
|
793 |
-
# if idx == 0:
|
794 |
-
# break
|
795 |
|
796 |
hidden_states = self.final_norm(hidden_states)
|
797 |
|
|
|
734 |
slope_rate = slope_rates[idx]
|
735 |
slope_rate = slope_rate * (1 - idx / (self.num_layers - 1) + 1e-5)
|
736 |
mask = linear_attn_mask
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
737 |
|
738 |
layer_outputs = layer(
|
739 |
hidden_states,
|
|
|
752 |
|
753 |
if output_attentions:
|
754 |
all_self_attns += (layer_outputs[1],)
|
|
|
|
|
|
|
755 |
|
756 |
hidden_states = self.final_norm(hidden_states)
|
757 |
|