Crystalcareai
/

Quiet-Star-Custom

@@ -44,7 +44,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
@@ -134,6 +134,34 @@ def save_tokens_with_rewards_to_pdf(input_ids, token_rewards, tokenizer, output_
             previous_text = current_text
     c.showPage()
     c.save()
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
@@ -1070,10 +1098,11 @@ class QuietModel(QuietPreTrainedModel):
                     " this may lead to unexpected behaviour for Flash Attention version of Quiet. Make sure to "
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
-        if attention_mask is not None and self._attn_implementation == 'flash_attention_2':
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == 'sdpa' and not output_attentions:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
             attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
@@ -1082,12 +1111,7 @@ class QuietModel(QuietPreTrainedModel):
                 inputs_embeds,
                 past_key_values_length,
             )
-        else:
-            # Check the shape of the attention mask
-            if attention_mask is not None and attention_mask.dim() == 2:
-                # Reshape the attention mask to 4D
-                attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask,

 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask,
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
             previous_text = current_text
     c.showPage()
     c.save()
+def _prepare_4d_causal_attention_mask_for_sdpa(
+    attn_mask: Optional[torch.Tensor],
+    shape: Tuple[int, int],
+    inputs_embeds: Optional[torch.Tensor] = None,
+    past_key_values_length: int = 0,
+) -> torch.Tensor:
+    batch_size, seq_len = shape
+    if attn_mask is None:
+        attn_mask = torch.ones((batch_size, seq_len), dtype=torch.bool, device=inputs_embeds.device)
+    else:
+        attn_mask = attn_mask.bool()
+    # Extend the attention mask to account for past key/value states
+    if past_key_values_length > 0:
+        extended_attn_mask = torch.cat(
+            [
+                attn_mask.new_zeros(batch_size, seq_len, past_key_values_length),
+                attn_mask.unsqueeze(2),
+            ],
+            dim=2,
+        )
+        attn_mask = extended_attn_mask
+    attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)
+    causal_mask = torch.tril(torch.ones(seq_len, seq_len + past_key_values_length, device=attn_mask.device)).bool()
+    attn_mask = attn_mask & causal_mask.unsqueeze(0).unsqueeze(0)
+    return attn_mask
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
                     " this may lead to unexpected behaviour for Flash Attention version of Quiet. Make sure to "
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
+        if self._attn_implementation == "flash_attention_2":
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
             attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
                 inputs_embeds,
                 past_key_values_length,
             )
+        elif attention_mask is None or attention_mask.dim() == 2:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask,