Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 26, 2024

Commit

d33b844

verified ·

1 Parent(s): 21d94a3

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +20 -44

modeling_quiet.py CHANGED Viewed

@@ -1071,27 +1071,18 @@ class QuietModel(QuietPreTrainedModel):
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        elif attention_mask is None or attention_mask.dim() == 2:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
         hidden_states = inputs_embeds
@@ -1883,29 +1874,14 @@ class QuietForCausalLM(QuietPreTrainedModel):
                         inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
-                if len(attention_mask.shape) == 2:
-                    breakpoint()
-                else:
-                    original_attention = attention_mask[..., :attention_mask.shape[-2]]
-                    if self.use_upper_triangular:
-                        new_attention = original_attention
-                    else:
-                        original_attention = original_attention == attention_mask.max()
-                        # because eye isn't implemented for BF16, we need to handle the case
-                        if not attention_mask.dtype == torch.bfloat16:
-                            new_attention = torch.eye(
-                                seq_len, dtype=attention_mask.dtype, device=attention_mask.device
-                            )
-                        else:
-                            new_attention = torch.eye(
-                                seq_len, dtype=torch.float32, device=attention_mask.device
-                            ).to(attention_mask.dtype)
-                        new_attention = new_attention.view(1, 1, seq_len, seq_len).repeat(input_ids.shape[0], 1, 1, 1)
-                        new_attention = new_attention * original_attention
-                        new_attention[new_attention == 0] = attention_mask.min()
-                        new_attention[new_attention == 1] = attention_mask.max()
-                    attention_mask = torch.cat([attention_mask, new_attention], dim=-1)
                 past_key_values = outputs.past_key_values
                 position_ids = position_ids + 1

                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_len), dtype=torch.bool, device=input_ids.device)
+        if attention_mask.dim() == 2:
+            attention_mask = attention_mask.view(batch_size, 1, 1, seq_len)
+            attention_mask = attention_mask.expand(batch_size, 1, seq_len, seq_len)
+        elif attention_mask.dim() == 3:
+            attention_mask = attention_mask.unsqueeze(1)
+        elif attention_mask.dim() != 4:
+            raise ValueError(f"Attention mask should be of shape (batch_size, 1, seq_len, seq_len) or (batch_size, 1, 1, seq_len), but got {attention_mask.shape}")
+        attention_mask = attention_mask.to(dtype=torch.bool, device=input_ids.device)
         hidden_states = inputs_embeds
                         inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
+                if attention_mask is not None:
+                    if attention_mask.dim() == 2:
+                        attention_mask = attention_mask.view(batch_size, 1, 1, seq_len)
+                        attention_mask = attention_mask.expand(batch_size, 1, seq_len, seq_len)
+                    elif attention_mask.dim() != 4:
+                        raise ValueError(f"Attention mask should be of shape (batch_size, 1, seq_len, seq_len), but got {attention_mask.shape}")
+    attention_mask = attention_mask.to(dtype=torch.bool, device=input_ids.device)
                 past_key_values = outputs.past_key_values
                 position_ids = position_ids + 1