Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 26, 2024

Commit

7ed349e

verified ·

1 Parent(s): d842ce9

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +53 -76

modeling_quiet.py CHANGED Viewed

@@ -23,7 +23,6 @@ import math
 import copy
 import os
 import time
-import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 import wandb
@@ -69,73 +68,6 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "QuietConfig"
-from reportlab.pdfgen import canvas
-from reportlab.lib.pagesizes import letter
-from reportlab.lib.colors import HexColor
-def save_tokens_with_rewards_to_pdf(input_ids, token_rewards, tokenizer, output_file="text.pdf", eps=0.2, eps2=0.5):
-    c = canvas.Canvas(output_file, pagesize=letter)
-    c.setFont("Courier", 8)
-    x, y = 50, 750
-    previous_text = ""
-    current_text = ""
-    for token_idx, reward in enumerate(token_rewards):
-        current_text = tokenizer.decode(input_ids[: token_idx + 1])
-        if current_text != previous_text:
-            diff_text = current_text[len(previous_text) :]
-            if "\n" in diff_text:
-                lines = diff_text.split("\n")
-                for line_idx, line in enumerate(lines):
-                    if line_idx > 0:
-                        x = 50
-                        y -= 12
-                    if abs(reward) < eps:
-                        opacity = 0
-                    elif abs(reward) > eps2:
-                        opacity = 0.8
-                    else:
-                        opacity = 0.8 * (abs(reward) - eps) / (eps2 - eps)
-                    text_width = c.stringWidth(line)
-                    if reward > 0:
-                        highlight_color = HexColor("#4CCD99")
-                    else:
-                        highlight_color = HexColor("#FFC700")
-                    highlight_color.alpha = opacity
-                    c.setFillColor(highlight_color)
-                    c.rect(x, y - 2, text_width, 10, fill=True, stroke=False)
-                    c.setFillColor(HexColor("#000000"))
-                    c.drawString(x, y, line)
-                    x += text_width
-            else:
-                if abs(reward) < eps:
-                    opacity = 0
-                elif abs(reward) > eps2:
-                    opacity = 0.8
-                else:
-                    opacity = 0.8 * (abs(reward) - eps) / (eps2 - eps)
-                text_width = c.stringWidth(diff_text)
-                if reward > 0:
-                    highlight_color = HexColor("#4CCD99")
-                else:
-                    highlight_color = HexColor("#FFC700")
-                highlight_color.alpha = opacity
-                c.setFillColor(highlight_color)
-                c.rect(x, y - 2, text_width, 10, fill=True, stroke=False)
-                c.setFillColor(HexColor("#000000"))
-                c.drawString(x, y, diff_text)
-                x += text_width
-            if x > 550:
-                x = 50
-                y -= 12
-            if y < 50:
-                c.showPage()
-                y = 750
-                x = 50
-            previous_text = current_text
-    c.showPage()
-    c.save()
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
@@ -325,12 +257,22 @@ class QuietAttention(nn.Module):
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
             )
         bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
@@ -368,11 +310,16 @@ class QuietAttention(nn.Module):
             )
         if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
         # upcast attention to fp32
@@ -749,11 +696,21 @@ class QuietSdpaAttention(QuietAttention):
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
         if query_states.device.type == "cuda" and attention_mask is not None:
@@ -1327,7 +1284,27 @@ class QuietForCausalLM(QuietPreTrainedModel):
         # Generate the continuation
         continuation_length = self.n_ahead - 2
         new_key_values = past_key_values
         start_time = time.time()
         for continuation_idx in range(continuation_length):
             outputs = self.model(
@@ -2376,4 +2353,4 @@ class QuietForSequenceClassification(QuietPreTrainedModel):
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
-        )

 import copy
 import os
 import time
 import seaborn as sns
 import matplotlib.pyplot as plt
 import wandb
 _CONFIG_FOR_DOC = "QuietConfig"
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if past_key_value is not None:
+            expected_attention_mask_size = (bsz, 1, q_len, q_len + past_key_value.get_usable_length(q_len, self.layer_idx))
+            if attention_mask.size() != expected_attention_mask_size:
+                # Assuming the attention mask is larger than expected, slice it to match the expected size
+                attention_mask = attention_mask[:, :, :, -expected_attention_mask_size[-1]:]
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
             )
         bsz, q_len, _ = hidden_states.size()
+        query_states = query_states.to(attention_mask.dtype)
+        key_states = key_states.to(attention_mask.dtype)
+        value_states = value_states.to(attention_mask.dtype)
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
             )
         if attention_mask is not None:
+            if attention_mask.dim() == 3:
+                attention_mask = attention_mask.unsqueeze(1)
+            elif attention_mask.dim() == 2:
+                attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            if attention_mask.size(0) != bsz or attention_mask.size(-1) != kv_seq_len:
                 raise ValueError(
+                    f"Attention mask should be of size ({bsz}, 1, q_len, {kv_seq_len}), but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
         # upcast attention to fp32
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         if attention_mask is not None:
+            if attention_mask.dim() == 3:
+                attention_mask = attention_mask.unsqueeze(1)
+            elif attention_mask.dim() == 2:
+                attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        if attention_mask is not None:
+            if attention_mask.dim() == 3:
+                attention_mask = attention_mask.unsqueeze(1)
+            elif attention_mask.dim() == 2:
+                attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            if attention_mask.size(0) != bsz or attention_mask.size(-1) != kv_seq_len:
                 raise ValueError(
+                    f"Attention mask should be of size ({bsz}, 1, q_len, {kv_seq_len}), but is {attention_mask.size()}"
                 )
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
         if query_states.device.type == "cuda" and attention_mask is not None:
         # Generate the continuation
         continuation_length = self.n_ahead - 2
         new_key_values = past_key_values
+        if self.n_ahead != 1 or self.n_ahead_talk != 1 or self.comparison_mode:
+            if attention_mask is None:
+                base_attention_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=0).to(input_ids.device)
+                base_attention_mask = base_attention_mask.view(1, 1, seq_len, seq_len)
+                base_attention_mask = base_attention_mask.repeat(input_ids.shape[0], 1, 1, 1)
+                attention_mask = base_attention_mask
+            elif attention_mask.dim() == 2:
+                if seq_len + past_key_values_length != attention_mask.shape[-1]:
+                    attention_mask = torch.cat(
+                        [torch.ones((attention_mask.shape[0], past_key_values_length), dtype=attention_mask.dtype, device=attention_mask.device), attention_mask],
+                        dim=-1
+                    )
+                attention_mask = _prepare_4d_causal_attention_mask(
+                    attention_mask,
+                    (batch_size, seq_len),
+                    inputs_embeds,
+                    past_key_values_length,
+                    sliding_window=self.config.sliding_window,
+                )
         start_time = time.time()
         for continuation_idx in range(continuation_length):
             outputs = self.model(
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
+        )