Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 2, 2024

Commit

7fb20bf

verified ·

1 Parent(s): f5c1913

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +110 -194

modeling_quiet.py CHANGED Viewed

@@ -44,7 +44,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
@@ -73,67 +73,63 @@ from reportlab.pdfgen import canvas
 from reportlab.lib.pagesizes import letter
 from reportlab.lib.colors import HexColor
-def save_tokens_with_rewards_to_pdf(input_ids, token_rewards, tokenizer, output_file="text.pdf", eps=0.2, eps2=0.5):
-    c = canvas.Canvas(output_file, pagesize=letter)
-    c.setFont("Courier", 8)
-    x, y = 50, 750
-    previous_text = ""
-    current_text = ""
-    for token_idx, reward in enumerate(token_rewards):
-        current_text = tokenizer.decode(input_ids[: token_idx + 1])
-        if current_text != previous_text:
-            diff_text = current_text[len(previous_text) :]
-            if "\n" in diff_text:
-                lines = diff_text.split("\n")
-                for line_idx, line in enumerate(lines):
-                    if line_idx > 0:
-                        x = 50
-                        y -= 12
-                    if abs(reward) < eps:
-                        opacity = 0
-                    elif abs(reward) > eps2:
-                        opacity = 0.8
-                    else:
-                        opacity = 0.8 * (abs(reward) - eps) / (eps2 - eps)
-                    text_width = c.stringWidth(line)
-                    if reward > 0:
-                        highlight_color = HexColor("#4CCD99")
-                    else:
-                        highlight_color = HexColor("#FFC700")
-                    highlight_color.alpha = opacity
-                    c.setFillColor(highlight_color)
-                    c.rect(x, y - 2, text_width, 10, fill=True, stroke=False)
-                    c.setFillColor(HexColor("#000000"))
-                    c.drawString(x, y, line)
-                    x += text_width
-            else:
-                if abs(reward) < eps:
-                    opacity = 0
-                elif abs(reward) > eps2:
-                    opacity = 0.8
-                else:
-                    opacity = 0.8 * (abs(reward) - eps) / (eps2 - eps)
-                text_width = c.stringWidth(diff_text)
-                if reward > 0:
-                    highlight_color = HexColor("#4CCD99")
-                else:
-                    highlight_color = HexColor("#FFC700")
-                highlight_color.alpha = opacity
-                c.setFillColor(highlight_color)
-                c.rect(x, y - 2, text_width, 10, fill=True, stroke=False)
-                c.setFillColor(HexColor("#000000"))
-                c.drawString(x, y, diff_text)
-                x += text_width
-            if x > 550:
-                x = 50
-                y -= 12
-            if y < 50:
-                c.showPage()
-                y = 750
-                x = 50
-            previous_text = current_text
-    c.showPage()
-    c.save()
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
@@ -1178,6 +1174,9 @@ class QuietForCausalLM(QuietPreTrainedModel):
         self.model = QuietModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.max_thoughts = config.max_thoughts
         self.merged_lm_and_talk_heads = config.merged_lm_and_talk_heads
         self.use_concat_talk_head = config.use_concat_talk_head
@@ -1240,10 +1239,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
         self.tokenized_thought_prefix = None
         self.log_dict = defaultdict(int)
         self.eval_log_dict = defaultdict(int)
-        self.print_final_only = True
         self.loss_mean = loss_mean
-        self.all_rewards = []
-        self.all_unreduced_losses = []
         self.start_embedding = nn.Parameter(torch.zeros(2, self.model.config.hidden_size))
         self.end_embedding = nn.Parameter(torch.zeros(2, self.model.config.hidden_size))
@@ -1252,6 +1248,9 @@ class QuietForCausalLM(QuietPreTrainedModel):
         self.embedding_scale = 1e2
         self.reinforce_temperature = 3
         self.base_loss_beta = 1
         # Not used in the paper:
         self.use_thought_prefix = False
@@ -1259,7 +1258,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
         self.use_upper_triangular = False
         self.subtract_mean_reward = False
         self.comparison_mode = False
-        self.gumbel_detach = True
         # For visualization
         self.eval_mode = False
@@ -1358,6 +1357,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
             # Apply Gumbel-Softmax to the logits
             next_token_logits = F.gumbel_softmax(logits, tau=self.gumbel_temperature, hard=True, dim=-1)
             next_token_id = torch.argmax(next_token_logits, dim=-1)
             # Append the generated token to the input sequence
@@ -1436,6 +1436,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -1459,14 +1460,27 @@ class QuietForCausalLM(QuietPreTrainedModel):
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
-        log_dict = self.log_dict if self.training else self.eval_log_dict
         if not self.training:
             n_ahead_talk_to_restore = self.n_ahead_talk
             n_passes_to_restore = self.n_passes
             self.n_ahead_talk = 1
             self.n_passes = 1
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1547,6 +1561,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
             else:
                 # convert to identity transform
                 def lambda_transform(cur_head):
                     if cur_head.weight.data.shape[0] != cur_head.weight.data.shape[1]:
                         return torch.cat([
                         torch.eye(
@@ -1679,6 +1694,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
                 use_cache=use_cache,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
@@ -1793,8 +1809,10 @@ class QuietForCausalLM(QuietPreTrainedModel):
                         shift_labels = labels[..., 1 + shift_amount:].contiguous()
                         # Flatten the tokens
                         loss_fct = CrossEntropyLoss(reduction="none")
                         shift_logits = shift_logits.view(-1, self.config.vocab_size)
                         shift_labels = shift_labels.view(-1).clone()
                         # Enable model parallelism
                         shift_labels[shift_labels == self.tokenizer.pad_token_id] = -100
                         shift_labels = shift_labels.to(shift_logits.device)
@@ -1886,6 +1904,22 @@ class QuietForCausalLM(QuietPreTrainedModel):
                         inputs_embeds = cur_thought_embedding.unsqueeze(0).repeat(batch_size, seq_len, 1)
                         inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 if len(attention_mask.shape) == 2:
                     breakpoint()
@@ -1933,7 +1967,9 @@ class QuietForCausalLM(QuietPreTrainedModel):
                     # if shift_labels.min() == self.tokenizer.pad_token_id:
                     shift_labels = torch.where(shift_labels == self.tokenizer.pad_token_id, -100, shift_labels)
                     unreduced_loss = loss_fct(shift_logits, shift_labels)
                     if torch.any(unreduced_loss != unreduced_loss):
                         raise ValueError("NaN loss")
                     unreduced_loss = unreduced_loss.reshape(logits.shape[0], -1)
                     loss_list.append(unreduced_loss)
@@ -1992,78 +2028,6 @@ class QuietForCausalLM(QuietPreTrainedModel):
                             else:
                                 added_reward = original_dqn_reward
                             policy_reward += added_reward
-                    if self.use_policy_loss and ahead_idx == self.n_ahead + self.n_ahead_talk - 2:
-                        # only compute during the thinking phase
-                        if self.use_reparam_for_thought_embeddings and (self.use_start_thought_token or self.use_end_thought_token):
-                            # sampled_start, sampled_end
-                            # calculate the log likelihood of the start and end embeddings sampled from a multivariate normal distribution
-                            # with mean start_embedding[0] and standard deviation start_embedding[1]
-                            if self.use_start_thought_token:
-                                exp_start_std = torch.exp(start_embedding[1])
-                                start_loglikelihood = -0.5 * (sampled_start.detach() - start_embedding[0]) ** 2 / exp_start_std ** 2 - start_embedding[1] - 0.5 * math.log(2 * math.pi)
-                                start_loglikelihood = start_loglikelihood.mean(dim=-1)
-                            if self.use_end_thought_token:
-                                exp_end_std = torch.exp(end_embedding[1])
-                                end_loglikelihood = -0.5 * (sampled_end.detach() - end_embedding[0]) ** 2 / exp_end_std ** 2 - end_embedding[1] - 0.5 * math.log(2 * math.pi)
-                                end_loglikelihood = end_loglikelihood.mean(dim=-1)
-                            # we use the mean instead of the sum to prevent dependence on the dimensionality of the embeddings
-                            if self.use_end_thought_token and self.use_policy_loss_for_end_thought:
-                                action_loglikelihoods_list.append(end_loglikelihood)
-                            if self.use_start_thought_token:
-                                action_loglikelihoods_list.append(start_loglikelihood)
-                        if ahead_idx == self.n_ahead + self.n_ahead_talk - 2 and self.eval_mode:
-                            with torch.no_grad():
-                                # calculate the 0.75 quantile of the rewards
-                                filtered_tokens = input_ids[:, :policy_reward.shape[-1]].cpu().detach().numpy().flatten()
-                                filtered_tokens_mask = filtered_tokens != self.tokenizer.pad_token_id
-                                filtered_tokens = filtered_tokens[filtered_tokens_mask]
-                                filtered_rewards = policy_reward.float().cpu().detach().numpy()[:, :seq_len - self.n_ahead_talk].flatten()
-                                filtered_rewards = filtered_rewards[filtered_tokens_mask]
-                                abs_reward_list = np.abs(policy_reward.float().cpu().detach().numpy()[:, :seq_len - self.n_ahead_talk].flatten())
-                                abs_reward_list = abs_reward_list[filtered_tokens_mask]
-                                medium_quantile = np.quantile(abs_reward_list, 0.5)
-                                upper_quantile = np.quantile(abs_reward_list, 0.95)
-                                save_tokens_with_rewards_to_pdf(
-                                    filtered_tokens,
-                                    [0] + filtered_rewards.tolist(),
-                                    self.tokenizer,
-                                    output_file=f"texts/rewards_talk_{self.n_ahead_talk}_{self.training_steps}.pdf",
-                                    eps=medium_quantile,
-                                    eps2=upper_quantile,
-                                )
-                                def plot_kde(data, losses):
-                                    sns.set(style="whitegrid")
-                                    # Create the KDE plot
-                                    sns.kdeplot(data, fill=True)
-                                    # Set the plot title and labels
-                                    plt.title("KDE Plot")
-                                    plt.xlabel("Value")
-                                    plt.ylabel("Density")
-                                    # Save the plot
-                                    plt.savefig(f"texts/kde_talk_{self.n_ahead_talk}_{self.training_steps}.pdf")
-                                    # Close the plot
-                                    plt.close()
-                                    # Step 1: Create a base color palette
-                                    base_colors = sns.color_palette("light:#5A9", n_colors=256)  # More colors for a smoother gradient
-                                    base_cmap = LinearSegmentedColormap.from_list("log_light", base_colors)
-                                    log_norm = LogNorm(vmin=1e-3, vmax=10)
-                                    sns.kdeplot(x=data, y=losses, fill=True, levels=20, norm=log_norm, cut=0, linewidths=0)
-                                    # limit y to 0 to 25 and x to -1 to 1
-                                    plt.xlim(-1, 1)
-                                    plt.ylim(0, 25)
-                                    plt.savefig(f"texts/jointer_talk_{self.n_ahead_talk}_{self.training_steps}.pdf")
-                                    plt.close()
-                                self.all_rewards.extend(filtered_rewards)
-                                self.all_unreduced_losses.extend(unreduced_loss[:, :-1].flatten()[filtered_tokens_mask].float().flatten().cpu().detach().numpy())
-                                plot_kde(self.all_rewards, self.all_unreduced_losses)
                         for action_loglikelihoods_2d in action_loglikelihoods_list:
                             train_policy_reward = policy_reward
@@ -2112,6 +2076,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
                     else:
                         loss = cur_loss
                 loss = loss / len(loss_list)
             loss = loss * self.base_loss_beta
@@ -2133,64 +2098,15 @@ class QuietForCausalLM(QuietPreTrainedModel):
         if loss is not None:
             base_log_dict["loss_train"] = loss.item()
-        for loss_key, loss_val in base_log_dict.items():
-            log_dict[loss_key] += loss_val / self.n_tokens_print
-        if self.use_policy_loss and policy_reward is not None:
-            log_dict["policy_loss"] += dqn_loss / self.n_tokens_print
-            log_dict["policy_reward"] += policy_reward.mean() / self.n_tokens_print
-        if not loss_list:
-            if loss is not None:
-                log_dict["loss_0"] += loss / self.n_tokens_print
-        else:
-            log_dict["loss_final"] += nonzero_mean(loss_list[-1]) / self.n_tokens_print
-            log_dict["loss_talk"] += sum(nonzero_mean(cur_loss_item) for cur_loss_item in loss_list[-self.n_ahead_talk:]) / self.n_ahead_talk / self.n_tokens_print
-        # also log relative losses to loss_0
-        if loss_list:
-            for i in range(len(loss_list)):
-                talk_idx = min(max(i - (self.n_ahead - 1), 0), len(talk_loss_list) - 1)
-                if not talk_loss_list:
-                    cur_talk_loss = nonzero_mean(loss_list[0])
-                else:
-                    cur_talk_loss = talk_loss_list[talk_idx]
-                log_dict[f"rel_loss_{i}"] += (nonzero_mean(loss_list[i]) - cur_talk_loss) / self.n_tokens_print
-        if self.training:
-            self.training_steps += 1
-        try:
-            # if self.training_steps % (self.gradient_accumulation_steps * 256) == 0:
-            if self.wandb_enabled:
-                if self.training_steps % (self.n_tokens_print) == 0 or not self.training:# and "0" in str(loss.device):
-                    if not self.training:
-                        new_log_dict = {}
-                        for key in list(log_dict.keys()):
-                            new_log_dict["eval_" + key] = log_dict[key]
-                        log_dict = new_log_dict
-                    log_dict["training_steps"] = self.training_steps
-                    log_dict["batch_size"] = batch_size
-                    log_dict["example_steps"] = self.training_steps * batch_size * self.gradient_accumulation_steps
-                    if self.n_ahead > 1:
-                        log_dict["compute_steps"] = self.training_steps * batch_size * (self.n_ahead + self.n_ahead_talk - 1) * self.gradient_accumulation_steps
-                    else: # There's no overhead for talk tokens if there's no thinking
-                        log_dict["compute_steps"] = self.training_steps * batch_size * self.gradient_accumulation_steps
-                    # remove all nans
-                    for key in list(log_dict.keys()):
-                        if log_dict[key] != log_dict[key]:
-                            del log_dict[key]
-                    if self.training:
-                        wandb.log(log_dict)
-                    if self.training:
-                        self.log_dict = defaultdict(int)
-                    else:
-                        self.eval_log_dict = defaultdict(int)
-        except Exception as e:
-            pass
         if not self.training:
             self.n_ahead_talk = n_ahead_talk_to_restore
             self.n_passes = n_passes_to_restore
         return CausalLMOutputWithPast(
             loss=loss if loss is not None else None,
             logits=(rm_logits if self.n_ahead > 1 else logits) if not self.output_logits_at_the_end else logits,

 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
 from reportlab.lib.pagesizes import letter
 from reportlab.lib.colors import HexColor
+def _prepare_4d_causal_attention_mask_for_sdpa(attention_mask, input_shape, inputs_embeds, past_key_values_length):
+    # Compute the attention mask correctly
+    bsz, tgt_len = input_shape
+    # Create a 4D attention mask from a 2D tensor mask.
+    # The shape of the output attention mask is (batch_size, 1, tgt_len, src_len)
+    # The values are either 0 or 1, where 0 means padding and 1 means non-padding.
+    combined_attention_mask = None
+    if attention_mask is not None:
+        # What if attention_mask is not None and has a shape of (batch_size, 1, tgt_len, src_len)
+        # In this case, we can just use it directly.
+        if attention_mask.dim() == 4:
+            combined_attention_mask = attention_mask
+        # What if attention_mask is not None and has a shape of (batch_size, 1, tgt_len)
+        # In this case, we need to expand it to (batch_size, 1, tgt_len, src_len)
+        elif attention_mask.dim() == 3:
+            expanded_attn_mask = attention_mask[:, None, :, :]
+            combined_attention_mask = expanded_attn_mask
+        # What if attention_mask is not None and has a shape of (batch_size, tgt_len)
+        # In this case, we need to expand it to (batch_size, 1, tgt_len, src_len)
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if past_key_values_length > 0:
+                attention_mask = attention_mask.to(dtype=torch.long)
+                attention_mask = attention_mask[:, past_key_values_length:]
+            expanded_attn_mask = attention_mask[:, None, None, :]
+            combined_attention_mask = expanded_attn_mask
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+    # masked positions, this operation will create a tensor which is 0.0 for
+    # positions we want to attend and -10000.0 for masked positions.
+    # Since we are adding it to the raw scores before the softmax, this is
+    # effectively the same as removing these entirely.
+    if combined_attention_mask is not None:
+        # Ensure the attention mask values are within a reasonable range
+        combined_attention_mask = combined_attention_mask.clamp(min=0, max=1)
+        # Convert the attention mask to bfloat16
+        combined_attention_mask = combined_attention_mask.to(torch.bfloat16)
+        # Normalize the attention mask values to be between 0 and 1
+        combined_attention_mask = (1.0 - combined_attention_mask) * -10000.0
+    else:
+        combined_attention_mask = torch.zeros(
+            (bsz, 1, tgt_len, tgt_len), dtype=torch.bfloat16, device=inputs_embeds.device
+        )
+    return combined_attention_mask
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
         self.model = QuietModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
         self.max_thoughts = config.max_thoughts
         self.merged_lm_and_talk_heads = config.merged_lm_and_talk_heads
         self.use_concat_talk_head = config.use_concat_talk_head
         self.tokenized_thought_prefix = None
         self.log_dict = defaultdict(int)
         self.eval_log_dict = defaultdict(int)
         self.loss_mean = loss_mean
         self.start_embedding = nn.Parameter(torch.zeros(2, self.model.config.hidden_size))
         self.end_embedding = nn.Parameter(torch.zeros(2, self.model.config.hidden_size))
         self.embedding_scale = 1e2
         self.reinforce_temperature = 3
         self.base_loss_beta = 1
+        self.thinking_usefulness_head = nn.Linear(self.model.config.hidden_size, 1)
+        self.thinking_threshold = 0.5
+        self.thinking_usefulness_loss_weight = 1e-2
         # Not used in the paper:
         self.use_thought_prefix = False
         self.use_upper_triangular = False
         self.subtract_mean_reward = False
         self.comparison_mode = False
+        self.gumbel_detach = False
         # For visualization
         self.eval_mode = False
             # Apply Gumbel-Softmax to the logits
             next_token_logits = F.gumbel_softmax(logits, tau=self.gumbel_temperature, hard=True, dim=-1)
+            print("Next token logits:", next_token_logits)
             next_token_id = torch.argmax(next_token_logits, dim=-1)
             # Append the generated token to the input sequence
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
+        # output_router_logits: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
         if not self.training:
             n_ahead_talk_to_restore = self.n_ahead_talk
             n_passes_to_restore = self.n_passes
             self.n_ahead_talk = 1
             self.n_passes = 1
+        # aux_loss = None
+        # output_router_logits = output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        # if output_router_logits:
+        #     router_logits = outputs.router_logits if return_dict else outputs[-1]
+        #     if router_logits is not None:
+        #         aux_loss = load_balancing_loss_func(
+        #             router_logits,
+        #             self.num_experts,
+        #             self.num_experts_per_tok,
+        #             attention_mask,
+        #         )
+        #         if labels is not None:
+        #             loss += self.router_aux_loss_coef * aux_loss.to(loss.device)
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
             else:
                 # convert to identity transform
                 def lambda_transform(cur_head):
+                    # pdb.set_trace()
                     if cur_head.weight.data.shape[0] != cur_head.weight.data.shape[1]:
                         return torch.cat([
                         torch.eye(
                 use_cache=use_cache,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
+                # output_router_logits=output_router_logits,
                 return_dict=return_dict,
             )
                         shift_labels = labels[..., 1 + shift_amount:].contiguous()
                         # Flatten the tokens
                         loss_fct = CrossEntropyLoss(reduction="none")
+                        print("Shift logits before:", shift_logits)
                         shift_logits = shift_logits.view(-1, self.config.vocab_size)
                         shift_labels = shift_labels.view(-1).clone()
+                        print("shift logits after:", shift_logits)
                         # Enable model parallelism
                         shift_labels[shift_labels == self.tokenizer.pad_token_id] = -100
                         shift_labels = shift_labels.to(shift_logits.device)
                         inputs_embeds = cur_thought_embedding.unsqueeze(0).repeat(batch_size, seq_len, 1)
                         inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
+                # Predict the usefulness of thinking at each token position
+                thinking_usefulness = self.thinking_usefulness_head(hidden_states).squeeze(-1)
+                # Apply a threshold to decide where to generate thoughts
+                generate_thought_mask = thinking_usefulness > self.thinking_threshold
+                # Compute the regularization loss for thinking usefulness prediction
+                thinking_usefulness_loss = torch.mean(thinking_usefulness * (1 - generate_thought_mask.float()))
+                # Add the regularization loss to the total loss
+                if loss is not None:
+                    loss = loss + self.thinking_usefulness_loss_weight * thinking_usefulness_loss
+                else:
+                    loss = self.thinking_usefulness_loss_weight * thinking_usefulness_loss
                 if len(attention_mask.shape) == 2:
                     breakpoint()
                     # if shift_labels.min() == self.tokenizer.pad_token_id:
                     shift_labels = torch.where(shift_labels == self.tokenizer.pad_token_id, -100, shift_labels)
                     unreduced_loss = loss_fct(shift_logits, shift_labels)
+                    # print("Loss:", unreduced_loss.item())  # Print the loss before checking for NaN values
                     if torch.any(unreduced_loss != unreduced_loss):
+                        # pdb.set_trace()
                         raise ValueError("NaN loss")
                     unreduced_loss = unreduced_loss.reshape(logits.shape[0], -1)
                     loss_list.append(unreduced_loss)
                             else:
                                 added_reward = original_dqn_reward
                             policy_reward += added_reward
                         for action_loglikelihoods_2d in action_loglikelihoods_list:
                             train_policy_reward = policy_reward
                     else:
                         loss = cur_loss
                 loss = loss / len(loss_list)
+                loss = loss + thinking_usefulness_loss
             loss = loss * self.base_loss_beta
         if loss is not None:
             base_log_dict["loss_train"] = loss.item()
         if not self.training:
             self.n_ahead_talk = n_ahead_talk_to_restore
             self.n_passes = n_passes_to_restore
+        del start_embedding
+        del end_embedding
+        torch.cuda.empty_cache()
         return CausalLMOutputWithPast(
             loss=loss if loss is not None else None,
             logits=(rm_logits if self.n_ahead > 1 else logits) if not self.output_logits_at_the_end else logits,