Crystalcareai
/

Quiet-Star-Custom

@@ -1022,9 +1022,7 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
         seq_len += 1
         # Update the attention mask
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        else:
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Generate the continuation
@@ -1059,11 +1057,12 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
             next_token_id = torch.argmax(next_token_logits, dim=-1)
             # Append the generated token to the input sequence
-            input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
             seq_len += 1
             # Update the attention mask
-            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Append the end thought token to the input sequence
         end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
@@ -1071,7 +1070,8 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
         seq_len += 1
         # Update the attention mask
-        attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Get the hidden states before and after the thought
         outputs_before = self.model(
@@ -1090,7 +1090,7 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
         # two new tokens: last continuation token and end thought token
         outputs_after = self.model(
             input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1),
-            attention_mask=torch.cat([attention_mask[:, -2:], torch.ones((batch_size, 2)).to(attention_mask.device)], dim=-1),
             position_ids=position_ids,
             past_key_values=new_key_values,
             inputs_embeds=inputs_embeds,
@@ -1110,127 +1110,25 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
         # Apply the language model head to get the final logits
         logits = self.lm_head(mixed_hidden_states)
         return logits
-    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **kwargs):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        return {"input_ids": input_ids, "attention_mask": attention_mask}
-    def _generate_no_beam_search(
-        self,
-        input_ids,
-        cur_len,
-        max_length,
-        min_length,
-        do_sample,
-        temperature,
-        top_k,
-        top_p,
-        repetition_penalty,
-        no_repeat_ngram_size,
-        bad_words_ids,
-        pad_token_id,
-        eos_token_id,
-        batch_size,
-        attention_mask,
-        use_cache,
-        model_kwargs,
-    ):
-        finished_generating = torch.zeros(batch_size, dtype=torch.bool, device=input_ids.device)
-        for cur_token_idx in range(max_length):
-            # Sample the next token
-            new_ids = self(
-                input_ids[~finished_generating],
-                attention_mask=attention_mask[~finished_generating]
-            )['logits']
-            # Mask out the start and end thought tokens so we don't accidentally sample them
-            new_ids[:, :, self.tokenizer.vocab_size:] = -float("inf")
-            for list_idx, answer_idx in enumerate((~finished_generating).nonzero(as_tuple=True)[0]):
-                # Find the index of the last token that is not padding
-                base_answer_ids = input_ids[answer_idx]
-                new_answer_ids = new_ids[list_idx]
-                last_token_idx = (base_answer_ids != self.tokenizer.pad_token_id).nonzero(as_tuple=True)[0].max()
-                new_ids_sampled = torch.multinomial(
-                    torch.nn.functional.softmax(new_answer_ids[last_token_idx] / temperature, dim=-1), 1)
-                # Assign the new id to the last token
-                if last_token_idx + 1 >= len(base_answer_ids):
-                    # Add padding everywhere
-                    new_padding = torch.full((batch_size, 1), self.tokenizer.pad_token_id, dtype=torch.long,
-                                             device=input_ids.device)
-                    input_ids = torch.cat([input_ids, new_padding], dim=-1)
-                    attention_mask = torch.cat([attention_mask, torch.zeros_like(new_padding)], dim=-1)
-                attention_mask[answer_idx, last_token_idx + 1] = 1
-                input_ids[answer_idx, last_token_idx + 1] = new_ids_sampled
-                if new_ids_sampled == self.tokenizer.eos_token_id or new_ids_sampled == self.tokenizer.bos_token_id or new_ids_sampled == self.tokenizer.pad_token_id:
-                    finished_generating[answer_idx] = 1
-                # Check if the end token is generated
-                if new_ids_sampled == self.tokenizer.convert_tokens_to_ids("<|/assistant|>"):
-                    finished_generating[answer_idx] = 1
-            if finished_generating.all():
-                break
-        return input_ids
     @torch.no_grad()
     def generate(
         self,
-        input_ids=None,
-        max_length=None,
-        min_length=None,
-        do_sample=None,
-        early_stopping=None,
-        num_beams=None,
-        temperature=None,
-        top_k=None,
-        top_p=None,
-        repetition_penalty=None,
-        bad_words_ids=None,
-        bos_token_id=None,
-        pad_token_id=None,
-        eos_token_id=None,
-        length_penalty=None,
-        no_repeat_ngram_size=None,
-        num_return_sequences=None,
-        attention_mask=None,
-        decoder_start_token_id=None,
-        use_cache=None,
-        **model_kwargs,
-    ):
-        max_length = max_length if max_length is not None else self.config.max_length
-        min_length = min_length if min_length is not None else self.config.min_length
-        do_sample = do_sample if do_sample is not None else self.config.do_sample
-        temperature = temperature if temperature is not None else self.config.temperature
-        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
-        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
-        # if input_ids is None:
-        #     raise ValueError("You have to specify either input_ids")
-        # batch_size = input_ids.shape[0]
-        # cur_len = input_ids.shape[-1]
-        # if attention_mask is None:
-        #     attention_mask = torch.ones(batch_size, cur_len, device=input_ids.device)
-        return self._generate_no_beam_search(
-            input_ids,
-            cur_len=cur_len,
-            max_length=max_length,
-            min_length=min_length,
-            do_sample=do_sample,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            bad_words_ids=bad_words_ids,
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-            batch_size=batch_size,
-            attention_mask=attention_mask,
-            use_cache=use_cache,
-            model_kwargs=model_kwargs,
-        )
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
@@ -1971,6 +1869,65 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):

         seq_len += 1
         # Update the attention mask
+        if attention_mask is not None:
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Generate the continuation
             next_token_id = torch.argmax(next_token_logits, dim=-1)
             # Append the generated token to the input sequence
+            # input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
             seq_len += 1
             # Update the attention mask
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Append the end thought token to the input sequence
         end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
         seq_len += 1
         # Update the attention mask
+        if attention_mask is not None:
+            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Get the hidden states before and after the thought
         outputs_before = self.model(
         # two new tokens: last continuation token and end thought token
         outputs_after = self.model(
             input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1),
+            attention_mask=torch.cat([attention_mask[:, -1:], torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1),
             position_ids=position_ids,
             past_key_values=new_key_values,
             inputs_embeds=inputs_embeds,
         # Apply the language model head to get the final logits
         logits = self.lm_head(mixed_hidden_states)
         return logits
     @torch.no_grad()
     def generate(
         self,
+        input_ids: torch.LongTensor = torch.LongTensor(),
+        attention_mask: Optional[torch.Tensor] = None,
+        max_new_tokens: Optional[int] = None,
+        temperature: float = 1.1,
+        **kwargs,
+    ):
+        if isinstance(input_ids, str):
+            input_ids = self.tokenizer(input_ids, return_tensors="pt").input_ids
+        if attention_mask is None:
+            # Create a default attention mask if not provided
+            attention_mask = torch.ones_like(input_ids)
+        from .generate import custom_generate
+        return custom_generate(self, input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, temperature=temperature, **kwargs)
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing inputs_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):