Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 4, 2024

Commit

0c70c53

verified ·

1 Parent(s): ed908ff

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +104 -30

modeling_quiet.py CHANGED Viewed

@@ -1425,6 +1425,54 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
         logits = self.lm_head(mixed_hidden_states)
         return logits
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
@@ -2159,36 +2207,62 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
         return rare_token_ids
-def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
-    attention_mask = kwargs.get("attention_mask", None)
-    position_ids = kwargs.get("position_ids", None)
-    inputs_embeds = kwargs.get("inputs_embeds", None)
-    use_cache = kwargs.get("use_cache", None)
-    output_attentions = kwargs.get("output_attentions", None)
-    output_hidden_states = kwargs.get("output_hidden_states", None)
-    return_dict = kwargs.get("return_dict", None)
-    # Call the custom infer method
-    logits = self.infer(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-    )
-    # Return the prepared inputs for generation
-    return {
-        "input_ids": input_ids,
-        "logits": logits,
-        "past_key_values": past_key_values,
-        "attention_mask": attention_mask,
-        "position_ids": position_ids,
-    }
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):

         logits = self.lm_head(mixed_hidden_states)
         return logits
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        max_length: Optional[int] = None,
+        min_length: Optional[int] = None,
+        do_sample: Optional[bool] = None,
+        early_stopping: Optional[bool] = None,
+        num_beams: Optional[int] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        repetition_penalty: Optional[float] = None,
+        pad_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        length_penalty: Optional[float] = None,
+        no_repeat_ngram_size: Optional[int] = None,
+        bad_words_ids: Optional[Iterable[int]] = None,
+        num_return_sequences: Optional[int] = None,
+        decoder_start_token_id: Optional[int] = None,
+        use_cache: Optional[bool] = None,
+        stopping_criteria: Optional["StoppingCriteriaList"] = None,
+        **model_kwargs,
+    ) -> torch.LongTensor:
+        # Validate stopping criteria
+        stopping_criteria = validate_stopping_criteria(stopping_criteria)
+        # Prepare inputs
+        input_ids = inputs["input_ids"] if "input_ids" in inputs else inputs
+        attention_mask = inputs["attention_mask"] if "attention_mask" in inputs else None
+        position_ids = inputs["position_ids"] if "position_ids" in inputs else None
+        past_key_values = inputs["past_key_values"] if "past_key_values" in inputs else None
+        inputs_embeds = inputs["inputs_embeds"] if "inputs_embeds" in inputs else None
+        # Call the infer function
+        logits = self.infer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+        )
+        # Return the generated logits
+        return logits
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
         return rare_token_ids
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing inputs_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):