AlexHung29629
/

test_mllama_v16

Feature Extraction

Transformers

Safetensors

ultravox

custom_code

Model card Files Files and versions Community

AlexHung29629 commited on Nov 26, 2024

Commit

b7af19e

verified ·

1 Parent(s): 921ebab

Update ultravox_processing.py

Browse files

Files changed (1) hide show

ultravox_processing.py +26 -31

ultravox_processing.py CHANGED Viewed

@@ -10,7 +10,6 @@ from .ultravox_config import UltravoxConfig
 class UltravoxProcessor(transformers.ProcessorMixin):
     """
     Constructs an Ultravox processor which wraps an audio processor and a tokenizer into a single processor.
     Args:
         audio_processor: The audio processor for the audio encoder.
         tokenizer: The tokenizer for the language model.
@@ -100,7 +99,6 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
         audio processor's [`~Wav2Vec2Processor.__call__`] if `audio` is not `None`. Please refer to the docstring
         of the above two methods for more information.
         Args:
             text (`str`, `List[str]`):
                 The sequence to be encoded. Sequence can be a string or (pretokenized string).
@@ -113,15 +111,12 @@ class UltravoxProcessor(transformers.ProcessorMixin):
                 you are doing.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
                 - `'tf'`: Return TensorFlow `tf.constant` objects.
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
                 - `'np'`: Return NumPy `np.ndarray` objects.
                 - `'jax'`: Return JAX `jnp.ndarray` objects.
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
@@ -133,7 +128,6 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         """
         # TODO: Add support for multiple audio and text inputs.
         data = {}
-        audio_embed_frames = 0
         if audio is not None and len(audio) > 0:
             # Main audio processing. The processor is model-specific.
             x = self.audio_processor(
@@ -151,10 +145,10 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             def cnn_out_len(in_len, kernel, stride=1, padding=1, dilation=1):
                 return np.floor((in_len + (2*padding) - (dilation * (kernel - 1)) - 1)/stride + 1)
             def stack_frame_len(T):
-                T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor
-                return int((T_pad + self.stack_factor) // self.stack_factor)
-            nb_encoder_frames = [cnn_out_len(cnn_out_len(feat_len, kernel=3), kernel=3, stride=2) for feat_len in data["audio_len"]]
-            data["audio_token_len"] = [stack_frame_len(x) for x in nb_encoder_frames]
         if text is not None:
             assert isinstance(
@@ -162,29 +156,30 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             ), "Text must be a list."
             processed_text = []
             data["audio_token_start_idx"] = []
-            for t in text:
-                if self.audio_placeholder in t:
-                    if "audio_token_len" not in data:
-                        raise ValueError(
-                            f"audio must be provided when using audio placeholder ({self.audio_placeholder}) in text."
-                        )
-                    start_idx = len(
-                        self.tokenizer.encode(
-                            t[: t.index(self.audio_placeholder)],
-                            add_special_tokens=False,
-                        )
                     )
-                    data["audio_token_start_idx"].append(start_idx)
-                    # Replace the audio placeholder with the audio token.
-                    #   e.g. "Transcribe\n<|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
-                    #        where the number of </s> is the number of audio frames.
-                    t = t.replace(
-                        self.audio_placeholder,
-                        self.audio_token_replacement * audio_embed_frames,
                     )
-                    processed_text.append(t)
             # Special tokens like BOS should already have been added by the caller.
             data.update(self.tokenizer(processed_text, add_special_tokens=False, padding='longest', **kwargs))

 class UltravoxProcessor(transformers.ProcessorMixin):
     """
     Constructs an Ultravox processor which wraps an audio processor and a tokenizer into a single processor.
     Args:
         audio_processor: The audio processor for the audio encoder.
         tokenizer: The tokenizer for the language model.
         the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
         audio processor's [`~Wav2Vec2Processor.__call__`] if `audio` is not `None`. Please refer to the docstring
         of the above two methods for more information.
         Args:
             text (`str`, `List[str]`):
                 The sequence to be encoded. Sequence can be a string or (pretokenized string).
                 you are doing.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
                 - `'tf'`: Return TensorFlow `tf.constant` objects.
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
                 - `'np'`: Return NumPy `np.ndarray` objects.
                 - `'jax'`: Return JAX `jnp.ndarray` objects.
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
         """
         # TODO: Add support for multiple audio and text inputs.
         data = {}
         if audio is not None and len(audio) > 0:
             # Main audio processing. The processor is model-specific.
             x = self.audio_processor(
             def cnn_out_len(in_len, kernel, stride=1, padding=1, dilation=1):
                 return np.floor((in_len + (2*padding) - (dilation * (kernel - 1)) - 1)/stride + 1)
             def stack_frame_len(T):
+                T_pad = ((T + self.stack_factor - 1) // self.stack_factor) * self.stack_factor
+                return ((T_pad + self.stack_factor) // self.stack_factor).astype(int)
+            nb_encoder_frames = cnn_out_len(cnn_out_len(data["audio_len"], kernel=3), kernel=3, stride=2)
+            data["audio_token_len"] = stack_frame_len(nb_encoder_frames)
         if text is not None:
             assert isinstance(
             ), "Text must be a list."
             processed_text = []
             data["audio_token_start_idx"] = []
+            for i, t in enumerate(text):
+                assert self.audio_placeholder in t
+                if "audio_token_len" not in data:
+                    raise ValueError(
+                        f"audio must be provided when using audio placeholder ({self.audio_placeholder}) in text."
                     )
+                start_idx = len(
+                    self.tokenizer.encode(
+                        t.split(self.audio_placeholder)[0],
+                        add_special_tokens=False,
                     )
+                )
+                data["audio_token_start_idx"].append(start_idx)
+                # Replace the audio placeholder with the audio token.
+                #   e.g. "Transcribe <|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
+                #        where the number of </s> is the number of audio frames.
+                t = t.replace(
+                    self.audio_placeholder,
+                    self.audio_token_replacement * data["audio_token_len"][i],
+                )
+                processed_text.append(t)
             # Special tokens like BOS should already have been added by the caller.
             data.update(self.tokenizer(processed_text, add_special_tokens=False, padding='longest', **kwargs))