AlexHung29629
/

test_mllama_v16

Feature Extraction

Transformers

Safetensors

ultravox

custom_code

Model card Files Files and versions Community

AlexHung29629 commited on Nov 28, 2024

Commit

bf1a94f

verified ·

1 Parent(s): b7af19e

Update ultravox_processing.py

Browse files

Files changed (1) hide show

ultravox_processing.py +26 -23

ultravox_processing.py CHANGED Viewed

@@ -20,6 +20,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         "Wav2Vec2Processor",
         "SeamlessM4TFeatureExtractor",
         "WhisperProcessor",
     )
     tokenizer_class = (
         "PreTrainedTokenizer",
@@ -128,12 +129,27 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         """
         # TODO: Add support for multiple audio and text inputs.
         data = {}
         if audio is not None and len(audio) > 0:
             # Main audio processing. The processor is model-specific.
             x = self.audio_processor(
                 audio,
                 sampling_rate=sampling_rate,
                 padding="longest",
                 return_attention_mask=True,
                 **kwargs,
             )
@@ -142,21 +158,13 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             else:
                 data["audio_values"] = x.input_values
             data["audio_len"] = x.attention_mask.sum(-1) - 1
-            def cnn_out_len(in_len, kernel, stride=1, padding=1, dilation=1):
-                return np.floor((in_len + (2*padding) - (dilation * (kernel - 1)) - 1)/stride + 1)
-            def stack_frame_len(T):
-                T_pad = ((T + self.stack_factor - 1) // self.stack_factor) * self.stack_factor
-                return ((T_pad + self.stack_factor) // self.stack_factor).astype(int)
-            nb_encoder_frames = cnn_out_len(cnn_out_len(data["audio_len"], kernel=3), kernel=3, stride=2)
-            data["audio_token_len"] = stack_frame_len(nb_encoder_frames)
         if text is not None:
-            assert isinstance(
-                text, list
-            ), "Text must be a list."
-            processed_text = []
             data["audio_token_start_idx"] = []
-            for i, t in enumerate(text):
                 assert self.audio_placeholder in t
                 if "audio_token_len" not in data:
                     raise ValueError(
@@ -165,24 +173,19 @@ class UltravoxProcessor(transformers.ProcessorMixin):
                 start_idx = len(
                     self.tokenizer.encode(
-                        t.split(self.audio_placeholder)[0],
                         add_special_tokens=False,
                     )
                 )
                 data["audio_token_start_idx"].append(start_idx)
-                # Replace the audio placeholder with the audio token.
-                #   e.g. "Transcribe <|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
-                #        where the number of </s> is the number of audio frames.
-                t = t.replace(
-                    self.audio_placeholder,
-                    self.audio_token_replacement * data["audio_token_len"][i],
-                )
-                processed_text.append(t)
             # Special tokens like BOS should already have been added by the caller.
-            data.update(self.tokenizer(processed_text, add_special_tokens=False, padding='longest', **kwargs))
         return transformers.BatchFeature(data=data, tensor_type=return_tensors)

         "Wav2Vec2Processor",
         "SeamlessM4TFeatureExtractor",
         "WhisperProcessor",
+        "Wav2Vec2BertProcessor",
     )
     tokenizer_class = (
         "PreTrainedTokenizer",
         """
         # TODO: Add support for multiple audio and text inputs.
         data = {}
+        audio_embed_frames = 0
         if audio is not None and len(audio) > 0:
+            if self.audio_padding == "max_length":
+                # 30 seconds is the expected length for Whisper
+                assert sampling_rate is not None, "Sampling rate must be provided."
+                audio_len = [30 * sampling_rate] * len(audio)
+            else:
+                audio_len = [a.shape[-1] for a in audio]
+            # It's guaranteed that the number of frames is less than or equal to this amount.
+            # For Whisper this is exact AFAICT, but for Wav2Vec2 it's an upper bound.
+            # Currently, StackAudioFrames makes sure an over-estimation won't cause issues by padding the audio embeddings.
+            nb_encoder_frames = [int(round(a / self.encoder_ds_factor + 1e-4)) for a in audio_len]
+            audio_embed_frames = [int(np.ceil(n / self.stack_factor)) for n in nb_encoder_frames]
+            data["audio_token_len"] = audio_embed_frames
             # Main audio processing. The processor is model-specific.
             x = self.audio_processor(
                 audio,
                 sampling_rate=sampling_rate,
                 padding="longest",
+                max_length=max(audio_len),
                 return_attention_mask=True,
                 **kwargs,
             )
             else:
                 data["audio_values"] = x.input_values
             data["audio_len"] = x.attention_mask.sum(-1) - 1
         if text is not None:
+            #assert isinstance(
+            #    text, str
+            #), "Text must be a string. Batch mode not supported yet."
             data["audio_token_start_idx"] = []
+            for t in text:
                 assert self.audio_placeholder in t
                 if "audio_token_len" not in data:
                     raise ValueError(
                 start_idx = len(
                     self.tokenizer.encode(
+                        t[: t.index(self.audio_placeholder)],
                         add_special_tokens=False,
                     )
                 )
                 data["audio_token_start_idx"].append(start_idx)
+            # Replace the audio placeholder with the audio token.
+            #   e.g. "Transcribe\n<|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
+            #        where the number of </s> is the number of audio frames.
+            text = [t.replace(self.audio_placeholder, self.audio_token_replacement * data["audio_token_len"][i]) for i, t in enumerate(text)]
             # Special tokens like BOS should already have been added by the caller.
+            data.update(self.tokenizer(text, add_special_tokens=False, padding=True, **kwargs))
         return transformers.BatchFeature(data=data, tensor_type=return_tensors)