AlexHung29629
/

test_mllama_v12b

Feature Extraction

Transformers

Safetensors

ultravox

custom_code

Model card Files Files and versions Community

AlexHung29629 commited on Nov 27, 2024

Commit

3bec836

verified ·

1 Parent(s): 12908ea

Update ultravox_processing.py

Browse files

Files changed (1) hide show

ultravox_processing.py +18 -19

ultravox_processing.py CHANGED Viewed

@@ -134,15 +134,15 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             if self.audio_padding == "max_length":
                 # 30 seconds is the expected length for Whisper
                 assert sampling_rate is not None, "Sampling rate must be provided."
-                audio_len = 30 * sampling_rate
             else:
-                audio_len = max([a.shape[-1] for a in audio])
             # It's guaranteed that the number of frames is less than or equal to this amount.
             # For Whisper this is exact AFAICT, but for Wav2Vec2 it's an upper bound.
             # Currently, StackAudioFrames makes sure an over-estimation won't cause issues by padding the audio embeddings.
-            nb_encoder_frames = int(round(audio_len / self.encoder_ds_factor + 1e-4))
-            audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor))
-            data["audio_token_len"] = [audio_embed_frames]
             # Main audio processing. The processor is model-specific.
             x = self.audio_processor(
@@ -160,10 +160,12 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             data["audio_len"] = x.attention_mask.sum(-1) - 1
         if text is not None:
-            assert isinstance(
-                text, str
-            ), "Text must be a string. Batch mode not supported yet."
-            if self.audio_placeholder in text:
                 if "audio_token_len" not in data:
                     raise ValueError(
                         f"audio must be provided when using audio placeholder ({self.audio_placeholder}) in text."
@@ -171,19 +173,16 @@ class UltravoxProcessor(transformers.ProcessorMixin):
                 start_idx = len(
                     self.tokenizer.encode(
-                        text[: text.index(self.audio_placeholder)],
                         add_special_tokens=False,
                     )
                 )
-                data["audio_token_start_idx"] = [start_idx]
-                # Replace the audio placeholder with the audio token.
-                #   e.g. "Transcribe\n<|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
-                #        where the number of </s> is the number of audio frames.
-                text = text.replace(
-                    self.audio_placeholder,
-                    self.audio_token_replacement * audio_embed_frames,
-                )
             # Special tokens like BOS should already have been added by the caller.
             data.update(self.tokenizer([text], add_special_tokens=False, **kwargs))

             if self.audio_padding == "max_length":
                 # 30 seconds is the expected length for Whisper
                 assert sampling_rate is not None, "Sampling rate must be provided."
+                audio_len = [30 * sampling_rate] * len(audio)
             else:
+                audio_len = [a.shape[-1] for a in audio]
             # It's guaranteed that the number of frames is less than or equal to this amount.
             # For Whisper this is exact AFAICT, but for Wav2Vec2 it's an upper bound.
             # Currently, StackAudioFrames makes sure an over-estimation won't cause issues by padding the audio embeddings.
+            nb_encoder_frames = [int(round(a / self.encoder_ds_factor + 1e-4)) for a in audio_len]
+            audio_embed_frames = [int(np.ceil(n / self.stack_factor)) for n in nb_encoder_frames]
+            data["audio_token_len"] = audio_embed_frames
             # Main audio processing. The processor is model-specific.
             x = self.audio_processor(
             data["audio_len"] = x.attention_mask.sum(-1) - 1
         if text is not None:
+            #assert isinstance(
+            #    text, str
+            #), "Text must be a string. Batch mode not supported yet."
+            data["audio_token_start_idx"] = []
+            for t in text:
+                assert self.audio_placeholder in t
                 if "audio_token_len" not in data:
                     raise ValueError(
                         f"audio must be provided when using audio placeholder ({self.audio_placeholder}) in text."
                 start_idx = len(
                     self.tokenizer.encode(
+                        t[: t.index(self.audio_placeholder)],
                         add_special_tokens=False,
                     )
                 )
+                data["audio_token_start_idx"].append(start_idx)
+            # Replace the audio placeholder with the audio token.
+            #   e.g. "Transcribe\n<|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
+            #        where the number of </s> is the number of audio frames.
+            text = [t.replace(self.audio_placeholder, self.audio_token_replacement * data["audio_token_len"][i]) for i, t in enumerate(text)]
             # Special tokens like BOS should already have been added by the caller.
             data.update(self.tokenizer([text], add_special_tokens=False, **kwargs))