AlexHung29629
/

test_mllama_v16

Feature Extraction

Transformers

Safetensors

ultravox

custom_code

Model card Files Files and versions Community

AlexHung29629 commited on Nov 26, 2024

Commit

921ebab

verified ·

1 Parent(s): fc81040

Update ultravox_model.py

Browse files

Files changed (1) hide show

ultravox_model.py +1 -8

ultravox_model.py CHANGED Viewed

@@ -21,13 +21,10 @@ from .ultravox_config import UltravoxConfig
 class UltravoxModel(transformers.LlamaPreTrainedModel):
     """
     The Ultravox model which consists of an audio encoder and a language model.
     Audio input is processed by the audio encoder, then every `stack_factor` frames are stacked together and
     projected to the language model's embedding space using a few linear layers.
     The text is embedded by the language model as usual and then the audio and text embeddings are merged together.
     A special token `<|audio|>` is used to indicate the start of the audio embeddings in the merged embeddings.
     Parameters:
         config: Model configuration class with all the parameters of the model.
     """
@@ -159,13 +156,11 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
     ) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]:
         """
         Forward pass for the Ultravox model.
         `input_ids` are the tokenized text input. They are embedded by the language model as usual.
         `audio_values` are processed by the audio encoder and then every `stack_factor` frames are stacked together and
         projected to the language model's embedding space using a few linear layers.
         The audio and text embeddings are merged together. A special token `<|audio|>` is used to indicate the start
         of the audio embeddings in the merged embeddings.
         Args:
             input_ids: The tokenized text input.
             audio_values: The processed audio values.
@@ -202,6 +197,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
                 zip(audio_embeds, audio_token_start_idx, audio_token_len)
             ):
                 inputs_embeds[i, start : start + length] = audio[:length]
         lm_output = self.language_model.forward(
             inputs_embeds=inputs_embeds,
@@ -453,7 +449,6 @@ def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module:
 class StackAudioFrames(nn.Module):
     """
     Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`.
     The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames.
     NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor,
     we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings.
@@ -514,13 +509,11 @@ class UltravoxProjector(nn.Sequential):
 class ModifiedWhisperEncoder(whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin):
     """
     Encoder portion of OpenAI's Whisper model.
     This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes:
     1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder
     2. allow less than 30 second of audio padding to be passed in:
         - relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal
         - embed_pos is now sliced to match the length of `inputs_embeds`
     Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
     """

 class UltravoxModel(transformers.LlamaPreTrainedModel):
     """
     The Ultravox model which consists of an audio encoder and a language model.
     Audio input is processed by the audio encoder, then every `stack_factor` frames are stacked together and
     projected to the language model's embedding space using a few linear layers.
     The text is embedded by the language model as usual and then the audio and text embeddings are merged together.
     A special token `<|audio|>` is used to indicate the start of the audio embeddings in the merged embeddings.
     Parameters:
         config: Model configuration class with all the parameters of the model.
     """
     ) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]:
         """
         Forward pass for the Ultravox model.
         `input_ids` are the tokenized text input. They are embedded by the language model as usual.
         `audio_values` are processed by the audio encoder and then every `stack_factor` frames are stacked together and
         projected to the language model's embedding space using a few linear layers.
         The audio and text embeddings are merged together. A special token `<|audio|>` is used to indicate the start
         of the audio embeddings in the merged embeddings.
         Args:
             input_ids: The tokenized text input.
             audio_values: The processed audio values.
                 zip(audio_embeds, audio_token_start_idx, audio_token_len)
             ):
                 inputs_embeds[i, start : start + length] = audio[:length]
         lm_output = self.language_model.forward(
             inputs_embeds=inputs_embeds,
 class StackAudioFrames(nn.Module):
     """
     Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`.
     The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames.
     NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor,
     we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings.
 class ModifiedWhisperEncoder(whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin):
     """
     Encoder portion of OpenAI's Whisper model.
     This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes:
     1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder
     2. allow less than 30 second of audio padding to be passed in:
         - relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal
         - embed_pos is now sliced to match the length of `inputs_embeds`
     Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
     """