Update ultravox_model.py
Browse files- ultravox_model.py +1 -8
ultravox_model.py
CHANGED
@@ -21,13 +21,10 @@ from .ultravox_config import UltravoxConfig
|
|
21 |
class UltravoxModel(transformers.LlamaPreTrainedModel):
|
22 |
"""
|
23 |
The Ultravox model which consists of an audio encoder and a language model.
|
24 |
-
|
25 |
Audio input is processed by the audio encoder, then every `stack_factor` frames are stacked together and
|
26 |
projected to the language model's embedding space using a few linear layers.
|
27 |
The text is embedded by the language model as usual and then the audio and text embeddings are merged together.
|
28 |
-
|
29 |
A special token `<|audio|>` is used to indicate the start of the audio embeddings in the merged embeddings.
|
30 |
-
|
31 |
Parameters:
|
32 |
config: Model configuration class with all the parameters of the model.
|
33 |
"""
|
@@ -159,13 +156,11 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
159 |
) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]:
|
160 |
"""
|
161 |
Forward pass for the Ultravox model.
|
162 |
-
|
163 |
`input_ids` are the tokenized text input. They are embedded by the language model as usual.
|
164 |
`audio_values` are processed by the audio encoder and then every `stack_factor` frames are stacked together and
|
165 |
projected to the language model's embedding space using a few linear layers.
|
166 |
The audio and text embeddings are merged together. A special token `<|audio|>` is used to indicate the start
|
167 |
of the audio embeddings in the merged embeddings.
|
168 |
-
|
169 |
Args:
|
170 |
input_ids: The tokenized text input.
|
171 |
audio_values: The processed audio values.
|
@@ -202,6 +197,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
|
|
202 |
zip(audio_embeds, audio_token_start_idx, audio_token_len)
|
203 |
):
|
204 |
inputs_embeds[i, start : start + length] = audio[:length]
|
|
|
205 |
|
206 |
lm_output = self.language_model.forward(
|
207 |
inputs_embeds=inputs_embeds,
|
@@ -453,7 +449,6 @@ def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module:
|
|
453 |
class StackAudioFrames(nn.Module):
|
454 |
"""
|
455 |
Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`.
|
456 |
-
|
457 |
The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames.
|
458 |
NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor,
|
459 |
we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings.
|
@@ -514,13 +509,11 @@ class UltravoxProjector(nn.Sequential):
|
|
514 |
class ModifiedWhisperEncoder(whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin):
|
515 |
"""
|
516 |
Encoder portion of OpenAI's Whisper model.
|
517 |
-
|
518 |
This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes:
|
519 |
1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder
|
520 |
2. allow less than 30 second of audio padding to be passed in:
|
521 |
- relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal
|
522 |
- embed_pos is now sliced to match the length of `inputs_embeds`
|
523 |
-
|
524 |
Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
|
525 |
"""
|
526 |
|
|
|
21 |
class UltravoxModel(transformers.LlamaPreTrainedModel):
|
22 |
"""
|
23 |
The Ultravox model which consists of an audio encoder and a language model.
|
|
|
24 |
Audio input is processed by the audio encoder, then every `stack_factor` frames are stacked together and
|
25 |
projected to the language model's embedding space using a few linear layers.
|
26 |
The text is embedded by the language model as usual and then the audio and text embeddings are merged together.
|
|
|
27 |
A special token `<|audio|>` is used to indicate the start of the audio embeddings in the merged embeddings.
|
|
|
28 |
Parameters:
|
29 |
config: Model configuration class with all the parameters of the model.
|
30 |
"""
|
|
|
156 |
) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]:
|
157 |
"""
|
158 |
Forward pass for the Ultravox model.
|
|
|
159 |
`input_ids` are the tokenized text input. They are embedded by the language model as usual.
|
160 |
`audio_values` are processed by the audio encoder and then every `stack_factor` frames are stacked together and
|
161 |
projected to the language model's embedding space using a few linear layers.
|
162 |
The audio and text embeddings are merged together. A special token `<|audio|>` is used to indicate the start
|
163 |
of the audio embeddings in the merged embeddings.
|
|
|
164 |
Args:
|
165 |
input_ids: The tokenized text input.
|
166 |
audio_values: The processed audio values.
|
|
|
197 |
zip(audio_embeds, audio_token_start_idx, audio_token_len)
|
198 |
):
|
199 |
inputs_embeds[i, start : start + length] = audio[:length]
|
200 |
+
|
201 |
|
202 |
lm_output = self.language_model.forward(
|
203 |
inputs_embeds=inputs_embeds,
|
|
|
449 |
class StackAudioFrames(nn.Module):
|
450 |
"""
|
451 |
Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`.
|
|
|
452 |
The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames.
|
453 |
NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor,
|
454 |
we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings.
|
|
|
509 |
class ModifiedWhisperEncoder(whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin):
|
510 |
"""
|
511 |
Encoder portion of OpenAI's Whisper model.
|
|
|
512 |
This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes:
|
513 |
1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder
|
514 |
2. allow less than 30 second of audio padding to be passed in:
|
515 |
- relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal
|
516 |
- embed_pos is now sliced to match the length of `inputs_embeds`
|
|
|
517 |
Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
|
518 |
"""
|
519 |
|