AlexHung29629 commited on
Commit
921ebab
·
verified ·
1 Parent(s): fc81040

Update ultravox_model.py

Browse files
Files changed (1) hide show
  1. ultravox_model.py +1 -8
ultravox_model.py CHANGED
@@ -21,13 +21,10 @@ from .ultravox_config import UltravoxConfig
21
  class UltravoxModel(transformers.LlamaPreTrainedModel):
22
  """
23
  The Ultravox model which consists of an audio encoder and a language model.
24
-
25
  Audio input is processed by the audio encoder, then every `stack_factor` frames are stacked together and
26
  projected to the language model's embedding space using a few linear layers.
27
  The text is embedded by the language model as usual and then the audio and text embeddings are merged together.
28
-
29
  A special token `<|audio|>` is used to indicate the start of the audio embeddings in the merged embeddings.
30
-
31
  Parameters:
32
  config: Model configuration class with all the parameters of the model.
33
  """
@@ -159,13 +156,11 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
159
  ) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]:
160
  """
161
  Forward pass for the Ultravox model.
162
-
163
  `input_ids` are the tokenized text input. They are embedded by the language model as usual.
164
  `audio_values` are processed by the audio encoder and then every `stack_factor` frames are stacked together and
165
  projected to the language model's embedding space using a few linear layers.
166
  The audio and text embeddings are merged together. A special token `<|audio|>` is used to indicate the start
167
  of the audio embeddings in the merged embeddings.
168
-
169
  Args:
170
  input_ids: The tokenized text input.
171
  audio_values: The processed audio values.
@@ -202,6 +197,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
202
  zip(audio_embeds, audio_token_start_idx, audio_token_len)
203
  ):
204
  inputs_embeds[i, start : start + length] = audio[:length]
 
205
 
206
  lm_output = self.language_model.forward(
207
  inputs_embeds=inputs_embeds,
@@ -453,7 +449,6 @@ def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module:
453
  class StackAudioFrames(nn.Module):
454
  """
455
  Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`.
456
-
457
  The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames.
458
  NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor,
459
  we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings.
@@ -514,13 +509,11 @@ class UltravoxProjector(nn.Sequential):
514
  class ModifiedWhisperEncoder(whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin):
515
  """
516
  Encoder portion of OpenAI's Whisper model.
517
-
518
  This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes:
519
  1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder
520
  2. allow less than 30 second of audio padding to be passed in:
521
  - relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal
522
  - embed_pos is now sliced to match the length of `inputs_embeds`
523
-
524
  Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
525
  """
526
 
 
21
  class UltravoxModel(transformers.LlamaPreTrainedModel):
22
  """
23
  The Ultravox model which consists of an audio encoder and a language model.
 
24
  Audio input is processed by the audio encoder, then every `stack_factor` frames are stacked together and
25
  projected to the language model's embedding space using a few linear layers.
26
  The text is embedded by the language model as usual and then the audio and text embeddings are merged together.
 
27
  A special token `<|audio|>` is used to indicate the start of the audio embeddings in the merged embeddings.
 
28
  Parameters:
29
  config: Model configuration class with all the parameters of the model.
30
  """
 
156
  ) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]:
157
  """
158
  Forward pass for the Ultravox model.
 
159
  `input_ids` are the tokenized text input. They are embedded by the language model as usual.
160
  `audio_values` are processed by the audio encoder and then every `stack_factor` frames are stacked together and
161
  projected to the language model's embedding space using a few linear layers.
162
  The audio and text embeddings are merged together. A special token `<|audio|>` is used to indicate the start
163
  of the audio embeddings in the merged embeddings.
 
164
  Args:
165
  input_ids: The tokenized text input.
166
  audio_values: The processed audio values.
 
197
  zip(audio_embeds, audio_token_start_idx, audio_token_len)
198
  ):
199
  inputs_embeds[i, start : start + length] = audio[:length]
200
+
201
 
202
  lm_output = self.language_model.forward(
203
  inputs_embeds=inputs_embeds,
 
449
  class StackAudioFrames(nn.Module):
450
  """
451
  Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`.
 
452
  The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames.
453
  NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor,
454
  we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings.
 
509
  class ModifiedWhisperEncoder(whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin):
510
  """
511
  Encoder portion of OpenAI's Whisper model.
 
512
  This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes:
513
  1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder
514
  2. allow less than 30 second of audio padding to be passed in:
515
  - relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal
516
  - embed_pos is now sliced to match the length of `inputs_embeds`
 
517
  Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
518
  """
519