fixie-ai
/

ultravox-v0_5-llama-3_3-70b

@@ -32,6 +32,8 @@ class LossFunction(str, Enum):
 class LossConfig:
     loss_function: LossFunction = LossFunction.CrossEntropy
     kl_temperature: float = 2.0
     @property
     def requires_alt_fields(self):
@@ -47,7 +49,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
     Args:
-        audio_config (`Wav2Vec2Config`,  *optional*):
             Custom audio config or dict
         text_config (`Union[AutoConfig, dict]`, *optional*):
             The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
@@ -72,10 +74,10 @@ class UltravoxConfig(transformers.PretrainedConfig):
     Example:
     ```python
-    >>> from transformers import UltravoxForConditionalGeneration, Wav2Vec2Config, UltravoxConfig, LlamaConfig
     >>> # Initializing an audio encoder config
-    >>> audio_config = Wav2Vec2Config()
     >>> # Initializing a Llama config
     >>> text_config = LlamaConfig()
@@ -84,13 +86,13 @@ class UltravoxConfig(transformers.PretrainedConfig):
     >>> configuration = UltravoxConfig(audio_config, text_config)
     >>> # Initializing a completely untrained model from the configuration
-    >>> model = UltravoxForConditionalGeneration(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     >>> # Initialize a model from pretrained checkpoints and random projector weights
-    >>> config = UltravoxConfig(audio_model_id="facebook/wav2vec2-base-960h", text_model_id="meta-llama/Llama-2-7b-chat-hf")
     ```"""
     model_type = "ultravox"
@@ -140,7 +142,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
         else:
             audio_config = audio_config or {}
             self.audio_config = transformers.CONFIG_MAPPING[
-                audio_config.get("model_type", "wav2vec2")
             ](**audio_config)
         self.text_model_lora_config = (
@@ -167,7 +169,12 @@ class UltravoxConfig(transformers.PretrainedConfig):
         # remove text_config and audio_config if text_model_id and audio_model_id are present
         if self.text_model_id is not None:
             diff_dict.pop("text_config", None)
         if self.audio_model_id is not None:
             diff_dict.pop("audio_config", None)
         return diff_dict

 class LossConfig:
     loss_function: LossFunction = LossFunction.CrossEntropy
     kl_temperature: float = 2.0
+    # Number of tokens to ignore from the beginning of the sequence. Only used in LSM
+    initial_tokens_to_ignore: int = 0
     @property
     def requires_alt_fields(self):
     documentation from [`PretrainedConfig`] for more information.
     Args:
+        audio_config (`WhisperConfig`,  *optional*):
             Custom audio config or dict
         text_config (`Union[AutoConfig, dict]`, *optional*):
             The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
     Example:
     ```python
+    >>> from transformers import UltravoxModel, WhisperConfig, UltravoxConfig, LlamaConfig
     >>> # Initializing an audio encoder config
+    >>> audio_config = WhisperConfig()
     >>> # Initializing a Llama config
     >>> text_config = LlamaConfig()
     >>> configuration = UltravoxConfig(audio_config, text_config)
     >>> # Initializing a completely untrained model from the configuration
+    >>> model = UltravoxModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     >>> # Initialize a model from pretrained checkpoints and random projector weights
+    >>> config = UltravoxConfig(audio_model_id="openai/whisper-tiny", text_model_id="meta-llama/Llama-2-7b-chat-hf")
     ```"""
     model_type = "ultravox"
         else:
             audio_config = audio_config or {}
             self.audio_config = transformers.CONFIG_MAPPING[
+                audio_config.get("model_type", "whisper")
             ](**audio_config)
         self.text_model_lora_config = (
         # remove text_config and audio_config if text_model_id and audio_model_id are present
         if self.text_model_id is not None:
             diff_dict.pop("text_config", None)
+        elif "text_config" in diff_dict:
+            diff_dict["text_config"].pop("_attn_implementation_autoset", None)
         if self.audio_model_id is not None:
             diff_dict.pop("audio_config", None)
+        elif "audio_config" in diff_dict:
+            diff_dict["audio_config"].pop("_attn_implementation_autoset", None)
         return diff_dict