ltx-video-distilled

Running on Zero

App Files Files Community

ltx

#14

by dangthr - opened May 20

base: refs/heads/main

←

from: refs/pr/14

Discussion Files changed

+85

-346

Files changed (13) hide show

README.md +1 -1
app.py +11 -46
configs/ltxv-13b-0.9.8-dev-fp8.yaml +0 -34
configs/ltxv-13b-0.9.8-dev.yaml +0 -34
configs/ltxv-13b-0.9.8-distilled-fp8.yaml +0 -29
configs/ltxv-13b-0.9.8-distilled.yaml +0 -29
configs/ltxv-2b-0.9.8-distilled-fp8.yaml +0 -28
configs/ltxv-2b-0.9.8-distilled.yaml +0 -28
ltx_video/models/autoencoders/causal_video_autoencoder.py +7 -2
ltx_video/models/transformers/attention.py +1 -0
ltx_video/models/transformers/transformer3d.py +1 -1
ltx_video/pipelines/pipeline_ltx_video.py +63 -113
ltx_video/schedulers/rf.py +1 -1

README.md CHANGED Viewed

@@ -7,7 +7,7 @@ sdk: gradio
 sdk_version: 5.29.1
 app_file: app.py
 pinned: false
-short_description: ultra-fast video model, LTX 0.9.8 13B distilled
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 sdk_version: 5.29.1
 app_file: app.py
 pinned: false
+short_description: ultra-fast video model, LTX 0.9.7 13B distilled
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ from inference import (
 from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline, LTXVideoPipeline
 from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
-config_file_path = "configs/ltxv-13b-0.9.8-distilled.yaml"
 with open(config_file_path, "r") as file:
     PIPELINE_CONFIG_YAML = yaml.safe_load(file)
@@ -141,47 +141,12 @@ def get_duration(prompt, negative_prompt, input_image_filepath, input_video_file
         return 60
 @spaces.GPU(duration=get_duration)
-def generate(prompt, negative_prompt, input_image_filepath=None, input_video_filepath=None,
-             height_ui=512, width_ui=704, mode="text-to-video",
-             duration_ui=2.0,
-             ui_frames_to_use=9,
-             seed_ui=42, randomize_seed=True, ui_guidance_scale=3.0, improve_texture_flag=True,
              progress=gr.Progress(track_tqdm=True)):
-    """
-    Generate high-quality videos using LTX Video model with support for text-to-video, image-to-video, and video-to-video modes.
-    Args:
-        prompt (str): Text description of the desired video content. Required for all modes.
-        negative_prompt (str): Text describing what to avoid in the generated video. Optional, can be empty string.
-        input_image_filepath (str or None): Path to input image file. Required for image-to-video mode, None for other modes.
-        input_video_filepath (str or None): Path to input video file. Required for video-to-video mode, None for other modes.
-        height_ui (int): Height of the output video in pixels, must be divisible by 32. Default: 512.
-        width_ui (int): Width of the output video in pixels, must be divisible by 32. Default: 704.
-        mode (str): Generation mode. Required. One of "text-to-video", "image-to-video", or "video-to-video". Default: "text-to-video".
-        duration_ui (float): Duration of the output video in seconds. Range: 0.3 to 8.5. Default: 2.0.
-        ui_frames_to_use (int): Number of frames to use from input video. Only used in video-to-video mode. Must be N*8+1. Default: 9.
-        seed_ui (int): Random seed for reproducible generation. Range: 0 to 2^32-1. Default: 42.
-        randomize_seed (bool): Whether to use a random seed instead of seed_ui. Default: True.
-        ui_guidance_scale (float): CFG scale controlling prompt influence. Range: 1.0 to 10.0. Higher values = stronger prompt influence. Default: 3.0.
-        improve_texture_flag (bool): Whether to use multi-scale generation for better texture quality. Slower but higher quality. Default: True.
-        progress (gr.Progress): Progress tracker for the generation process. Optional, used for UI updates.
-    Returns:
-        tuple: A tuple containing (output_video_path, used_seed) where output_video_path is the path to the generated video file and used_seed is the actual seed used for generation.
-    """
-    # Validate mode-specific required parameters
-    if mode == "image-to-video":
-        if not input_image_filepath:
-            raise gr.Error("input_image_filepath is required for image-to-video mode")
-    elif mode == "video-to-video":
-        if not input_video_filepath:
-            raise gr.Error("input_video_filepath is required for video-to-video mode")
-    elif mode == "text-to-video":
-        # No additional file inputs required for text-to-video
-        pass
-    else:
-        raise gr.Error(f"Invalid mode: {mode}. Must be one of: text-to-video, image-to-video, video-to-video")
     if randomize_seed:
         seed_ui = random.randint(0, 2**32 - 1)
@@ -374,9 +339,9 @@ css="""
 """
 with gr.Blocks(css=css) as demo:
-    gr.Markdown("# LTX Video 0.9.8 13B Distilled")
-    gr.Markdown("Fast high quality video generation.**Update (17/07):** now with the new v0.9.8 for improved prompt understanding and detail generation" )
-    gr.Markdown("[Model](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltxv-13b-0.9.8-distilled.safetensors) [GitHub](https://github.com/Lightricks/LTX-Video) [Diffusers](https://huggingface.co/Lightricks/LTX-Video-0.9.8-13B-distilled#diffusers-🧨)")
     with gr.Row():
         with gr.Column():
             with gr.Tab("image-to-video") as image_tab:
@@ -404,7 +369,7 @@ with gr.Blocks(css=css) as demo:
                 step=0.1,
                 info=f"Target video duration (0.3s to 8.5s)"
             )
-            improve_texture = gr.Checkbox(label="Improve Texture (multi-scale)", value=True,visible=False, info="Uses a two-pass generation for better quality, but is slower. Recommended for final output.")
         with gr.Column():
             output_video = gr.Video(label="Generated Video", interactive=False)
@@ -416,7 +381,7 @@ with gr.Blocks(css=css) as demo:
         with gr.Row():
             seed_input = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=2**32-1)
             randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=True)
-        with gr.Row(visible=False):
             guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
         with gr.Row():
             height_input = gr.Slider(label="Height", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")

 from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline, LTXVideoPipeline
 from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+config_file_path = "configs/ltxv-13b-0.9.7-distilled.yaml"
 with open(config_file_path, "r") as file:
     PIPELINE_CONFIG_YAML = yaml.safe_load(file)
         return 60
 @spaces.GPU(duration=get_duration)
+def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath,
+             height_ui, width_ui, mode,
+             duration_ui,
+             ui_frames_to_use,
+             seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
              progress=gr.Progress(track_tqdm=True)):
     if randomize_seed:
         seed_ui = random.randint(0, 2**32 - 1)
 """
 with gr.Blocks(css=css) as demo:
+    gr.Markdown("# LTX Video 0.9.7 Distilled")
+    gr.Markdown("Fast high quality video generation. [Model](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltxv-13b-0.9.7-distilled.safetensors) [GitHub](https://github.com/Lightricks/LTX-Video) [Diffusers](#)")
     with gr.Row():
         with gr.Column():
             with gr.Tab("image-to-video") as image_tab:
                 step=0.1,
                 info=f"Target video duration (0.3s to 8.5s)"
             )
+            improve_texture = gr.Checkbox(label="Improve Texture (multi-scale)", value=True, info="Uses a two-pass generation for better quality, but is slower. Recommended for final output.")
         with gr.Column():
             output_video = gr.Video(label="Generated Video", interactive=False)
         with gr.Row():
             seed_input = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=2**32-1)
             randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=True)
+        with gr.Row():
             guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
         with gr.Row():
             height_input = gr.Slider(label="Height", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")

configs/ltxv-13b-0.9.8-dev-fp8.yaml DELETED Viewed

@@ -1,34 +0,0 @@
-pipeline_type: multi-scale
-checkpoint_path: "ltxv-13b-0.9.8-dev-fp8.safetensors"
-downscale_factor: 0.6666666
-spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
-stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
-decode_timestep: 0.05
-decode_noise_scale: 0.025
-text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
-precision: "float8_e4m3fn" # options: "float8_e4m3fn", "bfloat16", "mixed_precision"
-sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
-prompt_enhancement_words_threshold: 120
-prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
-prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
-stochastic_sampling: false
-first_pass:
-  guidance_scale: [1, 1, 6, 8, 6, 1, 1]
-  stg_scale: [0, 0, 4, 4, 4, 2, 1]
-  rescaling_scale: [1, 1, 0.5, 0.5, 1, 1, 1]
-  guidance_timesteps: [1.0, 0.996,  0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
-  skip_block_list: [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
-  num_inference_steps: 30
-  skip_final_inference_steps: 3
-  cfg_star_rescale: true
-second_pass:
-  guidance_scale: [1]
-  stg_scale: [1]
-  rescaling_scale: [1]
-  guidance_timesteps: [1.0]
-  skip_block_list: [27]
-  num_inference_steps: 30
-  skip_initial_inference_steps: 17
-  cfg_star_rescale: true

configs/ltxv-13b-0.9.8-dev.yaml DELETED Viewed

@@ -1,34 +0,0 @@
-pipeline_type: multi-scale
-checkpoint_path: "ltxv-13b-0.9.8-dev.safetensors"
-downscale_factor: 0.6666666
-spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
-stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
-decode_timestep: 0.05
-decode_noise_scale: 0.025
-text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
-precision: "bfloat16"
-sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
-prompt_enhancement_words_threshold: 120
-prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
-prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
-stochastic_sampling: false
-first_pass:
-  guidance_scale: [1, 1, 6, 8, 6, 1, 1]
-  stg_scale: [0, 0, 4, 4, 4, 2, 1]
-  rescaling_scale: [1, 1, 0.5, 0.5, 1, 1, 1]
-  guidance_timesteps: [1.0, 0.996,  0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
-  skip_block_list: [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
-  num_inference_steps: 30
-  skip_final_inference_steps: 3
-  cfg_star_rescale: true
-second_pass:
-  guidance_scale: [1]
-  stg_scale: [1]
-  rescaling_scale: [1]
-  guidance_timesteps: [1.0]
-  skip_block_list: [27]
-  num_inference_steps: 30
-  skip_initial_inference_steps: 17
-  cfg_star_rescale: true

configs/ltxv-13b-0.9.8-distilled-fp8.yaml DELETED Viewed

@@ -1,29 +0,0 @@
-pipeline_type: multi-scale
-checkpoint_path: "ltxv-13b-0.9.8-distilled-fp8.safetensors"
-downscale_factor: 0.6666666
-spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
-stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
-decode_timestep: 0.05
-decode_noise_scale: 0.025
-text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
-precision: "float8_e4m3fn" # options: "float8_e4m3fn", "bfloat16", "mixed_precision"
-sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
-prompt_enhancement_words_threshold: 120
-prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
-prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
-stochastic_sampling: false
-first_pass:
-  timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250]
-  guidance_scale: 1
-  stg_scale: 0
-  rescaling_scale: 1
-  skip_block_list: [42]
-second_pass:
-  timesteps: [0.9094, 0.7250, 0.4219]
-  guidance_scale: 1
-  stg_scale: 0
-  rescaling_scale: 1
-  skip_block_list: [42]
-  tone_map_compression_ratio: 0.6

configs/ltxv-13b-0.9.8-distilled.yaml DELETED Viewed

@@ -1,29 +0,0 @@
-pipeline_type: multi-scale
-checkpoint_path: "ltxv-13b-0.9.8-distilled.safetensors"
-downscale_factor: 0.6666666
-spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
-stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
-decode_timestep: 0.05
-decode_noise_scale: 0.025
-text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
-precision: "bfloat16"
-sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
-prompt_enhancement_words_threshold: 120
-prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
-prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
-stochastic_sampling: false
-first_pass:
-  timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250]
-  guidance_scale: 1
-  stg_scale: 0
-  rescaling_scale: 1
-  skip_block_list: [42]
-second_pass:
-  timesteps: [0.9094, 0.7250, 0.4219]
-  guidance_scale: 1
-  stg_scale: 0
-  rescaling_scale: 1
-  skip_block_list: [42]
-  tone_map_compression_ratio: 0.6

configs/ltxv-2b-0.9.8-distilled-fp8.yaml DELETED Viewed

@@ -1,28 +0,0 @@
-pipeline_type: multi-scale
-checkpoint_path: "ltxv-2b-0.9.8-distilled-fp8.safetensors"
-downscale_factor: 0.6666666
-spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
-stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
-decode_timestep: 0.05
-decode_noise_scale: 0.025
-text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
-precision: "float8_e4m3fn" # options: "float8_e4m3fn", "bfloat16", "mixed_precision"
-sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
-prompt_enhancement_words_threshold: 120
-prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
-prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
-stochastic_sampling: false
-first_pass:
-  timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250]
-  guidance_scale: 1
-  stg_scale: 0
-  rescaling_scale: 1
-  skip_block_list: [42]
-second_pass:
-  timesteps: [0.9094, 0.7250, 0.4219]
-  guidance_scale: 1
-  stg_scale: 0
-  rescaling_scale: 1
-  skip_block_list: [42]

configs/ltxv-2b-0.9.8-distilled.yaml DELETED Viewed

@@ -1,28 +0,0 @@
-pipeline_type: multi-scale
-checkpoint_path: "ltxv-2b-0.9.8-distilled.safetensors"
-downscale_factor: 0.6666666
-spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
-stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
-decode_timestep: 0.05
-decode_noise_scale: 0.025
-text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
-precision: "bfloat16"
-sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
-prompt_enhancement_words_threshold: 120
-prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
-prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
-stochastic_sampling: false
-first_pass:
-  timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250]
-  guidance_scale: 1
-  stg_scale: 0
-  rescaling_scale: 1
-  skip_block_list: [42]
-second_pass:
-  timesteps: [0.9094, 0.7250, 0.4219]
-  guidance_scale: 1
-  stg_scale: 0
-  rescaling_scale: 1
-  skip_block_list: [42]

ltx_video/models/autoencoders/causal_video_autoencoder.py CHANGED Viewed

@@ -235,7 +235,7 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
                     "compress_time",
                     "compress_all",
                     "compress_all_res",
-                    "compress_time_res",
                 ]
             ]
         )
@@ -608,7 +608,7 @@ class Decoder(nn.Module):
             block_params = block_params if isinstance(block_params, dict) else {}
             if block_name == "res_x_y":
                 output_channel = output_channel * block_params.get("multiplier", 2)
-            if block_name.startswith("compress"):
                 output_channel = output_channel * block_params.get("multiplier", 1)
         self.conv_in = make_conv_nd(
@@ -1303,15 +1303,20 @@ def create_video_autoencoder_demo_config(
     encoder_blocks = [
         ("res_x", {"num_layers": 2}),
         ("compress_space_res", {"multiplier": 2}),
         ("compress_time_res", {"multiplier": 2}),
         ("compress_all_res", {"multiplier": 2}),
         ("compress_all_res", {"multiplier": 2}),
         ("res_x", {"num_layers": 1}),
     ]
     decoder_blocks = [
         ("res_x", {"num_layers": 2, "inject_noise": False}),
         ("compress_all", {"residual": True, "multiplier": 2}),
         ("compress_all", {"residual": True, "multiplier": 2}),
         ("compress_all", {"residual": True, "multiplier": 2}),
         ("res_x", {"num_layers": 2, "inject_noise": False}),
     ]

                     "compress_time",
                     "compress_all",
                     "compress_all_res",
+                    "compress_space_res",
                 ]
             ]
         )
             block_params = block_params if isinstance(block_params, dict) else {}
             if block_name == "res_x_y":
                 output_channel = output_channel * block_params.get("multiplier", 2)
+            if block_name == "compress_all":
                 output_channel = output_channel * block_params.get("multiplier", 1)
         self.conv_in = make_conv_nd(
     encoder_blocks = [
         ("res_x", {"num_layers": 2}),
         ("compress_space_res", {"multiplier": 2}),
+        ("res_x", {"num_layers": 2}),
         ("compress_time_res", {"multiplier": 2}),
+        ("res_x", {"num_layers": 1}),
         ("compress_all_res", {"multiplier": 2}),
+        ("res_x", {"num_layers": 1}),
         ("compress_all_res", {"multiplier": 2}),
         ("res_x", {"num_layers": 1}),
     ]
     decoder_blocks = [
         ("res_x", {"num_layers": 2, "inject_noise": False}),
         ("compress_all", {"residual": True, "multiplier": 2}),
+        ("res_x", {"num_layers": 2, "inject_noise": False}),
         ("compress_all", {"residual": True, "multiplier": 2}),
+        ("res_x", {"num_layers": 2, "inject_noise": False}),
         ("compress_all", {"residual": True, "multiplier": 2}),
         ("res_x", {"num_layers": 2, "inject_noise": False}),
     ]

ltx_video/models/transformers/attention.py CHANGED Viewed

@@ -205,6 +205,7 @@ class BasicTransformerBlock(nn.Module):
         timestep: Optional[torch.LongTensor] = None,
         cross_attention_kwargs: Dict[str, Any] = None,
         class_labels: Optional[torch.LongTensor] = None,
         skip_layer_mask: Optional[torch.Tensor] = None,
         skip_layer_strategy: Optional[SkipLayerStrategy] = None,
     ) -> torch.FloatTensor:

         timestep: Optional[torch.LongTensor] = None,
         cross_attention_kwargs: Dict[str, Any] = None,
         class_labels: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
         skip_layer_mask: Optional[torch.Tensor] = None,
         skip_layer_strategy: Optional[SkipLayerStrategy] = None,
     ) -> torch.FloatTensor:

ltx_video/models/transformers/transformer3d.py CHANGED Viewed

@@ -268,7 +268,7 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
                 for key, value in state_dict.items()
                 if key.startswith("model.diffusion_model.")
             }
-        super().load_state_dict(state_dict, *args, **kwargs)
     @classmethod
     def from_pretrained(

                 for key, value in state_dict.items()
                 if key.startswith("model.diffusion_model.")
             }
+        super().load_state_dict(state_dict, **kwargs)
     @classmethod
     def from_pretrained(

ltx_video/pipelines/pipeline_ltx_video.py CHANGED Viewed

@@ -45,6 +45,11 @@ from ltx_video.models.autoencoders.vae_encode import (
 )
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -790,7 +795,6 @@ class LTXVideoPipeline(DiffusionPipeline):
         text_encoder_max_tokens: int = 256,
         stochastic_sampling: bool = False,
         media_items: Optional[torch.Tensor] = None,
-        tone_map_compression_ratio: float = 0.0,
         **kwargs,
     ) -> Union[ImagePipelineOutput, Tuple]:
         """
@@ -872,8 +876,6 @@ class LTXVideoPipeline(DiffusionPipeline):
                 If set to `True`, the sampling is stochastic. If set to `False`, the sampling is deterministic.
             media_items ('torch.Tensor', *optional*):
                 The input media item used for image-to-image / video-to-video.
-            tone_map_compression_ratio: compression ratio for tone mapping, defaults to 0.0.
-                        If set to 0.0, no tone mapping is applied. If set to 1.0 - full compression is applied.
         Examples:
         Returns:
@@ -976,6 +978,10 @@ class LTXVideoPipeline(DiffusionPipeline):
                 guidance_scale[guidance_mapping[i]] for i in range(len(timesteps))
             ]
         if not isinstance(stg_scale, List):
             stg_scale = [stg_scale] * len(timesteps)
         else:
@@ -988,6 +994,16 @@ class LTXVideoPipeline(DiffusionPipeline):
                 rescaling_scale[guidance_mapping[i]] for i in range(len(timesteps))
             ]
         # Normalize skip_block_list to always be None or a list of lists matching timesteps
         if skip_block_list is not None:
             # Convert single list to list of lists if needed
@@ -999,6 +1015,17 @@ class LTXVideoPipeline(DiffusionPipeline):
                     new_skip_block_list.append(skip_block_list[guidance_mapping[i]])
                 skip_block_list = new_skip_block_list
         if enhance_prompt:
             self.prompt_enhancer_image_caption_model = (
                 self.prompt_enhancer_image_caption_model.to(self._execution_device)
@@ -1028,7 +1055,7 @@ class LTXVideoPipeline(DiffusionPipeline):
             negative_prompt_attention_mask,
         ) = self.encode_prompt(
             prompt,
-            True,
             negative_prompt=negative_prompt,
             num_images_per_prompt=num_images_per_prompt,
             device=device,
@@ -1046,28 +1073,23 @@ class LTXVideoPipeline(DiffusionPipeline):
         prompt_embeds_batch = prompt_embeds
         prompt_attention_mask_batch = prompt_attention_mask
-        negative_prompt_embeds = (
-            torch.zeros_like(prompt_embeds)
-            if negative_prompt_embeds is None
-            else negative_prompt_embeds
-        )
-        negative_prompt_attention_mask = (
-            torch.zeros_like(prompt_attention_mask)
-            if negative_prompt_attention_mask is None
-            else negative_prompt_attention_mask
-        )
-        prompt_embeds_batch = torch.cat(
-            [negative_prompt_embeds, prompt_embeds, prompt_embeds], dim=0
-        )
-        prompt_attention_mask_batch = torch.cat(
-            [
-                negative_prompt_attention_mask,
-                prompt_attention_mask,
-                prompt_attention_mask,
-            ],
-            dim=0,
-        )
         # 4. Prepare the initial latents using the provided media and conditioning items
         # Prepare the initial latents tensor, shape = (b, c, f, h, w)
@@ -1076,7 +1098,7 @@ class LTXVideoPipeline(DiffusionPipeline):
             media_items=media_items,
             timestep=timesteps[0],
             latent_shape=latent_shape,
-            dtype=prompt_embeds.dtype,
             device=device,
             generator=generator,
             vae_per_channel_normalize=vae_per_channel_normalize,
@@ -1096,6 +1118,14 @@ class LTXVideoPipeline(DiffusionPipeline):
         )
         init_latents = latents.clone()  # Used for image_cond_noise_update
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -1104,50 +1134,8 @@ class LTXVideoPipeline(DiffusionPipeline):
             len(timesteps) - num_inference_steps * self.scheduler.order, 0
         )
-        orig_conditioning_mask = conditioning_mask
-        # Befor compiling this code please be aware:
-        # This code might generate different input shapes if some timesteps have no STG or CFG.
-        # This means that the codes might need to be compiled mutliple times.
-        # To avoid that, use the same STG and CFG values for all timesteps.
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
-                do_classifier_free_guidance = guidance_scale[i] > 1.0
-                do_spatio_temporal_guidance = stg_scale[i] > 0
-                do_rescaling = rescaling_scale[i] != 1.0
-                num_conds = 1
-                if do_classifier_free_guidance:
-                    num_conds += 1
-                if do_spatio_temporal_guidance:
-                    num_conds += 1
-                if do_classifier_free_guidance and do_spatio_temporal_guidance:
-                    indices = slice(batch_size * 0, batch_size * 3)
-                elif do_classifier_free_guidance:
-                    indices = slice(batch_size * 0, batch_size * 2)
-                elif do_spatio_temporal_guidance:
-                    indices = slice(batch_size * 1, batch_size * 3)
-                else:
-                    indices = slice(batch_size * 1, batch_size * 2)
-                # Prepare skip layer masks
-                skip_layer_mask: Optional[torch.Tensor] = None
-                if do_spatio_temporal_guidance:
-                    if skip_block_list is not None:
-                        skip_layer_mask = self.transformer.create_skip_layer_mask(
-                            batch_size, num_conds, num_conds - 1, skip_block_list[i]
-                        )
-                batch_pixel_coords = torch.cat([pixel_coords] * num_conds)
-                conditioning_mask = orig_conditioning_mask
-                if conditioning_mask is not None and is_video:
-                    assert num_images_per_prompt == 1
-                    conditioning_mask = torch.cat([conditioning_mask] * num_conds)
-                fractional_coords = batch_pixel_coords.to(torch.float32)
-                fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
                 if conditioning_mask is not None and image_cond_noise_scale > 0.0:
                     latents = self.add_noise_to_image_conditioning_latents(
                         t,
@@ -1206,12 +1194,16 @@ class LTXVideoPipeline(DiffusionPipeline):
                     noise_pred = self.transformer(
                         latent_model_input.to(self.transformer.dtype),
                         indices_grid=fractional_coords,
-                        encoder_hidden_states=prompt_embeds_batch[indices].to(
                             self.transformer.dtype
                         ),
-                        encoder_attention_mask=prompt_attention_mask_batch[indices],
                         timestep=current_timestep,
-                        skip_layer_mask=skip_layer_mask,
                         skip_layer_strategy=skip_layer_strategy,
                         return_dict=False,
                     )[0]
@@ -1323,7 +1315,6 @@ class LTXVideoPipeline(DiffusionPipeline):
                 )
             else:
                 decode_timestep = None
-            latents = self.tone_map_latents(latents, tone_map_compression_ratio)
             image = vae_decode(
                 latents,
                 self.vae,
@@ -1745,47 +1736,6 @@ class LTXVideoPipeline(DiffusionPipeline):
         num_frames = (num_frames - 1) // scale_factor * scale_factor + 1
         return num_frames
-    @staticmethod
-    def tone_map_latents(
-        latents: torch.Tensor,
-        compression: float,
-    ) -> torch.Tensor:
-        """
-        Applies a non-linear tone-mapping function to latent values to reduce their dynamic range
-        in a perceptually smooth way using a sigmoid-based compression.
-        This is useful for regularizing high-variance latents or for conditioning outputs
-        during generation, especially when controlling dynamic behavior with a `compression` factor.
-        Parameters:
-        ----------
-        latents : torch.Tensor
-            Input latent tensor with arbitrary shape. Expected to be roughly in [-1, 1] or [0, 1] range.
-        compression : float
-            Compression strength in the range [0, 1].
-            - 0.0: No tone-mapping (identity transform)
-            - 1.0: Full compression effect
-        Returns:
-        -------
-        torch.Tensor
-            The tone-mapped latent tensor of the same shape as input.
-        """
-        if not (0 <= compression <= 1):
-            raise ValueError("Compression must be in the range [0, 1]")
-        # Remap [0-1] to [0-0.75] and apply sigmoid compression in one shot
-        scale_factor = compression * 0.75
-        abs_latents = torch.abs(latents)
-        # Sigmoid compression: sigmoid shifts large values toward 0.2, small values stay ~1.0
-        # When scale_factor=0, sigmoid term vanishes, when scale_factor=0.75, full effect
-        sigmoid_term = torch.sigmoid(4.0 * scale_factor * (abs_latents - 1.0))
-        scales = 1.0 - 0.8 * scale_factor * sigmoid_term
-        filtered = latents * scales
-        return filtered
 def adain_filter_latent(
     latents: torch.Tensor, reference_latents: torch.Tensor, factor=1.0

 )
+try:
+    import torch_xla.distributed.spmd as xs
+except ImportError:
+    xs = None
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
         text_encoder_max_tokens: int = 256,
         stochastic_sampling: bool = False,
         media_items: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[ImagePipelineOutput, Tuple]:
         """
                 If set to `True`, the sampling is stochastic. If set to `False`, the sampling is deterministic.
             media_items ('torch.Tensor', *optional*):
                 The input media item used for image-to-image / video-to-video.
         Examples:
         Returns:
                 guidance_scale[guidance_mapping[i]] for i in range(len(timesteps))
             ]
+        # For simplicity, we are using a constant num_conds for all timesteps, so we need to zero
+        # out cases where the guidance scale should not be applied.
+        guidance_scale = [x if x > 1.0 else 0.0 for x in guidance_scale]
         if not isinstance(stg_scale, List):
             stg_scale = [stg_scale] * len(timesteps)
         else:
                 rescaling_scale[guidance_mapping[i]] for i in range(len(timesteps))
             ]
+        do_classifier_free_guidance = any(x > 1.0 for x in guidance_scale)
+        do_spatio_temporal_guidance = any(x > 0.0 for x in stg_scale)
+        do_rescaling = any(x != 1.0 for x in rescaling_scale)
+        num_conds = 1
+        if do_classifier_free_guidance:
+            num_conds += 1
+        if do_spatio_temporal_guidance:
+            num_conds += 1
         # Normalize skip_block_list to always be None or a list of lists matching timesteps
         if skip_block_list is not None:
             # Convert single list to list of lists if needed
                     new_skip_block_list.append(skip_block_list[guidance_mapping[i]])
                 skip_block_list = new_skip_block_list
+        # Prepare skip layer masks
+        skip_layer_masks: Optional[List[torch.Tensor]] = None
+        if do_spatio_temporal_guidance:
+            if skip_block_list is not None:
+                skip_layer_masks = [
+                    self.transformer.create_skip_layer_mask(
+                        batch_size, num_conds, num_conds - 1, skip_blocks
+                    )
+                    for skip_blocks in skip_block_list
+                ]
         if enhance_prompt:
             self.prompt_enhancer_image_caption_model = (
                 self.prompt_enhancer_image_caption_model.to(self._execution_device)
             negative_prompt_attention_mask,
         ) = self.encode_prompt(
             prompt,
+            do_classifier_free_guidance,
             negative_prompt=negative_prompt,
             num_images_per_prompt=num_images_per_prompt,
             device=device,
         prompt_embeds_batch = prompt_embeds
         prompt_attention_mask_batch = prompt_attention_mask
+        if do_classifier_free_guidance:
+            prompt_embeds_batch = torch.cat(
+                [negative_prompt_embeds, prompt_embeds], dim=0
+            )
+            prompt_attention_mask_batch = torch.cat(
+                [negative_prompt_attention_mask, prompt_attention_mask], dim=0
+            )
+        if do_spatio_temporal_guidance:
+            prompt_embeds_batch = torch.cat([prompt_embeds_batch, prompt_embeds], dim=0)
+            prompt_attention_mask_batch = torch.cat(
+                [
+                    prompt_attention_mask_batch,
+                    prompt_attention_mask,
+                ],
+                dim=0,
+            )
         # 4. Prepare the initial latents using the provided media and conditioning items
         # Prepare the initial latents tensor, shape = (b, c, f, h, w)
             media_items=media_items,
             timestep=timesteps[0],
             latent_shape=latent_shape,
+            dtype=prompt_embeds_batch.dtype,
             device=device,
             generator=generator,
             vae_per_channel_normalize=vae_per_channel_normalize,
         )
         init_latents = latents.clone()  # Used for image_cond_noise_update
+        pixel_coords = torch.cat([pixel_coords] * num_conds)
+        orig_conditioning_mask = conditioning_mask
+        if conditioning_mask is not None and is_video:
+            assert num_images_per_prompt == 1
+            conditioning_mask = torch.cat([conditioning_mask] * num_conds)
+        fractional_coords = pixel_coords.to(torch.float32)
+        fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
             len(timesteps) - num_inference_steps * self.scheduler.order, 0
         )
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 if conditioning_mask is not None and image_cond_noise_scale > 0.0:
                     latents = self.add_noise_to_image_conditioning_latents(
                         t,
                     noise_pred = self.transformer(
                         latent_model_input.to(self.transformer.dtype),
                         indices_grid=fractional_coords,
+                        encoder_hidden_states=prompt_embeds_batch.to(
                             self.transformer.dtype
                         ),
+                        encoder_attention_mask=prompt_attention_mask_batch,
                         timestep=current_timestep,
+                        skip_layer_mask=(
+                            skip_layer_masks[i]
+                            if skip_layer_masks is not None
+                            else None
+                        ),
                         skip_layer_strategy=skip_layer_strategy,
                         return_dict=False,
                     )[0]
                 )
             else:
                 decode_timestep = None
             image = vae_decode(
                 latents,
                 self.vae,
         num_frames = (num_frames - 1) // scale_factor * scale_factor + 1
         return num_frames
 def adain_filter_latent(
     latents: torch.Tensor, reference_latents: torch.Tensor, factor=1.0

ltx_video/schedulers/rf.py CHANGED Viewed

@@ -314,7 +314,7 @@ class RectifiedFlowScheduler(SchedulerMixin, ConfigMixin, TimestepShifter):
         """
         Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
         process from the learned model outputs (most often the predicted noise).
-        z_{t_1} = z_t - Delta_t * v
         The method finds the next timestep that is lower than the input timestep(s) and denoises the latents
         to that level. The input timestep(s) are not required to be one of the predefined timesteps.

         """
         Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
         process from the learned model outputs (most often the predicted noise).
+        z_{t_1} = z_t - \Delta_t * v
         The method finds the next timestep that is lower than the input timestep(s) and denoises the latents
         to that level. The input timestep(s) are not required to be one of the predefined timesteps.