Spaces:
Running
on
Zero
Running
on
Zero
ltx
#14
by
dangthr
- opened
- README.md +1 -1
- app.py +11 -46
- configs/ltxv-13b-0.9.8-dev-fp8.yaml +0 -34
- configs/ltxv-13b-0.9.8-dev.yaml +0 -34
- configs/ltxv-13b-0.9.8-distilled-fp8.yaml +0 -29
- configs/ltxv-13b-0.9.8-distilled.yaml +0 -29
- configs/ltxv-2b-0.9.8-distilled-fp8.yaml +0 -28
- configs/ltxv-2b-0.9.8-distilled.yaml +0 -28
- ltx_video/models/autoencoders/causal_video_autoencoder.py +7 -2
- ltx_video/models/transformers/attention.py +1 -0
- ltx_video/models/transformers/transformer3d.py +1 -1
- ltx_video/pipelines/pipeline_ltx_video.py +63 -113
- ltx_video/schedulers/rf.py +1 -1
README.md
CHANGED
@@ -7,7 +7,7 @@ sdk: gradio
|
|
7 |
sdk_version: 5.29.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
short_description: ultra-fast video model, LTX 0.9.
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
7 |
sdk_version: 5.29.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
short_description: ultra-fast video model, LTX 0.9.7 13B distilled
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -24,7 +24,7 @@ from inference import (
|
|
24 |
from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline, LTXVideoPipeline
|
25 |
from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
|
26 |
|
27 |
-
config_file_path = "configs/ltxv-13b-0.9.
|
28 |
with open(config_file_path, "r") as file:
|
29 |
PIPELINE_CONFIG_YAML = yaml.safe_load(file)
|
30 |
|
@@ -141,47 +141,12 @@ def get_duration(prompt, negative_prompt, input_image_filepath, input_video_file
|
|
141 |
return 60
|
142 |
|
143 |
@spaces.GPU(duration=get_duration)
|
144 |
-
def generate(prompt, negative_prompt, input_image_filepath
|
145 |
-
height_ui
|
146 |
-
duration_ui
|
147 |
-
ui_frames_to_use
|
148 |
-
seed_ui
|
149 |
progress=gr.Progress(track_tqdm=True)):
|
150 |
-
"""
|
151 |
-
Generate high-quality videos using LTX Video model with support for text-to-video, image-to-video, and video-to-video modes.
|
152 |
-
|
153 |
-
Args:
|
154 |
-
prompt (str): Text description of the desired video content. Required for all modes.
|
155 |
-
negative_prompt (str): Text describing what to avoid in the generated video. Optional, can be empty string.
|
156 |
-
input_image_filepath (str or None): Path to input image file. Required for image-to-video mode, None for other modes.
|
157 |
-
input_video_filepath (str or None): Path to input video file. Required for video-to-video mode, None for other modes.
|
158 |
-
height_ui (int): Height of the output video in pixels, must be divisible by 32. Default: 512.
|
159 |
-
width_ui (int): Width of the output video in pixels, must be divisible by 32. Default: 704.
|
160 |
-
mode (str): Generation mode. Required. One of "text-to-video", "image-to-video", or "video-to-video". Default: "text-to-video".
|
161 |
-
duration_ui (float): Duration of the output video in seconds. Range: 0.3 to 8.5. Default: 2.0.
|
162 |
-
ui_frames_to_use (int): Number of frames to use from input video. Only used in video-to-video mode. Must be N*8+1. Default: 9.
|
163 |
-
seed_ui (int): Random seed for reproducible generation. Range: 0 to 2^32-1. Default: 42.
|
164 |
-
randomize_seed (bool): Whether to use a random seed instead of seed_ui. Default: True.
|
165 |
-
ui_guidance_scale (float): CFG scale controlling prompt influence. Range: 1.0 to 10.0. Higher values = stronger prompt influence. Default: 3.0.
|
166 |
-
improve_texture_flag (bool): Whether to use multi-scale generation for better texture quality. Slower but higher quality. Default: True.
|
167 |
-
progress (gr.Progress): Progress tracker for the generation process. Optional, used for UI updates.
|
168 |
-
|
169 |
-
Returns:
|
170 |
-
tuple: A tuple containing (output_video_path, used_seed) where output_video_path is the path to the generated video file and used_seed is the actual seed used for generation.
|
171 |
-
"""
|
172 |
-
|
173 |
-
# Validate mode-specific required parameters
|
174 |
-
if mode == "image-to-video":
|
175 |
-
if not input_image_filepath:
|
176 |
-
raise gr.Error("input_image_filepath is required for image-to-video mode")
|
177 |
-
elif mode == "video-to-video":
|
178 |
-
if not input_video_filepath:
|
179 |
-
raise gr.Error("input_video_filepath is required for video-to-video mode")
|
180 |
-
elif mode == "text-to-video":
|
181 |
-
# No additional file inputs required for text-to-video
|
182 |
-
pass
|
183 |
-
else:
|
184 |
-
raise gr.Error(f"Invalid mode: {mode}. Must be one of: text-to-video, image-to-video, video-to-video")
|
185 |
|
186 |
if randomize_seed:
|
187 |
seed_ui = random.randint(0, 2**32 - 1)
|
@@ -374,9 +339,9 @@ css="""
|
|
374 |
"""
|
375 |
|
376 |
with gr.Blocks(css=css) as demo:
|
377 |
-
gr.Markdown("# LTX Video 0.9.
|
378 |
-
gr.Markdown("Fast high quality video generation
|
379 |
-
|
380 |
with gr.Row():
|
381 |
with gr.Column():
|
382 |
with gr.Tab("image-to-video") as image_tab:
|
@@ -404,7 +369,7 @@ with gr.Blocks(css=css) as demo:
|
|
404 |
step=0.1,
|
405 |
info=f"Target video duration (0.3s to 8.5s)"
|
406 |
)
|
407 |
-
improve_texture = gr.Checkbox(label="Improve Texture (multi-scale)", value=True,
|
408 |
|
409 |
with gr.Column():
|
410 |
output_video = gr.Video(label="Generated Video", interactive=False)
|
@@ -416,7 +381,7 @@ with gr.Blocks(css=css) as demo:
|
|
416 |
with gr.Row():
|
417 |
seed_input = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=2**32-1)
|
418 |
randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=True)
|
419 |
-
with gr.Row(
|
420 |
guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
|
421 |
with gr.Row():
|
422 |
height_input = gr.Slider(label="Height", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
|
|
|
24 |
from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline, LTXVideoPipeline
|
25 |
from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
|
26 |
|
27 |
+
config_file_path = "configs/ltxv-13b-0.9.7-distilled.yaml"
|
28 |
with open(config_file_path, "r") as file:
|
29 |
PIPELINE_CONFIG_YAML = yaml.safe_load(file)
|
30 |
|
|
|
141 |
return 60
|
142 |
|
143 |
@spaces.GPU(duration=get_duration)
|
144 |
+
def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath,
|
145 |
+
height_ui, width_ui, mode,
|
146 |
+
duration_ui,
|
147 |
+
ui_frames_to_use,
|
148 |
+
seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
|
149 |
progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
if randomize_seed:
|
152 |
seed_ui = random.randint(0, 2**32 - 1)
|
|
|
339 |
"""
|
340 |
|
341 |
with gr.Blocks(css=css) as demo:
|
342 |
+
gr.Markdown("# LTX Video 0.9.7 Distilled")
|
343 |
+
gr.Markdown("Fast high quality video generation. [Model](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltxv-13b-0.9.7-distilled.safetensors) [GitHub](https://github.com/Lightricks/LTX-Video) [Diffusers](#)")
|
344 |
+
|
345 |
with gr.Row():
|
346 |
with gr.Column():
|
347 |
with gr.Tab("image-to-video") as image_tab:
|
|
|
369 |
step=0.1,
|
370 |
info=f"Target video duration (0.3s to 8.5s)"
|
371 |
)
|
372 |
+
improve_texture = gr.Checkbox(label="Improve Texture (multi-scale)", value=True, info="Uses a two-pass generation for better quality, but is slower. Recommended for final output.")
|
373 |
|
374 |
with gr.Column():
|
375 |
output_video = gr.Video(label="Generated Video", interactive=False)
|
|
|
381 |
with gr.Row():
|
382 |
seed_input = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=2**32-1)
|
383 |
randomize_seed_input = gr.Checkbox(label="Randomize Seed", value=True)
|
384 |
+
with gr.Row():
|
385 |
guidance_scale_input = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0, value=PIPELINE_CONFIG_YAML.get("first_pass", {}).get("guidance_scale", 1.0), step=0.1, info="Controls how much the prompt influences the output. Higher values = stronger influence.")
|
386 |
with gr.Row():
|
387 |
height_input = gr.Slider(label="Height", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE, info="Must be divisible by 32.")
|
configs/ltxv-13b-0.9.8-dev-fp8.yaml
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
pipeline_type: multi-scale
|
2 |
-
checkpoint_path: "ltxv-13b-0.9.8-dev-fp8.safetensors"
|
3 |
-
downscale_factor: 0.6666666
|
4 |
-
spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
|
5 |
-
stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
|
6 |
-
decode_timestep: 0.05
|
7 |
-
decode_noise_scale: 0.025
|
8 |
-
text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
|
9 |
-
precision: "float8_e4m3fn" # options: "float8_e4m3fn", "bfloat16", "mixed_precision"
|
10 |
-
sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
|
11 |
-
prompt_enhancement_words_threshold: 120
|
12 |
-
prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
|
13 |
-
prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
|
14 |
-
stochastic_sampling: false
|
15 |
-
|
16 |
-
first_pass:
|
17 |
-
guidance_scale: [1, 1, 6, 8, 6, 1, 1]
|
18 |
-
stg_scale: [0, 0, 4, 4, 4, 2, 1]
|
19 |
-
rescaling_scale: [1, 1, 0.5, 0.5, 1, 1, 1]
|
20 |
-
guidance_timesteps: [1.0, 0.996, 0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
|
21 |
-
skip_block_list: [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
|
22 |
-
num_inference_steps: 30
|
23 |
-
skip_final_inference_steps: 3
|
24 |
-
cfg_star_rescale: true
|
25 |
-
|
26 |
-
second_pass:
|
27 |
-
guidance_scale: [1]
|
28 |
-
stg_scale: [1]
|
29 |
-
rescaling_scale: [1]
|
30 |
-
guidance_timesteps: [1.0]
|
31 |
-
skip_block_list: [27]
|
32 |
-
num_inference_steps: 30
|
33 |
-
skip_initial_inference_steps: 17
|
34 |
-
cfg_star_rescale: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/ltxv-13b-0.9.8-dev.yaml
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
pipeline_type: multi-scale
|
2 |
-
checkpoint_path: "ltxv-13b-0.9.8-dev.safetensors"
|
3 |
-
downscale_factor: 0.6666666
|
4 |
-
spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
|
5 |
-
stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
|
6 |
-
decode_timestep: 0.05
|
7 |
-
decode_noise_scale: 0.025
|
8 |
-
text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
|
9 |
-
precision: "bfloat16"
|
10 |
-
sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
|
11 |
-
prompt_enhancement_words_threshold: 120
|
12 |
-
prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
|
13 |
-
prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
|
14 |
-
stochastic_sampling: false
|
15 |
-
|
16 |
-
first_pass:
|
17 |
-
guidance_scale: [1, 1, 6, 8, 6, 1, 1]
|
18 |
-
stg_scale: [0, 0, 4, 4, 4, 2, 1]
|
19 |
-
rescaling_scale: [1, 1, 0.5, 0.5, 1, 1, 1]
|
20 |
-
guidance_timesteps: [1.0, 0.996, 0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
|
21 |
-
skip_block_list: [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
|
22 |
-
num_inference_steps: 30
|
23 |
-
skip_final_inference_steps: 3
|
24 |
-
cfg_star_rescale: true
|
25 |
-
|
26 |
-
second_pass:
|
27 |
-
guidance_scale: [1]
|
28 |
-
stg_scale: [1]
|
29 |
-
rescaling_scale: [1]
|
30 |
-
guidance_timesteps: [1.0]
|
31 |
-
skip_block_list: [27]
|
32 |
-
num_inference_steps: 30
|
33 |
-
skip_initial_inference_steps: 17
|
34 |
-
cfg_star_rescale: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/ltxv-13b-0.9.8-distilled-fp8.yaml
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
pipeline_type: multi-scale
|
2 |
-
checkpoint_path: "ltxv-13b-0.9.8-distilled-fp8.safetensors"
|
3 |
-
downscale_factor: 0.6666666
|
4 |
-
spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
|
5 |
-
stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
|
6 |
-
decode_timestep: 0.05
|
7 |
-
decode_noise_scale: 0.025
|
8 |
-
text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
|
9 |
-
precision: "float8_e4m3fn" # options: "float8_e4m3fn", "bfloat16", "mixed_precision"
|
10 |
-
sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
|
11 |
-
prompt_enhancement_words_threshold: 120
|
12 |
-
prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
|
13 |
-
prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
|
14 |
-
stochastic_sampling: false
|
15 |
-
|
16 |
-
first_pass:
|
17 |
-
timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250]
|
18 |
-
guidance_scale: 1
|
19 |
-
stg_scale: 0
|
20 |
-
rescaling_scale: 1
|
21 |
-
skip_block_list: [42]
|
22 |
-
|
23 |
-
second_pass:
|
24 |
-
timesteps: [0.9094, 0.7250, 0.4219]
|
25 |
-
guidance_scale: 1
|
26 |
-
stg_scale: 0
|
27 |
-
rescaling_scale: 1
|
28 |
-
skip_block_list: [42]
|
29 |
-
tone_map_compression_ratio: 0.6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/ltxv-13b-0.9.8-distilled.yaml
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
pipeline_type: multi-scale
|
2 |
-
checkpoint_path: "ltxv-13b-0.9.8-distilled.safetensors"
|
3 |
-
downscale_factor: 0.6666666
|
4 |
-
spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
|
5 |
-
stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
|
6 |
-
decode_timestep: 0.05
|
7 |
-
decode_noise_scale: 0.025
|
8 |
-
text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
|
9 |
-
precision: "bfloat16"
|
10 |
-
sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
|
11 |
-
prompt_enhancement_words_threshold: 120
|
12 |
-
prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
|
13 |
-
prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
|
14 |
-
stochastic_sampling: false
|
15 |
-
|
16 |
-
first_pass:
|
17 |
-
timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250]
|
18 |
-
guidance_scale: 1
|
19 |
-
stg_scale: 0
|
20 |
-
rescaling_scale: 1
|
21 |
-
skip_block_list: [42]
|
22 |
-
|
23 |
-
second_pass:
|
24 |
-
timesteps: [0.9094, 0.7250, 0.4219]
|
25 |
-
guidance_scale: 1
|
26 |
-
stg_scale: 0
|
27 |
-
rescaling_scale: 1
|
28 |
-
skip_block_list: [42]
|
29 |
-
tone_map_compression_ratio: 0.6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/ltxv-2b-0.9.8-distilled-fp8.yaml
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
pipeline_type: multi-scale
|
2 |
-
checkpoint_path: "ltxv-2b-0.9.8-distilled-fp8.safetensors"
|
3 |
-
downscale_factor: 0.6666666
|
4 |
-
spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
|
5 |
-
stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
|
6 |
-
decode_timestep: 0.05
|
7 |
-
decode_noise_scale: 0.025
|
8 |
-
text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
|
9 |
-
precision: "float8_e4m3fn" # options: "float8_e4m3fn", "bfloat16", "mixed_precision"
|
10 |
-
sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
|
11 |
-
prompt_enhancement_words_threshold: 120
|
12 |
-
prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
|
13 |
-
prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
|
14 |
-
stochastic_sampling: false
|
15 |
-
|
16 |
-
first_pass:
|
17 |
-
timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250]
|
18 |
-
guidance_scale: 1
|
19 |
-
stg_scale: 0
|
20 |
-
rescaling_scale: 1
|
21 |
-
skip_block_list: [42]
|
22 |
-
|
23 |
-
second_pass:
|
24 |
-
timesteps: [0.9094, 0.7250, 0.4219]
|
25 |
-
guidance_scale: 1
|
26 |
-
stg_scale: 0
|
27 |
-
rescaling_scale: 1
|
28 |
-
skip_block_list: [42]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/ltxv-2b-0.9.8-distilled.yaml
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
pipeline_type: multi-scale
|
2 |
-
checkpoint_path: "ltxv-2b-0.9.8-distilled.safetensors"
|
3 |
-
downscale_factor: 0.6666666
|
4 |
-
spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors"
|
5 |
-
stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
|
6 |
-
decode_timestep: 0.05
|
7 |
-
decode_noise_scale: 0.025
|
8 |
-
text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
|
9 |
-
precision: "bfloat16"
|
10 |
-
sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
|
11 |
-
prompt_enhancement_words_threshold: 120
|
12 |
-
prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
|
13 |
-
prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
|
14 |
-
stochastic_sampling: false
|
15 |
-
|
16 |
-
first_pass:
|
17 |
-
timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250]
|
18 |
-
guidance_scale: 1
|
19 |
-
stg_scale: 0
|
20 |
-
rescaling_scale: 1
|
21 |
-
skip_block_list: [42]
|
22 |
-
|
23 |
-
second_pass:
|
24 |
-
timesteps: [0.9094, 0.7250, 0.4219]
|
25 |
-
guidance_scale: 1
|
26 |
-
stg_scale: 0
|
27 |
-
rescaling_scale: 1
|
28 |
-
skip_block_list: [42]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ltx_video/models/autoencoders/causal_video_autoencoder.py
CHANGED
@@ -235,7 +235,7 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
|
|
235 |
"compress_time",
|
236 |
"compress_all",
|
237 |
"compress_all_res",
|
238 |
-
"
|
239 |
]
|
240 |
]
|
241 |
)
|
@@ -608,7 +608,7 @@ class Decoder(nn.Module):
|
|
608 |
block_params = block_params if isinstance(block_params, dict) else {}
|
609 |
if block_name == "res_x_y":
|
610 |
output_channel = output_channel * block_params.get("multiplier", 2)
|
611 |
-
if block_name
|
612 |
output_channel = output_channel * block_params.get("multiplier", 1)
|
613 |
|
614 |
self.conv_in = make_conv_nd(
|
@@ -1303,15 +1303,20 @@ def create_video_autoencoder_demo_config(
|
|
1303 |
encoder_blocks = [
|
1304 |
("res_x", {"num_layers": 2}),
|
1305 |
("compress_space_res", {"multiplier": 2}),
|
|
|
1306 |
("compress_time_res", {"multiplier": 2}),
|
|
|
1307 |
("compress_all_res", {"multiplier": 2}),
|
|
|
1308 |
("compress_all_res", {"multiplier": 2}),
|
1309 |
("res_x", {"num_layers": 1}),
|
1310 |
]
|
1311 |
decoder_blocks = [
|
1312 |
("res_x", {"num_layers": 2, "inject_noise": False}),
|
1313 |
("compress_all", {"residual": True, "multiplier": 2}),
|
|
|
1314 |
("compress_all", {"residual": True, "multiplier": 2}),
|
|
|
1315 |
("compress_all", {"residual": True, "multiplier": 2}),
|
1316 |
("res_x", {"num_layers": 2, "inject_noise": False}),
|
1317 |
]
|
|
|
235 |
"compress_time",
|
236 |
"compress_all",
|
237 |
"compress_all_res",
|
238 |
+
"compress_space_res",
|
239 |
]
|
240 |
]
|
241 |
)
|
|
|
608 |
block_params = block_params if isinstance(block_params, dict) else {}
|
609 |
if block_name == "res_x_y":
|
610 |
output_channel = output_channel * block_params.get("multiplier", 2)
|
611 |
+
if block_name == "compress_all":
|
612 |
output_channel = output_channel * block_params.get("multiplier", 1)
|
613 |
|
614 |
self.conv_in = make_conv_nd(
|
|
|
1303 |
encoder_blocks = [
|
1304 |
("res_x", {"num_layers": 2}),
|
1305 |
("compress_space_res", {"multiplier": 2}),
|
1306 |
+
("res_x", {"num_layers": 2}),
|
1307 |
("compress_time_res", {"multiplier": 2}),
|
1308 |
+
("res_x", {"num_layers": 1}),
|
1309 |
("compress_all_res", {"multiplier": 2}),
|
1310 |
+
("res_x", {"num_layers": 1}),
|
1311 |
("compress_all_res", {"multiplier": 2}),
|
1312 |
("res_x", {"num_layers": 1}),
|
1313 |
]
|
1314 |
decoder_blocks = [
|
1315 |
("res_x", {"num_layers": 2, "inject_noise": False}),
|
1316 |
("compress_all", {"residual": True, "multiplier": 2}),
|
1317 |
+
("res_x", {"num_layers": 2, "inject_noise": False}),
|
1318 |
("compress_all", {"residual": True, "multiplier": 2}),
|
1319 |
+
("res_x", {"num_layers": 2, "inject_noise": False}),
|
1320 |
("compress_all", {"residual": True, "multiplier": 2}),
|
1321 |
("res_x", {"num_layers": 2, "inject_noise": False}),
|
1322 |
]
|
ltx_video/models/transformers/attention.py
CHANGED
@@ -205,6 +205,7 @@ class BasicTransformerBlock(nn.Module):
|
|
205 |
timestep: Optional[torch.LongTensor] = None,
|
206 |
cross_attention_kwargs: Dict[str, Any] = None,
|
207 |
class_labels: Optional[torch.LongTensor] = None,
|
|
|
208 |
skip_layer_mask: Optional[torch.Tensor] = None,
|
209 |
skip_layer_strategy: Optional[SkipLayerStrategy] = None,
|
210 |
) -> torch.FloatTensor:
|
|
|
205 |
timestep: Optional[torch.LongTensor] = None,
|
206 |
cross_attention_kwargs: Dict[str, Any] = None,
|
207 |
class_labels: Optional[torch.LongTensor] = None,
|
208 |
+
added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
|
209 |
skip_layer_mask: Optional[torch.Tensor] = None,
|
210 |
skip_layer_strategy: Optional[SkipLayerStrategy] = None,
|
211 |
) -> torch.FloatTensor:
|
ltx_video/models/transformers/transformer3d.py
CHANGED
@@ -268,7 +268,7 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
|
|
268 |
for key, value in state_dict.items()
|
269 |
if key.startswith("model.diffusion_model.")
|
270 |
}
|
271 |
-
super().load_state_dict(state_dict,
|
272 |
|
273 |
@classmethod
|
274 |
def from_pretrained(
|
|
|
268 |
for key, value in state_dict.items()
|
269 |
if key.startswith("model.diffusion_model.")
|
270 |
}
|
271 |
+
super().load_state_dict(state_dict, **kwargs)
|
272 |
|
273 |
@classmethod
|
274 |
def from_pretrained(
|
ltx_video/pipelines/pipeline_ltx_video.py
CHANGED
@@ -45,6 +45,11 @@ from ltx_video.models.autoencoders.vae_encode import (
|
|
45 |
)
|
46 |
|
47 |
|
|
|
|
|
|
|
|
|
|
|
48 |
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
49 |
|
50 |
|
@@ -790,7 +795,6 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
790 |
text_encoder_max_tokens: int = 256,
|
791 |
stochastic_sampling: bool = False,
|
792 |
media_items: Optional[torch.Tensor] = None,
|
793 |
-
tone_map_compression_ratio: float = 0.0,
|
794 |
**kwargs,
|
795 |
) -> Union[ImagePipelineOutput, Tuple]:
|
796 |
"""
|
@@ -872,8 +876,6 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
872 |
If set to `True`, the sampling is stochastic. If set to `False`, the sampling is deterministic.
|
873 |
media_items ('torch.Tensor', *optional*):
|
874 |
The input media item used for image-to-image / video-to-video.
|
875 |
-
tone_map_compression_ratio: compression ratio for tone mapping, defaults to 0.0.
|
876 |
-
If set to 0.0, no tone mapping is applied. If set to 1.0 - full compression is applied.
|
877 |
Examples:
|
878 |
|
879 |
Returns:
|
@@ -976,6 +978,10 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
976 |
guidance_scale[guidance_mapping[i]] for i in range(len(timesteps))
|
977 |
]
|
978 |
|
|
|
|
|
|
|
|
|
979 |
if not isinstance(stg_scale, List):
|
980 |
stg_scale = [stg_scale] * len(timesteps)
|
981 |
else:
|
@@ -988,6 +994,16 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
988 |
rescaling_scale[guidance_mapping[i]] for i in range(len(timesteps))
|
989 |
]
|
990 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
991 |
# Normalize skip_block_list to always be None or a list of lists matching timesteps
|
992 |
if skip_block_list is not None:
|
993 |
# Convert single list to list of lists if needed
|
@@ -999,6 +1015,17 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
999 |
new_skip_block_list.append(skip_block_list[guidance_mapping[i]])
|
1000 |
skip_block_list = new_skip_block_list
|
1001 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1002 |
if enhance_prompt:
|
1003 |
self.prompt_enhancer_image_caption_model = (
|
1004 |
self.prompt_enhancer_image_caption_model.to(self._execution_device)
|
@@ -1028,7 +1055,7 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
1028 |
negative_prompt_attention_mask,
|
1029 |
) = self.encode_prompt(
|
1030 |
prompt,
|
1031 |
-
|
1032 |
negative_prompt=negative_prompt,
|
1033 |
num_images_per_prompt=num_images_per_prompt,
|
1034 |
device=device,
|
@@ -1046,28 +1073,23 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
1046 |
|
1047 |
prompt_embeds_batch = prompt_embeds
|
1048 |
prompt_attention_mask_batch = prompt_attention_mask
|
1049 |
-
|
1050 |
-
torch.
|
1051 |
-
|
1052 |
-
|
1053 |
-
|
1054 |
-
|
1055 |
-
|
1056 |
-
|
1057 |
-
|
1058 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1059 |
|
1060 |
-
prompt_embeds_batch = torch.cat(
|
1061 |
-
[negative_prompt_embeds, prompt_embeds, prompt_embeds], dim=0
|
1062 |
-
)
|
1063 |
-
prompt_attention_mask_batch = torch.cat(
|
1064 |
-
[
|
1065 |
-
negative_prompt_attention_mask,
|
1066 |
-
prompt_attention_mask,
|
1067 |
-
prompt_attention_mask,
|
1068 |
-
],
|
1069 |
-
dim=0,
|
1070 |
-
)
|
1071 |
# 4. Prepare the initial latents using the provided media and conditioning items
|
1072 |
|
1073 |
# Prepare the initial latents tensor, shape = (b, c, f, h, w)
|
@@ -1076,7 +1098,7 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
1076 |
media_items=media_items,
|
1077 |
timestep=timesteps[0],
|
1078 |
latent_shape=latent_shape,
|
1079 |
-
dtype=
|
1080 |
device=device,
|
1081 |
generator=generator,
|
1082 |
vae_per_channel_normalize=vae_per_channel_normalize,
|
@@ -1096,6 +1118,14 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
1096 |
)
|
1097 |
init_latents = latents.clone() # Used for image_cond_noise_update
|
1098 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1099 |
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
1100 |
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
1101 |
|
@@ -1104,50 +1134,8 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
1104 |
len(timesteps) - num_inference_steps * self.scheduler.order, 0
|
1105 |
)
|
1106 |
|
1107 |
-
orig_conditioning_mask = conditioning_mask
|
1108 |
-
|
1109 |
-
# Befor compiling this code please be aware:
|
1110 |
-
# This code might generate different input shapes if some timesteps have no STG or CFG.
|
1111 |
-
# This means that the codes might need to be compiled mutliple times.
|
1112 |
-
# To avoid that, use the same STG and CFG values for all timesteps.
|
1113 |
-
|
1114 |
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
1115 |
for i, t in enumerate(timesteps):
|
1116 |
-
do_classifier_free_guidance = guidance_scale[i] > 1.0
|
1117 |
-
do_spatio_temporal_guidance = stg_scale[i] > 0
|
1118 |
-
do_rescaling = rescaling_scale[i] != 1.0
|
1119 |
-
|
1120 |
-
num_conds = 1
|
1121 |
-
if do_classifier_free_guidance:
|
1122 |
-
num_conds += 1
|
1123 |
-
if do_spatio_temporal_guidance:
|
1124 |
-
num_conds += 1
|
1125 |
-
|
1126 |
-
if do_classifier_free_guidance and do_spatio_temporal_guidance:
|
1127 |
-
indices = slice(batch_size * 0, batch_size * 3)
|
1128 |
-
elif do_classifier_free_guidance:
|
1129 |
-
indices = slice(batch_size * 0, batch_size * 2)
|
1130 |
-
elif do_spatio_temporal_guidance:
|
1131 |
-
indices = slice(batch_size * 1, batch_size * 3)
|
1132 |
-
else:
|
1133 |
-
indices = slice(batch_size * 1, batch_size * 2)
|
1134 |
-
|
1135 |
-
# Prepare skip layer masks
|
1136 |
-
skip_layer_mask: Optional[torch.Tensor] = None
|
1137 |
-
if do_spatio_temporal_guidance:
|
1138 |
-
if skip_block_list is not None:
|
1139 |
-
skip_layer_mask = self.transformer.create_skip_layer_mask(
|
1140 |
-
batch_size, num_conds, num_conds - 1, skip_block_list[i]
|
1141 |
-
)
|
1142 |
-
|
1143 |
-
batch_pixel_coords = torch.cat([pixel_coords] * num_conds)
|
1144 |
-
conditioning_mask = orig_conditioning_mask
|
1145 |
-
if conditioning_mask is not None and is_video:
|
1146 |
-
assert num_images_per_prompt == 1
|
1147 |
-
conditioning_mask = torch.cat([conditioning_mask] * num_conds)
|
1148 |
-
fractional_coords = batch_pixel_coords.to(torch.float32)
|
1149 |
-
fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
|
1150 |
-
|
1151 |
if conditioning_mask is not None and image_cond_noise_scale > 0.0:
|
1152 |
latents = self.add_noise_to_image_conditioning_latents(
|
1153 |
t,
|
@@ -1206,12 +1194,16 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
1206 |
noise_pred = self.transformer(
|
1207 |
latent_model_input.to(self.transformer.dtype),
|
1208 |
indices_grid=fractional_coords,
|
1209 |
-
encoder_hidden_states=prompt_embeds_batch
|
1210 |
self.transformer.dtype
|
1211 |
),
|
1212 |
-
encoder_attention_mask=prompt_attention_mask_batch
|
1213 |
timestep=current_timestep,
|
1214 |
-
skip_layer_mask=
|
|
|
|
|
|
|
|
|
1215 |
skip_layer_strategy=skip_layer_strategy,
|
1216 |
return_dict=False,
|
1217 |
)[0]
|
@@ -1323,7 +1315,6 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
1323 |
)
|
1324 |
else:
|
1325 |
decode_timestep = None
|
1326 |
-
latents = self.tone_map_latents(latents, tone_map_compression_ratio)
|
1327 |
image = vae_decode(
|
1328 |
latents,
|
1329 |
self.vae,
|
@@ -1745,47 +1736,6 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
1745 |
num_frames = (num_frames - 1) // scale_factor * scale_factor + 1
|
1746 |
return num_frames
|
1747 |
|
1748 |
-
@staticmethod
|
1749 |
-
def tone_map_latents(
|
1750 |
-
latents: torch.Tensor,
|
1751 |
-
compression: float,
|
1752 |
-
) -> torch.Tensor:
|
1753 |
-
"""
|
1754 |
-
Applies a non-linear tone-mapping function to latent values to reduce their dynamic range
|
1755 |
-
in a perceptually smooth way using a sigmoid-based compression.
|
1756 |
-
|
1757 |
-
This is useful for regularizing high-variance latents or for conditioning outputs
|
1758 |
-
during generation, especially when controlling dynamic behavior with a `compression` factor.
|
1759 |
-
|
1760 |
-
Parameters:
|
1761 |
-
----------
|
1762 |
-
latents : torch.Tensor
|
1763 |
-
Input latent tensor with arbitrary shape. Expected to be roughly in [-1, 1] or [0, 1] range.
|
1764 |
-
compression : float
|
1765 |
-
Compression strength in the range [0, 1].
|
1766 |
-
- 0.0: No tone-mapping (identity transform)
|
1767 |
-
- 1.0: Full compression effect
|
1768 |
-
|
1769 |
-
Returns:
|
1770 |
-
-------
|
1771 |
-
torch.Tensor
|
1772 |
-
The tone-mapped latent tensor of the same shape as input.
|
1773 |
-
"""
|
1774 |
-
if not (0 <= compression <= 1):
|
1775 |
-
raise ValueError("Compression must be in the range [0, 1]")
|
1776 |
-
|
1777 |
-
# Remap [0-1] to [0-0.75] and apply sigmoid compression in one shot
|
1778 |
-
scale_factor = compression * 0.75
|
1779 |
-
abs_latents = torch.abs(latents)
|
1780 |
-
|
1781 |
-
# Sigmoid compression: sigmoid shifts large values toward 0.2, small values stay ~1.0
|
1782 |
-
# When scale_factor=0, sigmoid term vanishes, when scale_factor=0.75, full effect
|
1783 |
-
sigmoid_term = torch.sigmoid(4.0 * scale_factor * (abs_latents - 1.0))
|
1784 |
-
scales = 1.0 - 0.8 * scale_factor * sigmoid_term
|
1785 |
-
|
1786 |
-
filtered = latents * scales
|
1787 |
-
return filtered
|
1788 |
-
|
1789 |
|
1790 |
def adain_filter_latent(
|
1791 |
latents: torch.Tensor, reference_latents: torch.Tensor, factor=1.0
|
|
|
45 |
)
|
46 |
|
47 |
|
48 |
+
try:
|
49 |
+
import torch_xla.distributed.spmd as xs
|
50 |
+
except ImportError:
|
51 |
+
xs = None
|
52 |
+
|
53 |
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
54 |
|
55 |
|
|
|
795 |
text_encoder_max_tokens: int = 256,
|
796 |
stochastic_sampling: bool = False,
|
797 |
media_items: Optional[torch.Tensor] = None,
|
|
|
798 |
**kwargs,
|
799 |
) -> Union[ImagePipelineOutput, Tuple]:
|
800 |
"""
|
|
|
876 |
If set to `True`, the sampling is stochastic. If set to `False`, the sampling is deterministic.
|
877 |
media_items ('torch.Tensor', *optional*):
|
878 |
The input media item used for image-to-image / video-to-video.
|
|
|
|
|
879 |
Examples:
|
880 |
|
881 |
Returns:
|
|
|
978 |
guidance_scale[guidance_mapping[i]] for i in range(len(timesteps))
|
979 |
]
|
980 |
|
981 |
+
# For simplicity, we are using a constant num_conds for all timesteps, so we need to zero
|
982 |
+
# out cases where the guidance scale should not be applied.
|
983 |
+
guidance_scale = [x if x > 1.0 else 0.0 for x in guidance_scale]
|
984 |
+
|
985 |
if not isinstance(stg_scale, List):
|
986 |
stg_scale = [stg_scale] * len(timesteps)
|
987 |
else:
|
|
|
994 |
rescaling_scale[guidance_mapping[i]] for i in range(len(timesteps))
|
995 |
]
|
996 |
|
997 |
+
do_classifier_free_guidance = any(x > 1.0 for x in guidance_scale)
|
998 |
+
do_spatio_temporal_guidance = any(x > 0.0 for x in stg_scale)
|
999 |
+
do_rescaling = any(x != 1.0 for x in rescaling_scale)
|
1000 |
+
|
1001 |
+
num_conds = 1
|
1002 |
+
if do_classifier_free_guidance:
|
1003 |
+
num_conds += 1
|
1004 |
+
if do_spatio_temporal_guidance:
|
1005 |
+
num_conds += 1
|
1006 |
+
|
1007 |
# Normalize skip_block_list to always be None or a list of lists matching timesteps
|
1008 |
if skip_block_list is not None:
|
1009 |
# Convert single list to list of lists if needed
|
|
|
1015 |
new_skip_block_list.append(skip_block_list[guidance_mapping[i]])
|
1016 |
skip_block_list = new_skip_block_list
|
1017 |
|
1018 |
+
# Prepare skip layer masks
|
1019 |
+
skip_layer_masks: Optional[List[torch.Tensor]] = None
|
1020 |
+
if do_spatio_temporal_guidance:
|
1021 |
+
if skip_block_list is not None:
|
1022 |
+
skip_layer_masks = [
|
1023 |
+
self.transformer.create_skip_layer_mask(
|
1024 |
+
batch_size, num_conds, num_conds - 1, skip_blocks
|
1025 |
+
)
|
1026 |
+
for skip_blocks in skip_block_list
|
1027 |
+
]
|
1028 |
+
|
1029 |
if enhance_prompt:
|
1030 |
self.prompt_enhancer_image_caption_model = (
|
1031 |
self.prompt_enhancer_image_caption_model.to(self._execution_device)
|
|
|
1055 |
negative_prompt_attention_mask,
|
1056 |
) = self.encode_prompt(
|
1057 |
prompt,
|
1058 |
+
do_classifier_free_guidance,
|
1059 |
negative_prompt=negative_prompt,
|
1060 |
num_images_per_prompt=num_images_per_prompt,
|
1061 |
device=device,
|
|
|
1073 |
|
1074 |
prompt_embeds_batch = prompt_embeds
|
1075 |
prompt_attention_mask_batch = prompt_attention_mask
|
1076 |
+
if do_classifier_free_guidance:
|
1077 |
+
prompt_embeds_batch = torch.cat(
|
1078 |
+
[negative_prompt_embeds, prompt_embeds], dim=0
|
1079 |
+
)
|
1080 |
+
prompt_attention_mask_batch = torch.cat(
|
1081 |
+
[negative_prompt_attention_mask, prompt_attention_mask], dim=0
|
1082 |
+
)
|
1083 |
+
if do_spatio_temporal_guidance:
|
1084 |
+
prompt_embeds_batch = torch.cat([prompt_embeds_batch, prompt_embeds], dim=0)
|
1085 |
+
prompt_attention_mask_batch = torch.cat(
|
1086 |
+
[
|
1087 |
+
prompt_attention_mask_batch,
|
1088 |
+
prompt_attention_mask,
|
1089 |
+
],
|
1090 |
+
dim=0,
|
1091 |
+
)
|
1092 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1093 |
# 4. Prepare the initial latents using the provided media and conditioning items
|
1094 |
|
1095 |
# Prepare the initial latents tensor, shape = (b, c, f, h, w)
|
|
|
1098 |
media_items=media_items,
|
1099 |
timestep=timesteps[0],
|
1100 |
latent_shape=latent_shape,
|
1101 |
+
dtype=prompt_embeds_batch.dtype,
|
1102 |
device=device,
|
1103 |
generator=generator,
|
1104 |
vae_per_channel_normalize=vae_per_channel_normalize,
|
|
|
1118 |
)
|
1119 |
init_latents = latents.clone() # Used for image_cond_noise_update
|
1120 |
|
1121 |
+
pixel_coords = torch.cat([pixel_coords] * num_conds)
|
1122 |
+
orig_conditioning_mask = conditioning_mask
|
1123 |
+
if conditioning_mask is not None and is_video:
|
1124 |
+
assert num_images_per_prompt == 1
|
1125 |
+
conditioning_mask = torch.cat([conditioning_mask] * num_conds)
|
1126 |
+
fractional_coords = pixel_coords.to(torch.float32)
|
1127 |
+
fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
|
1128 |
+
|
1129 |
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
1130 |
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
1131 |
|
|
|
1134 |
len(timesteps) - num_inference_steps * self.scheduler.order, 0
|
1135 |
)
|
1136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1137 |
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
1138 |
for i, t in enumerate(timesteps):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1139 |
if conditioning_mask is not None and image_cond_noise_scale > 0.0:
|
1140 |
latents = self.add_noise_to_image_conditioning_latents(
|
1141 |
t,
|
|
|
1194 |
noise_pred = self.transformer(
|
1195 |
latent_model_input.to(self.transformer.dtype),
|
1196 |
indices_grid=fractional_coords,
|
1197 |
+
encoder_hidden_states=prompt_embeds_batch.to(
|
1198 |
self.transformer.dtype
|
1199 |
),
|
1200 |
+
encoder_attention_mask=prompt_attention_mask_batch,
|
1201 |
timestep=current_timestep,
|
1202 |
+
skip_layer_mask=(
|
1203 |
+
skip_layer_masks[i]
|
1204 |
+
if skip_layer_masks is not None
|
1205 |
+
else None
|
1206 |
+
),
|
1207 |
skip_layer_strategy=skip_layer_strategy,
|
1208 |
return_dict=False,
|
1209 |
)[0]
|
|
|
1315 |
)
|
1316 |
else:
|
1317 |
decode_timestep = None
|
|
|
1318 |
image = vae_decode(
|
1319 |
latents,
|
1320 |
self.vae,
|
|
|
1736 |
num_frames = (num_frames - 1) // scale_factor * scale_factor + 1
|
1737 |
return num_frames
|
1738 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1739 |
|
1740 |
def adain_filter_latent(
|
1741 |
latents: torch.Tensor, reference_latents: torch.Tensor, factor=1.0
|
ltx_video/schedulers/rf.py
CHANGED
@@ -314,7 +314,7 @@ class RectifiedFlowScheduler(SchedulerMixin, ConfigMixin, TimestepShifter):
|
|
314 |
"""
|
315 |
Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
|
316 |
process from the learned model outputs (most often the predicted noise).
|
317 |
-
z_{t_1} = z_t - Delta_t * v
|
318 |
The method finds the next timestep that is lower than the input timestep(s) and denoises the latents
|
319 |
to that level. The input timestep(s) are not required to be one of the predefined timesteps.
|
320 |
|
|
|
314 |
"""
|
315 |
Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
|
316 |
process from the learned model outputs (most often the predicted noise).
|
317 |
+
z_{t_1} = z_t - \Delta_t * v
|
318 |
The method finds the next timestep that is lower than the input timestep(s) and denoises the latents
|
319 |
to that level. The input timestep(s) are not required to be one of the predefined timesteps.
|
320 |
|