import gradio as gr import spaces import torch # from pipeline_ltx_condition import LTXVideoCondition, LTXConditionPipeline # from diffusers import LTXLatentUpsamplePipeline from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition from diffusers.utils import export_to_video, load_video import numpy as np pipe = LTXConditionPipeline.from_pretrained("linoyts/LTX-Video-0.9.7-distilled-diffusers", torch_dtype=torch.bfloat16) pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("a-r-r-o-w/LTX-Video-0.9.7-Latent-Spatial-Upsampler-diffusers", vae=pipe.vae, torch_dtype=torch.bfloat16) pipe.to("cuda") pipe_upsample.to("cuda") pipe.vae.enable_tiling() MAX_SEED = np.iinfo(np.int32).max MAX_IMAGE_SIZE = 2048 def round_to_nearest_resolution_acceptable_by_vae(height, width): height = height - (height % pipe.vae_temporal_compression_ratio) width = width - (width % pipe.vae_temporal_compression_ratio) return height, width def change_mode_to_text(): return gr.update(value="text-to-video") def change_mode_to_image(): return gr.update(value="image-to-video") def change_mode_to_video(): return gr.update(value="video-to-video") @spaces.GPU def generate(prompt, negative_prompt, image, video, mode, steps, num_frames, frames_to_use, seed, randomize_seed, improve_texture=False, progress=gr.Progress(track_tqdm=True)): if randomize_seed: seed = random.randint(0, MAX_SEED) # Part 1. Generate video at smaller resolution # Text-only conditioning is also supported without the need to pass `conditions` expected_height, expected_width = 768, 1152 #todo make configurable downscale_factor = 2 / 3 downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor) downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width) if mode == "text-to-video" and video is not None: video = load_video(video)[:frames_to_use] condition = True elif mode == "image-to-video" and image is not None: video = [image] condition = True else: condition=False if condition: condition1 = LTXVideoCondition(video=video, frame_index=0) else: condition1 = None latents = pipe( conditions=condition1, prompt=prompt, negative_prompt=negative_prompt, width=downscaled_width, height=downscaled_height, num_frames=num_frames, num_inference_steps=steps, decode_timestep = 0.05, decode_noise_scale = 0.025, guidance_scale=1.0, generator=torch.Generator(device="cuda").manual_seed(seed), output_type="latent", ).frames # latents = pipe( # conditions=condition1, # prompt=prompt, # negative_prompt=negative_prompt, # # width=downscaled_width, # # height=downscaled_height, # num_frames=num_frames, # num_inference_steps=steps, # decode_timestep = 0.05, # decode_noise_scale = 0.025, # generator=torch.Generator().manual_seed(seed), # #output_type="latent", # ).frames # Part 2. Upscale generated video using latent upsampler with fewer inference steps # The available latent upsampler upscales the height/width by 2x if improve_texture: upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2 upscaled_latents = pipe_upsample( latents=latents, output_type="latent" ).frames # Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended) video = pipe( conditions=condition1, prompt=prompt, negative_prompt=negative_prompt, width=upscaled_width, height=upscaled_height, num_frames=num_frames, guidance_scale=1.0, denoise_strength=0.6, # Effectively, 4 inference steps out of 10 num_inference_steps=3, latents=upscaled_latents, decode_timestep=0.05, image_cond_noise_scale=0.025, generator=torch.Generator().manual_seed(seed), output_type="pil", ).frames[0] else: upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2 video = pipe_upsample( latents=latents, # output_type="latent" ).frames[0] # Part 4. Downscale the video to the expected resolution video = [frame.resize((expected_width, expected_height)) for frame in video] export_to_video(video, "output.mp4", fps=24) return "output.mp4" css=""" #col-container { margin: 0 auto; max-width: 900px; } """ js_func = """ function refresh() { const url = new URL(window.location); if (url.searchParams.get('__theme') !== 'dark') { url.searchParams.set('__theme', 'dark'); window.location.href = url.href; } } """ with gr.Blocks(css=css, theme=gr.themes.Ocean()) as demo: gr.Markdown("# LTX Video 0.9.7 Distilled") mode = gr.State(value="text-to-video") with gr.Row(): with gr.Column(): with gr.Group(): with gr.Tab("text-to-video") as text_tab: image = gr.Image(label="", visible=False) #prompt = gr.Textbox(label="prompt") with gr.Tab("image-to-video") as image_tab: image = gr.Image(label="") with gr.Tab("video-to-video") as video_tab: video = gr.Video(label="") frames_to_use = gr.Number(label="num frames to use",info="first # of frames to use from the input video", value=1) prompt = gr.Textbox(label="prompt") improve_texture = gr.Checkbox(label="improve texture", value=False, info="note it slows generation") run_button = gr.Button() with gr.Column(): output = gr.Video(interactive=False) with gr.Accordion("Advanced settings", open=False): negative_prompt = gr.Textbox(label="negative prompt", value="worst quality, inconsistent motion, blurry, jittery, distorted", visible=False) with gr.Row(): seed = gr.Number(label="seed", value=0, precision=0) randomize_seed = gr.Checkbox(label="randomize seed") with gr.Row(): steps = gr.Slider(label="Steps", minimum=1, maximum=30, value=8, step=1) num_frames = gr.Slider(label="# frames", minimum=1, maximum=161, value=96, step=1) with gr.Row(): height = gr.Slider(label="height", value=512, step=1) width = gr.Slider(label="width", value=704, step=1) text_tab.select(fn=change_mode_to_text, inputs=[], outputs=[mode]) image_tab.select(fn=change_mode_to_image, inputs=[], outputs=[mode]) video_tab.select(fn=change_mode_to_video, inputs=[], outputs=[mode]) run_button.click(fn=generate, inputs=[prompt, negative_prompt, image, video, mode, steps, num_frames, frames_to_use, seed, randomize_seed, improve_texture], outputs=[output]) demo.launch()