Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,277 Bytes
3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 cdbcd1b e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 cdbcd1b 3756d66 e082df4 cdbcd1b e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import gradio as gr
import torch
import spaces
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
from diffusers.utils import export_to_video
from PIL import Image, ImageOps
from gtts import gTTS
from pydub import AudioSegment
import whisper
import ffmpeg
import requests
from io import BytesIO
import os
import gc
# Load LTX models
ltx_model_id = "Lightricks/LTX-Video-0.9.7-distilled"
upscaler_model_id = "Lightricks/ltxv-spatial-upscaler-0.9.7"
pipe = LTXConditionPipeline.from_pretrained(ltx_model_id, torch_dtype=torch.float16)
pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained(
upscaler_model_id, vae=pipe.vae, torch_dtype=torch.float16
)
pipe.to("cuda")
pipe_upsample.to("cuda")
pipe.vae.enable_tiling()
def prepare_image_condition(image, size=(512, 512), background=(0, 0, 0)):
image = ImageOps.contain(image, size)
canvas = Image.new("RGB", size, background)
offset = ((size[0] - image.width) // 2, (size[1] - image.height) // 2)
canvas.paste(image, offset)
return canvas
def round_to_nearest_resolution(height, width, ratio):
return height - (height % ratio), width - (width % ratio)
@spaces.GPU(duration=180)
def generate_video(prompt, image_url):
generator = torch.Generator("cuda").manual_seed(42)
# Aspect-ratio preserving image prep
image = None
if image_url:
raw_image = Image.open(BytesIO(requests.get(image_url).content)).convert("RGB")
image = prepare_image_condition(raw_image)
# Dimensions
base_width, base_height = 512, 512
downscale = 2 / 3
# Use correct rounding for VAE compatibility
w_d, h_d = round_to_nearest_resolution(
int(base_width * downscale),
int(base_height * downscale),
ratio=pipe.vae_spatial_compression_ratio
)
# Upscaled dimensions must also be VAE-aligned
w_up, h_up = round_to_nearest_resolution(
base_width,
base_height,
ratio=pipe.vae_spatial_compression_ratio
)
# Step 1: Generate latents
latents = pipe(
prompt=prompt,
image=image,
width=w_d,
height=h_d,
num_frames=60,
num_inference_steps=7,
output_type="latent",
guidance_scale=1.0,
decode_timestep=0.05,
decode_noise_scale=0.025,
generator=generator
).frames
torch.cuda.empty_cache()
gc.collect()
# Step 2: Upscale
upscaled = pipe_upsample(latents=latents, output_type="latent").frames
torch.cuda.empty_cache()
gc.collect()
# Step 3: Decode to frames (must match rounded base)
frames = pipe(
prompt=prompt,
image=image,
latents=upscaled,
width=w_up,
height=h_up,
num_frames=60,
num_inference_steps=10,
output_type="pil",
guidance_scale=1.0,
decode_timestep=0.05,
decode_noise_scale=0.025,
image_cond_noise_scale=0.025,
denoise_strength=0.3,
generator=generator
).frames[0]
# Step 4: Export video
video_path = "output.mp4"
export_to_video(frames, video_path, fps=24)
# Step 5: TTS
tts = gTTS(text=prompt, lang='en')
tts.save("voice.mp3")
AudioSegment.from_mp3("voice.mp3").export("voice.wav", format="wav")
# Step 6: Subtitles (CPU)
model = whisper.load_model("base", device="cpu")
result = model.transcribe("voice.wav", task="transcribe", language="en")
with open("subtitles.srt", "w", encoding="utf-8") as f:
f.write(result["srt"])
# Step 7: Merge video + audio + subtitles
final_output = "final_with_audio.mp4"
ffmpeg.input(video_path).output(
final_output,
vf="subtitles=subtitles.srt",
i="voice.mp3",
c="copy",
shortest=None,
loglevel="error"
).run()
return final_output
# Gradio UI
demo = gr.Interface(
fn=generate_video,
inputs=[
gr.Textbox(label="Prompt", placeholder="Describe your scene..."),
gr.Textbox(label="Optional Image URL (e.g. Pexels)", placeholder="https://...")
],
outputs=gr.Video(label="Generated Video"),
title="🎬 LTX AI Video Generator",
description="AI-powered video with voiceover and subtitles. Supports ZeroGPU (PyTorch) runtime."
)
demo.launch()
|