Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,377 Bytes
3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 3756d66 e082df4 d38c396 e082df4 3756d66 e082df4 3756d66 d38c396 e082df4 d38c396 3756d66 e082df4 d38c396 e082df4 3756d66 e082df4 3756d66 d38c396 e082df4 d38c396 e082df4 d38c396 e082df4 3756d66 d38c396 e082df4 3756d66 d38c396 e082df4 d38c396 e082df4 d38c396 e082df4 d38c396 3756d66 e082df4 d38c396 e082df4 d38c396 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import gradio as gr
import torch
import spaces
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
from diffusers.utils import export_to_video
from PIL import Image, ImageOps
from gtts import gTTS
from pydub import AudioSegment
import whisper
import ffmpeg
import requests
from io import BytesIO
import os
import gc
# Load LTX models
ltx_model_id = "Lightricks/LTX-Video-0.9.7-distilled"
upscaler_model_id = "Lightricks/ltxv-spatial-upscaler-0.9.7"
pipe = LTXConditionPipeline.from_pretrained(ltx_model_id, torch_dtype=torch.float16)
pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained(
upscaler_model_id, vae=pipe.vae, torch_dtype=torch.float16
)
pipe.to("cuda")
pipe_upsample.to("cuda")
pipe.vae.enable_tiling()
def prepare_image_condition(image, size=(480, 480), background=(0, 0, 0)):
image = ImageOps.contain(image, size)
canvas = Image.new("RGB", size, background)
offset = ((size[0] - image.width) // 2, (size[1] - image.height) // 2)
canvas.paste(image, offset)
return canvas
@spaces.GPU(duration=180)
def generate_video(prompt, image_url):
generator = torch.Generator("cuda").manual_seed(42)
# Load & prepare image
image = None
if image_url:
raw_image = Image.open(BytesIO(requests.get(image_url).content)).convert("RGB")
image = prepare_image_condition(raw_image)
# Set target resolutions
base_width, base_height = 480, 480 # final size (must be divisible by 16)
down_width, down_height = 320, 320 # for latent generation (must also be divisible by 16)
# Step 1: Generate latents at lower resolution
latents = pipe(
prompt=prompt,
image=image,
width=down_width,
height=down_height,
num_frames=60,
num_inference_steps=7,
output_type="latent",
guidance_scale=1.0,
decode_timestep=0.05,
decode_noise_scale=0.025,
generator=generator
).frames
torch.cuda.empty_cache()
gc.collect()
# Step 2: Upscale latents
upscaled_latents = pipe_upsample(latents=latents, output_type="latent").frames
torch.cuda.empty_cache()
gc.collect()
# Step 3: Decode upscaled latents to frames
# Use the VAE decoder directly instead of the full pipeline
frames = pipe.vae.decode(upscaled_latents).sample
frames = (frames / 2 + 0.5).clamp(0, 1) # Normalize to [0, 1]
frames = (frames * 255).to(torch.uint8) # Convert to uint8
# Convert tensor to PIL Images
pil_frames = []
for i in range(frames.shape[2]): # num_frames dimension
frame = frames[0, :, i, :, :].permute(1, 2, 0).cpu().numpy()
pil_frames.append(Image.fromarray(frame))
torch.cuda.empty_cache()
gc.collect()
# Step 4: Export video
video_path = "output.mp4"
export_to_video(pil_frames, video_path, fps=24)
# Step 5: TTS
tts = gTTS(text=prompt, lang='en')
tts.save("voice.mp3")
AudioSegment.from_mp3("voice.mp3").export("voice.wav", format="wav")
# Step 6: Subtitles
model = whisper.load_model("base", device="cpu")
result = model.transcribe("voice.wav", task="transcribe", language="en")
# Generate SRT subtitles manually since result["srt"] might not be available
srt_content = ""
for i, segment in enumerate(result["segments"]):
start_time = format_time(segment["start"])
end_time = format_time(segment["end"])
text = segment["text"].strip()
srt_content += f"{i + 1}\n{start_time} --> {end_time}\n{text}\n\n"
with open("subtitles.srt", "w", encoding="utf-8") as f:
f.write(srt_content)
# Step 7: Merge video + audio + subtitles
final_output = "final_with_audio.mp4"
try:
(
ffmpeg
.input(video_path)
.output(
final_output,
vf="subtitles=subtitles.srt",
**{"c:v": "libx264", "c:a": "aac"},
loglevel="error"
)
.run(overwrite_output=True)
)
# Add audio track
(
ffmpeg
.input(final_output)
.input("voice.wav")
.output(
"final_complete.mp4",
**{"c:v": "copy", "c:a": "aac"},
shortest=None,
loglevel="error"
)
.run(overwrite_output=True)
)
return "final_complete.mp4"
except Exception as e:
print(f"FFmpeg error: {e}")
# Fallback: return video without audio/subtitles
return video_path
def format_time(seconds):
"""Convert seconds to SRT time format"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
millisecs = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}"
# Gradio UI
demo = gr.Interface(
fn=generate_video,
inputs=[
gr.Textbox(label="Prompt", placeholder="Describe your scene..."),
gr.Textbox(label="Optional Image URL (e.g. Pexels)", placeholder="https://...")
],
outputs=gr.Video(label="Generated Video"),
title="🎬 LTX AI Video Generator",
description="AI-powered video with voiceover and subtitles. Now outputs at 480x480 resolution."
)
demo.launch() |