File size: 7,386 Bytes
3756d66
 
 
e082df4
3756d66
e082df4
3756d66
 
e2ffd52
 
 
 
3756d66
e082df4
 
3756d66
e082df4
3756d66
e082df4
 
 
 
 
 
 
3756d66
e082df4
3756d66
e082df4
 
 
8fa0161
e082df4
 
 
 
 
3756d66
e082df4
 
3756d66
d38c396
 
e082df4
 
 
 
d38c396
19053fa
 
 
 
 
d38c396
19053fa
3756d66
 
e082df4
d38c396
 
e082df4
19053fa
3756d66
19053fa
 
 
e082df4
3756d66
d38c396
e082df4
 
d38c396
 
 
 
e082df4
 
d38c396
19053fa
1acbd2a
19053fa
1acbd2a
 
 
 
19053fa
1acbd2a
19053fa
 
 
 
 
1acbd2a
 
d38c396
 
 
 
e082df4
3756d66
1acbd2a
d38c396
e082df4
3756d66
 
 
d38c396
 
e2ffd52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d38c396
8fa0161
e082df4
d38c396
8fa0161
 
d38c396
 
 
8fa0161
 
 
 
d38c396
 
8fa0161
d38c396
 
8fa0161
 
d38c396
8fa0161
 
 
d38c396
8fa0161
d38c396
8fa0161
 
d38c396
 
8fa0161
d38c396
 
 
8fa0161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d38c396
 
 
 
 
 
 
 
3756d66
 
e082df4
 
 
 
 
 
 
 
19053fa
e082df4
 
d38c396
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import gradio as gr
import torch
import spaces
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
from diffusers.utils import export_to_video
from PIL import Image, ImageOps
from gtts import gTTS
from pydub import AudioSegment
try:
    import whisper
except ImportError:
    whisper = None
import ffmpeg
import requests
from io import BytesIO
import os
import gc

# Load LTX models
ltx_model_id = "Lightricks/LTX-Video-0.9.7-distilled"
upscaler_model_id = "Lightricks/ltxv-spatial-upscaler-0.9.7"

pipe = LTXConditionPipeline.from_pretrained(ltx_model_id, torch_dtype=torch.float16)
pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained(
    upscaler_model_id, vae=pipe.vae, torch_dtype=torch.float16
)

pipe.to("cuda")
pipe_upsample.to("cuda")
pipe.vae.enable_tiling()

def prepare_image_condition(image, size=(512, 512), background=(0, 0, 0)):
    image = ImageOps.contain(image, size)
    canvas = Image.new("RGB", size, background)
    offset = ((size[0] - image.width) // 2, (size[1] - image.height) // 2)
    canvas.paste(image, offset)
    return canvas

@spaces.GPU(duration=180)
def generate_video(prompt, image_url):
    generator = torch.Generator("cuda").manual_seed(42)
    
    # Load & prepare image
    image = None
    if image_url:
        raw_image = Image.open(BytesIO(requests.get(image_url).content)).convert("RGB")
        image = prepare_image_condition(raw_image)
    
    # Set target resolutions - using dimensions that match expected latent shapes
    # LTX uses 32x downsampling, so we need multiples of 32
    # For latent shape (1, 128, 8, 16, 16), we need 16*32 = 512x512
    base_width, base_height = 512, 512  # final upscaled size (16*32)
    down_width, down_height = 256, 256  # for initial generation (8*32) - smaller ratio for upscaling
    
    # Step 1: Generate latents at lower resolution with improved quality settings
    latents = pipe(
        prompt=prompt,
        image=image,
        width=down_width,
        height=down_height,
        num_frames=60,
        num_inference_steps=10,  # Increased from 7 for better quality
        output_type="latent",
        guidance_scale=1.5,  # Slightly increased for better prompt adherence
        decode_timestep=0.08,  # Optimized value
        decode_noise_scale=0.05,  # Reduced noise
        generator=generator
    ).frames
    
    torch.cuda.empty_cache()
    gc.collect()
    
    # Step 2: Upscale latents
    upscaled_latents = pipe_upsample(latents=latents, output_type="latent").frames
    
    torch.cuda.empty_cache()
    gc.collect()
    
    # Step 3: Decode upscaled latents to frames with improved settings
    frames = pipe(
        prompt=prompt,  # Use original prompt for consistency
        latents=upscaled_latents,
        width=base_width,
        height=base_height,
        num_frames=60,
        num_inference_steps=12,  # Increased for better decoding quality
        output_type="pil",
        guidance_scale=1.5,  # Consistent with generation
        decode_timestep=0.08,  # Optimized
        decode_noise_scale=0.05,  # Reduced noise
        image_cond_noise_scale=0.02,  # Reduced for cleaner output
        denoise_strength=0.25,  # Balanced denoising
        generator=generator
    ).frames[0]
    
    torch.cuda.empty_cache()
    gc.collect()
    
    # Step 4: Export video
    video_path = "output.mp4"
    export_to_video(frames, video_path, fps=24)
    
    # Step 5: TTS
    tts = gTTS(text=prompt, lang='en')
    tts.save("voice.mp3")
    AudioSegment.from_mp3("voice.mp3").export("voice.wav", format="wav")
    
    # Step 6: Subtitles
    if whisper is not None:
        try:
            model = whisper.load_model("base", device="cpu")
            result = model.transcribe("voice.wav", task="transcribe", language="en")
            
            # Generate SRT subtitles manually since result["srt"] might not be available
            srt_content = ""
            for i, segment in enumerate(result["segments"]):
                start_time = format_time(segment["start"])
                end_time = format_time(segment["end"])
                text = segment["text"].strip()
                srt_content += f"{i + 1}\n{start_time} --> {end_time}\n{text}\n\n"
            
            with open("subtitles.srt", "w", encoding="utf-8") as f:
                f.write(srt_content)
        except Exception as e:
            print(f"Whisper transcription failed: {e}")
            # Create a simple subtitle with the original prompt
            srt_content = f"1\n00:00:00,000 --> 00:00:05,000\n{prompt}\n\n"
            with open("subtitles.srt", "w", encoding="utf-8") as f:
                f.write(srt_content)
    else:
        print("Whisper not available, using prompt as subtitle")
        # Create a simple subtitle with the original prompt
        srt_content = f"1\n00:00:00,000 --> 00:00:05,000\n{prompt}\n\n"
        with open("subtitles.srt", "w", encoding="utf-8") as f:
            f.write(srt_content)
    
    # Step 7: Merge video + audio + subtitles with proper FFmpeg handling
    final_output = "final_with_audio.mp4"
    try:
        # First, create video with subtitles
        video_with_subs = "video_with_subs.mp4"
        (
            ffmpeg
            .input(video_path)
            .filter('subtitles', 'subtitles.srt')
            .output(video_with_subs, vcodec='libx264', acodec='aac', loglevel='error')
            .overwrite_output()
            .run()
        )
        
        # Then add audio track
        (
            ffmpeg
            .input(video_with_subs)
            .input('voice.wav')
            .output(
                final_output,
                vcodec='copy',
                acodec='aac',
                shortest=None,
                loglevel='error'
            )
            .overwrite_output()
            .run()
        )
        
        return final_output
        
    except Exception as e:
        print(f"FFmpeg error: {e}")
        # Fallback: try simpler approach without subtitles
        try:
            (
                ffmpeg
                .input(video_path)
                .input('voice.wav')
                .output(
                    final_output,
                    vcodec='libx264',
                    acodec='aac',
                    shortest=None,
                    loglevel='error'
                )
                .overwrite_output()
                .run()
            )
            return final_output
        except Exception as e2:
            print(f"FFmpeg fallback error: {e2}")
            # Final fallback: return original video
            return video_path

def format_time(seconds):
    """Convert seconds to SRT time format"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millisecs = int((seconds % 1) * 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}"

# Gradio UI
demo = gr.Interface(
    fn=generate_video,
    inputs=[
        gr.Textbox(label="Prompt", placeholder="Describe your scene..."),
        gr.Textbox(label="Optional Image URL (e.g. Pexels)", placeholder="https://...")
    ],
    outputs=gr.Video(label="Generated Video"),
    title="🎬 LTX AI Video Generator",
    description="AI-powered video with voiceover and subtitles. Generates at 256x256 and upscales to 512x512 with improved quality."
)

demo.launch()