import gradio as gr import torch import spaces from diffusers import LTXConditionPipeline from diffusers.utils import export_to_video from gtts import gTTS from pydub import AudioSegment import whisper import ffmpeg import os # Load pipeline pipe = LTXConditionPipeline.from_pretrained( "Lightricks/LTX-Video-0.9.7-distilled", torch_dtype=torch.float16 ) pipe.to("cuda") @spaces.GPU(duration=120) def generate_video(prompt): generator = torch.Generator("cuda").manual_seed(42) # Generate latent video latents = pipe( prompt=prompt, width=512, height=512, num_frames=24, output_type="latent", generator=generator, num_inference_steps=7 ).frames # Decode frames frames = pipe( prompt=prompt, latents=latents, num_frames=24, output_type="pil", generator=generator, num_inference_steps=7 ).frames[0] # Save as video video_path = "output.mp4" export_to_video(frames, video_path, fps=12) # TTS tts = gTTS(text=prompt, lang='en') tts.save("voice.mp3") AudioSegment.from_mp3("voice.mp3").export("voice.wav", format="wav") # Subtitles model = whisper.load_model("base") result = model.transcribe("voice.wav", language="en") with open("subtitles.srt", "w") as f: f.write(result["srt"]) # Merge audio + subtitles into video ffmpeg.input(video_path).output( "final.mp4", vf="subtitles=subtitles.srt", i="voice.mp3", c="copy", shortest=None, loglevel="error" ).run() return "final.mp4" # Gradio UI demo = gr.Interface(fn=generate_video, inputs="text", outputs=gr.Video()) demo.launch()