preSalesAIAutomation commited on
Commit
e082df4
·
verified ·
1 Parent(s): 786c685

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -28
app.py CHANGED
@@ -1,63 +1,118 @@
1
  import gradio as gr
2
  import torch
3
  import spaces
4
- from diffusers import LTXConditionPipeline
5
  from diffusers.utils import export_to_video
 
6
  from gtts import gTTS
7
  from pydub import AudioSegment
8
  import whisper
9
  import ffmpeg
 
 
10
  import os
 
11
 
12
- # Load pipeline
13
- pipe = LTXConditionPipeline.from_pretrained(
14
- "Lightricks/LTX-Video-0.9.7-distilled", torch_dtype=torch.float16
 
 
 
 
15
  )
 
16
  pipe.to("cuda")
 
 
 
 
 
 
 
 
 
17
 
18
- @spaces.GPU(duration=120)
19
- def generate_video(prompt):
 
 
 
20
  generator = torch.Generator("cuda").manual_seed(42)
21
 
22
- # Generate latent video
 
 
 
 
 
 
 
 
 
 
 
23
  latents = pipe(
24
  prompt=prompt,
25
- width=512,
26
- height=512,
27
- num_frames=24,
 
 
28
  output_type="latent",
29
- generator=generator,
30
- num_inference_steps=7
 
 
 
31
  ).frames
32
 
33
- # Decode frames
 
 
 
 
 
 
 
 
 
34
  frames = pipe(
35
  prompt=prompt,
36
- latents=latents,
37
- num_frames=24,
 
 
 
 
38
  output_type="pil",
39
- generator=generator,
40
- num_inference_steps=7
 
 
 
 
41
  ).frames[0]
42
 
43
- # Save as video
44
  video_path = "output.mp4"
45
- export_to_video(frames, video_path, fps=12)
46
 
47
- # TTS
48
  tts = gTTS(text=prompt, lang='en')
49
  tts.save("voice.mp3")
50
  AudioSegment.from_mp3("voice.mp3").export("voice.wav", format="wav")
51
 
52
- # Subtitles
53
- model = whisper.load_model("base")
54
- result = model.transcribe("voice.wav", language="en")
55
- with open("subtitles.srt", "w") as f:
56
  f.write(result["srt"])
57
 
58
- # Merge audio + subtitles into video
 
59
  ffmpeg.input(video_path).output(
60
- "final.mp4",
61
  vf="subtitles=subtitles.srt",
62
  i="voice.mp3",
63
  c="copy",
@@ -65,8 +120,18 @@ def generate_video(prompt):
65
  loglevel="error"
66
  ).run()
67
 
68
- return "final.mp4"
69
 
70
  # Gradio UI
71
- demo = gr.Interface(fn=generate_video, inputs="text", outputs=gr.Video())
 
 
 
 
 
 
 
 
 
 
72
  demo.launch()
 
1
  import gradio as gr
2
  import torch
3
  import spaces
4
+ from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
5
  from diffusers.utils import export_to_video
6
+ from PIL import Image, ImageOps
7
  from gtts import gTTS
8
  from pydub import AudioSegment
9
  import whisper
10
  import ffmpeg
11
+ import requests
12
+ from io import BytesIO
13
  import os
14
+ import gc
15
 
16
+ # Load LTX models
17
+ ltx_model_id = "Lightricks/LTX-Video-0.9.7-distilled"
18
+ upscaler_model_id = "Lightricks/ltxv-spatial-upscaler-0.9.7"
19
+
20
+ pipe = LTXConditionPipeline.from_pretrained(ltx_model_id, torch_dtype=torch.float16)
21
+ pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained(
22
+ upscaler_model_id, vae=pipe.vae, torch_dtype=torch.float16
23
  )
24
+
25
  pipe.to("cuda")
26
+ pipe_upsample.to("cuda")
27
+ pipe.vae.enable_tiling()
28
+
29
+ def prepare_image_condition(image, size=(512, 512), background=(0, 0, 0)):
30
+ image = ImageOps.contain(image, size)
31
+ canvas = Image.new("RGB", size, background)
32
+ offset = ((size[0] - image.width) // 2, (size[1] - image.height) // 2)
33
+ canvas.paste(image, offset)
34
+ return canvas
35
 
36
+ def round_to_nearest_resolution(height, width, ratio):
37
+ return height - (height % ratio), width - (width % ratio)
38
+
39
+ @spaces.GPU(duration=180)
40
+ def generate_video(prompt, image_url):
41
  generator = torch.Generator("cuda").manual_seed(42)
42
 
43
+ # Aspect-ratio preserving image prep
44
+ image = None
45
+ if image_url:
46
+ raw_image = Image.open(BytesIO(requests.get(image_url).content)).convert("RGB")
47
+ image = prepare_image_condition(raw_image)
48
+
49
+ # Dimensions
50
+ base_width, base_height = 512, 512
51
+ downscale = 2 / 3
52
+ w_d, h_d = round_to_nearest_resolution(int(base_width * downscale), int(base_height * downscale), pipe.vae_spatial_compression_ratio)
53
+
54
+ # Step 1: Generate latents
55
  latents = pipe(
56
  prompt=prompt,
57
+ image=image,
58
+ width=w_d,
59
+ height=h_d,
60
+ num_frames=60,
61
+ num_inference_steps=7,
62
  output_type="latent",
63
+ guidance_scale=1.0,
64
+ decode_timestep=0.05,
65
+ decode_noise_scale=0.025,
66
+ low_vram=True,
67
+ generator=generator
68
  ).frames
69
 
70
+ torch.cuda.empty_cache()
71
+ gc.collect()
72
+
73
+ # Step 2: Upscale
74
+ upscaled = pipe_upsample(latents=latents, output_type="latent").frames
75
+
76
+ torch.cuda.empty_cache()
77
+ gc.collect()
78
+
79
+ # Step 3: Decode to frames
80
  frames = pipe(
81
  prompt=prompt,
82
+ image=image,
83
+ latents=upscaled,
84
+ width=base_width,
85
+ height=base_height,
86
+ num_frames=60,
87
+ num_inference_steps=10,
88
  output_type="pil",
89
+ guidance_scale=1.0,
90
+ decode_timestep=0.05,
91
+ decode_noise_scale=0.025,
92
+ image_cond_noise_scale=0.025,
93
+ denoise_strength=0.3,
94
+ generator=generator
95
  ).frames[0]
96
 
97
+ # Step 4: Export video
98
  video_path = "output.mp4"
99
+ export_to_video(frames, video_path, fps=24)
100
 
101
+ # Step 5: TTS
102
  tts = gTTS(text=prompt, lang='en')
103
  tts.save("voice.mp3")
104
  AudioSegment.from_mp3("voice.mp3").export("voice.wav", format="wav")
105
 
106
+ # Step 6: Subtitles (CPU)
107
+ model = whisper.load_model("base", device="cpu")
108
+ result = model.transcribe("voice.wav", task="transcribe", language="en")
109
+ with open("subtitles.srt", "w", encoding="utf-8") as f:
110
  f.write(result["srt"])
111
 
112
+ # Step 7: Merge video + audio + subtitles
113
+ final_output = "final_with_audio.mp4"
114
  ffmpeg.input(video_path).output(
115
+ final_output,
116
  vf="subtitles=subtitles.srt",
117
  i="voice.mp3",
118
  c="copy",
 
120
  loglevel="error"
121
  ).run()
122
 
123
+ return final_output
124
 
125
  # Gradio UI
126
+ demo = gr.Interface(
127
+ fn=generate_video,
128
+ inputs=[
129
+ gr.Textbox(label="Prompt", placeholder="Describe your scene..."),
130
+ gr.Textbox(label="Optional Image URL (e.g. Pexels)", placeholder="https://...")
131
+ ],
132
+ outputs=gr.Video(label="Generated Video"),
133
+ title="🎬 LTX AI Video Generator",
134
+ description="AI-powered video with voiceover and subtitles. Supports ZeroGPU (PyTorch) runtime."
135
+ )
136
+
137
  demo.launch()