Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,223 Bytes
7da056c 1fd2df8 f3beecc 056829f 1fd2df8 056829f 1fd2df8 056829f f1a05f0 1fd2df8 056829f f1a05f0 1fd2df8 be22e58 1fd2df8 8d37981 f3beecc 1fd2df8 f3beecc 1fd2df8 c1ee18a 1fd2df8 c1ee18a 1fd2df8 056829f 1fd2df8 f3beecc 056829f 1fd2df8 056829f f3beecc 1fd2df8 f3beecc f1a05f0 1fd2df8 056829f 1fd2df8 056829f 1fd2df8 056829f 1fd2df8 056829f 1fd2df8 056829f 1fd2df8 056829f 1fd2df8 056829f 1fd2df8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import spaces
from datetime import datetime
import gc
import gradio as gr
import numpy as np
import random
from pathlib import Path
import os
from diffusers import AutoencoderKLLTXVideo, LTXPipeline, LTXVideoTransformer3DModel
from diffusers.utils import export_to_video
from transformers import T5EncoderModel, T5Tokenizer
import torch
from utils import install_packages
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.jit._state.disable()
torch.set_grad_enabled(False)
gc.collect()
torch.cuda.empty_cache()
ckpt_path = Path("a-r-r-o-w/LTX-Video-0.9.1-diffusers")
single_file_url = "https://huggingface.co/Lightricks/LTX-Video/ltx-video-2b-v0.9.1.safetensors"
transformer = LTXVideoTransformer3DModel.from_single_file(
single_file_url, torch_dtype=torch.bfloat16
)
vae = AutoencoderKLLTXVideo.from_single_file(
single_file_url, torch_dtype=torch.bfloat16)
vae.eval()
vae = vae.to("cuda")
text_encoder = T5EncoderModel.from_pretrained(
ckpt_path,
subfolder="text_encoder",
torch_dtype=torch.bfloat16
)
text_encoder.eval()
text_encoder = text_encoder.to("cuda")
tokenizer = T5Tokenizer.from_pretrained(
ckpt_path,
subfolder="tokenizer"
)
pipeline = LTXPipeline.from_single_file(
single_file_url,
transformer=transformer,
text_encoder=text_encoder,
tokenizer=tokenizer,
vae=vae,
torch_dtype=torch.bfloat16
)
# pipeline.enable_model_cpu_offload()
pipeline.vae.enable_tiling()
pipeline.vae.enable_slicing()
pipeline = pipeline.to("cuda")
MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 1280
@spaces.GPU()
def infer(
prompt,
negative_prompt,
seed,
randomize_seed,
width=704,
height=448,
num_frames=129,
fps=24,
num_inference_steps=30,
progress=gr.Progress(track_tqdm=True),
):
if randomize_seed:
seed = random.randint(0, MAX_SEED)
generator = torch.Generator(device='cuda').manual_seed(seed)
with torch.amp.autocast_mode.autocast('cuda', torch.bfloat16), torch.no_grad(), torch.inference_mode():
video = pipeline(
prompt=prompt,
negative_prompt=negative_prompt,
width=width,
height=height,
num_frames=num_frames,
# guidance_scale=guidance_scale,
num_inference_steps=num_inference_steps,
# decode_timestep=decode_timestep,
# decode_noise_scale=decode_noise_scale,
generator=generator,
# max_sequence_length=512,
).frames[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"output_{timestamp}.mp4"
os.makedirs("output", exist_ok=True)
output_path = f"./output/{filename}"
export_to_video(video, output_path, fps=fps)
gc.collect
torch.cuda.empty_cache()
return output_path
css = """
#col-container {
margin: 0 auto;
max-width: 640px;
}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.Markdown(" # Text-to-Image Gradio Template")
with gr.Row():
prompt = gr.Textbox(
label="Prompt",
lines=3,
value=str("A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"),
)
negative_prompt = gr.Textbox(
label="Negative prompt",
lines=3,
value=str("worst quality, blurry, distorted"),
)
with gr.Row():
run_button = gr.Button("Run", scale=0, variant="huggingface")
with gr.Row():
result = gr.Video(label="Result", show_label=False)
with gr.Accordion("Advanced Settings", open=False):
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=MAX_SEED,
step=1,
value=0,
)
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
with gr.Row():
width = gr.Slider(
label="Width",
minimum=256,
maximum=MAX_IMAGE_SIZE,
step=32,
value=704, # Replace with defaults that work for your model
)
height = gr.Slider(
label="Height",
minimum=256,
maximum=MAX_IMAGE_SIZE,
step=32,
value=448, # Replace with defaults that work for your model
)
with gr.Row():
num_frames = gr.Slider(
label="Number of frames",
minimum=1,
maximum=257,
step=32,
value=129, # Replace with defaults that work for your model
)
fps = gr.Slider(
label="Number of frames per second",
minimum=1,
maximum=30,
step=1,
value=24, # Replace with defaults that work for your model
)
with gr.Row():
num_inference_steps = gr.Slider(
label="Number of inference steps",
minimum=1,
maximum=50,
step=1,
value=30, # Replace with defaults that work for your model
)
gr.on(
triggers=[run_button.click, prompt.submit],
fn=infer,
inputs=[
prompt,
negative_prompt,
seed,
randomize_seed,
width,
height,
num_frames,
fps,
num_inference_steps,
],
outputs=[result],
)
if __name__ == "__main__":
install_packages()
demo.launch() |