Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,907 Bytes
2fb5a94 8a8e3b8 0746826 6ff3b9f 0746826 b80d30c 0746826 5c9a5a9 0746826 78158a5 5c9a5a9 6ff3b9f 5869eb8 6ff3b9f 0378a15 6ff3b9f 0378a15 9259e85 0378a15 a7fc428 41cd548 5869eb8 78158a5 7bcf8f6 5869eb8 0746826 7bcf8f6 0746826 78158a5 0746826 968cf3f 0746826 78158a5 057dae5 78158a5 057dae5 0746826 968cf3f 0746826 057dae5 0746826 78158a5 0746826 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import os
os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 "torch<2.9" spaces')
os.system('sudo modprobe -r nvidia_uvm && sudo modprobe nvidia_uvm" spaces')
import spaces
import torch
from diffusers import AutoencoderKLWan, WanPipeline, WanImageToVideoPipeline, UniPCMultistepScheduler
from diffusers.utils import export_to_video
#from diffusers.hooks import apply_first_block_cache, FirstBlockCacheConfig
import gradio as gr
import tempfile
from huggingface_hub import hf_hub_download
import numpy as np
from PIL import Image
import random
MODEL_ID = "FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers"
vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
# Initialize pipelines
text_to_video_pipe = WanPipeline.from_pretrained(MODEL_ID, vae=vae, torch_dtype=torch.bfloat16)
image_to_video_pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID, vae=vae, torch_dtype=torch.bfloat16)
for pipe in [text_to_video_pipe, image_to_video_pipe]:
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=5.0)
pipe.to("cuda")
##Lora testing
#vae = AutoencoderKLWan.from_pretrained("Kijai/WanVideo_comfy", filename="Wan2_2_VAE_bf16.safetensors", torch_dtype=torch.bfloat16)
# LORA_REPO_ID = "JERRYNPC/WAN2.2-LORA-NSFW"
#apply_first_block_cache(pipe.transformer, FirstBlockCacheConfig(threshold=0.2))
# LORA_FILENAME= "jerry_HIGH-nsfw-V10E800.safetensors"
# causvid_path = hf_hub_download(repo_id=LORA_REPO_ID, filename=LORA_FILENAME)
# pipe.load_lora_weights(causvid_path, adapter_name="causvid_lora")
# pipe.set_adapters(["causvid_lora"], adapter_weights=[0.95])
# pipe.fuse_lora()
#LORA_REPO_ID = "AlekseyCalvin/HSToric_Color_Wan2.2_5B_LoRA_BySilverAgePoets"
#LORA_FILENAME = "HSToric_color_Wan22_5b_LoRA.safetensors"
LORA_REPO_ID = "AlekseyCalvin/HSToric_Color_Wan2.2_5B_LoRA_BySilverAgePoets"
LORA_FILENAME = "HSTcolor_Wan5b_LoRA_Rank64_PowerEMAsigmaRel020.safetensors"
causvid_path = hf_hub_download(repo_id=LORA_REPO_ID, filename=LORA_FILENAME)
pipe.load_lora_weights(causvid_path, adapter_name="wan_lora")
pipe.set_adapters(["wan_lora"], adapter_weights=[1.0])
pipe.fuse_lora()
# Constants
MOD_VALUE = 32
DEFAULT_H_SLIDER_VALUE = 832
DEFAULT_W_SLIDER_VALUE = 832
NEW_FORMULA_MAX_AREA = 1024 * 1024
SLIDER_MIN_H, SLIDER_MAX_H = 256, 1024
SLIDER_MIN_W, SLIDER_MAX_W = 256, 1024
MAX_SEED = np.iinfo(np.int32).max
FIXED_FPS = 16
MIN_FRAMES_MODEL = 17
MAX_FRAMES_MODEL = 193
default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
default_negative_prompt = "dull, overexposed, flashing, stuttering, static, blurred, vapid, banal, static, overall gray, worst, low, JPEG compression residue, incomplete, extra, error, missing, vanishing, lapse, broken, wrong, deformed, disfigured, misshapen, fused fingers, still, messy, watermark"
def _calculate_new_dimensions_wan(pil_image, mod_val, calculation_max_area, min_slider_h, max_slider_h, min_slider_w, max_slider_w, default_h, default_w):
orig_w, orig_h = pil_image.size
if orig_w <= 0 or orig_h <= 0:
return default_h, default_w
aspect_ratio = orig_h / orig_w
calc_h = round(np.sqrt(calculation_max_area * aspect_ratio))
calc_w = round(np.sqrt(calculation_max_area / aspect_ratio))
calc_h = max(mod_val, (calc_h // mod_val) * mod_val)
calc_w = max(mod_val, (calc_w // mod_val) * mod_val)
new_h = int(np.clip(calc_h, min_slider_h, (max_slider_h // mod_val) * mod_val))
new_w = int(np.clip(calc_w, min_slider_w, (max_slider_w // mod_val) * mod_val))
return new_h, new_w
def handle_image_upload_for_dims_wan(uploaded_pil_image, current_h_val, current_w_val):
if uploaded_pil_image is None:
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
try:
new_h, new_w = _calculate_new_dimensions_wan(
uploaded_pil_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE
)
return gr.update(value=new_h), gr.update(value=new_w)
except Exception as e:
gr.Warning("Error attempting to calculate new dimensions")
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
def get_duration(input_image, prompt, height, width,
negative_prompt, duration_seconds,
guidance_scale, steps,
seed, randomize_seed,
progress):
if steps > 5 and duration_seconds > 5:
return 60
elif steps > 5 or duration_seconds > 5:
return 50
else:
return 40
@spaces.GPU(duration=get_duration)
def generate_video(input_image, prompt, height, width, negative_prompt=default_negative_prompt, duration_seconds=2, guidance_scale=0, steps=4, seed=44, randomize_seed=False, progress=gr.Progress(track_tqdm=True)):
target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
if input_image is not None:
resized_image = input_image.resize((target_w, target_h))
with torch.inference_mode():
output_frames_list = image_to_video_pipe(
image=resized_image, prompt=prompt, negative_prompt=negative_prompt,
height=target_h, width=target_w, num_frames=num_frames,
guidance_scale=float(guidance_scale), num_inference_steps=int(steps),
generator=torch.Generator(device="cuda").manual_seed(current_seed)
).frames[0]
else:
with torch.inference_mode():
output_frames_list = text_to_video_pipe(
prompt=prompt, negative_prompt=negative_prompt,
height=target_h, width=target_w, num_frames=num_frames,
guidance_scale=float(guidance_scale), num_inference_steps=int(steps),
generator=torch.Generator(device="cuda").manual_seed(current_seed)
).frames[0]
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
video_path = tmpfile.name
export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
return video_path, current_seed
with gr.Blocks() as demo:
gr.Markdown("# Fast Wan 2.2 TI2V 5B Demo")
gr.Markdown("""This Demo is using [FastWan2.2-TI2V-5B](https://huggingface.co/FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers) which is fine-tuned with Sparse-distill method which allows wan to generate high quality videos in 3-5 steps.""")
with gr.Row():
with gr.Column():
input_image_component = gr.Image(type="pil", label="Input Image (optional, auto-resized to target H/W)")
prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
duration_seconds_input = gr.Slider(minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1), maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1), step=0.1, value=2, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
with gr.Accordion("Advanced Settings", open=True):
negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
with gr.Row():
height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
steps_slider = gr.Slider(minimum=1, maximum=8, step=1, value=4, label="Inference Steps")
guidance_scale_input = gr.Slider(minimum=0.0, maximum=5.0, step=0.01, value=0.0, label="Guidance Scale")
generate_button = gr.Button("Generate Video", variant="primary")
with gr.Column():
video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
input_image_component.upload(
fn=handle_image_upload_for_dims_wan,
inputs=[input_image_component, height_input, width_input],
outputs=[height_input, width_input]
)
input_image_component.clear(
fn=handle_image_upload_for_dims_wan,
inputs=[input_image_component, height_input, width_input],
outputs=[height_input, width_input]
)
ui_inputs = [
input_image_component, prompt_input, height_input, width_input,
negative_prompt_input, duration_seconds_input,
guidance_scale_input, steps_slider, seed_input, randomize_seed_checkbox
]
generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
gr.Examples(
examples=[
[None, "A person eating spaghetti", 1024, 720],
["cat.png", "The cat removes the glasses from its eyes.", 1088, 800],
[None, "a penguin playfully dancing in the snow, Antarctica", 1024, 720],
["peng.png", "a penguin running towards camera joyfully, Antarctica", 896, 512],
],
inputs=[input_image_component, prompt_input, height_input, width_input], outputs=[video_output, seed_input], fn=generate_video, cache_examples="lazy"
)
if __name__ == "__main__":
demo.queue().launch() |