import torch from diffusers import UniPCMultistepScheduler from diffusers import WanPipeline, AutoencoderKLWan # Use Wan-specific VAE from diffusers.models import UNetSpatioTemporalConditionModel from transformers import T5EncoderModel, T5Tokenizer from PIL import Image import numpy as np import gradio as gr import spaces model_id = "Wan-AI/Wan2.1-T2V-14B-Diffusers" vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32) pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16) flow_shift = 5.0 # 5.0 for 720P, 3.0 for 480P pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift) @spaces.GPU() def generate(prompt, negative_prompt, width=1280, height=720, num_inference_steps=50, progress=gr.Progress(track_tqdm=True)): pipe.to("cuda") output = pipe( prompt=prompt, negative_prompt=negative_prompt, height=height, width=width, num_frames=1, num_inference_steps=num_inference_steps, guidance_scale=5.0, ) image = output.frames[0][0] image = (image * 255).astype(np.uint8) return Image.fromarray(image) iface = gr.Interface( fn=generate, inputs=[ gr.Textbox(label="Input prompt"), ], additional_inputs = [ gr.Textbox(label="Negative prompt", value = ""), gr.Slider(label="Width", minimum=480, maximum=1280, step=8, value=1024), gr.Slider(label="Height", minimum=480, maximum=1280, step=8, value=1024), gr.Slider(minimum=1, maximum=40, step=1, label="Inference Steps", value=20) ], outputs=gr.Image(label="output"), ) iface.launch()