Spaces:

ovi054
/

Wan2.1-Image

Running on Zero

File size: 1,692 Bytes

812e69e
 
 
 
 
 
 
 
 
ab613df
69a9a62
ab613df
8ea00e2
812e69e
 
 
 
d60c82f
812e69e
 
cc9b68f
d60c82f
812e69e
 
da1c584
cc9b68f
 
812e69e
da1c584
812e69e
 
 
 
 
 
 
 
 
 
 
da1c584
8ea00e2
cc9b68f
 
da1c584
78004f2
812e69e

import torch
from diffusers import UniPCMultistepScheduler
from diffusers import WanPipeline, AutoencoderKLWan  # Use Wan-specific VAE
from diffusers.models import UNetSpatioTemporalConditionModel
from transformers import T5EncoderModel, T5Tokenizer

from PIL import Image
import numpy as np

import gradio as gr
import spaces

model_id = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)


@spaces.GPU()
def generate(prompt, negative_prompt, width=1280, height=720, num_inference_steps=50, progress=gr.Progress(track_tqdm=True)):
    pipe.to("cuda")
    output = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        height=height,
        width=width,
        num_frames=1,
        num_inference_steps=num_inference_steps,
        guidance_scale=5.0,
    )
    image = output.frames[0][0]
    image = (image * 255).astype(np.uint8)
    return Image.fromarray(image)

iface = gr.Interface(
    fn=generate,
    inputs=[
        gr.Textbox(label="Input prompt"),
    ],
    additional_inputs = [
        gr.Textbox(label="Negative prompt", value = ""),
        gr.Slider(label="Width", minimum=480, maximum=1280, step=8, value=1280),
        gr.Slider(label="Height", minimum=480, maximum=1280, step=8, value=720),
        gr.Slider(minimum=20, maximum=100, step=1, label="Inference Steps", value=50)
    ],
    outputs=gr.Image(label="output"),
)

iface.launch()