File size: 5,691 Bytes
b935e3d
b950d1e
 
1da8dac
 
71bbdfb
1da8dac
b950d1e
1da8dac
b950d1e
1da8dac
b950d1e
1da8dac
b950d1e
 
1da8dac
b950d1e
 
 
1da8dac
57a950e
71bbdfb
 
 
 
 
 
 
57a950e
 
 
 
e47fa4e
1da8dac
4736362
e47fa4e
 
71bbdfb
1da8dac
 
 
 
 
71bbdfb
 
 
 
 
 
 
 
 
 
ef2375d
b950d1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e47fa4e
b950d1e
 
 
 
 
 
 
71bbdfb
 
 
 
 
 
 
 
 
 
b950d1e
 
71bbdfb
 
 
b950d1e
 
 
1da8dac
b950d1e
71bbdfb
 
1da8dac
b950d1e
1da8dac
 
 
 
 
b950d1e
1da8dac
b950d1e
1da8dac
 
 
 
b950d1e
1da8dac
b950d1e
1da8dac
 
b950d1e
 
45a6a13
1da8dac
71bbdfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f404d24
71bbdfb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import gradio as gr
import spaces
#import gradio.helpers
import torch
import os
import shutil
from glob import glob
from pathlib import Path
from typing import Optional

from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video
from PIL import Image

import uuid
import random
from huggingface_hub import hf_hub_download

#gradio.helpers.CACHED_FOLDER = '/data/cache'

# OPTIONAL: Clear caches at startup to free space
hf_cache = os.path.expanduser("~/.cache/huggingface")
torch_cache = os.path.expanduser("~/.cache/torch")
if os.path.exists(hf_cache):
    shutil.rmtree(hf_cache)
if os.path.exists(torch_cache):
    shutil.rmtree(torch_cache)

# Configure ZeroGPU to use memory instead of disk
from spaces.zero.config import Config
Config.zerogpu_offload_dir = None  # Disable disk offloading to prevent disk space issues

# Load the pipeline with authentication token
pipe = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt",
    torch_dtype=torch.float16,
    variant="fp16",
    use_auth_token=os.getenv("HUGGINGFACE_TOKEN")  # Fetch the token from environment if set
)
pipe.to("cuda")

max_64_bit_int = 2**63 - 1

def clean_outputs(output_folder: str, keep: int = 1):
    """
    Remove old video files to prevent using all disk space.
    Keeps the most recent <keep> files.
    """
    files = sorted(glob(os.path.join(output_folder, "*.mp4")), key=os.path.getmtime)
    if len(files) > keep:
        for old_file in files[:-keep]:
            os.remove(old_file)

@spaces.GPU(duration=250)
def sample(
    image: Image,
    seed: Optional[int] = 42,
    randomize_seed: bool = True,
    motion_bucket_id: int = 127,
    fps_id: int = 6,
    version: str = "svd_xt",
    cond_aug: float = 0.02,
    decoding_t: int = 3,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
    device: str = "cuda",
    output_folder: str = "outputs",
    progress=gr.Progress(track_tqdm=True)
):
    if image.mode == "RGBA":
        image = image.convert("RGB")
        
    if randomize_seed:
        seed = random.randint(0, max_64_bit_int)
    generator = torch.manual_seed(seed)
    
    os.makedirs(output_folder, exist_ok=True)
    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")

    # Reduce num_frames from 25 to 10 to consume less space
    frames = pipe(
        image, 
        decode_chunk_size=decoding_t, 
        generator=generator, 
        motion_bucket_id=motion_bucket_id, 
        noise_aug_strength=0.1, 
        num_frames=10  # reduced from 25
    ).frames[0]
    
    export_to_video(frames, video_path, fps=fps_id)
    torch.manual_seed(seed)

    # Clean up old videos to prevent filling disk
    clean_outputs(output_folder, keep=2)
    
    return video_path, seed

def resize_image(image, output_size=(1024, 576)):
    # Calculate aspect ratios
    target_aspect = output_size[0] / output_size[1]
    image_aspect = image.width / image.height

    # Resize then crop if the original image is larger
    if image_aspect > target_aspect:
        new_height = output_size[1]
        new_width = int(new_height * image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        left = (new_width - output_size[0]) / 2
        top = 0
        right = (new_width + output_size[0]) / 2
        bottom = output_size[1]
    else:
        new_width = output_size[0]
        new_height = int(new_width / image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        left = 0
        top = (new_height - output_size[1]) / 2
        right = output_size[0]
        bottom = (new_height + output_size[1]) / 2

    cropped_image = resized_image.crop((left, top, right, bottom))
    return cropped_image

with gr.Blocks() as demo:
    gr.Markdown('''# Community demo for Stable Video Diffusion - Img2Vid - XT ([model](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt), [paper](https://stability.ai/research/stable-video-diffusion-scaling-latent-video-diffusion-models-to-large-datasets), [stability's ui waitlist](https://stability.ai/contact))
#### Research release ([_non-commercial_](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/blob/main/LICENSE)): generate `~4s` vid from a single image at (`10 frames` at `6 fps`). This demo uses [🧨 diffusers](https://huggingface.co/docs/diffusers/main/en/using-diffusers/svd) for low VRAM usage.
    ''')
    with gr.Row():
        with gr.Column():
            image = gr.Image(label="Upload your image", type="pil")
            generate_btn = gr.Button("Generate")
        video = gr.Video()
    with gr.Accordion("Advanced options", open=False):
        seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1)
        randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
        motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255)
        fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be num_frames/fps", value=6, minimum=5, maximum=30)

    # Resize on upload
    image.upload(fn=resize_image, inputs=image, outputs=image, queue=False)

    # Generate with sample() function
    generate_btn.click(
        fn=sample,
        inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id],
        outputs=[video, seed],
        api_name="video"
    )

if __name__ == "__main__":
    demo.launch(show_api=False)