File size: 4,360 Bytes
b935e3d
1da8dac
 
 
 
 
c31fb3d
1da8dac
 
b67e239
1da8dac
c31fb3d
1da8dac
c31fb3d
 
 
1da8dac
 
 
c31fb3d
1da8dac
 
c31fb3d
1da8dac
45a6a13
 
1da8dac
 
 
 
 
 
 
c31fb3d
1da8dac
 
 
 
c31fb3d
1da8dac
 
 
c31fb3d
1da8dac
c31fb3d
45a6a13
c31fb3d
45a6a13
 
 
 
 
c31fb3d
45a6a13
c31fb3d
45a6a13
 
 
 
 
 
 
 
 
 
 
 
 
 
c31fb3d
45a6a13
 
c31fb3d
45a6a13
c31fb3d
45a6a13
 
 
 
 
 
c31fb3d
 
45a6a13
 
 
 
 
 
 
 
c31fb3d
 
45a6a13
 
 
 
 
 
 
 
 
c31fb3d
45a6a13
c31fb3d
1da8dac
c31fb3d
 
45a6a13
 
c31fb3d
 
 
 
 
 
 
 
 
45a6a13
 
c31fb3d
 
 
45a6a13
 
f404d24
c31fb3d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gradio as gr
import torch
import os
from glob import glob
from typing import Optional
from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import export_to_video
from PIL import Image
import random
from moviepy import VideoFileClip, concatenate_videoclips

# Load the Stable Video Diffusion Pipeline
pipe = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt", 
    torch_dtype=torch.float16, 
    variant="fp16"
)
pipe.to("cuda")

# Maximum seed value
max_64_bit_int = 2**63 - 1

# Resize and crop image to desired resolution
def resize_image(image, output_size=(1024, 576)):
    target_aspect = output_size[0] / output_size[1]
    image_aspect = image.width / image.height

    if image_aspect > target_aspect:
        new_height = output_size[1]
        new_width = int(new_height * image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        left = (new_width - output_size[0]) / 2
        right = (new_width + output_size[0]) / 2
        top, bottom = 0, output_size[1]
    else:
        new_width = output_size[0]
        new_height = int(new_width / image_aspect)
        resized_image = image.resize((new_width, new_height), Image.LANCZOS)
        left, right = 0, output_size[0]
        top = (new_height - output_size[1]) / 2
        bottom = (new_height + output_size[1]) / 2

    return resized_image.crop((left, top, right, bottom))

# Combine multiple video snippets into a single video
def combine_videos(video_paths, output_path="outputs/final_long_video.mp4"):
    os.makedirs("outputs", exist_ok=True)
    clips = [VideoFileClip(vp) for vp in video_paths]
    final_clip = concatenate_videoclips(clips, method="compose")
    final_clip.write_videofile(output_path, codec="libx264", fps=clips[0].fps, audio=False)
    return output_path

# Generate a video snippet from an input image
def generate_snippet(
    init_image: Image, seed: int, motion_bucket_id: int, fps_id: int, decoding_t: int, output_folder: str
):
    generator = torch.manual_seed(seed)
    os.makedirs(output_folder, exist_ok=True)
    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")

    result = pipe(
        init_image,
        decode_chunk_size=decoding_t,
        generator=generator,
        motion_bucket_id=motion_bucket_id,
        noise_aug_strength=0.1,
        num_frames=25
    )
    frames = result.frames[0]
    export_to_video(frames, video_path, fps=fps_id)

    return frames[-1], video_path

# Generate a long video composed of 5 short snippets
def sample_long(
    image: Image,
    seed: Optional[int] = 42,
    randomize_seed: bool = True,
    motion_bucket_id: int = 127,
    fps_id: int = 6,
    decoding_t: int = 3,
    output_folder: str = "outputs"
):
    if image.mode == "RGBA":
        image = image.convert("RGB")
    if randomize_seed:
        seed = random.randint(0, max_64_bit_int)

    snippet_paths = []
    current_image = image
    for _ in range(5):
        current_image, snippet_path = generate_snippet(
            init_image=current_image,
            seed=seed,
            motion_bucket_id=motion_bucket_id,
            fps_id=fps_id,
            decoding_t=decoding_t,
            output_folder=output_folder
        )
        snippet_paths.append(snippet_path)

    return combine_videos(snippet_paths), seed

# Build the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("### Stable Video Diffusion - Generate a Long Video")

    with gr.Row():
        with gr.Column():
            image = gr.Image(label="Upload an image", type="pil")
            generate_btn = gr.Button("Generate Long Video")
        video_output = gr.Video()

    with gr.Accordion("Advanced Options", open=False):
        seed = gr.Slider(0, max_64_bit_int, value=42, step=1, label="Seed")
        randomize_seed = gr.Checkbox(value=True, label="Randomize Seed")
        motion_bucket_id = gr.Slider(1, 255, value=127, step=1, label="Motion Bucket ID")
        fps_id = gr.Slider(5, 30, value=6, step=1, label="Frames Per Second")

    generate_btn.click(
        sample_long,
        inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id],
        outputs=[video_output, seed]
    )

if __name__ == "__main__":
    demo.launch(share=True)