File size: 4,677 Bytes
55892cf
8f729c0
 
 
55892cf
 
8f729c0
 
55892cf
 
8f729c0
 
 
 
55892cf
8f729c0
55892cf
8f729c0
 
 
 
 
 
55892cf
 
 
8f729c0
fcddaaf
55892cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f729c0
55892cf
 
 
 
 
 
8f729c0
 
 
55892cf
 
8f729c0
 
 
55892cf
 
 
8f729c0
55892cf
14c5bb2
8f729c0
14c5bb2
8f729c0
14c5bb2
8f729c0
 
55892cf
 
8f729c0
55892cf
8f729c0
55892cf
8f729c0
55892cf
8f729c0
55892cf
8f729c0
 
 
 
55892cf
 
 
 
 
 
 
 
 
 
8f729c0
55892cf
 
 
 
 
 
 
 
 
 
 
 
14c5bb2
 
55892cf
 
 
8f729c0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import torch
import random
import time
import numpy as np
from PIL import Image
import gradio as gr
from diffsynth import save_video, ModelManager, SVDVideoPipeline
import spaces

# Clone the repository and install dependencies
os.system("git clone https://github.com/modelscope/DiffSynth-Studio.git")
os.system("cp -r DiffSynth-Studio/diffsynth ./")
os.system("pip install -r DiffSynth-Studio/requirements.txt")

# Function to initialize the model pipeline
def get_i2v_pipeline():
    model_manager = ModelManager(
        torch_dtype=torch.float16,
        device="cuda",
        model_id_list=["stable-video-diffusion-img2vid-xt", "ExVideo-SVD-128f-v1"],
        downloading_priority=["HuggingFace"]
    )
    pipe = SVDVideoPipeline.from_model_manager(model_manager)
    return pipe

# Function to sample and generate video
@spaces.GPU(duration=280)
def sample(image, seed, randomize_seed, motion_bucket_id, num_inference_steps):
    if randomize_seed:
        seed = random.randint(0, 10**8)
    torch.manual_seed(seed)
    video = pipe(
        input_image=image.resize((512, 512)),
        num_frames=128, fps=30, height=512, width=512,
        motion_bucket_id=motion_bucket_id,
        num_inference_steps=num_inference_steps,
        min_cfg_scale=2, max_cfg_scale=2, contrast_enhance_scale=1.2
    )
    file_path = f"videos/{time.time_ns()}.mp4"
    os.makedirs("videos", exist_ok=True)
    save_video(video, file_path, fps=30, quality=7)
    return file_path, seed

# Function to crop and resize the image
def crop_and_resize(image):
    height = 512
    width = 512
    image = np.array(image)
    image_height, image_width, _ = image.shape
    if image_height / image_width < height / width:
        cropped_width = int(image_height / height * width)
        left = (image_width - cropped_width) // 2
        image = image[:, left: left + cropped_width]
        image = Image.fromarray(image).convert("RGB").resize((width, height))
    else:
        cropped_height = int(image_width / width * height)
        left = (image_height - cropped_height) // 2
        image = image[left: left + cropped_height, :]
        image = Image.fromarray(image).convert("RGB").resize((width, height))
    return image

# Initialize the model pipeline
pipe = get_i2v_pipeline()

# Function to process examples
def process_examples(image):
    file_path, seed = sample(image, seed=0, randomize_seed=True, motion_bucket_id=100, num_inference_steps=25)
    return file_path, seed

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown('''
    # ExVideo

    ExVideo is a post-tuning technique aimed at enhancing the capability of video generation models. We have extended Stable Video Diffusion to achieve the generation of long videos up to 128 frames.

    This is the first model we have made public. Due to limitations in computational resources, this model was trained on about 40,000 videos using 8x A100 GPUs for approximately one week. Therefore, the model may sometimes generate content that does not conform to real-world principles. Please look forward to the release of our subsequent models.

    To use this model, please refer to [DiffSynth](https://github.com/modelscope/DiffSynth-Studio).

    * [Project Page](https://ecnu-cilab.github.io/ExVideoProjectPage/)
    * [Source Code](https://github.com/modelscope/DiffSynth-Studio)
    * [Technical report](https://arxiv.org/abs/2406.14130)
    ''')
    with gr.Row():
        with gr.Column():
            image = gr.Image(label="Upload your image", type="pil")
            generate_btn = gr.Button("Generate")
        video = gr.Video()
    with gr.Accordion("Advanced options", open=False):
        seed = gr.Slider(label="Seed", value=0, randomize=True, minimum=0, maximum=10**8, step=1)
        randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
        motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to synthesize", value=100, minimum=0, maximum=127)
        num_inference_steps = gr.Slider(label="Inference steps", value=25, minimum=1, maximum=50)

    image.upload(fn=crop_and_resize, inputs=image, outputs=image, queue=False)
    generate_btn.click(fn=sample, inputs=[image, seed, randomize_seed, motion_bucket_id, num_inference_steps], outputs=[video, seed], api_name="video")
    gr.Examples(
        examples=[
            "images/0.png",
            "images/1.png",
            "images/2.png",
            "images/3.png",
            "images/4.png"
        ],
        inputs=image,
        outputs=[video, seed],
        fn=process_examples,
        cache_examples="lazy",
    )

if __name__ == "__main__":
    demo.launch()