import gradio as gr import spaces #import gradio.helpers import torch import os from glob import glob from pathlib import Path from typing import Optional from diffusers import StableVideoDiffusionPipeline from diffusers.utils import load_image, export_to_video from PIL import Image import uuid import random from huggingface_hub import hf_hub_download from moviepy import VideoFileClip, concatenate_videoclips #gradio.helpers.CACHED_FOLDER = '/data/cache' pipe = StableVideoDiffusionPipeline.from_pretrained( "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16" ) pipe.to("cuda") #pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) #pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True) max_64_bit_int = 2**63 - 1 def resize_image(image, output_size=(1024, 576)): """ Resizes/crops the image to match a target resolution without distorting aspect ratio. """ target_aspect = output_size[0] / output_size[1] image_aspect = image.width / image.height if image_aspect > target_aspect: new_height = output_size[1] new_width = int(new_height * image_aspect) resized_image = image.resize((new_width, new_height), Image.LANCZOS) left = (new_width - output_size[0]) / 2 top = 0 right = (new_width + output_size[0]) / 2 bottom = output_size[1] else: new_width = output_size[0] new_height = int(new_width / image_aspect) resized_image = image.resize((new_width, new_height), Image.LANCZOS) left = 0 top = (new_height - output_size[1]) / 2 right = output_size[0] bottom = (new_height + output_size[1]) / 2 cropped_image = resized_image.crop((left, top, right, bottom)) return cropped_image # NEW CODE HERE: def combine_videos(video_paths, output_path="outputs/final_long_video.mp4"): """ Concatenate a list of MP4 videos into one MP4. """ clips = [VideoFileClip(vp) for vp in video_paths] final_clip = concatenate_videoclips(clips, method="compose") final_clip.write_videofile(output_path, codec="libx264", fps=clips[0].fps, audio=False) return output_path # NEW CODE HERE: # We create a helper function that returns both the frames and the snippet path def generate_snippet( init_image: Image, seed: int, motion_bucket_id: int, fps_id: int, decoding_t: int = 3, device: str = "cuda", output_folder: str = "outputs" ): """ Generate a short snippet from `init_image` using the pipeline. Returns: (frames, video_path) """ generator = torch.manual_seed(seed) os.makedirs(output_folder, exist_ok=True) base_count = len(glob(os.path.join(output_folder, "*.mp4"))) video_path = os.path.join(output_folder, f"{base_count:06d}.mp4") # Generate frames result = pipe( init_image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=25 ) frames = result.frames[0] # a list of PIL images # Save snippet export_to_video(frames, video_path, fps=fps_id) return frames, video_path @spaces.GPU(duration=120) def sample_long( image: Image, seed: Optional[int] = 42, randomize_seed: bool = True, motion_bucket_id: int = 127, fps_id: int = 6, cond_aug: float = 0.02, decoding_t: int = 3, # Number of frames decoded at a time! This can be lowered if VRAM is an issue. device: str = "cuda", output_folder: str = "outputs", progress=gr.Progress(track_tqdm=True) ): """ Generate 5 snippets in a row. Each new snippet starts from the last frame of the previous snippet. Return the path to the final, concatenated MP4. """ if image.mode == "RGBA": image = image.convert("RGB") if randomize_seed: seed = random.randint(0, max_64_bit_int) torch.manual_seed(seed) snippet_paths = [] current_image = image for i in range(5): frames, snippet_path = generate_snippet( init_image=current_image, seed=seed, motion_bucket_id=motion_bucket_id, fps_id=fps_id, decoding_t=decoding_t, device=device, output_folder=output_folder ) snippet_paths.append(snippet_path) # Get the last frame for the next snippet last_frame = frames[-1] # PIL image current_image = last_frame # Optional: re-seed each time if you like randomness in every snippet # Otherwise, keep the same seed for a more cohesive “style” # If you want random seeds each snippet, uncomment: # seed = random.randint(0, max_64_bit_int) # Concatenate all snippets final_video_path = os.path.join(output_folder, "final_long_video.mp4") final_video_path = combine_videos(snippet_paths, output_path=final_video_path) return final_video_path, seed with gr.Blocks() as demo: gr.Markdown('''# Community demo for Stable Video Diffusion - Img2Vid - XT ([model](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt), [paper](https://stability.ai/research/stable-video-diffusion-scaling-latent-video-diffusion-models-to-large-datasets), [stability's ui waitlist](https://stability.ai/contact)) #### Research release ([_non-commercial_](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/blob/main/LICENSE)): Generate a longer video by chaining together multiple short snippets. ''') with gr.Row(): with gr.Column(): image = gr.Image(label="Upload your image", type="pil") generate_btn = gr.Button("Generate Long Video (5 snippets)") video = gr.Video() with gr.Accordion("Advanced options", open=False): seed = gr.Slider( label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1 ) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) motion_bucket_id = gr.Slider( label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255 ) fps_id = gr.Slider( label="Frames per second", info="The length of your video in seconds will be 25/fps", value=6, minimum=5, maximum=30 ) # Automatically resize on image upload image.upload(fn=resize_image, inputs=image, outputs=image, queue=False) # NEW: Generate a *long* video composed of 5 short snippets generate_btn.click( fn=sample_long, inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id], outputs=[video, seed], api_name="video" ) if __name__ == "__main__": demo.launch(share=True, show_api=False)