Spaces:

XiangpengYang
/

VideoGrain

Configuration error

+#!/usr/bin/env python
+from __future__ import annotations
+import os
+import gradio as gr
+from webui.merge_config_gradio import merge_config_then_run
+import huggingface_hub
+import shutil
+import os
+HF_TOKEN = os.getenv('HF_TOKEN')
+pipe = merge_config_then_run()
+ARTICLE = r"""
+If VideoGrain is helpful, please help to ⭐ the <a href='https://github.com/knightyxp/VideoGrain' target='_blank'>Github Repo</a>. Thanks!
+[![GitHub Stars](https://img.shields.io/github/stars/knightyxp/VideoGrain?style=social)](https://github.com/knightyxp/VideoGrain)
+---
+📝 **Citation**
+If our work is useful for your research, please consider citing:
+```bibtex
+@article{yang2025videograin,
+  title={VideoGrain: Modulating Space-Time Attention for Multi-grained Video Editing},
+  author={Yang, Xiangpeng and Zhu, Linchao and Fan, Hehe and Yang, Yi},
+  journal={ICLR},
+  year={2025}
+}
+```
+📋 **License**
+This project is licensed under <a rel="license" href="https://github.com/knightyxp/VideoGrain?tab=License-1-ov-file#readme">ReLER-Lab License 1.0</a>.
+Redistribution and use for non-commercial purposes should follow this license.
+📧 **Contact**
+If you have any questions, please feel free to reach me out at <b>[email protected]</b>.
+"""
+def update_layout_visibility(num):
+    """
+    Given the user's selection (string) in ["2","3","4","5"],
+    return visibility updates for each of the 5 layout video inputs.
+    """
+    n = int(num)
+    # Show layout_file1 if n >= 1, layout_file2 if n >= 2, etc.
+    return [
+        gr.update(visible=(n >= 1)),
+        gr.update(visible=(n >= 2)),
+        gr.update(visible=(n >= 3)),
+        gr.update(visible=(n >= 4)),
+        gr.update(visible=(n >= 5))
+    ]
+with gr.Blocks(css='style.css') as demo:
+    # gr.Markdown(TITLE)
+    gr.HTML(
+    """
+    <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
+    <h1 style="font-weight: 900; font-size: 2rem; margin: 0rem">
+        VideoGrain: Modulating Space-Time Attention for Multi-Grained Video Editing
+    </h1>
+    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
+            <a href="https://github.com/knightyxp">Xiangpeng Yang</a>
+    </h2>
+    <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
+                  <span class="link-block">
+                    [<a href="https://arxiv.org/abs/2502.17258" target="_blank"
+                    class="external-link ">
+                    <span class="icon">
+                      <i class="ai ai-arxiv"></i>
+                    </span>
+                    <span>arXiv</span>
+                  </a>]
+                </span>
+                  <!-- Github link -->
+                  <span class="link-block">
+                    [<a href="https://github.com/knightyxp/VideoGrain" target="_blank"
+                    class="external-link ">
+                    <span class="icon">
+                      <i class="fab fa-github"></i>
+                    </span>
+                    <span>Code</span>
+                  </a>]
+                </span>
+                <!-- Github link -->
+                  <span class="link-block">
+                    [<a href="https://knightyxp.github.io/VideoGrain_project_page" target="_blank"
+                    class="external-link ">
+                    <span class="icon">
+                      <i class="fab fa-github"></i>
+                    </span>
+                    <span>Homepage</span>
+                  </a>]
+                </span>
+                <!-- Github link -->
+                <span class="link-block">
+                  [<a href="https://www.youtube.com/watch?v=XEM4Pex7F9E" target="_blank"
+                  class="external-link ">
+                  <span class="icon">
+                    <i class="fab fa-youtube"></i>
+                  </span>
+                  <span>Youtube Video</span>
+                </a>]
+              </span>
+    </h2>
+    <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
+       📕 TL;DR: VideoGrain is a zero-shot method for class-level, instance-level, and part-level video editing
+    </h2>
+    <h2 style="font-weight: 450; font-size: 1rem;">
+        Note that this page is a limited demo of VideoGrain. To run with more configurations, please check out our <a href="https://github.com/knightyxp/VideoGrain">github page.
+    </h2>
+    </div>
+    """)
+    gr.HTML("""
+    <p>We provide an <a href="https://github.com/knightyxp/VideoGrain?tab=readme-ov-file#editing-guidance-for-your-video"> Editing Guidance </a> to help users to choose hyperparameters when editing in-the-wild video.
+    <p>To remove the limitations or avoid queue on your own hardware, you may <a href="https://huggingface.co/spaces/XiangpengYang/VideoGrain?duplicate=true" style="display: inline-block; vertical-align: middle;"><img style="margin-top: 0em; margin-bottom: 0em; display: inline-block;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a></p>
+    """)
+    with gr.Row():
+        with gr.Column():
+            with gr.Accordion('Input Video', open=True):
+                # user_input_video = gr.File(label='Input Source Video')
+                user_input_video = gr.Video(label='Input Source Video', source='upload', type='numpy', format="mp4", visible=True).style(height="auto")
+                # Radio to choose how many layout videos to show
+                num_layouts = gr.Radio(
+                    choices=["2", "3", "4", "5"],
+                    label="Select Number of Editing Areas",
+                    value="2",  # default
+                    info="Please select the number of editing areas"
+                )
+                # Put all layout-video components in one Row to display them horizontally.
+                with gr.Row():
+                    layout_file1 = gr.Video(
+                        label="Layout Video 1",
+                        type="numpy",
+                        format="mp4",
+                        visible=True
+                    )
+                    layout_file2 = gr.Video(
+                        label="Layout Video 2",
+                        type="numpy",
+                        format="mp4",
+                        visible=True
+                    )
+                    layout_file3 = gr.Video(
+                        label="Layout Video 3",
+                        type="numpy",
+                        format="mp4",
+                        visible=False
+                    )
+                    layout_file4 = gr.Video(
+                        label="Layout Video 4",
+                        type="numpy",
+                        format="mp4",
+                        visible=False
+                    )
+                    layout_file5 = gr.Video(
+                        label="Layout Video 5",
+                        type="numpy",
+                        format="mp4",
+                        visible=False
+                    )
+                # Toggle visibility of the layout videos based on user selection
+                num_layouts.change(
+                    fn=update_layout_visibility,
+                    inputs=num_layouts,
+                    outputs=[
+                        layout_file1,
+                        layout_file2,
+                        layout_file3,
+                        layout_file4,
+                        layout_file5
+                    ]
+                )
+                prompt = gr.Textbox(label='Prompt',
+                                    info='Change the prompt, and extract each local prompt in the editing prompts.\
+                                    the local prompt order should be same as layout masks order.)',
+                                )
+                model_id = gr.Dropdown(
+                    label='Model ID',
+                    choices=[
+                        'stable-diffusion-v1-5/stable-diffusion-v1-5',
+                        # add shape editing ckpt here
+                    ],
+                    value='stable-diffusion-v1-5/stable-diffusion-v1-5')
+            run_button = gr.Button('Generate')
+        with gr.Column():
+            result = gr.Video(label='Result')
+            # result.style(height=512, width=512)
+            with gr.Row():
+                control_list = ['dwpose', 'depth_zoe', 'depth_midas']
+                control_type = gr.Dropdown(
+                    choices=control_list,
+                    label='Control type',
+                    value='dwpose'
+                )
+                # Checkbox group for "dwpose" options; default: hand selected, face not selected.
+                dwpose_options = gr.CheckboxGroup(
+                    choices=["hand", "face"],
+                    label="DW Pose Options",
+                    value=["hand"],
+                    visible=True  # Initially visible since default control_type is "dwpose"
+                )
+                # Update the visibility of the dwpose_options based on the selected control type
+                control_type.change(
+                    fn=lambda x: gr.update(visible=(x == "dwpose")),
+                    inputs=control_type,
+                    outputs=dwpose_options
+                )
+                controlnet_conditioning_scale = gr.Slider(label='ControlNet conditioning scale',
+                          minimum=0.0,
+                          maximum=1.0,
+                          value=1.0,
+                          step=0.1)
+            with gr.Accordion('Editing config for VideoGrian', open=True):
+                use_pnp = gr.Checkbox(
+                    label="Use PnP",
+                    value=False,
+                    info="Check to enable PnP functionality."
+                )
+                pnp_inject_steps = gr.Slider(label='pnp inject steps',
+                                info='PnP inject steps for temporal consistency',
+                                minimum=0,
+                                maximum=10,
+                                step=1,
+                                value=0)
+                flatten_res = gr.CheckboxGroup(
+                    choices=["1", "2", "4", "8"],
+                    label="Flatten Resolution",
+                    value=["1"],
+                    info="Select one or more flatten resolution factors. Mapping: 1 -> 64, 2 -> 32 (64/2), 4 -> 16 (64/4), 8 -> 8 (64/8)."
+                )
+    with gr.Row():
+        from example import style_example
+        examples = style_example
+        # gr.Examples(examples=examples,
+        #             inputs=[
+        #                 model_id,
+        #                 user_input_video,
+        #                 layout_files,
+        #                 prompt,
+        #                 model_id,
+        #                 control_type,
+        #                 dwpose_options,
+        #                 controlnet_conditioning_scale,
+        #                 use_pnp,
+        #                 pnp_inject_steps,
+        #                 flatten_res,
+        #             ],
+        #             outputs=result,
+        #             fn=pipe.run,
+        #             cache_examples=True,
+        #             # cache_examples=os.getenv('SYSTEM') == 'spaces'
+        #             )
+    gr.Markdown(ARTICLE)
+    inputs = [
+              model_id,
+              user_input_video,
+              num_layouts,
+              layout_file1,
+              layout_file2,
+              layout_file3,
+              layout_file4,
+              layout_file5,
+              prompt,
+              model_id,
+              control_type,
+              dwpose_options,
+              controlnet_conditioning_scale,
+              use_pnp,
+              pnp_inject_steps,
+              flatten_res,
+    ]
+    prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
+    run_button.click(fn=pipe.run, inputs=inputs, outputs=result)
+demo.queue().launch()

assets/teaser/run_two_man.mp4 ADDED Viewed

Binary file (149 kB). View file

config/demo_config.yaml CHANGED Viewed

@@ -1,4 +1,4 @@
-pretrained_model_path: "./ckpt/stable-diffusion-v1-5"
 logdir: ./result/run_two_man/instance_level/3cls_spider_polar_vis_cross_attn
 dataset_config:
@@ -13,7 +13,7 @@ dataset_config:
 control_config:
     control_type: "dwpose"
-    pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose"
     controlnet_conditioning_scale: 1.0
     hand: True
     face: False

+pretrained_model_path: "/home/xianyang/Data/code/FateZero/ckpt/stable-diffusion-v1-5"
 logdir: ./result/run_two_man/instance_level/3cls_spider_polar_vis_cross_attn
 dataset_config:
 control_config:
     control_type: "dwpose"
+    pretrained_controlnet_path: "/home/xianyang/Data/code/FateZero/ckpt/control_v11p_sd15_openpose"
     controlnet_conditioning_scale: 1.0
     hand: True
     face: False

example.py ADDED Viewed

	@@ -0,0 +1,19 @@

+num_steps = 15
+style_example = [
+            [
+                'CompVis/stable-diffusion-v1-5',
+                'data/run_two_man/run_two_man.mp4',
+                'Man in red hoddie and man in gray shirt are jogging in forest',
+                'left man → Spiderman, right man → Polar Bear + Sunglasses, ground → grassy meadow, trees → cherry blossoms',
+                0.8,
+                0.8,
+                "instance+part",
+                10,
+                num_steps,
+                7.5,
+                # input video argument
+                None, 0, 8, 1, 0,0,0,0
+            ],
+]

video_diffusion/data/__pycache__/dataset.cpython-310.pyc CHANGED Viewed

Binary files a/video_diffusion/data/__pycache__/dataset.cpython-310.pyc and b/video_diffusion/data/__pycache__/dataset.cpython-310.pyc differ

video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc CHANGED Viewed

Binary files a/video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc and b/video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc differ

webui/__pycache__/merge_config_gradio.cpython-310.pyc ADDED Viewed

Binary file (2.68 kB). View file

webui/merge_config_gradio.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from test import *
+import copy
+import gradio as gr
+class merge_config_then_run():
+    def __init__(self) -> None:
+            # Load the tokenizer
+        pretrained_model_path = '/home/xianyang/Data/code/FateZero/ckpt/stable-diffusion-v1-5'
+        self.tokenizer = None
+        self.text_encoder = None
+        self.vae = None
+        self.unet = None
+        cache_ckpt = True
+        if cache_ckpt:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                pretrained_model_path,
+                # 'FateZero/ckpt/stable-diffusion-v1-4',
+                subfolder="tokenizer",
+                use_fast=False,
+            )
+            # Load models and create wrapper for stable diffusion
+            self.text_encoder = CLIPTextModel.from_pretrained(
+                pretrained_model_path,
+                subfolder="text_encoder",
+            )
+            self.vae = AutoencoderKL.from_pretrained(
+                pretrained_model_path,
+                subfolder="vae",
+            )
+            self.unet = UNetPseudo3DConditionModel.from_2d_model(
+                os.path.join(pretrained_model_path, "unet"), model_config=model_config
+            )
+    def run(
+        self,
+        # def merge_config_then_run(
+        model_id,
+        data_path,
+        source_prompt,
+        target_prompt,
+        cross_replace_steps,
+        self_replace_steps,
+        enhance_words,
+        enhance_words_value,
+        num_steps,
+        guidance_scale,
+        user_input_video=None,
+        # Temporal and spatial crop of the video
+        start_sample_frame=0,
+        n_sample_frame=8,
+        stride=1,
+        left_crop=0,
+        right_crop=0,
+        top_crop=0,
+        bottom_crop=0,
+    ):
+        # , ] = inputs
+        default_edit_config='config/demo_config.yaml'
+        Omegadict_default_edit_config = OmegaConf.load(default_edit_config)
+        dataset_time_string = get_time_string()
+        config_now = copy.deepcopy(Omegadict_default_edit_config)
+        print(f"config_now['pretrained_model_path'] = model_id {model_id}")
+        # config_now['pretrained_model_path'] = model_id
+        config_now['dataset_config']['prompt'] = source_prompt
+        config_now['dataset_config']['path'] = data_path
+        # ImageSequenceDataset_dict = { }
+        offset_dict = {
+            "left": left_crop,
+            "right": right_crop,
+            "top": top_crop,
+            "bottom": bottom_crop,
+        }
+        ImageSequenceDataset_dict = {
+            "start_sample_frame" : start_sample_frame,
+            "n_sample_frame" : n_sample_frame,
+            "sampling_rate"       : stride,
+            "offset": offset_dict,
+        }
+        config_now['dataset_config'].update(ImageSequenceDataset_dict)
+        if user_input_video and data_path is None:
+            raise gr.Error('You need to upload a video or choose a provided video')
+        if user_input_video is not None:
+            if isinstance(user_input_video, str):
+                config_now['dataset_config']['path'] = user_input_video
+            elif hasattr(user_input_video, 'name') and user_input_video.name is not None:
+                config_now['dataset_config']['path'] = user_input_video.name
+        # editing config
+        config_now['editing_config']['prompts'] = [target_prompt]
+        config_now['editing_config']['guidance_scale'] = guidance_scale
+        config_now['editing_config']['num_inference_steps'] = num_steps
+        logdir = default_edit_config.replace('config', 'result').replace('.yml', '').replace('.yaml', '')+f'_{dataset_time_string}'
+        config_now['logdir'] = logdir
+        print(f'Saving at {logdir}')
+        save_path = test(tokenizer = self.tokenizer,
+                         text_encoder = self.text_encoder,
+                         vae = self.vae,
+                         unet = self.unet,
+                         config=default_edit_config, **config_now)
+        mp4_path = save_path.replace('_0.gif', '_0_0_0.mp4')
+        return mp4_path