XiangpengYang commited on
Commit
836b387
·
1 Parent(s): 3a89a07

huggingface app

Browse files
.gitignore CHANGED
@@ -3,4 +3,5 @@ annotator/annotator_ckpts.tar.gz
3
  result/**
4
  trash/**
5
  data/**
6
- videograin_data.tar.gz
 
 
3
  result/**
4
  trash/**
5
  data/**
6
+ videograin_data.tar.gz
7
+ off_app.sh
__pycache__/example.cpython-310.pyc ADDED
Binary file (534 Bytes). View file
 
__pycache__/merge_config_gradio.cpython-310.pyc ADDED
Binary file (2.93 kB). View file
 
__pycache__/test.cpython-310.pyc ADDED
Binary file (10 kB). View file
 
app.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ import gradio as gr
8
+
9
+ from webui.merge_config_gradio import merge_config_then_run
10
+
11
+ import huggingface_hub
12
+ import shutil
13
+ import os
14
+
15
+
16
+ HF_TOKEN = os.getenv('HF_TOKEN')
17
+ pipe = merge_config_then_run()
18
+
19
+
20
+ ARTICLE = r"""
21
+ If VideoGrain is helpful, please help to ⭐ the <a href='https://github.com/knightyxp/VideoGrain' target='_blank'>Github Repo</a>. Thanks!
22
+ [![GitHub Stars](https://img.shields.io/github/stars/knightyxp/VideoGrain?style=social)](https://github.com/knightyxp/VideoGrain)
23
+ ---
24
+ 📝 **Citation**
25
+ If our work is useful for your research, please consider citing:
26
+ ```bibtex
27
+ @article{yang2025videograin,
28
+ title={VideoGrain: Modulating Space-Time Attention for Multi-grained Video Editing},
29
+ author={Yang, Xiangpeng and Zhu, Linchao and Fan, Hehe and Yang, Yi},
30
+ journal={ICLR},
31
+ year={2025}
32
+ }
33
+ ```
34
+ 📋 **License**
35
+ This project is licensed under <a rel="license" href="https://github.com/knightyxp/VideoGrain?tab=License-1-ov-file#readme">ReLER-Lab License 1.0</a>.
36
+ Redistribution and use for non-commercial purposes should follow this license.
37
+ 📧 **Contact**
38
+ If you have any questions, please feel free to reach me out at <b>[email protected]</b>.
39
+ """
40
+
41
+
42
+
43
+ def update_layout_visibility(num):
44
+ """
45
+ Given the user's selection (string) in ["2","3","4","5"],
46
+ return visibility updates for each of the 5 layout video inputs.
47
+ """
48
+ n = int(num)
49
+ # Show layout_file1 if n >= 1, layout_file2 if n >= 2, etc.
50
+ return [
51
+ gr.update(visible=(n >= 1)),
52
+ gr.update(visible=(n >= 2)),
53
+ gr.update(visible=(n >= 3)),
54
+ gr.update(visible=(n >= 4)),
55
+ gr.update(visible=(n >= 5))
56
+ ]
57
+
58
+ with gr.Blocks(css='style.css') as demo:
59
+ # gr.Markdown(TITLE)
60
+
61
+ gr.HTML(
62
+ """
63
+ <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
64
+ <h1 style="font-weight: 900; font-size: 2rem; margin: 0rem">
65
+ VideoGrain: Modulating Space-Time Attention for Multi-Grained Video Editing
66
+ </h1>
67
+ <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
68
+ <a href="https://github.com/knightyxp">Xiangpeng Yang</a>
69
+ </h2>
70
+ <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
71
+ <span class="link-block">
72
+ [<a href="https://arxiv.org/abs/2502.17258" target="_blank"
73
+ class="external-link ">
74
+ <span class="icon">
75
+ <i class="ai ai-arxiv"></i>
76
+ </span>
77
+ <span>arXiv</span>
78
+ </a>]
79
+ </span>
80
+ <!-- Github link -->
81
+ <span class="link-block">
82
+ [<a href="https://github.com/knightyxp/VideoGrain" target="_blank"
83
+ class="external-link ">
84
+ <span class="icon">
85
+ <i class="fab fa-github"></i>
86
+ </span>
87
+ <span>Code</span>
88
+ </a>]
89
+ </span>
90
+ <!-- Github link -->
91
+ <span class="link-block">
92
+ [<a href="https://knightyxp.github.io/VideoGrain_project_page" target="_blank"
93
+ class="external-link ">
94
+ <span class="icon">
95
+ <i class="fab fa-github"></i>
96
+ </span>
97
+ <span>Homepage</span>
98
+ </a>]
99
+ </span>
100
+ <!-- Github link -->
101
+ <span class="link-block">
102
+ [<a href="https://www.youtube.com/watch?v=XEM4Pex7F9E" target="_blank"
103
+ class="external-link ">
104
+ <span class="icon">
105
+ <i class="fab fa-youtube"></i>
106
+ </span>
107
+ <span>Youtube Video</span>
108
+ </a>]
109
+ </span>
110
+ </h2>
111
+ <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
112
+ 📕 TL;DR: VideoGrain is a zero-shot method for class-level, instance-level, and part-level video editing
113
+ </h2>
114
+ <h2 style="font-weight: 450; font-size: 1rem;">
115
+ Note that this page is a limited demo of VideoGrain. To run with more configurations, please check out our <a href="https://github.com/knightyxp/VideoGrain">github page.
116
+ </h2>
117
+ </div>
118
+ """)
119
+
120
+
121
+ gr.HTML("""
122
+ <p>We provide an <a href="https://github.com/knightyxp/VideoGrain?tab=readme-ov-file#editing-guidance-for-your-video"> Editing Guidance </a> to help users to choose hyperparameters when editing in-the-wild video.
123
+ <p>To remove the limitations or avoid queue on your own hardware, you may <a href="https://huggingface.co/spaces/XiangpengYang/VideoGrain?duplicate=true" style="display: inline-block; vertical-align: middle;"><img style="margin-top: 0em; margin-bottom: 0em; display: inline-block;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a></p>
124
+ """)
125
+
126
+ with gr.Row():
127
+ with gr.Column():
128
+ with gr.Accordion('Input Video', open=True):
129
+ # user_input_video = gr.File(label='Input Source Video')
130
+ user_input_video = gr.Video(label='Input Source Video', source='upload', type='numpy', format="mp4", visible=True).style(height="auto")
131
+
132
+
133
+ # Radio to choose how many layout videos to show
134
+ num_layouts = gr.Radio(
135
+ choices=["2", "3", "4", "5"],
136
+ label="Select Number of Editing Areas",
137
+ value="2", # default
138
+ info="Please select the number of editing areas"
139
+ )
140
+
141
+ # Put all layout-video components in one Row to display them horizontally.
142
+ with gr.Row():
143
+ layout_file1 = gr.Video(
144
+ label="Layout Video 1",
145
+ type="numpy",
146
+ format="mp4",
147
+ visible=True
148
+ )
149
+ layout_file2 = gr.Video(
150
+ label="Layout Video 2",
151
+ type="numpy",
152
+ format="mp4",
153
+ visible=True
154
+ )
155
+ layout_file3 = gr.Video(
156
+ label="Layout Video 3",
157
+ type="numpy",
158
+ format="mp4",
159
+ visible=False
160
+ )
161
+ layout_file4 = gr.Video(
162
+ label="Layout Video 4",
163
+ type="numpy",
164
+ format="mp4",
165
+ visible=False
166
+ )
167
+ layout_file5 = gr.Video(
168
+ label="Layout Video 5",
169
+ type="numpy",
170
+ format="mp4",
171
+ visible=False
172
+ )
173
+
174
+ # Toggle visibility of the layout videos based on user selection
175
+ num_layouts.change(
176
+ fn=update_layout_visibility,
177
+ inputs=num_layouts,
178
+ outputs=[
179
+ layout_file1,
180
+ layout_file2,
181
+ layout_file3,
182
+ layout_file4,
183
+ layout_file5
184
+ ]
185
+ )
186
+
187
+ prompt = gr.Textbox(label='Prompt',
188
+ info='Change the prompt, and extract each local prompt in the editing prompts.\
189
+ the local prompt order should be same as layout masks order.)',
190
+ )
191
+
192
+ model_id = gr.Dropdown(
193
+ label='Model ID',
194
+ choices=[
195
+ 'stable-diffusion-v1-5/stable-diffusion-v1-5',
196
+ # add shape editing ckpt here
197
+ ],
198
+ value='stable-diffusion-v1-5/stable-diffusion-v1-5')
199
+
200
+
201
+ run_button = gr.Button('Generate')
202
+
203
+ with gr.Column():
204
+ result = gr.Video(label='Result')
205
+ # result.style(height=512, width=512)
206
+ with gr.Row():
207
+ control_list = ['dwpose', 'depth_zoe', 'depth_midas']
208
+ control_type = gr.Dropdown(
209
+ choices=control_list,
210
+ label='Control type',
211
+ value='dwpose'
212
+ )
213
+
214
+ # Checkbox group for "dwpose" options; default: hand selected, face not selected.
215
+ dwpose_options = gr.CheckboxGroup(
216
+ choices=["hand", "face"],
217
+ label="DW Pose Options",
218
+ value=["hand"],
219
+ visible=True # Initially visible since default control_type is "dwpose"
220
+ )
221
+
222
+ # Update the visibility of the dwpose_options based on the selected control type
223
+ control_type.change(
224
+ fn=lambda x: gr.update(visible=(x == "dwpose")),
225
+ inputs=control_type,
226
+ outputs=dwpose_options
227
+ )
228
+
229
+ controlnet_conditioning_scale = gr.Slider(label='ControlNet conditioning scale',
230
+ minimum=0.0,
231
+ maximum=1.0,
232
+ value=1.0,
233
+ step=0.1)
234
+
235
+ with gr.Accordion('Editing config for VideoGrian', open=True):
236
+ use_pnp = gr.Checkbox(
237
+ label="Use PnP",
238
+ value=False,
239
+ info="Check to enable PnP functionality."
240
+ )
241
+
242
+ pnp_inject_steps = gr.Slider(label='pnp inject steps',
243
+ info='PnP inject steps for temporal consistency',
244
+ minimum=0,
245
+ maximum=10,
246
+ step=1,
247
+ value=0)
248
+
249
+ flatten_res = gr.CheckboxGroup(
250
+ choices=["1", "2", "4", "8"],
251
+ label="Flatten Resolution",
252
+ value=["1"],
253
+ info="Select one or more flatten resolution factors. Mapping: 1 -> 64, 2 -> 32 (64/2), 4 -> 16 (64/4), 8 -> 8 (64/8)."
254
+ )
255
+
256
+
257
+ with gr.Row():
258
+ from example import style_example
259
+ examples = style_example
260
+
261
+ # gr.Examples(examples=examples,
262
+ # inputs=[
263
+ # model_id,
264
+ # user_input_video,
265
+ # layout_files,
266
+ # prompt,
267
+ # model_id,
268
+ # control_type,
269
+ # dwpose_options,
270
+ # controlnet_conditioning_scale,
271
+ # use_pnp,
272
+ # pnp_inject_steps,
273
+ # flatten_res,
274
+ # ],
275
+ # outputs=result,
276
+ # fn=pipe.run,
277
+ # cache_examples=True,
278
+ # # cache_examples=os.getenv('SYSTEM') == 'spaces'
279
+ # )
280
+ gr.Markdown(ARTICLE)
281
+ inputs = [
282
+ model_id,
283
+ user_input_video,
284
+ num_layouts,
285
+ layout_file1,
286
+ layout_file2,
287
+ layout_file3,
288
+ layout_file4,
289
+ layout_file5,
290
+ prompt,
291
+ model_id,
292
+ control_type,
293
+ dwpose_options,
294
+ controlnet_conditioning_scale,
295
+ use_pnp,
296
+ pnp_inject_steps,
297
+ flatten_res,
298
+ ]
299
+ prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
300
+ run_button.click(fn=pipe.run, inputs=inputs, outputs=result)
301
+
302
+ demo.queue().launch()
assets/teaser/run_two_man.mp4 ADDED
Binary file (149 kB). View file
 
config/demo_config.yaml CHANGED
@@ -1,4 +1,4 @@
1
- pretrained_model_path: "./ckpt/stable-diffusion-v1-5"
2
  logdir: ./result/run_two_man/instance_level/3cls_spider_polar_vis_cross_attn
3
 
4
  dataset_config:
@@ -13,7 +13,7 @@ dataset_config:
13
 
14
  control_config:
15
  control_type: "dwpose"
16
- pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose"
17
  controlnet_conditioning_scale: 1.0
18
  hand: True
19
  face: False
 
1
+ pretrained_model_path: "/home/xianyang/Data/code/FateZero/ckpt/stable-diffusion-v1-5"
2
  logdir: ./result/run_two_man/instance_level/3cls_spider_polar_vis_cross_attn
3
 
4
  dataset_config:
 
13
 
14
  control_config:
15
  control_type: "dwpose"
16
+ pretrained_controlnet_path: "/home/xianyang/Data/code/FateZero/ckpt/control_v11p_sd15_openpose"
17
  controlnet_conditioning_scale: 1.0
18
  hand: True
19
  face: False
example.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_steps = 15
2
+ style_example = [
3
+ [
4
+ 'CompVis/stable-diffusion-v1-5',
5
+ 'data/run_two_man/run_two_man.mp4',
6
+ 'Man in red hoddie and man in gray shirt are jogging in forest',
7
+ 'left man → Spiderman, right man → Polar Bear + Sunglasses, ground → grassy meadow, trees → cherry blossoms',
8
+ 0.8,
9
+ 0.8,
10
+ "instance+part",
11
+ 10,
12
+ num_steps,
13
+ 7.5,
14
+ # input video argument
15
+ None, 0, 8, 1, 0,0,0,0
16
+
17
+ ],
18
+
19
+ ]
video_diffusion/data/__pycache__/dataset.cpython-310.pyc CHANGED
Binary files a/video_diffusion/data/__pycache__/dataset.cpython-310.pyc and b/video_diffusion/data/__pycache__/dataset.cpython-310.pyc differ
 
video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc CHANGED
Binary files a/video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc and b/video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc differ
 
webui/__pycache__/merge_config_gradio.cpython-310.pyc ADDED
Binary file (2.68 kB). View file
 
webui/merge_config_gradio.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from test import *
2
+
3
+ import copy
4
+ import gradio as gr
5
+
6
+ class merge_config_then_run():
7
+ def __init__(self) -> None:
8
+ # Load the tokenizer
9
+ pretrained_model_path = '/home/xianyang/Data/code/FateZero/ckpt/stable-diffusion-v1-5'
10
+ self.tokenizer = None
11
+ self.text_encoder = None
12
+ self.vae = None
13
+ self.unet = None
14
+
15
+ cache_ckpt = True
16
+ if cache_ckpt:
17
+ self.tokenizer = AutoTokenizer.from_pretrained(
18
+ pretrained_model_path,
19
+ # 'FateZero/ckpt/stable-diffusion-v1-4',
20
+ subfolder="tokenizer",
21
+ use_fast=False,
22
+ )
23
+
24
+ # Load models and create wrapper for stable diffusion
25
+ self.text_encoder = CLIPTextModel.from_pretrained(
26
+ pretrained_model_path,
27
+ subfolder="text_encoder",
28
+ )
29
+
30
+ self.vae = AutoencoderKL.from_pretrained(
31
+ pretrained_model_path,
32
+ subfolder="vae",
33
+ )
34
+
35
+ self.unet = UNetPseudo3DConditionModel.from_2d_model(
36
+ os.path.join(pretrained_model_path, "unet"), model_config=model_config
37
+ )
38
+
39
+ def run(
40
+ self,
41
+ # def merge_config_then_run(
42
+ model_id,
43
+ data_path,
44
+ source_prompt,
45
+ target_prompt,
46
+ cross_replace_steps,
47
+ self_replace_steps,
48
+ enhance_words,
49
+ enhance_words_value,
50
+ num_steps,
51
+ guidance_scale,
52
+ user_input_video=None,
53
+
54
+ # Temporal and spatial crop of the video
55
+ start_sample_frame=0,
56
+ n_sample_frame=8,
57
+ stride=1,
58
+ left_crop=0,
59
+ right_crop=0,
60
+ top_crop=0,
61
+ bottom_crop=0,
62
+ ):
63
+ # , ] = inputs
64
+ default_edit_config='config/demo_config.yaml'
65
+ Omegadict_default_edit_config = OmegaConf.load(default_edit_config)
66
+
67
+ dataset_time_string = get_time_string()
68
+ config_now = copy.deepcopy(Omegadict_default_edit_config)
69
+ print(f"config_now['pretrained_model_path'] = model_id {model_id}")
70
+ # config_now['pretrained_model_path'] = model_id
71
+ config_now['dataset_config']['prompt'] = source_prompt
72
+ config_now['dataset_config']['path'] = data_path
73
+ # ImageSequenceDataset_dict = { }
74
+ offset_dict = {
75
+ "left": left_crop,
76
+ "right": right_crop,
77
+ "top": top_crop,
78
+ "bottom": bottom_crop,
79
+ }
80
+ ImageSequenceDataset_dict = {
81
+ "start_sample_frame" : start_sample_frame,
82
+ "n_sample_frame" : n_sample_frame,
83
+ "sampling_rate" : stride,
84
+ "offset": offset_dict,
85
+ }
86
+ config_now['dataset_config'].update(ImageSequenceDataset_dict)
87
+ if user_input_video and data_path is None:
88
+ raise gr.Error('You need to upload a video or choose a provided video')
89
+ if user_input_video is not None:
90
+ if isinstance(user_input_video, str):
91
+ config_now['dataset_config']['path'] = user_input_video
92
+ elif hasattr(user_input_video, 'name') and user_input_video.name is not None:
93
+ config_now['dataset_config']['path'] = user_input_video.name
94
+
95
+
96
+ # editing config
97
+ config_now['editing_config']['prompts'] = [target_prompt]
98
+
99
+ config_now['editing_config']['guidance_scale'] = guidance_scale
100
+ config_now['editing_config']['num_inference_steps'] = num_steps
101
+
102
+
103
+ logdir = default_edit_config.replace('config', 'result').replace('.yml', '').replace('.yaml', '')+f'_{dataset_time_string}'
104
+ config_now['logdir'] = logdir
105
+ print(f'Saving at {logdir}')
106
+ save_path = test(tokenizer = self.tokenizer,
107
+ text_encoder = self.text_encoder,
108
+ vae = self.vae,
109
+ unet = self.unet,
110
+ config=default_edit_config, **config_now)
111
+ mp4_path = save_path.replace('_0.gif', '_0_0_0.mp4')
112
+ return mp4_path