Spaces:
Configuration error
Configuration error
Commit
·
836b387
1
Parent(s):
3a89a07
huggingface app
Browse files- .gitignore +2 -1
- __pycache__/example.cpython-310.pyc +0 -0
- __pycache__/merge_config_gradio.cpython-310.pyc +0 -0
- __pycache__/test.cpython-310.pyc +0 -0
- app.py +302 -0
- assets/teaser/run_two_man.mp4 +0 -0
- config/demo_config.yaml +2 -2
- example.py +19 -0
- video_diffusion/data/__pycache__/dataset.cpython-310.pyc +0 -0
- video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc +0 -0
- webui/__pycache__/merge_config_gradio.cpython-310.pyc +0 -0
- webui/merge_config_gradio.py +112 -0
.gitignore
CHANGED
@@ -3,4 +3,5 @@ annotator/annotator_ckpts.tar.gz
|
|
3 |
result/**
|
4 |
trash/**
|
5 |
data/**
|
6 |
-
videograin_data.tar.gz
|
|
|
|
3 |
result/**
|
4 |
trash/**
|
5 |
data/**
|
6 |
+
videograin_data.tar.gz
|
7 |
+
off_app.sh
|
__pycache__/example.cpython-310.pyc
ADDED
Binary file (534 Bytes). View file
|
|
__pycache__/merge_config_gradio.cpython-310.pyc
ADDED
Binary file (2.93 kB). View file
|
|
__pycache__/test.cpython-310.pyc
ADDED
Binary file (10 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
|
3 |
+
from __future__ import annotations
|
4 |
+
|
5 |
+
import os
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
|
9 |
+
from webui.merge_config_gradio import merge_config_then_run
|
10 |
+
|
11 |
+
import huggingface_hub
|
12 |
+
import shutil
|
13 |
+
import os
|
14 |
+
|
15 |
+
|
16 |
+
HF_TOKEN = os.getenv('HF_TOKEN')
|
17 |
+
pipe = merge_config_then_run()
|
18 |
+
|
19 |
+
|
20 |
+
ARTICLE = r"""
|
21 |
+
If VideoGrain is helpful, please help to ⭐ the <a href='https://github.com/knightyxp/VideoGrain' target='_blank'>Github Repo</a>. Thanks!
|
22 |
+
[](https://github.com/knightyxp/VideoGrain)
|
23 |
+
---
|
24 |
+
📝 **Citation**
|
25 |
+
If our work is useful for your research, please consider citing:
|
26 |
+
```bibtex
|
27 |
+
@article{yang2025videograin,
|
28 |
+
title={VideoGrain: Modulating Space-Time Attention for Multi-grained Video Editing},
|
29 |
+
author={Yang, Xiangpeng and Zhu, Linchao and Fan, Hehe and Yang, Yi},
|
30 |
+
journal={ICLR},
|
31 |
+
year={2025}
|
32 |
+
}
|
33 |
+
```
|
34 |
+
📋 **License**
|
35 |
+
This project is licensed under <a rel="license" href="https://github.com/knightyxp/VideoGrain?tab=License-1-ov-file#readme">ReLER-Lab License 1.0</a>.
|
36 |
+
Redistribution and use for non-commercial purposes should follow this license.
|
37 |
+
📧 **Contact**
|
38 |
+
If you have any questions, please feel free to reach me out at <b>[email protected]</b>.
|
39 |
+
"""
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
def update_layout_visibility(num):
|
44 |
+
"""
|
45 |
+
Given the user's selection (string) in ["2","3","4","5"],
|
46 |
+
return visibility updates for each of the 5 layout video inputs.
|
47 |
+
"""
|
48 |
+
n = int(num)
|
49 |
+
# Show layout_file1 if n >= 1, layout_file2 if n >= 2, etc.
|
50 |
+
return [
|
51 |
+
gr.update(visible=(n >= 1)),
|
52 |
+
gr.update(visible=(n >= 2)),
|
53 |
+
gr.update(visible=(n >= 3)),
|
54 |
+
gr.update(visible=(n >= 4)),
|
55 |
+
gr.update(visible=(n >= 5))
|
56 |
+
]
|
57 |
+
|
58 |
+
with gr.Blocks(css='style.css') as demo:
|
59 |
+
# gr.Markdown(TITLE)
|
60 |
+
|
61 |
+
gr.HTML(
|
62 |
+
"""
|
63 |
+
<div style="text-align: center; max-width: 1200px; margin: 20px auto;">
|
64 |
+
<h1 style="font-weight: 900; font-size: 2rem; margin: 0rem">
|
65 |
+
VideoGrain: Modulating Space-Time Attention for Multi-Grained Video Editing
|
66 |
+
</h1>
|
67 |
+
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
|
68 |
+
<a href="https://github.com/knightyxp">Xiangpeng Yang</a>
|
69 |
+
</h2>
|
70 |
+
<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
|
71 |
+
<span class="link-block">
|
72 |
+
[<a href="https://arxiv.org/abs/2502.17258" target="_blank"
|
73 |
+
class="external-link ">
|
74 |
+
<span class="icon">
|
75 |
+
<i class="ai ai-arxiv"></i>
|
76 |
+
</span>
|
77 |
+
<span>arXiv</span>
|
78 |
+
</a>]
|
79 |
+
</span>
|
80 |
+
<!-- Github link -->
|
81 |
+
<span class="link-block">
|
82 |
+
[<a href="https://github.com/knightyxp/VideoGrain" target="_blank"
|
83 |
+
class="external-link ">
|
84 |
+
<span class="icon">
|
85 |
+
<i class="fab fa-github"></i>
|
86 |
+
</span>
|
87 |
+
<span>Code</span>
|
88 |
+
</a>]
|
89 |
+
</span>
|
90 |
+
<!-- Github link -->
|
91 |
+
<span class="link-block">
|
92 |
+
[<a href="https://knightyxp.github.io/VideoGrain_project_page" target="_blank"
|
93 |
+
class="external-link ">
|
94 |
+
<span class="icon">
|
95 |
+
<i class="fab fa-github"></i>
|
96 |
+
</span>
|
97 |
+
<span>Homepage</span>
|
98 |
+
</a>]
|
99 |
+
</span>
|
100 |
+
<!-- Github link -->
|
101 |
+
<span class="link-block">
|
102 |
+
[<a href="https://www.youtube.com/watch?v=XEM4Pex7F9E" target="_blank"
|
103 |
+
class="external-link ">
|
104 |
+
<span class="icon">
|
105 |
+
<i class="fab fa-youtube"></i>
|
106 |
+
</span>
|
107 |
+
<span>Youtube Video</span>
|
108 |
+
</a>]
|
109 |
+
</span>
|
110 |
+
</h2>
|
111 |
+
<h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
|
112 |
+
📕 TL;DR: VideoGrain is a zero-shot method for class-level, instance-level, and part-level video editing
|
113 |
+
</h2>
|
114 |
+
<h2 style="font-weight: 450; font-size: 1rem;">
|
115 |
+
Note that this page is a limited demo of VideoGrain. To run with more configurations, please check out our <a href="https://github.com/knightyxp/VideoGrain">github page.
|
116 |
+
</h2>
|
117 |
+
</div>
|
118 |
+
""")
|
119 |
+
|
120 |
+
|
121 |
+
gr.HTML("""
|
122 |
+
<p>We provide an <a href="https://github.com/knightyxp/VideoGrain?tab=readme-ov-file#editing-guidance-for-your-video"> Editing Guidance </a> to help users to choose hyperparameters when editing in-the-wild video.
|
123 |
+
<p>To remove the limitations or avoid queue on your own hardware, you may <a href="https://huggingface.co/spaces/XiangpengYang/VideoGrain?duplicate=true" style="display: inline-block; vertical-align: middle;"><img style="margin-top: 0em; margin-bottom: 0em; display: inline-block;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a></p>
|
124 |
+
""")
|
125 |
+
|
126 |
+
with gr.Row():
|
127 |
+
with gr.Column():
|
128 |
+
with gr.Accordion('Input Video', open=True):
|
129 |
+
# user_input_video = gr.File(label='Input Source Video')
|
130 |
+
user_input_video = gr.Video(label='Input Source Video', source='upload', type='numpy', format="mp4", visible=True).style(height="auto")
|
131 |
+
|
132 |
+
|
133 |
+
# Radio to choose how many layout videos to show
|
134 |
+
num_layouts = gr.Radio(
|
135 |
+
choices=["2", "3", "4", "5"],
|
136 |
+
label="Select Number of Editing Areas",
|
137 |
+
value="2", # default
|
138 |
+
info="Please select the number of editing areas"
|
139 |
+
)
|
140 |
+
|
141 |
+
# Put all layout-video components in one Row to display them horizontally.
|
142 |
+
with gr.Row():
|
143 |
+
layout_file1 = gr.Video(
|
144 |
+
label="Layout Video 1",
|
145 |
+
type="numpy",
|
146 |
+
format="mp4",
|
147 |
+
visible=True
|
148 |
+
)
|
149 |
+
layout_file2 = gr.Video(
|
150 |
+
label="Layout Video 2",
|
151 |
+
type="numpy",
|
152 |
+
format="mp4",
|
153 |
+
visible=True
|
154 |
+
)
|
155 |
+
layout_file3 = gr.Video(
|
156 |
+
label="Layout Video 3",
|
157 |
+
type="numpy",
|
158 |
+
format="mp4",
|
159 |
+
visible=False
|
160 |
+
)
|
161 |
+
layout_file4 = gr.Video(
|
162 |
+
label="Layout Video 4",
|
163 |
+
type="numpy",
|
164 |
+
format="mp4",
|
165 |
+
visible=False
|
166 |
+
)
|
167 |
+
layout_file5 = gr.Video(
|
168 |
+
label="Layout Video 5",
|
169 |
+
type="numpy",
|
170 |
+
format="mp4",
|
171 |
+
visible=False
|
172 |
+
)
|
173 |
+
|
174 |
+
# Toggle visibility of the layout videos based on user selection
|
175 |
+
num_layouts.change(
|
176 |
+
fn=update_layout_visibility,
|
177 |
+
inputs=num_layouts,
|
178 |
+
outputs=[
|
179 |
+
layout_file1,
|
180 |
+
layout_file2,
|
181 |
+
layout_file3,
|
182 |
+
layout_file4,
|
183 |
+
layout_file5
|
184 |
+
]
|
185 |
+
)
|
186 |
+
|
187 |
+
prompt = gr.Textbox(label='Prompt',
|
188 |
+
info='Change the prompt, and extract each local prompt in the editing prompts.\
|
189 |
+
the local prompt order should be same as layout masks order.)',
|
190 |
+
)
|
191 |
+
|
192 |
+
model_id = gr.Dropdown(
|
193 |
+
label='Model ID',
|
194 |
+
choices=[
|
195 |
+
'stable-diffusion-v1-5/stable-diffusion-v1-5',
|
196 |
+
# add shape editing ckpt here
|
197 |
+
],
|
198 |
+
value='stable-diffusion-v1-5/stable-diffusion-v1-5')
|
199 |
+
|
200 |
+
|
201 |
+
run_button = gr.Button('Generate')
|
202 |
+
|
203 |
+
with gr.Column():
|
204 |
+
result = gr.Video(label='Result')
|
205 |
+
# result.style(height=512, width=512)
|
206 |
+
with gr.Row():
|
207 |
+
control_list = ['dwpose', 'depth_zoe', 'depth_midas']
|
208 |
+
control_type = gr.Dropdown(
|
209 |
+
choices=control_list,
|
210 |
+
label='Control type',
|
211 |
+
value='dwpose'
|
212 |
+
)
|
213 |
+
|
214 |
+
# Checkbox group for "dwpose" options; default: hand selected, face not selected.
|
215 |
+
dwpose_options = gr.CheckboxGroup(
|
216 |
+
choices=["hand", "face"],
|
217 |
+
label="DW Pose Options",
|
218 |
+
value=["hand"],
|
219 |
+
visible=True # Initially visible since default control_type is "dwpose"
|
220 |
+
)
|
221 |
+
|
222 |
+
# Update the visibility of the dwpose_options based on the selected control type
|
223 |
+
control_type.change(
|
224 |
+
fn=lambda x: gr.update(visible=(x == "dwpose")),
|
225 |
+
inputs=control_type,
|
226 |
+
outputs=dwpose_options
|
227 |
+
)
|
228 |
+
|
229 |
+
controlnet_conditioning_scale = gr.Slider(label='ControlNet conditioning scale',
|
230 |
+
minimum=0.0,
|
231 |
+
maximum=1.0,
|
232 |
+
value=1.0,
|
233 |
+
step=0.1)
|
234 |
+
|
235 |
+
with gr.Accordion('Editing config for VideoGrian', open=True):
|
236 |
+
use_pnp = gr.Checkbox(
|
237 |
+
label="Use PnP",
|
238 |
+
value=False,
|
239 |
+
info="Check to enable PnP functionality."
|
240 |
+
)
|
241 |
+
|
242 |
+
pnp_inject_steps = gr.Slider(label='pnp inject steps',
|
243 |
+
info='PnP inject steps for temporal consistency',
|
244 |
+
minimum=0,
|
245 |
+
maximum=10,
|
246 |
+
step=1,
|
247 |
+
value=0)
|
248 |
+
|
249 |
+
flatten_res = gr.CheckboxGroup(
|
250 |
+
choices=["1", "2", "4", "8"],
|
251 |
+
label="Flatten Resolution",
|
252 |
+
value=["1"],
|
253 |
+
info="Select one or more flatten resolution factors. Mapping: 1 -> 64, 2 -> 32 (64/2), 4 -> 16 (64/4), 8 -> 8 (64/8)."
|
254 |
+
)
|
255 |
+
|
256 |
+
|
257 |
+
with gr.Row():
|
258 |
+
from example import style_example
|
259 |
+
examples = style_example
|
260 |
+
|
261 |
+
# gr.Examples(examples=examples,
|
262 |
+
# inputs=[
|
263 |
+
# model_id,
|
264 |
+
# user_input_video,
|
265 |
+
# layout_files,
|
266 |
+
# prompt,
|
267 |
+
# model_id,
|
268 |
+
# control_type,
|
269 |
+
# dwpose_options,
|
270 |
+
# controlnet_conditioning_scale,
|
271 |
+
# use_pnp,
|
272 |
+
# pnp_inject_steps,
|
273 |
+
# flatten_res,
|
274 |
+
# ],
|
275 |
+
# outputs=result,
|
276 |
+
# fn=pipe.run,
|
277 |
+
# cache_examples=True,
|
278 |
+
# # cache_examples=os.getenv('SYSTEM') == 'spaces'
|
279 |
+
# )
|
280 |
+
gr.Markdown(ARTICLE)
|
281 |
+
inputs = [
|
282 |
+
model_id,
|
283 |
+
user_input_video,
|
284 |
+
num_layouts,
|
285 |
+
layout_file1,
|
286 |
+
layout_file2,
|
287 |
+
layout_file3,
|
288 |
+
layout_file4,
|
289 |
+
layout_file5,
|
290 |
+
prompt,
|
291 |
+
model_id,
|
292 |
+
control_type,
|
293 |
+
dwpose_options,
|
294 |
+
controlnet_conditioning_scale,
|
295 |
+
use_pnp,
|
296 |
+
pnp_inject_steps,
|
297 |
+
flatten_res,
|
298 |
+
]
|
299 |
+
prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
|
300 |
+
run_button.click(fn=pipe.run, inputs=inputs, outputs=result)
|
301 |
+
|
302 |
+
demo.queue().launch()
|
assets/teaser/run_two_man.mp4
ADDED
Binary file (149 kB). View file
|
|
config/demo_config.yaml
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
pretrained_model_path: "
|
2 |
logdir: ./result/run_two_man/instance_level/3cls_spider_polar_vis_cross_attn
|
3 |
|
4 |
dataset_config:
|
@@ -13,7 +13,7 @@ dataset_config:
|
|
13 |
|
14 |
control_config:
|
15 |
control_type: "dwpose"
|
16 |
-
pretrained_controlnet_path: "
|
17 |
controlnet_conditioning_scale: 1.0
|
18 |
hand: True
|
19 |
face: False
|
|
|
1 |
+
pretrained_model_path: "/home/xianyang/Data/code/FateZero/ckpt/stable-diffusion-v1-5"
|
2 |
logdir: ./result/run_two_man/instance_level/3cls_spider_polar_vis_cross_attn
|
3 |
|
4 |
dataset_config:
|
|
|
13 |
|
14 |
control_config:
|
15 |
control_type: "dwpose"
|
16 |
+
pretrained_controlnet_path: "/home/xianyang/Data/code/FateZero/ckpt/control_v11p_sd15_openpose"
|
17 |
controlnet_conditioning_scale: 1.0
|
18 |
hand: True
|
19 |
face: False
|
example.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
num_steps = 15
|
2 |
+
style_example = [
|
3 |
+
[
|
4 |
+
'CompVis/stable-diffusion-v1-5',
|
5 |
+
'data/run_two_man/run_two_man.mp4',
|
6 |
+
'Man in red hoddie and man in gray shirt are jogging in forest',
|
7 |
+
'left man → Spiderman, right man → Polar Bear + Sunglasses, ground → grassy meadow, trees → cherry blossoms',
|
8 |
+
0.8,
|
9 |
+
0.8,
|
10 |
+
"instance+part",
|
11 |
+
10,
|
12 |
+
num_steps,
|
13 |
+
7.5,
|
14 |
+
# input video argument
|
15 |
+
None, 0, 8, 1, 0,0,0,0
|
16 |
+
|
17 |
+
],
|
18 |
+
|
19 |
+
]
|
video_diffusion/data/__pycache__/dataset.cpython-310.pyc
CHANGED
Binary files a/video_diffusion/data/__pycache__/dataset.cpython-310.pyc and b/video_diffusion/data/__pycache__/dataset.cpython-310.pyc differ
|
|
video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc
CHANGED
Binary files a/video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc and b/video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc differ
|
|
webui/__pycache__/merge_config_gradio.cpython-310.pyc
ADDED
Binary file (2.68 kB). View file
|
|
webui/merge_config_gradio.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from test import *
|
2 |
+
|
3 |
+
import copy
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
class merge_config_then_run():
|
7 |
+
def __init__(self) -> None:
|
8 |
+
# Load the tokenizer
|
9 |
+
pretrained_model_path = '/home/xianyang/Data/code/FateZero/ckpt/stable-diffusion-v1-5'
|
10 |
+
self.tokenizer = None
|
11 |
+
self.text_encoder = None
|
12 |
+
self.vae = None
|
13 |
+
self.unet = None
|
14 |
+
|
15 |
+
cache_ckpt = True
|
16 |
+
if cache_ckpt:
|
17 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
18 |
+
pretrained_model_path,
|
19 |
+
# 'FateZero/ckpt/stable-diffusion-v1-4',
|
20 |
+
subfolder="tokenizer",
|
21 |
+
use_fast=False,
|
22 |
+
)
|
23 |
+
|
24 |
+
# Load models and create wrapper for stable diffusion
|
25 |
+
self.text_encoder = CLIPTextModel.from_pretrained(
|
26 |
+
pretrained_model_path,
|
27 |
+
subfolder="text_encoder",
|
28 |
+
)
|
29 |
+
|
30 |
+
self.vae = AutoencoderKL.from_pretrained(
|
31 |
+
pretrained_model_path,
|
32 |
+
subfolder="vae",
|
33 |
+
)
|
34 |
+
|
35 |
+
self.unet = UNetPseudo3DConditionModel.from_2d_model(
|
36 |
+
os.path.join(pretrained_model_path, "unet"), model_config=model_config
|
37 |
+
)
|
38 |
+
|
39 |
+
def run(
|
40 |
+
self,
|
41 |
+
# def merge_config_then_run(
|
42 |
+
model_id,
|
43 |
+
data_path,
|
44 |
+
source_prompt,
|
45 |
+
target_prompt,
|
46 |
+
cross_replace_steps,
|
47 |
+
self_replace_steps,
|
48 |
+
enhance_words,
|
49 |
+
enhance_words_value,
|
50 |
+
num_steps,
|
51 |
+
guidance_scale,
|
52 |
+
user_input_video=None,
|
53 |
+
|
54 |
+
# Temporal and spatial crop of the video
|
55 |
+
start_sample_frame=0,
|
56 |
+
n_sample_frame=8,
|
57 |
+
stride=1,
|
58 |
+
left_crop=0,
|
59 |
+
right_crop=0,
|
60 |
+
top_crop=0,
|
61 |
+
bottom_crop=0,
|
62 |
+
):
|
63 |
+
# , ] = inputs
|
64 |
+
default_edit_config='config/demo_config.yaml'
|
65 |
+
Omegadict_default_edit_config = OmegaConf.load(default_edit_config)
|
66 |
+
|
67 |
+
dataset_time_string = get_time_string()
|
68 |
+
config_now = copy.deepcopy(Omegadict_default_edit_config)
|
69 |
+
print(f"config_now['pretrained_model_path'] = model_id {model_id}")
|
70 |
+
# config_now['pretrained_model_path'] = model_id
|
71 |
+
config_now['dataset_config']['prompt'] = source_prompt
|
72 |
+
config_now['dataset_config']['path'] = data_path
|
73 |
+
# ImageSequenceDataset_dict = { }
|
74 |
+
offset_dict = {
|
75 |
+
"left": left_crop,
|
76 |
+
"right": right_crop,
|
77 |
+
"top": top_crop,
|
78 |
+
"bottom": bottom_crop,
|
79 |
+
}
|
80 |
+
ImageSequenceDataset_dict = {
|
81 |
+
"start_sample_frame" : start_sample_frame,
|
82 |
+
"n_sample_frame" : n_sample_frame,
|
83 |
+
"sampling_rate" : stride,
|
84 |
+
"offset": offset_dict,
|
85 |
+
}
|
86 |
+
config_now['dataset_config'].update(ImageSequenceDataset_dict)
|
87 |
+
if user_input_video and data_path is None:
|
88 |
+
raise gr.Error('You need to upload a video or choose a provided video')
|
89 |
+
if user_input_video is not None:
|
90 |
+
if isinstance(user_input_video, str):
|
91 |
+
config_now['dataset_config']['path'] = user_input_video
|
92 |
+
elif hasattr(user_input_video, 'name') and user_input_video.name is not None:
|
93 |
+
config_now['dataset_config']['path'] = user_input_video.name
|
94 |
+
|
95 |
+
|
96 |
+
# editing config
|
97 |
+
config_now['editing_config']['prompts'] = [target_prompt]
|
98 |
+
|
99 |
+
config_now['editing_config']['guidance_scale'] = guidance_scale
|
100 |
+
config_now['editing_config']['num_inference_steps'] = num_steps
|
101 |
+
|
102 |
+
|
103 |
+
logdir = default_edit_config.replace('config', 'result').replace('.yml', '').replace('.yaml', '')+f'_{dataset_time_string}'
|
104 |
+
config_now['logdir'] = logdir
|
105 |
+
print(f'Saving at {logdir}')
|
106 |
+
save_path = test(tokenizer = self.tokenizer,
|
107 |
+
text_encoder = self.text_encoder,
|
108 |
+
vae = self.vae,
|
109 |
+
unet = self.unet,
|
110 |
+
config=default_edit_config, **config_now)
|
111 |
+
mp4_path = save_path.replace('_0.gif', '_0_0_0.mp4')
|
112 |
+
return mp4_path
|