Muhammad Taqi Raza
commited on
Commit
·
ee6a765
1
Parent(s):
7d2ae5b
upscale, refine, upscale_factor
Browse files- gradio_app.py +15 -8
- inference/cli_demo_camera_i2v_pcd.py +20 -6
- inference/utils.py +1 -1
- inference/v2v_data/demo.py +1 -0
- inference/v2v_data/models/infer.py +5 -1
gradio_app.py
CHANGED
@@ -74,7 +74,7 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
|
|
74 |
"--fps", str(fps),
|
75 |
"--depth_inference_steps", str(depth_inference_steps),
|
76 |
"--depth_guidance_scale", str(depth_guidance_scale),
|
77 |
-
|
78 |
"--sampler_name", sampler_name,
|
79 |
"--diffusion_guidance_scale", str(diffusion_guidance_scale),
|
80 |
"--diffusion_inference_steps", str(diffusion_inference_steps),
|
@@ -105,7 +105,7 @@ def inference(
|
|
105 |
fps, num_frames, controlnet_weights, controlnet_guidance_start,
|
106 |
controlnet_guidance_end, guidance_scale, num_inference_steps, dtype,
|
107 |
seed, height, width, downscale_coef, vae_channels,
|
108 |
-
controlnet_input_channels, controlnet_transformer_num_layers
|
109 |
):
|
110 |
MODEL_PATH = "/app/pretrained/CogVideoX-5b-I2V"
|
111 |
ckpt_path = "/app/out/EPiC_pretrained/checkpoint-500.pt"
|
@@ -132,7 +132,10 @@ def inference(
|
|
132 |
"--downscale_coef", str(downscale_coef),
|
133 |
"--vae_channels", str(vae_channels),
|
134 |
"--controlnet_input_channels", str(controlnet_input_channels),
|
135 |
-
"--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers)
|
|
|
|
|
|
|
136 |
]
|
137 |
try:
|
138 |
result = subprocess.run(command, capture_output=True, text=True, check=True)
|
@@ -164,7 +167,7 @@ with demo:
|
|
164 |
fps_input = gr.Number(value=24, label="FPS")
|
165 |
num_frames_input = gr.Number(value=49, label="Number of Frames")
|
166 |
radius_input = gr.Number(value = 1.0, label="Radius Scale")
|
167 |
-
mode_input = gr.Dropdown(choices=["gradual"
|
168 |
sampler_input = gr.Dropdown(choices=["Euler", "Euler A", "DPM++", "PNDM", "DDIM_Cog", "DDIM_Origin"], value="DDIM_Origin", label="Sampler")
|
169 |
diff_guidance_input = gr.Number(value=6.0, label="Diffusion Guidance")
|
170 |
diff_steps_input = gr.Number(value=50, label="Diffusion Steps")
|
@@ -175,8 +178,8 @@ with demo:
|
|
175 |
maxres_input = gr.Number(value=1920, label="Max Resolution")
|
176 |
sample_size = gr.Textbox(label="Sample Size (height, width)", placeholder="e.g., 384, 672", value="384, 672")
|
177 |
seed_input = gr.Number(value=43, label="Seed")
|
178 |
-
height = gr.Number(value=
|
179 |
-
width = gr.Number(value=
|
180 |
prompt_input = gr.Textbox(label="Prompt")
|
181 |
neg_prompt_input = gr.Textbox(label="Negative Prompt", value="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory.")
|
182 |
refine_prompt_input = gr.Textbox(label="Refine Prompt", value=" The video is of high quality, and the view is very clear. ")
|
@@ -190,6 +193,10 @@ with demo:
|
|
190 |
with gr.Row():
|
191 |
with gr.Column():
|
192 |
with gr.Row():
|
|
|
|
|
|
|
|
|
193 |
controlnet_weights_input = gr.Number(value=0.5, label="ControlNet Weights")
|
194 |
controlnet_guidance_start_input = gr.Number(value=0.0, label="Guidance Start")
|
195 |
controlnet_guidance_end_input = gr.Number(value=0.5, label="Guidance End")
|
@@ -200,7 +207,7 @@ with demo:
|
|
200 |
height_input = gr.Number(value=480, label="Height")
|
201 |
width_input = gr.Number(value=720, label="Width")
|
202 |
num_frames_input2 = gr.Number(value=49, label="Num Frames")
|
203 |
-
fps_input2 = gr.Number(value=
|
204 |
downscale_coef_input = gr.Number(value=8, label="Downscale Coef")
|
205 |
vae_channels_input = gr.Number(value=16, label="VAE Channels")
|
206 |
controlnet_input_channels_input = gr.Number(value=6, label="ControlNet Input Channels")
|
@@ -232,7 +239,7 @@ with demo:
|
|
232 |
inference_steps_input, dtype_input, seed_input2,
|
233 |
height_input, width_input, downscale_coef_input,
|
234 |
vae_channels_input, controlnet_input_channels_input,
|
235 |
-
controlnet_layers_input
|
236 |
],
|
237 |
outputs=[step2_video, step2_logs]
|
238 |
)
|
|
|
74 |
"--fps", str(fps),
|
75 |
"--depth_inference_steps", str(depth_inference_steps),
|
76 |
"--depth_guidance_scale", str(depth_guidance_scale),
|
77 |
+
"--near_far_estimated", str(near_far_estimated),
|
78 |
"--sampler_name", sampler_name,
|
79 |
"--diffusion_guidance_scale", str(diffusion_guidance_scale),
|
80 |
"--diffusion_inference_steps", str(diffusion_inference_steps),
|
|
|
105 |
fps, num_frames, controlnet_weights, controlnet_guidance_start,
|
106 |
controlnet_guidance_end, guidance_scale, num_inference_steps, dtype,
|
107 |
seed, height, width, downscale_coef, vae_channels,
|
108 |
+
controlnet_input_channels, controlnet_transformer_num_layers, upscale, upscale_factor, refine
|
109 |
):
|
110 |
MODEL_PATH = "/app/pretrained/CogVideoX-5b-I2V"
|
111 |
ckpt_path = "/app/out/EPiC_pretrained/checkpoint-500.pt"
|
|
|
132 |
"--downscale_coef", str(downscale_coef),
|
133 |
"--vae_channels", str(vae_channels),
|
134 |
"--controlnet_input_channels", str(controlnet_input_channels),
|
135 |
+
"--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers),
|
136 |
+
"--upscale", str(upscale),
|
137 |
+
"--upscale_factor", str(upscale_factor),
|
138 |
+
"--refine", str(refine),
|
139 |
]
|
140 |
try:
|
141 |
result = subprocess.run(command, capture_output=True, text=True, check=True)
|
|
|
167 |
fps_input = gr.Number(value=24, label="FPS")
|
168 |
num_frames_input = gr.Number(value=49, label="Number of Frames")
|
169 |
radius_input = gr.Number(value = 1.0, label="Radius Scale")
|
170 |
+
mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
|
171 |
sampler_input = gr.Dropdown(choices=["Euler", "Euler A", "DPM++", "PNDM", "DDIM_Cog", "DDIM_Origin"], value="DDIM_Origin", label="Sampler")
|
172 |
diff_guidance_input = gr.Number(value=6.0, label="Diffusion Guidance")
|
173 |
diff_steps_input = gr.Number(value=50, label="Diffusion Steps")
|
|
|
178 |
maxres_input = gr.Number(value=1920, label="Max Resolution")
|
179 |
sample_size = gr.Textbox(label="Sample Size (height, width)", placeholder="e.g., 384, 672", value="384, 672")
|
180 |
seed_input = gr.Number(value=43, label="Seed")
|
181 |
+
height = gr.Number(value=480, label="Height")
|
182 |
+
width = gr.Number(value=720, label="Width")
|
183 |
prompt_input = gr.Textbox(label="Prompt")
|
184 |
neg_prompt_input = gr.Textbox(label="Negative Prompt", value="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory.")
|
185 |
refine_prompt_input = gr.Textbox(label="Refine Prompt", value=" The video is of high quality, and the view is very clear. ")
|
|
|
193 |
with gr.Row():
|
194 |
with gr.Column():
|
195 |
with gr.Row():
|
196 |
+
upscale = gr.Checkbox(label="Upscale", value=True)
|
197 |
+
upscale_factor = gr.Number(label="Upscale factor", value=4)
|
198 |
+
refine = gr.Checkbox(label="refine", value=True)
|
199 |
+
|
200 |
controlnet_weights_input = gr.Number(value=0.5, label="ControlNet Weights")
|
201 |
controlnet_guidance_start_input = gr.Number(value=0.0, label="Guidance Start")
|
202 |
controlnet_guidance_end_input = gr.Number(value=0.5, label="Guidance End")
|
|
|
207 |
height_input = gr.Number(value=480, label="Height")
|
208 |
width_input = gr.Number(value=720, label="Width")
|
209 |
num_frames_input2 = gr.Number(value=49, label="Num Frames")
|
210 |
+
fps_input2 = gr.Number(value=24, label="FPS")
|
211 |
downscale_coef_input = gr.Number(value=8, label="Downscale Coef")
|
212 |
vae_channels_input = gr.Number(value=16, label="VAE Channels")
|
213 |
controlnet_input_channels_input = gr.Number(value=6, label="ControlNet Input Channels")
|
|
|
239 |
inference_steps_input, dtype_input, seed_input2,
|
240 |
height_input, width_input, downscale_coef_input,
|
241 |
vae_channels_input, controlnet_input_channels_input,
|
242 |
+
controlnet_layers_input, upscale, upscale_factor, refine
|
243 |
],
|
244 |
outputs=[step2_video, step2_logs]
|
245 |
)
|
inference/cli_demo_camera_i2v_pcd.py
CHANGED
@@ -174,6 +174,9 @@ def generate_video(
|
|
174 |
pool_style: str = 'avg',
|
175 |
pipe_cpu_offload: bool = False,
|
176 |
fps: int = 8,
|
|
|
|
|
|
|
177 |
):
|
178 |
"""
|
179 |
Generates a video based on the given prompt and saves it to the specified path.
|
@@ -399,12 +402,11 @@ def generate_video(
|
|
399 |
else:
|
400 |
print(f" Value: {item}")
|
401 |
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
latents = rife_inference_with_latents(frame_interpolation_model, latents)
|
408 |
|
409 |
|
410 |
# Convert latents back to PIL images after processing
|
@@ -489,6 +491,15 @@ if __name__ == "__main__":
|
|
489 |
parser.add_argument("--enable_model_cpu_offload", action="store_true", default=False, help="Enable model CPU offload")
|
490 |
parser.add_argument("--fps", type=int, default=8, help="Frames per second for the output video")
|
491 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
492 |
args = parser.parse_args()
|
493 |
dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
|
494 |
generate_video(
|
@@ -525,4 +536,7 @@ if __name__ == "__main__":
|
|
525 |
pool_style=args.pool_style,
|
526 |
pipe_cpu_offload=args.enable_model_cpu_offload,
|
527 |
fps=args.fps,
|
|
|
|
|
|
|
528 |
)
|
|
|
174 |
pool_style: str = 'avg',
|
175 |
pipe_cpu_offload: bool = False,
|
176 |
fps: int = 8,
|
177 |
+
upscale: bool = True,
|
178 |
+
upscale_factor: int = 4,
|
179 |
+
refine: bool = True,
|
180 |
):
|
181 |
"""
|
182 |
Generates a video based on the given prompt and saves it to the specified path.
|
|
|
402 |
else:
|
403 |
print(f" Value: {item}")
|
404 |
|
405 |
+
|
406 |
+
if upscale:
|
407 |
+
latents = utils.upscale_batch_and_concatenate(upscale_model, latents, device, upscale_factor=upscale_factor)
|
408 |
+
if refine:
|
409 |
+
latents = rife_inference_with_latents(frame_interpolation_model, latents) # upscale here is assigned 1.
|
|
|
410 |
|
411 |
|
412 |
# Convert latents back to PIL images after processing
|
|
|
491 |
parser.add_argument("--enable_model_cpu_offload", action="store_true", default=False, help="Enable model CPU offload")
|
492 |
parser.add_argument("--fps", type=int, default=8, help="Frames per second for the output video")
|
493 |
|
494 |
+
parser.add_argument("--upscale", action="store_true", default=False, help="Enable upscaling of the output video")
|
495 |
+
parser.add_argument("--upscale_factor", type=int, default=4, help="Factor by which to upscale the output video")
|
496 |
+
parser.add_argument("--refine", action="store_true", default=False, help="Enable refinement of the output video")
|
497 |
+
|
498 |
+
# "--upscale", str(upscale),
|
499 |
+
# "--upscale_factor", str(upscale_factor),
|
500 |
+
# "--refine", str(refine),
|
501 |
+
|
502 |
+
|
503 |
args = parser.parse_args()
|
504 |
dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
|
505 |
generate_video(
|
|
|
536 |
pool_style=args.pool_style,
|
537 |
pipe_cpu_offload=args.enable_model_cpu_offload,
|
538 |
fps=args.fps,
|
539 |
+
upscale=args.upscale,
|
540 |
+
upscale_factor=args.upscale_factor,
|
541 |
+
refine=args.refine,
|
542 |
)
|
inference/utils.py
CHANGED
@@ -200,7 +200,7 @@ def upscale(upscale_model, tensor: torch.Tensor, inf_device, output_device="cpu"
|
|
200 |
return s
|
201 |
|
202 |
|
203 |
-
def upscale_batch_and_concatenate(upscale_model, latents, inf_device, output_device="cpu") -> torch.Tensor:
|
204 |
upscaled_latents = []
|
205 |
for i in range(latents.size(0)):
|
206 |
latent = latents[i]
|
|
|
200 |
return s
|
201 |
|
202 |
|
203 |
+
def upscale_batch_and_concatenate(upscale_model, latents, inf_device, output_device="cpu", upscale_factor = 4) -> torch.Tensor:
|
204 |
upscaled_latents = []
|
205 |
for i in range(latents.size(0)):
|
206 |
latent = latents[i]
|
inference/v2v_data/demo.py
CHANGED
@@ -129,6 +129,7 @@ class GetAnchorVideos:
|
|
129 |
opts.depth_guidance_scale,
|
130 |
window_size=opts.window_size,
|
131 |
overlap=opts.overlap,
|
|
|
132 |
).to(opts.device)
|
133 |
|
134 |
frames = (
|
|
|
129 |
opts.depth_guidance_scale,
|
130 |
window_size=opts.window_size,
|
131 |
overlap=opts.overlap,
|
132 |
+
near_far_estimated = opts.near_far_estimated,
|
133 |
).to(opts.device)
|
134 |
|
135 |
frames = (
|
inference/v2v_data/models/infer.py
CHANGED
@@ -66,6 +66,7 @@ class DepthCrafterDemo:
|
|
66 |
overlap: int = 25,
|
67 |
seed: int = 42,
|
68 |
track_time: bool = True,
|
|
|
69 |
):
|
70 |
set_seed(seed)
|
71 |
|
@@ -94,7 +95,10 @@ class DepthCrafterDemo:
|
|
94 |
depths[depths < 1e-5] = 1e-5
|
95 |
depths = 10000.0 / depths
|
96 |
|
97 |
-
|
|
|
|
|
|
|
98 |
print(f"Estimated near: {near}, far: {far}")
|
99 |
depths = depths.clip(near, far)
|
100 |
|
|
|
66 |
overlap: int = 25,
|
67 |
seed: int = 42,
|
68 |
track_time: bool = True,
|
69 |
+
near_far_estimated: bool = True,
|
70 |
):
|
71 |
set_seed(seed)
|
72 |
|
|
|
95 |
depths[depths < 1e-5] = 1e-5
|
96 |
depths = 10000.0 / depths
|
97 |
|
98 |
+
if near_far_estimated:
|
99 |
+
print("Estimating near and far values from the depth map...")
|
100 |
+
near, far = self.estimate_near_far(depths)
|
101 |
+
|
102 |
print(f"Estimated near: {near}, far: {far}")
|
103 |
depths = depths.clip(near, far)
|
104 |
|