Muhammad Taqi Raza commited on
Commit
ee6a765
·
1 Parent(s): 7d2ae5b

upscale, refine, upscale_factor

Browse files
gradio_app.py CHANGED
@@ -74,7 +74,7 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
74
  "--fps", str(fps),
75
  "--depth_inference_steps", str(depth_inference_steps),
76
  "--depth_guidance_scale", str(depth_guidance_scale),
77
- # "--near_far_estimated", str(near_far_estimated),
78
  "--sampler_name", sampler_name,
79
  "--diffusion_guidance_scale", str(diffusion_guidance_scale),
80
  "--diffusion_inference_steps", str(diffusion_inference_steps),
@@ -105,7 +105,7 @@ def inference(
105
  fps, num_frames, controlnet_weights, controlnet_guidance_start,
106
  controlnet_guidance_end, guidance_scale, num_inference_steps, dtype,
107
  seed, height, width, downscale_coef, vae_channels,
108
- controlnet_input_channels, controlnet_transformer_num_layers
109
  ):
110
  MODEL_PATH = "/app/pretrained/CogVideoX-5b-I2V"
111
  ckpt_path = "/app/out/EPiC_pretrained/checkpoint-500.pt"
@@ -132,7 +132,10 @@ def inference(
132
  "--downscale_coef", str(downscale_coef),
133
  "--vae_channels", str(vae_channels),
134
  "--controlnet_input_channels", str(controlnet_input_channels),
135
- "--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers)
 
 
 
136
  ]
137
  try:
138
  result = subprocess.run(command, capture_output=True, text=True, check=True)
@@ -164,7 +167,7 @@ with demo:
164
  fps_input = gr.Number(value=24, label="FPS")
165
  num_frames_input = gr.Number(value=49, label="Number of Frames")
166
  radius_input = gr.Number(value = 1.0, label="Radius Scale")
167
- mode_input = gr.Dropdown(choices=["gradual", "direct", "bullet"], value="gradual", label="Camera Mode")
168
  sampler_input = gr.Dropdown(choices=["Euler", "Euler A", "DPM++", "PNDM", "DDIM_Cog", "DDIM_Origin"], value="DDIM_Origin", label="Sampler")
169
  diff_guidance_input = gr.Number(value=6.0, label="Diffusion Guidance")
170
  diff_steps_input = gr.Number(value=50, label="Diffusion Steps")
@@ -175,8 +178,8 @@ with demo:
175
  maxres_input = gr.Number(value=1920, label="Max Resolution")
176
  sample_size = gr.Textbox(label="Sample Size (height, width)", placeholder="e.g., 384, 672", value="384, 672")
177
  seed_input = gr.Number(value=43, label="Seed")
178
- height = gr.Number(value=576, label="Height")
179
- width = gr.Number(value=1024, label="Width")
180
  prompt_input = gr.Textbox(label="Prompt")
181
  neg_prompt_input = gr.Textbox(label="Negative Prompt", value="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory.")
182
  refine_prompt_input = gr.Textbox(label="Refine Prompt", value=" The video is of high quality, and the view is very clear. ")
@@ -190,6 +193,10 @@ with demo:
190
  with gr.Row():
191
  with gr.Column():
192
  with gr.Row():
 
 
 
 
193
  controlnet_weights_input = gr.Number(value=0.5, label="ControlNet Weights")
194
  controlnet_guidance_start_input = gr.Number(value=0.0, label="Guidance Start")
195
  controlnet_guidance_end_input = gr.Number(value=0.5, label="Guidance End")
@@ -200,7 +207,7 @@ with demo:
200
  height_input = gr.Number(value=480, label="Height")
201
  width_input = gr.Number(value=720, label="Width")
202
  num_frames_input2 = gr.Number(value=49, label="Num Frames")
203
- fps_input2 = gr.Number(value=8, label="FPS")
204
  downscale_coef_input = gr.Number(value=8, label="Downscale Coef")
205
  vae_channels_input = gr.Number(value=16, label="VAE Channels")
206
  controlnet_input_channels_input = gr.Number(value=6, label="ControlNet Input Channels")
@@ -232,7 +239,7 @@ with demo:
232
  inference_steps_input, dtype_input, seed_input2,
233
  height_input, width_input, downscale_coef_input,
234
  vae_channels_input, controlnet_input_channels_input,
235
- controlnet_layers_input
236
  ],
237
  outputs=[step2_video, step2_logs]
238
  )
 
74
  "--fps", str(fps),
75
  "--depth_inference_steps", str(depth_inference_steps),
76
  "--depth_guidance_scale", str(depth_guidance_scale),
77
+ "--near_far_estimated", str(near_far_estimated),
78
  "--sampler_name", sampler_name,
79
  "--diffusion_guidance_scale", str(diffusion_guidance_scale),
80
  "--diffusion_inference_steps", str(diffusion_inference_steps),
 
105
  fps, num_frames, controlnet_weights, controlnet_guidance_start,
106
  controlnet_guidance_end, guidance_scale, num_inference_steps, dtype,
107
  seed, height, width, downscale_coef, vae_channels,
108
+ controlnet_input_channels, controlnet_transformer_num_layers, upscale, upscale_factor, refine
109
  ):
110
  MODEL_PATH = "/app/pretrained/CogVideoX-5b-I2V"
111
  ckpt_path = "/app/out/EPiC_pretrained/checkpoint-500.pt"
 
132
  "--downscale_coef", str(downscale_coef),
133
  "--vae_channels", str(vae_channels),
134
  "--controlnet_input_channels", str(controlnet_input_channels),
135
+ "--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers),
136
+ "--upscale", str(upscale),
137
+ "--upscale_factor", str(upscale_factor),
138
+ "--refine", str(refine),
139
  ]
140
  try:
141
  result = subprocess.run(command, capture_output=True, text=True, check=True)
 
167
  fps_input = gr.Number(value=24, label="FPS")
168
  num_frames_input = gr.Number(value=49, label="Number of Frames")
169
  radius_input = gr.Number(value = 1.0, label="Radius Scale")
170
+ mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
171
  sampler_input = gr.Dropdown(choices=["Euler", "Euler A", "DPM++", "PNDM", "DDIM_Cog", "DDIM_Origin"], value="DDIM_Origin", label="Sampler")
172
  diff_guidance_input = gr.Number(value=6.0, label="Diffusion Guidance")
173
  diff_steps_input = gr.Number(value=50, label="Diffusion Steps")
 
178
  maxres_input = gr.Number(value=1920, label="Max Resolution")
179
  sample_size = gr.Textbox(label="Sample Size (height, width)", placeholder="e.g., 384, 672", value="384, 672")
180
  seed_input = gr.Number(value=43, label="Seed")
181
+ height = gr.Number(value=480, label="Height")
182
+ width = gr.Number(value=720, label="Width")
183
  prompt_input = gr.Textbox(label="Prompt")
184
  neg_prompt_input = gr.Textbox(label="Negative Prompt", value="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory.")
185
  refine_prompt_input = gr.Textbox(label="Refine Prompt", value=" The video is of high quality, and the view is very clear. ")
 
193
  with gr.Row():
194
  with gr.Column():
195
  with gr.Row():
196
+ upscale = gr.Checkbox(label="Upscale", value=True)
197
+ upscale_factor = gr.Number(label="Upscale factor", value=4)
198
+ refine = gr.Checkbox(label="refine", value=True)
199
+
200
  controlnet_weights_input = gr.Number(value=0.5, label="ControlNet Weights")
201
  controlnet_guidance_start_input = gr.Number(value=0.0, label="Guidance Start")
202
  controlnet_guidance_end_input = gr.Number(value=0.5, label="Guidance End")
 
207
  height_input = gr.Number(value=480, label="Height")
208
  width_input = gr.Number(value=720, label="Width")
209
  num_frames_input2 = gr.Number(value=49, label="Num Frames")
210
+ fps_input2 = gr.Number(value=24, label="FPS")
211
  downscale_coef_input = gr.Number(value=8, label="Downscale Coef")
212
  vae_channels_input = gr.Number(value=16, label="VAE Channels")
213
  controlnet_input_channels_input = gr.Number(value=6, label="ControlNet Input Channels")
 
239
  inference_steps_input, dtype_input, seed_input2,
240
  height_input, width_input, downscale_coef_input,
241
  vae_channels_input, controlnet_input_channels_input,
242
+ controlnet_layers_input, upscale, upscale_factor, refine
243
  ],
244
  outputs=[step2_video, step2_logs]
245
  )
inference/cli_demo_camera_i2v_pcd.py CHANGED
@@ -174,6 +174,9 @@ def generate_video(
174
  pool_style: str = 'avg',
175
  pipe_cpu_offload: bool = False,
176
  fps: int = 8,
 
 
 
177
  ):
178
  """
179
  Generates a video based on the given prompt and saves it to the specified path.
@@ -399,12 +402,11 @@ def generate_video(
399
  else:
400
  print(f" Value: {item}")
401
 
402
- scale_status = True
403
- rife_status = True
404
- if scale_status:
405
- latents = utils.upscale_batch_and_concatenate(upscale_model, latents, device)
406
- if rife_status:
407
- latents = rife_inference_with_latents(frame_interpolation_model, latents)
408
 
409
 
410
  # Convert latents back to PIL images after processing
@@ -489,6 +491,15 @@ if __name__ == "__main__":
489
  parser.add_argument("--enable_model_cpu_offload", action="store_true", default=False, help="Enable model CPU offload")
490
  parser.add_argument("--fps", type=int, default=8, help="Frames per second for the output video")
491
 
 
 
 
 
 
 
 
 
 
492
  args = parser.parse_args()
493
  dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
494
  generate_video(
@@ -525,4 +536,7 @@ if __name__ == "__main__":
525
  pool_style=args.pool_style,
526
  pipe_cpu_offload=args.enable_model_cpu_offload,
527
  fps=args.fps,
 
 
 
528
  )
 
174
  pool_style: str = 'avg',
175
  pipe_cpu_offload: bool = False,
176
  fps: int = 8,
177
+ upscale: bool = True,
178
+ upscale_factor: int = 4,
179
+ refine: bool = True,
180
  ):
181
  """
182
  Generates a video based on the given prompt and saves it to the specified path.
 
402
  else:
403
  print(f" Value: {item}")
404
 
405
+
406
+ if upscale:
407
+ latents = utils.upscale_batch_and_concatenate(upscale_model, latents, device, upscale_factor=upscale_factor)
408
+ if refine:
409
+ latents = rife_inference_with_latents(frame_interpolation_model, latents) # upscale here is assigned 1.
 
410
 
411
 
412
  # Convert latents back to PIL images after processing
 
491
  parser.add_argument("--enable_model_cpu_offload", action="store_true", default=False, help="Enable model CPU offload")
492
  parser.add_argument("--fps", type=int, default=8, help="Frames per second for the output video")
493
 
494
+ parser.add_argument("--upscale", action="store_true", default=False, help="Enable upscaling of the output video")
495
+ parser.add_argument("--upscale_factor", type=int, default=4, help="Factor by which to upscale the output video")
496
+ parser.add_argument("--refine", action="store_true", default=False, help="Enable refinement of the output video")
497
+
498
+ # "--upscale", str(upscale),
499
+ # "--upscale_factor", str(upscale_factor),
500
+ # "--refine", str(refine),
501
+
502
+
503
  args = parser.parse_args()
504
  dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
505
  generate_video(
 
536
  pool_style=args.pool_style,
537
  pipe_cpu_offload=args.enable_model_cpu_offload,
538
  fps=args.fps,
539
+ upscale=args.upscale,
540
+ upscale_factor=args.upscale_factor,
541
+ refine=args.refine,
542
  )
inference/utils.py CHANGED
@@ -200,7 +200,7 @@ def upscale(upscale_model, tensor: torch.Tensor, inf_device, output_device="cpu"
200
  return s
201
 
202
 
203
- def upscale_batch_and_concatenate(upscale_model, latents, inf_device, output_device="cpu") -> torch.Tensor:
204
  upscaled_latents = []
205
  for i in range(latents.size(0)):
206
  latent = latents[i]
 
200
  return s
201
 
202
 
203
+ def upscale_batch_and_concatenate(upscale_model, latents, inf_device, output_device="cpu", upscale_factor = 4) -> torch.Tensor:
204
  upscaled_latents = []
205
  for i in range(latents.size(0)):
206
  latent = latents[i]
inference/v2v_data/demo.py CHANGED
@@ -129,6 +129,7 @@ class GetAnchorVideos:
129
  opts.depth_guidance_scale,
130
  window_size=opts.window_size,
131
  overlap=opts.overlap,
 
132
  ).to(opts.device)
133
 
134
  frames = (
 
129
  opts.depth_guidance_scale,
130
  window_size=opts.window_size,
131
  overlap=opts.overlap,
132
+ near_far_estimated = opts.near_far_estimated,
133
  ).to(opts.device)
134
 
135
  frames = (
inference/v2v_data/models/infer.py CHANGED
@@ -66,6 +66,7 @@ class DepthCrafterDemo:
66
  overlap: int = 25,
67
  seed: int = 42,
68
  track_time: bool = True,
 
69
  ):
70
  set_seed(seed)
71
 
@@ -94,7 +95,10 @@ class DepthCrafterDemo:
94
  depths[depths < 1e-5] = 1e-5
95
  depths = 10000.0 / depths
96
 
97
- near, far = self.estimate_near_far(depths)
 
 
 
98
  print(f"Estimated near: {near}, far: {far}")
99
  depths = depths.clip(near, far)
100
 
 
66
  overlap: int = 25,
67
  seed: int = 42,
68
  track_time: bool = True,
69
+ near_far_estimated: bool = True,
70
  ):
71
  set_seed(seed)
72
 
 
95
  depths[depths < 1e-5] = 1e-5
96
  depths = 10000.0 / depths
97
 
98
+ if near_far_estimated:
99
+ print("Estimating near and far values from the depth map...")
100
+ near, far = self.estimate_near_far(depths)
101
+
102
  print(f"Estimated near: {near}, far: {far}")
103
  depths = depths.clip(near, far)
104