Muhammad Taqi Raza commited on
Commit
43360f0
·
1 Parent(s): 25b750a

aspect ratio

Browse files
gradio_app.py CHANGED
@@ -43,7 +43,7 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
43
  sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
44
  prompt, negative_prompt, refine_prompt,
45
  depth_inference_steps, depth_guidance_scale,
46
- window_size, overlap, max_res, sample_size, seed_input, height, width):
47
 
48
  temp_input_path = "/app/temp_input.mp4"
49
  output_dir = "/app/output_anchor"
@@ -57,8 +57,8 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
57
  except ValueError:
58
  return f"Invalid target pose format. Use: θ φ r x y", None, None
59
  logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
60
-
61
- # INTEGRATE HEIGHT AND WIDTH PARAMETERS
62
  command = [
63
  "python", "/app/inference/v2v_data/inference.py",
64
  "--video_path", temp_input_path,
@@ -87,7 +87,8 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
87
  # "--sample_size", sample_size if sample_size else "384,672",
88
  "--seed", str(seed_input),
89
  "--height", str(height), # Fixed height
90
- "--width", str(width)
 
91
  ]
92
 
93
  try:
@@ -133,6 +134,7 @@ def inference(
133
  "--vae_channels", str(vae_channels),
134
  "--controlnet_input_channels", str(controlnet_input_channels),
135
  "--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers),
 
136
  ]
137
  # Conditionally append optional flags
138
  if upscale:
@@ -169,6 +171,8 @@ with demo:
169
  near_far_estimated = gr.Checkbox(label="Near Far Estimation", value=True) # integrate it with
170
  pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
171
  fps_input = gr.Number(value=24, label="FPS")
 
 
172
  num_frames_input = gr.Number(value=49, label="Number of Frames")
173
  radius_input = gr.Number(value = 1.0, label="Radius Scale")
174
  mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
@@ -230,7 +234,7 @@ with demo:
230
  sampler_input, diff_guidance_input, diff_steps_input,
231
  prompt_input, neg_prompt_input, refine_prompt_input,
232
  depth_steps_input, depth_guidance_input,
233
- window_input, overlap_input, maxres_input, sample_size, seed_input, height, width
234
  ],
235
  outputs=[step1_video, step1_logs]
236
  )
 
43
  sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
44
  prompt, negative_prompt, refine_prompt,
45
  depth_inference_steps, depth_guidance_scale,
46
+ window_size, overlap, max_res, sample_size, seed_input, height, width, aspect_ratio_inputs):
47
 
48
  temp_input_path = "/app/temp_input.mp4"
49
  output_dir = "/app/output_anchor"
 
57
  except ValueError:
58
  return f"Invalid target pose format. Use: θ φ r x y", None, None
59
  logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
60
+ w, h = aspect_ratio_inputs.strip().split(",")
61
+
62
  command = [
63
  "python", "/app/inference/v2v_data/inference.py",
64
  "--video_path", temp_input_path,
 
87
  # "--sample_size", sample_size if sample_size else "384,672",
88
  "--seed", str(seed_input),
89
  "--height", str(height), # Fixed height
90
+ "--width", str(width),
91
+ "--target_aspect_ratio", w.strip(), h.strip()
92
  ]
93
 
94
  try:
 
134
  "--vae_channels", str(vae_channels),
135
  "--controlnet_input_channels", str(controlnet_input_channels),
136
  "--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers),
137
+
138
  ]
139
  # Conditionally append optional flags
140
  if upscale:
 
171
  near_far_estimated = gr.Checkbox(label="Near Far Estimation", value=True) # integrate it with
172
  pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
173
  fps_input = gr.Number(value=24, label="FPS")
174
+ aspect_ratio_inputs=gr.Textbox(label="Target Aspect Ratio (e.g., 2,3)"),
175
+
176
  num_frames_input = gr.Number(value=49, label="Number of Frames")
177
  radius_input = gr.Number(value = 1.0, label="Radius Scale")
178
  mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
 
234
  sampler_input, diff_guidance_input, diff_steps_input,
235
  prompt_input, neg_prompt_input, refine_prompt_input,
236
  depth_steps_input, depth_guidance_input,
237
+ window_input, overlap_input, maxres_input, sample_size, seed_input, height, width, aspect_ratio_inputs
238
  ],
239
  outputs=[step1_video, step1_logs]
240
  )
inference/v2v_data/demo.py CHANGED
@@ -13,7 +13,7 @@ import torch.nn.functional as F
13
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
14
  from qwen_vl_utils import process_vision_info
15
 
16
- def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(3, 4)):
17
  target_h, target_w = target_aspect_ratio
18
  aspect_ratio = target_w / target_h
19
 
@@ -117,7 +117,7 @@ class GetAnchorVideos:
117
  frame_shape = vr[0].shape # (H, W, 3)
118
  ori_resolution = frame_shape[:2]
119
  print(f"==> original video shape: {frame_shape}")
120
- target_resolution = get_center_crop_resolution(original_resoultion = ori_resolution, height = opts.height, width = opts.width)
121
  print(f"==> target video shape resized: {target_resolution}")
122
 
123
  prompt = self.get_caption(opts, opts.video_path)
 
13
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
14
  from qwen_vl_utils import process_vision_info
15
 
16
+ def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(2, 3)):
17
  target_h, target_w = target_aspect_ratio
18
  aspect_ratio = target_w / target_h
19
 
 
117
  frame_shape = vr[0].shape # (H, W, 3)
118
  ori_resolution = frame_shape[:2]
119
  print(f"==> original video shape: {frame_shape}")
120
+ target_resolution = get_center_crop_resolution(original_resoultion = ori_resolution, height = opts.height, width = opts.width, target_aspect_ratio= opts.target_aspect_ratio)
121
  print(f"==> target video shape resized: {target_resolution}")
122
 
123
  prompt = self.get_caption(opts, opts.video_path)
inference/v2v_data/inference.py CHANGED
@@ -189,6 +189,7 @@ def get_parser():
189
  parser.add_argument(
190
  '--max_res', type=int, default=1024, help='Maximum resolution for processing'
191
  )
 
192
 
193
  return parser
194
 
 
189
  parser.add_argument(
190
  '--max_res', type=int, default=1024, help='Maximum resolution for processing'
191
  )
192
+ parser.add_argument("--target_aspect_ratio", type=int, nargs=2, default=None)
193
 
194
  return parser
195