Muhammad Taqi Raza commited on
Commit
0f464ea
·
1 Parent(s): 79ff636

adding camera offsets values

Browse files
gradio_app.py CHANGED
@@ -43,7 +43,9 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
43
  sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
44
  prompt, negative_prompt, refine_prompt,
45
  depth_inference_steps, depth_guidance_scale,
46
- window_size, overlap, max_res, sample_size, seed_input, height, width, aspect_ratio_inputs):
 
 
47
 
48
  temp_input_path = "/app/temp_input.mp4"
49
  output_dir = "/app/output_anchor"
@@ -58,6 +60,7 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
58
  return f"Invalid target pose format. Use: θ φ r x y", None, None
59
  logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
60
  w, h = aspect_ratio_inputs.strip().split(",")
 
61
 
62
  command = [
63
  "python", "/app/inference/v2v_data/inference.py",
@@ -84,11 +87,15 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
84
  "--window_size", str(window_size),
85
  "--overlap", str(overlap),
86
  "--max_res", str(max_res),
87
- "--sample_size", sample_size if sample_size else "384,672",
88
  "--seed", str(seed_input),
89
  "--height", str(height),
90
  "--width", str(width),
91
- "--target_aspect_ratio", w.strip(), h.strip()
 
 
 
 
92
  ]
93
 
94
  try:
@@ -169,6 +176,11 @@ with demo:
169
  pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
170
  fps_input = gr.Number(value=24, label="FPS")
171
  aspect_ratio_inputs=gr.Textbox(label="Target Aspect Ratio (e.g., 2,3)")
 
 
 
 
 
172
  num_frames_input = gr.Number(value=49, label="Number of Frames")
173
  radius_input = gr.Number(value = 1.0, label="Radius Scale")
174
  mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
@@ -230,10 +242,13 @@ with demo:
230
  sampler_input, diff_guidance_input, diff_steps_input,
231
  prompt_input, neg_prompt_input, refine_prompt_input,
232
  depth_steps_input, depth_guidance_input,
233
- window_input, overlap_input, maxres_input, sample_size, seed_input, height, width, aspect_ratio_inputs
 
 
234
  ],
235
  outputs=[step1_video, step1_logs]
236
  )
 
237
  step2_button.click(
238
  inference,
239
  inputs=[
 
43
  sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
44
  prompt, negative_prompt, refine_prompt,
45
  depth_inference_steps, depth_guidance_scale,
46
+ window_size, overlap, max_res, sample_size,
47
+ seed_input, height, width, aspect_ratio_inputs,
48
+ init_dx, init_dy, init_dz): # ← NEW
49
 
50
  temp_input_path = "/app/temp_input.mp4"
51
  output_dir = "/app/output_anchor"
 
60
  return f"Invalid target pose format. Use: θ φ r x y", None, None
61
  logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
62
  w, h = aspect_ratio_inputs.strip().split(",")
63
+ h_s, w_s = sample_size.strip().split(",")
64
 
65
  command = [
66
  "python", "/app/inference/v2v_data/inference.py",
 
87
  "--window_size", str(window_size),
88
  "--overlap", str(overlap),
89
  "--max_res", str(max_res),
90
+ "--sample_size", h_s.strip(), w_s.strip(),
91
  "--seed", str(seed_input),
92
  "--height", str(height),
93
  "--width", str(width),
94
+ "--target_aspect_ratio", w.strip(), h.strip(),
95
+ "--init_dx", str(init_dx),
96
+ "--init_dy", str(init_dy),
97
+ "--init_dz", str(init_dz),
98
+
99
  ]
100
 
101
  try:
 
176
  pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
177
  fps_input = gr.Number(value=24, label="FPS")
178
  aspect_ratio_inputs=gr.Textbox(label="Target Aspect Ratio (e.g., 2,3)")
179
+
180
+ init_dx = gr.Number(value=0.0, label="Start Camera Offset X")
181
+ init_dy = gr.Number(value=0.0, label="Start Camera Offset Y")
182
+ init_dz = gr.Number(value=0.0, label="Start Camera Offset Z")
183
+
184
  num_frames_input = gr.Number(value=49, label="Number of Frames")
185
  radius_input = gr.Number(value = 1.0, label="Radius Scale")
186
  mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
 
242
  sampler_input, diff_guidance_input, diff_steps_input,
243
  prompt_input, neg_prompt_input, refine_prompt_input,
244
  depth_steps_input, depth_guidance_input,
245
+ window_input, overlap_input, maxres_input, sample_size,
246
+ seed_input, height, width, aspect_ratio_inputs,
247
+ init_dx, init_dy, init_dz # ← NEW INPUTS
248
  ],
249
  outputs=[step1_video, step1_logs]
250
  )
251
+
252
  step2_button.click(
253
  inference,
254
  inputs=[
inference/v2v_data/demo.py CHANGED
@@ -1,17 +1,17 @@
1
  import gc
2
  import os
3
  import torch
4
- from models.infer import DepthCrafterDemo
5
  import numpy as np
6
- import torch
7
  from PIL import Image
8
  from models.utils import *
9
-
10
- import torch
11
  import torch.nn.functional as F
 
 
12
 
13
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
14
  from qwen_vl_utils import process_vision_info
 
15
 
16
  def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(2, 3)):
17
  target_h, target_w = target_aspect_ratio
@@ -68,8 +68,6 @@ def center_crop_to_ratio(tensor: torch.Tensor, resolution=(480, 720)):
68
 
69
  return tensor[:, :, top:top + crop_h, left:left + crop_w]
70
 
71
- import imageio
72
- import numpy as np
73
 
74
  def save_video_as_mp4(video_tensor, save_path, fps=24):
75
  """
@@ -589,18 +587,34 @@ class GetAnchorVideos:
589
  .repeat(num_frames, 1, 1)
590
  .to(opts.device)
591
  )
 
 
 
 
 
592
  c2w_init = (
593
  torch.tensor(
594
  [
595
- [-1.0, 0.0, 0.0, 0.0],
596
- [0.0, 1.0, 0.0, 0.0],
597
- [0.0, 0.0, -1.0, 0.0],
598
  [0.0, 0.0, 0.0, 1.0],
599
  ]
600
- )
601
- .to(opts.device)
602
- .unsqueeze(0)
603
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
604
 
605
  if opts.camera == 'target':
606
  dtheta, dphi, dr, dx, dy = opts.target_pose
 
1
  import gc
2
  import os
3
  import torch
4
+ import imageio
5
  import numpy as np
6
+
7
  from PIL import Image
8
  from models.utils import *
 
 
9
  import torch.nn.functional as F
10
+ from models.infer import DepthCrafterDemo
11
+
12
 
 
13
  from qwen_vl_utils import process_vision_info
14
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
15
 
16
  def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(2, 3)):
17
  target_h, target_w = target_aspect_ratio
 
68
 
69
  return tensor[:, :, top:top + crop_h, left:left + crop_w]
70
 
 
 
71
 
72
  def save_video_as_mp4(video_tensor, save_path, fps=24):
73
  """
 
587
  .repeat(num_frames, 1, 1)
588
  .to(opts.device)
589
  )
590
+
591
+ camera_x = getattr(opts, "init_dx", 0.0)
592
+ camera_y = getattr(opts, "init_dy", 0.0)
593
+ camera_z = getattr(opts, "init_dz", 0.0)
594
+
595
  c2w_init = (
596
  torch.tensor(
597
  [
598
+ [-1.0, 0.0, 0.0, camera_x],
599
+ [0.0, 1.0, 0.0, camera_y],
600
+ [0.0, 0.0, -1.0, camera_z],
601
  [0.0, 0.0, 0.0, 1.0],
602
  ]
603
+ ).to(opts.device).unsqueeze(0)
 
 
604
  )
605
+
606
+ # c2w_init = (
607
+ # torch.tensor(
608
+ # [
609
+ # [-1.0, 0.0, 0.0, 0.0],
610
+ # [0.0, 1.0, 0.0, 0.0],
611
+ # [0.0, 0.0, -1.0, 0.0],
612
+ # [0.0, 0.0, 0.0, 1.0],
613
+ # ]
614
+ # )
615
+ # .to(opts.device)
616
+ # .unsqueeze(0)
617
+ # )
618
 
619
  if opts.camera == 'target':
620
  dtheta, dphi, dr, dx, dy = opts.target_pose
inference/v2v_data/inference.py CHANGED
@@ -189,7 +189,12 @@ def get_parser():
189
  parser.add_argument(
190
  '--max_res', type=int, default=1024, help='Maximum resolution for processing'
191
  )
 
192
  parser.add_argument("--target_aspect_ratio", type=int, nargs=2, default=None)
 
 
 
 
193
 
194
  return parser
195
 
 
189
  parser.add_argument(
190
  '--max_res', type=int, default=1024, help='Maximum resolution for processing'
191
  )
192
+
193
  parser.add_argument("--target_aspect_ratio", type=int, nargs=2, default=None)
194
+
195
+ parser.add_argument('--init_dx', type=float, default=0.0)
196
+ parser.add_argument('--init_dy', type=float, default=0.0)
197
+ parser.add_argument('--init_dz', type=float, default=0.0)
198
 
199
  return parser
200
 
inference/v2v_data/models/utils.py CHANGED
@@ -187,6 +187,7 @@ def generate_traj_specified(c2ws_anchor, theta, phi, d_r, d_x, d_y, frame, devic
187
  rs = np.linspace(0, d_r, frame)
188
  xs = np.linspace(0, d_x, frame)
189
  ys = np.linspace(0, d_y, frame)
 
190
  c2ws_list = []
191
  for th, ph, r, x, y in zip(thetas, phis, rs, xs, ys):
192
  c2w_new = sphere2pose(
 
187
  rs = np.linspace(0, d_r, frame)
188
  xs = np.linspace(0, d_x, frame)
189
  ys = np.linspace(0, d_y, frame)
190
+
191
  c2ws_list = []
192
  for th, ph, r, x, y in zip(thetas, phis, rs, xs, ys):
193
  c2w_new = sphere2pose(