Muhammad Taqi Raza
commited on
Commit
·
0f464ea
1
Parent(s):
79ff636
adding camera offsets values
Browse files- gradio_app.py +19 -4
- inference/v2v_data/demo.py +27 -13
- inference/v2v_data/inference.py +5 -0
- inference/v2v_data/models/utils.py +1 -0
gradio_app.py
CHANGED
@@ -43,7 +43,9 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
|
|
43 |
sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
|
44 |
prompt, negative_prompt, refine_prompt,
|
45 |
depth_inference_steps, depth_guidance_scale,
|
46 |
-
window_size, overlap, max_res, sample_size,
|
|
|
|
|
47 |
|
48 |
temp_input_path = "/app/temp_input.mp4"
|
49 |
output_dir = "/app/output_anchor"
|
@@ -58,6 +60,7 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
|
|
58 |
return f"Invalid target pose format. Use: θ φ r x y", None, None
|
59 |
logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
|
60 |
w, h = aspect_ratio_inputs.strip().split(",")
|
|
|
61 |
|
62 |
command = [
|
63 |
"python", "/app/inference/v2v_data/inference.py",
|
@@ -84,11 +87,15 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
|
|
84 |
"--window_size", str(window_size),
|
85 |
"--overlap", str(overlap),
|
86 |
"--max_res", str(max_res),
|
87 |
-
"--sample_size",
|
88 |
"--seed", str(seed_input),
|
89 |
"--height", str(height),
|
90 |
"--width", str(width),
|
91 |
-
"--target_aspect_ratio", w.strip(), h.strip()
|
|
|
|
|
|
|
|
|
92 |
]
|
93 |
|
94 |
try:
|
@@ -169,6 +176,11 @@ with demo:
|
|
169 |
pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
|
170 |
fps_input = gr.Number(value=24, label="FPS")
|
171 |
aspect_ratio_inputs=gr.Textbox(label="Target Aspect Ratio (e.g., 2,3)")
|
|
|
|
|
|
|
|
|
|
|
172 |
num_frames_input = gr.Number(value=49, label="Number of Frames")
|
173 |
radius_input = gr.Number(value = 1.0, label="Radius Scale")
|
174 |
mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
|
@@ -230,10 +242,13 @@ with demo:
|
|
230 |
sampler_input, diff_guidance_input, diff_steps_input,
|
231 |
prompt_input, neg_prompt_input, refine_prompt_input,
|
232 |
depth_steps_input, depth_guidance_input,
|
233 |
-
window_input, overlap_input, maxres_input, sample_size,
|
|
|
|
|
234 |
],
|
235 |
outputs=[step1_video, step1_logs]
|
236 |
)
|
|
|
237 |
step2_button.click(
|
238 |
inference,
|
239 |
inputs=[
|
|
|
43 |
sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
|
44 |
prompt, negative_prompt, refine_prompt,
|
45 |
depth_inference_steps, depth_guidance_scale,
|
46 |
+
window_size, overlap, max_res, sample_size,
|
47 |
+
seed_input, height, width, aspect_ratio_inputs,
|
48 |
+
init_dx, init_dy, init_dz): # ← NEW
|
49 |
|
50 |
temp_input_path = "/app/temp_input.mp4"
|
51 |
output_dir = "/app/output_anchor"
|
|
|
60 |
return f"Invalid target pose format. Use: θ φ r x y", None, None
|
61 |
logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
|
62 |
w, h = aspect_ratio_inputs.strip().split(",")
|
63 |
+
h_s, w_s = sample_size.strip().split(",")
|
64 |
|
65 |
command = [
|
66 |
"python", "/app/inference/v2v_data/inference.py",
|
|
|
87 |
"--window_size", str(window_size),
|
88 |
"--overlap", str(overlap),
|
89 |
"--max_res", str(max_res),
|
90 |
+
"--sample_size", h_s.strip(), w_s.strip(),
|
91 |
"--seed", str(seed_input),
|
92 |
"--height", str(height),
|
93 |
"--width", str(width),
|
94 |
+
"--target_aspect_ratio", w.strip(), h.strip(),
|
95 |
+
"--init_dx", str(init_dx),
|
96 |
+
"--init_dy", str(init_dy),
|
97 |
+
"--init_dz", str(init_dz),
|
98 |
+
|
99 |
]
|
100 |
|
101 |
try:
|
|
|
176 |
pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
|
177 |
fps_input = gr.Number(value=24, label="FPS")
|
178 |
aspect_ratio_inputs=gr.Textbox(label="Target Aspect Ratio (e.g., 2,3)")
|
179 |
+
|
180 |
+
init_dx = gr.Number(value=0.0, label="Start Camera Offset X")
|
181 |
+
init_dy = gr.Number(value=0.0, label="Start Camera Offset Y")
|
182 |
+
init_dz = gr.Number(value=0.0, label="Start Camera Offset Z")
|
183 |
+
|
184 |
num_frames_input = gr.Number(value=49, label="Number of Frames")
|
185 |
radius_input = gr.Number(value = 1.0, label="Radius Scale")
|
186 |
mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
|
|
|
242 |
sampler_input, diff_guidance_input, diff_steps_input,
|
243 |
prompt_input, neg_prompt_input, refine_prompt_input,
|
244 |
depth_steps_input, depth_guidance_input,
|
245 |
+
window_input, overlap_input, maxres_input, sample_size,
|
246 |
+
seed_input, height, width, aspect_ratio_inputs,
|
247 |
+
init_dx, init_dy, init_dz # ← NEW INPUTS
|
248 |
],
|
249 |
outputs=[step1_video, step1_logs]
|
250 |
)
|
251 |
+
|
252 |
step2_button.click(
|
253 |
inference,
|
254 |
inputs=[
|
inference/v2v_data/demo.py
CHANGED
@@ -1,17 +1,17 @@
|
|
1 |
import gc
|
2 |
import os
|
3 |
import torch
|
4 |
-
|
5 |
import numpy as np
|
6 |
-
|
7 |
from PIL import Image
|
8 |
from models.utils import *
|
9 |
-
|
10 |
-
import torch
|
11 |
import torch.nn.functional as F
|
|
|
|
|
12 |
|
13 |
-
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
14 |
from qwen_vl_utils import process_vision_info
|
|
|
15 |
|
16 |
def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(2, 3)):
|
17 |
target_h, target_w = target_aspect_ratio
|
@@ -68,8 +68,6 @@ def center_crop_to_ratio(tensor: torch.Tensor, resolution=(480, 720)):
|
|
68 |
|
69 |
return tensor[:, :, top:top + crop_h, left:left + crop_w]
|
70 |
|
71 |
-
import imageio
|
72 |
-
import numpy as np
|
73 |
|
74 |
def save_video_as_mp4(video_tensor, save_path, fps=24):
|
75 |
"""
|
@@ -589,18 +587,34 @@ class GetAnchorVideos:
|
|
589 |
.repeat(num_frames, 1, 1)
|
590 |
.to(opts.device)
|
591 |
)
|
|
|
|
|
|
|
|
|
|
|
592 |
c2w_init = (
|
593 |
torch.tensor(
|
594 |
[
|
595 |
-
[-1.0, 0.0, 0.0,
|
596 |
-
[0.0, 1.0, 0.0,
|
597 |
-
[0.0, 0.0, -1.0,
|
598 |
[0.0, 0.0, 0.0, 1.0],
|
599 |
]
|
600 |
-
)
|
601 |
-
.to(opts.device)
|
602 |
-
.unsqueeze(0)
|
603 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
604 |
|
605 |
if opts.camera == 'target':
|
606 |
dtheta, dphi, dr, dx, dy = opts.target_pose
|
|
|
1 |
import gc
|
2 |
import os
|
3 |
import torch
|
4 |
+
import imageio
|
5 |
import numpy as np
|
6 |
+
|
7 |
from PIL import Image
|
8 |
from models.utils import *
|
|
|
|
|
9 |
import torch.nn.functional as F
|
10 |
+
from models.infer import DepthCrafterDemo
|
11 |
+
|
12 |
|
|
|
13 |
from qwen_vl_utils import process_vision_info
|
14 |
+
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
15 |
|
16 |
def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(2, 3)):
|
17 |
target_h, target_w = target_aspect_ratio
|
|
|
68 |
|
69 |
return tensor[:, :, top:top + crop_h, left:left + crop_w]
|
70 |
|
|
|
|
|
71 |
|
72 |
def save_video_as_mp4(video_tensor, save_path, fps=24):
|
73 |
"""
|
|
|
587 |
.repeat(num_frames, 1, 1)
|
588 |
.to(opts.device)
|
589 |
)
|
590 |
+
|
591 |
+
camera_x = getattr(opts, "init_dx", 0.0)
|
592 |
+
camera_y = getattr(opts, "init_dy", 0.0)
|
593 |
+
camera_z = getattr(opts, "init_dz", 0.0)
|
594 |
+
|
595 |
c2w_init = (
|
596 |
torch.tensor(
|
597 |
[
|
598 |
+
[-1.0, 0.0, 0.0, camera_x],
|
599 |
+
[0.0, 1.0, 0.0, camera_y],
|
600 |
+
[0.0, 0.0, -1.0, camera_z],
|
601 |
[0.0, 0.0, 0.0, 1.0],
|
602 |
]
|
603 |
+
).to(opts.device).unsqueeze(0)
|
|
|
|
|
604 |
)
|
605 |
+
|
606 |
+
# c2w_init = (
|
607 |
+
# torch.tensor(
|
608 |
+
# [
|
609 |
+
# [-1.0, 0.0, 0.0, 0.0],
|
610 |
+
# [0.0, 1.0, 0.0, 0.0],
|
611 |
+
# [0.0, 0.0, -1.0, 0.0],
|
612 |
+
# [0.0, 0.0, 0.0, 1.0],
|
613 |
+
# ]
|
614 |
+
# )
|
615 |
+
# .to(opts.device)
|
616 |
+
# .unsqueeze(0)
|
617 |
+
# )
|
618 |
|
619 |
if opts.camera == 'target':
|
620 |
dtheta, dphi, dr, dx, dy = opts.target_pose
|
inference/v2v_data/inference.py
CHANGED
@@ -189,7 +189,12 @@ def get_parser():
|
|
189 |
parser.add_argument(
|
190 |
'--max_res', type=int, default=1024, help='Maximum resolution for processing'
|
191 |
)
|
|
|
192 |
parser.add_argument("--target_aspect_ratio", type=int, nargs=2, default=None)
|
|
|
|
|
|
|
|
|
193 |
|
194 |
return parser
|
195 |
|
|
|
189 |
parser.add_argument(
|
190 |
'--max_res', type=int, default=1024, help='Maximum resolution for processing'
|
191 |
)
|
192 |
+
|
193 |
parser.add_argument("--target_aspect_ratio", type=int, nargs=2, default=None)
|
194 |
+
|
195 |
+
parser.add_argument('--init_dx', type=float, default=0.0)
|
196 |
+
parser.add_argument('--init_dy', type=float, default=0.0)
|
197 |
+
parser.add_argument('--init_dz', type=float, default=0.0)
|
198 |
|
199 |
return parser
|
200 |
|
inference/v2v_data/models/utils.py
CHANGED
@@ -187,6 +187,7 @@ def generate_traj_specified(c2ws_anchor, theta, phi, d_r, d_x, d_y, frame, devic
|
|
187 |
rs = np.linspace(0, d_r, frame)
|
188 |
xs = np.linspace(0, d_x, frame)
|
189 |
ys = np.linspace(0, d_y, frame)
|
|
|
190 |
c2ws_list = []
|
191 |
for th, ph, r, x, y in zip(thetas, phis, rs, xs, ys):
|
192 |
c2w_new = sphere2pose(
|
|
|
187 |
rs = np.linspace(0, d_r, frame)
|
188 |
xs = np.linspace(0, d_x, frame)
|
189 |
ys = np.linspace(0, d_y, frame)
|
190 |
+
|
191 |
c2ws_list = []
|
192 |
for th, ph, r, x, y in zip(thetas, phis, rs, xs, ys):
|
193 |
c2w_new = sphere2pose(
|