Muhammad Taqi Raza
commited on
Commit
·
43360f0
1
Parent(s):
25b750a
aspect ratio
Browse files- gradio_app.py +9 -5
- inference/v2v_data/demo.py +2 -2
- inference/v2v_data/inference.py +1 -0
gradio_app.py
CHANGED
@@ -43,7 +43,7 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
|
|
43 |
sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
|
44 |
prompt, negative_prompt, refine_prompt,
|
45 |
depth_inference_steps, depth_guidance_scale,
|
46 |
-
window_size, overlap, max_res, sample_size, seed_input, height, width):
|
47 |
|
48 |
temp_input_path = "/app/temp_input.mp4"
|
49 |
output_dir = "/app/output_anchor"
|
@@ -57,8 +57,8 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
|
|
57 |
except ValueError:
|
58 |
return f"Invalid target pose format. Use: θ φ r x y", None, None
|
59 |
logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
|
60 |
-
|
61 |
-
|
62 |
command = [
|
63 |
"python", "/app/inference/v2v_data/inference.py",
|
64 |
"--video_path", temp_input_path,
|
@@ -87,7 +87,8 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
|
|
87 |
# "--sample_size", sample_size if sample_size else "384,672",
|
88 |
"--seed", str(seed_input),
|
89 |
"--height", str(height), # Fixed height
|
90 |
-
"--width", str(width)
|
|
|
91 |
]
|
92 |
|
93 |
try:
|
@@ -133,6 +134,7 @@ def inference(
|
|
133 |
"--vae_channels", str(vae_channels),
|
134 |
"--controlnet_input_channels", str(controlnet_input_channels),
|
135 |
"--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers),
|
|
|
136 |
]
|
137 |
# Conditionally append optional flags
|
138 |
if upscale:
|
@@ -169,6 +171,8 @@ with demo:
|
|
169 |
near_far_estimated = gr.Checkbox(label="Near Far Estimation", value=True) # integrate it with
|
170 |
pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
|
171 |
fps_input = gr.Number(value=24, label="FPS")
|
|
|
|
|
172 |
num_frames_input = gr.Number(value=49, label="Number of Frames")
|
173 |
radius_input = gr.Number(value = 1.0, label="Radius Scale")
|
174 |
mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
|
@@ -230,7 +234,7 @@ with demo:
|
|
230 |
sampler_input, diff_guidance_input, diff_steps_input,
|
231 |
prompt_input, neg_prompt_input, refine_prompt_input,
|
232 |
depth_steps_input, depth_guidance_input,
|
233 |
-
window_input, overlap_input, maxres_input, sample_size, seed_input, height, width
|
234 |
],
|
235 |
outputs=[step1_video, step1_logs]
|
236 |
)
|
|
|
43 |
sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
|
44 |
prompt, negative_prompt, refine_prompt,
|
45 |
depth_inference_steps, depth_guidance_scale,
|
46 |
+
window_size, overlap, max_res, sample_size, seed_input, height, width, aspect_ratio_inputs):
|
47 |
|
48 |
temp_input_path = "/app/temp_input.mp4"
|
49 |
output_dir = "/app/output_anchor"
|
|
|
57 |
except ValueError:
|
58 |
return f"Invalid target pose format. Use: θ φ r x y", None, None
|
59 |
logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
|
60 |
+
w, h = aspect_ratio_inputs.strip().split(",")
|
61 |
+
|
62 |
command = [
|
63 |
"python", "/app/inference/v2v_data/inference.py",
|
64 |
"--video_path", temp_input_path,
|
|
|
87 |
# "--sample_size", sample_size if sample_size else "384,672",
|
88 |
"--seed", str(seed_input),
|
89 |
"--height", str(height), # Fixed height
|
90 |
+
"--width", str(width),
|
91 |
+
"--target_aspect_ratio", w.strip(), h.strip()
|
92 |
]
|
93 |
|
94 |
try:
|
|
|
134 |
"--vae_channels", str(vae_channels),
|
135 |
"--controlnet_input_channels", str(controlnet_input_channels),
|
136 |
"--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers),
|
137 |
+
|
138 |
]
|
139 |
# Conditionally append optional flags
|
140 |
if upscale:
|
|
|
171 |
near_far_estimated = gr.Checkbox(label="Near Far Estimation", value=True) # integrate it with
|
172 |
pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
|
173 |
fps_input = gr.Number(value=24, label="FPS")
|
174 |
+
aspect_ratio_inputs=gr.Textbox(label="Target Aspect Ratio (e.g., 2,3)"),
|
175 |
+
|
176 |
num_frames_input = gr.Number(value=49, label="Number of Frames")
|
177 |
radius_input = gr.Number(value = 1.0, label="Radius Scale")
|
178 |
mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
|
|
|
234 |
sampler_input, diff_guidance_input, diff_steps_input,
|
235 |
prompt_input, neg_prompt_input, refine_prompt_input,
|
236 |
depth_steps_input, depth_guidance_input,
|
237 |
+
window_input, overlap_input, maxres_input, sample_size, seed_input, height, width, aspect_ratio_inputs
|
238 |
],
|
239 |
outputs=[step1_video, step1_logs]
|
240 |
)
|
inference/v2v_data/demo.py
CHANGED
@@ -13,7 +13,7 @@ import torch.nn.functional as F
|
|
13 |
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
14 |
from qwen_vl_utils import process_vision_info
|
15 |
|
16 |
-
def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(
|
17 |
target_h, target_w = target_aspect_ratio
|
18 |
aspect_ratio = target_w / target_h
|
19 |
|
@@ -117,7 +117,7 @@ class GetAnchorVideos:
|
|
117 |
frame_shape = vr[0].shape # (H, W, 3)
|
118 |
ori_resolution = frame_shape[:2]
|
119 |
print(f"==> original video shape: {frame_shape}")
|
120 |
-
target_resolution = get_center_crop_resolution(original_resoultion = ori_resolution, height = opts.height, width = opts.width)
|
121 |
print(f"==> target video shape resized: {target_resolution}")
|
122 |
|
123 |
prompt = self.get_caption(opts, opts.video_path)
|
|
|
13 |
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
14 |
from qwen_vl_utils import process_vision_info
|
15 |
|
16 |
+
def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(2, 3)):
|
17 |
target_h, target_w = target_aspect_ratio
|
18 |
aspect_ratio = target_w / target_h
|
19 |
|
|
|
117 |
frame_shape = vr[0].shape # (H, W, 3)
|
118 |
ori_resolution = frame_shape[:2]
|
119 |
print(f"==> original video shape: {frame_shape}")
|
120 |
+
target_resolution = get_center_crop_resolution(original_resoultion = ori_resolution, height = opts.height, width = opts.width, target_aspect_ratio= opts.target_aspect_ratio)
|
121 |
print(f"==> target video shape resized: {target_resolution}")
|
122 |
|
123 |
prompt = self.get_caption(opts, opts.video_path)
|
inference/v2v_data/inference.py
CHANGED
@@ -189,6 +189,7 @@ def get_parser():
|
|
189 |
parser.add_argument(
|
190 |
'--max_res', type=int, default=1024, help='Maximum resolution for processing'
|
191 |
)
|
|
|
192 |
|
193 |
return parser
|
194 |
|
|
|
189 |
parser.add_argument(
|
190 |
'--max_res', type=int, default=1024, help='Maximum resolution for processing'
|
191 |
)
|
192 |
+
parser.add_argument("--target_aspect_ratio", type=int, nargs=2, default=None)
|
193 |
|
194 |
return parser
|
195 |
|