Spaces:

roll-ai
/

EPiC

Paused

App Files Files Community

Muhammad Taqi Raza commited on 19 days ago

Commit

43360f0

1 Parent(s): 25b750a

aspect ratio

Browse files

Files changed (3) hide show

gradio_app.py +9 -5
inference/v2v_data/demo.py +2 -2
inference/v2v_data/inference.py +1 -0

gradio_app.py CHANGED Viewed

@@ -43,7 +43,7 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
                        sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
                        prompt, negative_prompt, refine_prompt,
                        depth_inference_steps, depth_guidance_scale,
-                       window_size, overlap, max_res, sample_size, seed_input, height, width):
     temp_input_path = "/app/temp_input.mp4"
     output_dir = "/app/output_anchor"
@@ -57,8 +57,8 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
     except ValueError:
         return f"Invalid target pose format. Use: θ φ r x y", None, None
     logs =  f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
-    # INTEGRATE HEIGHT AND WIDTH PARAMETERS
     command = [
         "python", "/app/inference/v2v_data/inference.py",
         "--video_path", temp_input_path,
@@ -87,7 +87,8 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
         # "--sample_size", sample_size if sample_size else "384,672",
         "--seed", str(seed_input),
         "--height", str(height),  # Fixed height
-        "--width", str(width)
     ]
     try:
@@ -133,6 +134,7 @@ def inference(
         "--vae_channels", str(vae_channels),
         "--controlnet_input_channels", str(controlnet_input_channels),
         "--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers),
     ]
         # Conditionally append optional flags
     if upscale:
@@ -169,6 +171,8 @@ with demo:
                         near_far_estimated = gr.Checkbox(label="Near Far Estimation", value=True) # integrate it with
                         pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
                         fps_input = gr.Number(value=24, label="FPS")
                         num_frames_input = gr.Number(value=49, label="Number of Frames")
                         radius_input = gr.Number(value = 1.0, label="Radius Scale")
                         mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
@@ -230,7 +234,7 @@ with demo:
             sampler_input, diff_guidance_input, diff_steps_input,
             prompt_input, neg_prompt_input, refine_prompt_input,
             depth_steps_input, depth_guidance_input,
-            window_input, overlap_input, maxres_input, sample_size, seed_input, height, width
         ],
         outputs=[step1_video, step1_logs]
     )

                        sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
                        prompt, negative_prompt, refine_prompt,
                        depth_inference_steps, depth_guidance_scale,
+                       window_size, overlap, max_res, sample_size, seed_input, height, width, aspect_ratio_inputs):
     temp_input_path = "/app/temp_input.mp4"
     output_dir = "/app/output_anchor"
     except ValueError:
         return f"Invalid target pose format. Use: θ φ r x y", None, None
     logs =  f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
+    w, h = aspect_ratio_inputs.strip().split(",")
     command = [
         "python", "/app/inference/v2v_data/inference.py",
         "--video_path", temp_input_path,
         # "--sample_size", sample_size if sample_size else "384,672",
         "--seed", str(seed_input),
         "--height", str(height),  # Fixed height
+        "--width", str(width),
+        "--target_aspect_ratio", w.strip(), h.strip()
     ]
     try:
         "--vae_channels", str(vae_channels),
         "--controlnet_input_channels", str(controlnet_input_channels),
         "--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers),
     ]
         # Conditionally append optional flags
     if upscale:
                         near_far_estimated = gr.Checkbox(label="Near Far Estimation", value=True) # integrate it with
                         pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
                         fps_input = gr.Number(value=24, label="FPS")
+                        aspect_ratio_inputs=gr.Textbox(label="Target Aspect Ratio (e.g., 2,3)"),
                         num_frames_input = gr.Number(value=49, label="Number of Frames")
                         radius_input = gr.Number(value = 1.0, label="Radius Scale")
                         mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
             sampler_input, diff_guidance_input, diff_steps_input,
             prompt_input, neg_prompt_input, refine_prompt_input,
             depth_steps_input, depth_guidance_input,
+            window_input, overlap_input, maxres_input, sample_size, seed_input, height, width, aspect_ratio_inputs
         ],
         outputs=[step1_video, step1_logs]
     )

inference/v2v_data/demo.py CHANGED Viewed

@@ -13,7 +13,7 @@ import torch.nn.functional as F
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
-def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(3, 4)):
     target_h, target_w = target_aspect_ratio
     aspect_ratio = target_w / target_h
@@ -117,7 +117,7 @@ class GetAnchorVideos:
         frame_shape = vr[0].shape  # (H, W, 3)
         ori_resolution = frame_shape[:2]
         print(f"==> original video shape: {frame_shape}")
-        target_resolution = get_center_crop_resolution(original_resoultion = ori_resolution, height = opts.height, width = opts.width)
         print(f"==> target video shape resized: {target_resolution}")
         prompt = self.get_caption(opts, opts.video_path)

 from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
+def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(2, 3)):
     target_h, target_w = target_aspect_ratio
     aspect_ratio = target_w / target_h
         frame_shape = vr[0].shape  # (H, W, 3)
         ori_resolution = frame_shape[:2]
         print(f"==> original video shape: {frame_shape}")
+        target_resolution = get_center_crop_resolution(original_resoultion = ori_resolution, height = opts.height, width = opts.width, target_aspect_ratio= opts.target_aspect_ratio)
         print(f"==> target video shape resized: {target_resolution}")
         prompt = self.get_caption(opts, opts.video_path)

inference/v2v_data/inference.py CHANGED Viewed

@@ -189,6 +189,7 @@ def get_parser():
     parser.add_argument(
         '--max_res', type=int, default=1024, help='Maximum resolution for processing'
     )
     return parser

     parser.add_argument(
         '--max_res', type=int, default=1024, help='Maximum resolution for processing'
     )
+    parser.add_argument("--target_aspect_ratio", type=int, nargs=2, default=None)
     return parser