Spaces:

roll-ai
/

EPiC

Starting on L40S

App Files Files Community

Muhammad Taqi Raza commited on 13 days ago

Commit

8eeb1c7

1 Parent(s): 7f5f48b

correct infer_gradual

Browse files

Files changed (3) hide show

gradio_app.py +1 -1
inference/v2v_data/demo.py +5 -5
inference/v2v_data/models/utils.py +56 -11

gradio_app.py CHANGED Viewed

@@ -166,7 +166,7 @@ with demo:
                         depth_guidance_input = gr.Number(value=1.0, label="Depth Guidance")
                         window_input = gr.Number(value=64, label="Window Size")
                         overlap_input = gr.Number(value=25, label="Overlap")
-                        maxres_input = gr.Number(value=1024, label="Max Resolution")
                         sample_size = gr.Textbox(label="Sample Size (height, width)", placeholder="e.g., 384, 672", value="384, 672")
                         seed_input = gr.Number(value=43, label="Seed")
                         height = gr.Number(value=576, label="Height")

                         depth_guidance_input = gr.Number(value=1.0, label="Depth Guidance")
                         window_input = gr.Number(value=64, label="Window Size")
                         overlap_input = gr.Number(value=25, label="Overlap")
+                        maxres_input = gr.Number(value=1920, label="Max Resolution")
                         sample_size = gr.Textbox(label="Sample Size (height, width)", placeholder="e.g., 384, 672", value="384, 672")
                         seed_input = gr.Number(value=43, label="Seed")
                         height = gr.Number(value=576, label="Height")

inference/v2v_data/demo.py CHANGED Viewed

@@ -111,14 +111,14 @@ class GetAnchorVideos:
     def infer_gradual(self, opts):
         frames = read_video_frames(
-            opts.video_path, opts.video_length, opts.stride, opts.max_res, height = opts.height, width = opts.width
         )
         vr = VideoReader(opts.video_path, ctx=cpu(0))
         frame_shape = vr[0].shape  # (H, W, 3)
         ori_resolution = frame_shape[:2]
         print(f"==> original video shape: {frame_shape}")
-        # target_resolution = get_center_crop_resolution(original_resoultion = ori_resolution, height = opts.height, width = opts.width)
-        # print(f"==> target video shape resized: {target_resolution}")
         prompt = self.get_caption(opts, opts.video_path)
         depths = self.depth_estimater.infer(
@@ -138,8 +138,8 @@ class GetAnchorVideos:
         print(f"==> opts video length: {opts.video_length}")
         assert frames.shape[0] == opts.video_length
-        # depths = center_crop_to_ratio(depths, resolution=target_resolution)
-        # frames = center_crop_to_ratio(frames, resolution=target_resolution)
         pose_s, pose_t, K = self.get_poses(opts, depths, num_frames=opts.video_length)
         warped_images = []
         masks = []

     def infer_gradual(self, opts):
         frames = read_video_frames(
+            opts.video_path, opts.video_length, opts.stride, opts.max_res
         )
         vr = VideoReader(opts.video_path, ctx=cpu(0))
         frame_shape = vr[0].shape  # (H, W, 3)
         ori_resolution = frame_shape[:2]
         print(f"==> original video shape: {frame_shape}")
+        target_resolution = get_center_crop_resolution(original_resoultion = ori_resolution, height = opts.height, width = opts.width)
+        print(f"==> target video shape resized: {target_resolution}")
         prompt = self.get_caption(opts, opts.video_path)
         depths = self.depth_estimater.infer(
         print(f"==> opts video length: {opts.video_length}")
         assert frames.shape[0] == opts.video_length
+        depths = center_crop_to_ratio(depths, resolution=target_resolution)
+        frames = center_crop_to_ratio(frames, resolution=target_resolution)
         pose_s, pose_t, K = self.get_poses(opts, depths, num_frames=opts.video_length)
         warped_images = []
         masks = []

inference/v2v_data/models/utils.py CHANGED Viewed

@@ -28,16 +28,57 @@ from decord import VideoReader, cpu
 from PIL import Image
-def read_video_frames(video_path, process_length, stride, max_res, dataset="open", height=576, width=1024):
     def is_image(path):
         return any(path.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp'])
-    if is_image(video_path):
         print("==> Detected image. Loading as single-frame video:", video_path)
         img = Image.open(video_path).convert("RGB")
-        # FIXME: hard coded
-        width = width
-        height = height
         img = img.resize((width, height), Image.BICUBIC)
         img = np.array(img).astype("float32") / 255.0  # [H, W, 3]
         frames = img[None, ...]  # [1, H, W, 3]
@@ -49,9 +90,15 @@ def read_video_frames(video_path, process_length, stride, max_res, dataset="open
         vid = VideoReader(video_path, ctx=cpu(0))
         print("==> original video shape:", (len(vid), *vid.get_batch([0]).shape[1:]))
-        # FIXME: hard coded
-        width = width
-        height = height
     vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
@@ -64,8 +111,6 @@ def read_video_frames(video_path, process_length, stride, max_res, dataset="open
     return frames
 def save_video(data, images_path, folder=None, fps=8):
     if isinstance(data, np.ndarray):
         tensor_data = (torch.from_numpy(data) * 255).to(torch.uint8)

 from PIL import Image
+# def read_video_frames(video_path, process_length, target_fps, max_res, dataset="open"):
+#     if dataset == "open":
+#         print("==> processing video: ", video_path)
+#         vid = VideoReader(video_path, ctx=cpu(0))
+#         print("==> original video shape: ", (len(vid), *vid.get_batch([0]).shape[1:]))
+#         original_height, original_width = vid.get_batch([0]).shape[1:3]
+#         height = round(original_height / 64) * 64
+#         width = round(original_width / 64) * 64
+#         if max(height, width) > max_res:
+#             scale = max_res / max(original_height, original_width)
+#             height = round(original_height * scale / 64) * 64
+#             width = round(original_width * scale / 64) * 64
+#     else:
+#         height = dataset_res_dict[dataset][0]
+#         width = dataset_res_dict[dataset][1]
+#     vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
+#     fps = vid.get_avg_fps() if target_fps == -1 else target_fps
+#     stride = round(vid.get_avg_fps() / fps)
+#     stride = max(stride, 1)
+#     frames_idx = list(range(0, len(vid), stride))
+#     print(
+#         f"==> downsampled shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}, with stride: {stride}"
+#     )
+#     if process_length != -1 and process_length < len(frames_idx):
+#         frames_idx = frames_idx[:process_length]
+#     print(
+#         f"==> final processing shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}"
+#     )
+#     frames = vid.get_batch(frames_idx).asnumpy().astype("float32") / 255.0
+#     return frames, fps
+def read_video_frames(video_path, process_length, stride, max_res, dataset="open"):
     def is_image(path):
         return any(path.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp'])
+    if is_image(video_path):
         print("==> Detected image. Loading as single-frame video:", video_path)
         img = Image.open(video_path).convert("RGB")
+        original_width = img.width
+        original_height = img.height
+        height = round(original_height / 64) * 64
+        width = round(original_width / 64) * 64
+        if max(height, width) > max_res:
+            scale = max_res / max(original_height, original_width)
+            height = round(original_height * scale / 64) * 64
+            width = round(original_width * scale / 64) * 64
         img = img.resize((width, height), Image.BICUBIC)
         img = np.array(img).astype("float32") / 255.0  # [H, W, 3]
         frames = img[None, ...]  # [1, H, W, 3]
         vid = VideoReader(video_path, ctx=cpu(0))
         print("==> original video shape:", (len(vid), *vid.get_batch([0]).shape[1:]))
+        original_height, original_width = vid.get_batch([0]).shape[1:3]
+        height = round(original_height / 64) * 64
+        width = round(original_width / 64) * 64
+        if max(height, width) > max_res:
+            scale = max_res / max(original_height, original_width)
+            height = round(original_height * scale / 64) * 64
+            width = round(original_width * scale / 64) * 64
     vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
     return frames
 def save_video(data, images_path, folder=None, fps=8):
     if isinstance(data, np.ndarray):
         tensor_data = (torch.from_numpy(data) * 255).to(torch.uint8)