Spaces:

mtwohey2
/

Depth_Stitcher

Running

App Files Files Community

mtwohey2 commited on Mar 3

Commit

6bfa5c9

verified ·

1 Parent(s): de898e9

Update app.py

Browse files

Lower the memory usage by processing each video frame at one time.

Files changed (1) hide show

app.py +92 -41

app.py CHANGED Viewed

@@ -33,36 +33,85 @@ def stitch_rgbd_videos(
     stitched_video_path = None
     if stitch:
-        # Ensure target_fps is valid (positive) or use original fps
-        safe_target_fps = max(1, target_fps) if target_fps > 0 else -1
-        # For stitching: read the original video in full resolution (without downscaling).
-        full_frames, original_fps = read_video_frames(processed_video, max_len, target_fps, max_res=-1)
-        depths, _ = read_video_frames(depth_vis_video, max_len, target_fps, max_res=-1)
-        # Use original_fps if target_fps wasn't specified
         if target_fps <= 0:
             target_fps = original_fps
-        print(f"Video fps: {original_fps}, target fps: {target_fps}")
-        print(f"Depth frame shape: {depths[0].shape if len(depths) > 0 else 'No frames'}, "
-              f"dtype: {depths[0].dtype if len(depths) > 0 else 'N/A'}, "
-              f"min: {depths.min() if len(depths) > 0 else 'N/A'}, "
-              f"max: {depths.max() if len(depths) > 0 else 'N/A'}")
-        if len(depths) == 0 or len(full_frames) == 0:
-            print("Error: No frames to process in either RGB or depth video")
             return None
-        # For each frame, create a visual depth image from the inferenced depths.
-        d_min, d_max = np.min(depths), np.max(depths)
-        print(f"Depth range: min={d_min}, max={d_max}, diff={d_max-d_min}")
-        stitched_frames = []
-        for i in range(min(len(full_frames), len(depths))):
-            rgb_full = full_frames[i]  # Full-resolution RGB frame.
-            depth_frame = depths[i]  # Already in uint8 format
             if grayscale:
                 if convert_from_color:
                     # Convert to grayscale if it's a color image
@@ -72,7 +121,7 @@ def stitch_rgbd_videos(
                     # Assume it's already the right format
                     depth_vis = depth_frame
             else:
-                if depth_frame.max() > 0:  # Ensure we have valid depth data
                     # Use the inferno colormap if requested
                     cmap = matplotlib.colormaps.get_cmap("inferno")
                     # Convert to single channel first
@@ -84,7 +133,6 @@ def stitch_rgbd_videos(
                 else:
                     # If zero depth, just use the original
                     depth_vis = depth_frame
             # Apply Gaussian blur if requested
             if blur > 0:
@@ -97,26 +145,27 @@ def stitch_rgbd_videos(
             depth_vis_resized = cv2.resize(depth_vis, (W_full, H_full))
             depth_vis_resized = depth_vis_resized.astype(np.uint8)  # Ensure uint8
-            if len(depth_vis_resized.shape) == 2:
-                depth_vis_resized = cv2.cvtColor(depth_vis_resized, cv2.COLOR_GRAY2BGR)
-            # Ensure both are the same type (commonly uint8):
-            if rgb_full.dtype != depth_vis_resized.dtype:
-                depth_vis_resized = depth_vis_resized.astype(rgb_full.dtype)
-            # Now safely concatenate.
-            stitched = cv2.hconcat([rgb_full, depth_vis_resized])
-            stitched_frames.append(stitched)
-            del rgb_full, depth_vis_resized, stitched
-            gc.collect()  # Force Python to free unused memory
-        stitched_frames = np.array(stitched_frames)
-        # Use only the first 20 characters of the base name for the output filename and append '_RGBD.mp4'
-        base_name = os.path.splitext(video_name)[0]
-        short_name = base_name[:20]
-        stitched_video_path = os.path.join(output_dir, short_name + '_RGBD.mp4')
-        save_video(stitched_frames, stitched_video_path, fps=target_fps)
         # Merge audio from the input video into the stitched video using ffmpeg.
         temp_audio_path = stitched_video_path.replace('_RGBD.mp4', '_RGBD_audio.mp4')
@@ -134,6 +183,8 @@ def stitch_rgbd_videos(
         ]
         subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         os.replace(temp_audio_path, stitched_video_path)
     # Return stitched video.
     return stitched_video_path

     stitched_video_path = None
     if stitch:
+        # Process videos frame by frame
+        cap_rgb = cv2.VideoCapture(processed_video)
+        cap_depth = cv2.VideoCapture(depth_vis_video)
+        if not cap_rgb.isOpened() or not cap_depth.isOpened():
+            print("Error: Could not open one or both videos")
+            return None
+        # Get video properties
+        original_fps = cap_rgb.get(cv2.CAP_PROP_FPS)
         if target_fps <= 0:
             target_fps = original_fps
+        # Calculate stride for frame skipping
+        stride = max(round(original_fps / target_fps), 1) if target_fps > 0 else 1
+        # Get frame counts for progress reporting
+        total_frames_rgb = int(cap_rgb.get(cv2.CAP_PROP_FRAME_COUNT))
+        print(f"Video fps: {original_fps}, target fps: {target_fps}, total frames: {total_frames_rgb}")
+        # Set up video writer
+        base_name = os.path.splitext(video_name)[0]
+        short_name = base_name[:20]
+        stitched_video_path = os.path.join(output_dir, short_name + '_RGBD.mp4')
+        # Get first frame to determine dimensions
+        ret_rgb, first_frame_rgb = cap_rgb.read()
+        ret_depth, first_frame_depth = cap_depth.read()
+        if not ret_rgb or not ret_depth:
+            print("Error: Could not read first frame from one or both videos")
             return None
+        # Reset video captures
+        cap_rgb.set(cv2.CAP_PROP_POS_FRAMES, 0)
+        cap_depth.set(cv2.CAP_PROP_POS_FRAMES, 0)
+        # Get output dimensions
+        H_full, W_full = first_frame_rgb.shape[:2]
+        output_width = W_full * 2  # RGB and depth side by side
+        output_height = H_full
+        # Initialize video writer
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(stitched_video_path, fourcc, target_fps, (output_width, output_height))
+        # Process frames one by one
+        frame_count = 0
+        processed_count = 0
+        while True:
+            # Read frames
+            ret_rgb, rgb_full = cap_rgb.read()
+            ret_depth, depth_frame = cap_depth.read()
+            # Break if either video ends
+            if not ret_rgb or not ret_depth:
+                break
+            # Skip frames based on stride
+            frame_count += 1
+            if frame_count % stride != 0:
+                continue
+            processed_count += 1
+            # Set max_len limit if specified
+            if max_len > 0 and processed_count > max_len:
+                break
+            # Process RGB frame - resize if max_res is specified
+            if max_res > 0:
+                h, w = rgb_full.shape[:2]
+                if max(h, w) > max_res:
+                    scale = max_res / max(h, w)
+                    new_h, new_w = int(h * scale), int(w * scale)
+                    rgb_full = cv2.resize(rgb_full, (new_w, new_h))
+            # Process depth frame based on settings (assuming always 3-channel)
             if grayscale:
                 if convert_from_color:
                     # Convert to grayscale if it's a color image
                     # Assume it's already the right format
                     depth_vis = depth_frame
             else:
+                if np.max(depth_frame) > 0:  # Ensure we have valid depth data
                     # Use the inferno colormap if requested
                     cmap = matplotlib.colormaps.get_cmap("inferno")
                     # Convert to single channel first
                 else:
                     # If zero depth, just use the original
                     depth_vis = depth_frame
             # Apply Gaussian blur if requested
             if blur > 0:
             depth_vis_resized = cv2.resize(depth_vis, (W_full, H_full))
             depth_vis_resized = depth_vis_resized.astype(np.uint8)  # Ensure uint8
+            # Concatenate frames
+            stitched = cv2.hconcat([rgb_full, depth_vis_resized])
+            # Write frame
+            out.write(stitched)
+            # Free memory
+            del rgb_full, depth_vis, depth_vis_resized, stitched
+            # Progress report
+            if processed_count % 10 == 0:
+                print(f"Processed {processed_count} frames...")
+            # Force garbage collection periodically
+            if processed_count % 50 == 0:
+                gc.collect()
+        # Release resources
+        cap_rgb.release()
+        cap_depth.release()
+        out.release()
         # Merge audio from the input video into the stitched video using ffmpeg.
         temp_audio_path = stitched_video_path.replace('_RGBD.mp4', '_RGBD_audio.mp4')
         ]
         subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         os.replace(temp_audio_path, stitched_video_path)
+        print(f"Completed processing {processed_count} frames")
     # Return stitched video.
     return stitched_video_path