File size: 9,793 Bytes
344feb9
 
 
 
 
 
 
 
 
 
 
 
10ae2d1
344feb9
 
 
a612409
 
 
344feb9
 
 
 
 
 
 
 
 
 
a612409
344feb9
 
a612409
344feb9
 
 
b878264
a612409
767c99b
b11843e
e999ea2
344feb9
767c99b
 
 
344feb9
 
 
767c99b
6278ae1
767c99b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3241598
767c99b
 
 
 
 
 
 
 
 
 
 
 
 
 
a4870c8
767c99b
a4870c8
767c99b
 
344feb9
767c99b
 
 
e185ea0
767c99b
 
 
 
 
41a5410
344feb9
 
 
2106d07
41a5410
2106d07
70e91c6
41a5410
70e91c6
 
 
41a5410
 
 
344feb9
4a8d08c
 
 
41a5410
344feb9
 
 
 
 
a612409
344feb9
 
 
 
 
 
 
3ed85f1
344feb9
 
 
 
 
 
 
 
 
 
a612409
2a41939
344feb9
 
 
 
 
 
 
 
 
 
3ed85f1
 
e999ea2
3ed85f1
 
344feb9
 
 
 
 
4a8d08c
344feb9
 
 
e999ea2
344feb9
 
 
 
 
4a8d08c
344feb9
 
 
a612409
3ed85f1
2a41939
344feb9
 
 
 
 
 
e999ea2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import os
import gc
import cv2
import gradio as gr
import numpy as np
import matplotlib.cm as cm
import matplotlib  # New import for the updated colormap API
import subprocess
import sys

from utils.dc_utils import read_video_frames, save_video

title = """**RGBD SBS output**"""
description = """**Video Depth Anything** + RGBD sbs output for viewing with Looking Glass Factory displays.
Please refer to our [paper](https://arxiv.org/abs/2501.12375), [project page](https://videodepthanything.github.io/), and [github](https://github.com/DepthAnything/Video-Depth-Anything) for more details."""

def stitch_rgbd_videos(
    processed_video: str,
    depth_vis_video: str,
    max_len: int = -1,
    target_fps: int = -1,
    max_res: int = 1280,
    stitch: bool = True,
    grayscale: bool = True,
    convert_from_color: bool = True,
    blur: float = 0.3,
    output_dir: str = './outputs',
    input_size: int = 518,
):
    video_name = os.path.basename(processed_video)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    stitched_video_path = None
    if stitch:
        # For stitching: read the original video in full resolution (without downscaling).
        full_frames, target_fps = read_video_frames(processed_video, max_len, target_fps, max_res=-1)
        depths, _ = read_video_frames(depth_vis_video, max_len, target_fps, max_res=-1)
        
        print(f"Depth frame shape: {depths[0].shape}, dtype: {depths[0].dtype}, min: {depths[0].min()}, max: {depths[0].max()}")
        
        # For each frame, create a visual depth image from the inferenced depths.
        d_min, d_max = np.min(depths), np.max(depths)
        print(f"Depth range: min={d_min}, max={d_max}, diff={d_max-d_min}")
        
        stitched_frames = []
        for i in range(min(len(full_frames), len(depths))):
            rgb_full = full_frames[i]  # Full-resolution RGB frame.
            depth_frame = depths[i]  # Already in uint8 format
            
            # Handle the case where depth is already in a 3-channel format
            if len(depth_frame.shape) == 3 and depth_frame.shape[2] == 3:
                # The depth is already a color or grayscale image with 3 channels
                if grayscale:
                    if convert_from_color:
                        # Convert to grayscale if it's a color image
                        depth_gray = cv2.cvtColor(depth_frame, cv2.COLOR_RGB2GRAY)
                        depth_vis = np.stack([depth_gray] * 3, axis=-1)
                    else:
                        # Assume it's already the right format
                        depth_vis = depth_frame
                else:
                    if depth_frame.max() > 0:  # Ensure we have valid depth data
                        # Use the inferno colormap if requested 
                        cmap = matplotlib.colormaps.get_cmap("inferno")
                        # Convert to single channel first
                        depth_gray = cv2.cvtColor(depth_frame, cv2.COLOR_RGB2GRAY)
                        # Normalize to 0-1 range for colormap
                        depth_norm = depth_gray / 255.0
                        # Apply colormap
                        depth_vis = (cmap(depth_norm)[..., :3] * 255).astype(np.uint8)
                    else:
                        # If zero depth, just use the original
                        depth_vis = depth_frame
            else:
                # Process as in original code (single channel depth)
                if d_max == d_min:
                    d_max = d_min + 1
                
                # Normalize the depth frame to the range [0, 255]
                depth_norm = np.clip((depth_frame - d_min) / (d_max - d_min) * 255, 0, 255).astype(np.uint8)
                
                # Ensure depth_norm is 2D (remove singleton dimensions if present)
                if depth_norm.ndim == 3:
                    depth_norm = np.squeeze(depth_norm)
                    
                # Generate depth visualization:
                if grayscale:
                    if convert_from_color:
                        # First, generate a color depth image using the inferno colormap,
                        # then convert that color image to grayscale.
                        cmap = matplotlib.colormaps.get_cmap("inferno")
                        depth_color = (cmap(depth_norm / 255.0)[..., :3] * 255).astype(np.uint8)
                        depth_gray = cv2.cvtColor(depth_color, cv2.COLOR_RGB2GRAY)
                        depth_vis = np.stack([depth_gray] * 3, axis=-1)
                    else:
                        # Directly generate a grayscale image from the normalized depth values.
                        depth_vis = np.stack([depth_norm] * 3, axis=-1)
                else:
                    # Generate a color depth image using the inferno colormap.
                    cmap = matplotlib.colormaps.get_cmap("inferno")
                    depth_vis = (cmap(depth_norm / 255.0)[..., :3] * 255).astype(np.uint8)
            
            # Apply Gaussian blur if requested
            if blur > 0:
                kernel_size = max(1, int(blur * 20) * 2 + 1)  # Ensures an odd kernel size.
                kernel_size = min(kernel_size, 31)  # Cap kernel size at 31 (OpenCV limitation)
                depth_vis = cv2.GaussianBlur(depth_vis, (kernel_size, kernel_size), 0)
            
            # Resize the depth visualization to match the full-resolution RGB frame.
            H_full, W_full = rgb_full.shape[:2]
            depth_vis_resized = cv2.resize(depth_vis, (W_full, H_full))
            depth_vis_resized = depth_vis_resized.astype(np.uint8)  # Ensure uint8
            
            if len(depth_vis_resized.shape) == 2:
                depth_vis_resized = cv2.cvtColor(depth_vis_resized, cv2.COLOR_GRAY2BGR)
            
            # Ensure both are the same type (commonly uint8):
            if rgb_full.dtype != depth_vis_resized.dtype:
                depth_vis_resized = depth_vis_resized.astype(rgb_full.dtype)
            
            # Now safely concatenate.
            stitched = cv2.hconcat([rgb_full, depth_vis_resized])            
            stitched_frames.append(stitched)

            del rgb_full, depth_vis_resized, stitched
            gc.collect()  # Force Python to free unused memory
            
        stitched_frames = np.array(stitched_frames)
        # Use only the first 20 characters of the base name for the output filename and append '_RGBD.mp4'
        base_name = os.path.splitext(video_name)[0]
        short_name = base_name[:20]
        stitched_video_path = os.path.join(output_dir, short_name + '_RGBD.mp4')
        save_video(stitched_frames, stitched_video_path, fps=target_fps)
        
        # Merge audio from the input video into the stitched video using ffmpeg.
        temp_audio_path = stitched_video_path.replace('_RGBD.mp4', '_RGBD_audio.mp4')
        cmd = [
            "ffmpeg",
            "-y",
            "-i", stitched_video_path,
            "-i", processed_video,
            "-c:v", "copy",
            "-c:a", "aac",
            "-map", "0:v:0",
            "-map", "1:a:0?",
            "-shortest",
            temp_audio_path
        ]
        subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        os.replace(temp_audio_path, stitched_video_path)

    # Return stitched video.
    return stitched_video_path

def construct_demo():
    with gr.Blocks(analytics_enabled=False) as demo:
        gr.Markdown(title)
        gr.Markdown(description)
        gr.Markdown("### If you find this work useful, please help ⭐ the [Github Repo](https://github.com/DepthAnything/Video-Depth-Anything). Thanks for your attention!")
        
        with gr.Row(equal_height=True):
            with gr.Column(scale=1):
                # Video input component for file upload.
                processed_video = gr.Video(label="Input Video with Audio")
                
            with gr.Column(scale=1):
                depth_vis_video = gr.Video(label="Depth Video") 
                
            with gr.Column(scale=2):
                with gr.Row(equal_height=True):
                    stitched_video = gr.Video(label="Stitched RGBD Video", interactive=False, autoplay=True, loop=True, show_share_button=True, scale=5)
                    
        with gr.Row(equal_height=True):
            with gr.Column(scale=2):
                with gr.Accordion("Advanced Settings", open=False):
                    max_len = gr.Slider(label="Max process length", minimum=-1, maximum=1000, value=-1, step=1)
                    target_fps = gr.Slider(label="Target FPS", minimum=-1, maximum=30, value=-1, step=1)
                    max_res = gr.Slider(label="Max side resolution", minimum=480, maximum=1920, value=1920, step=1)
                    stitch_option = gr.Checkbox(label="Stitch RGB & Depth Videos", value=True)
                    grayscale_option = gr.Checkbox(label="Output Depth as Grayscale", value=True)
                    convert_from_color_option = gr.Checkbox(label="Convert Grayscale from Color", value=True)
                    blur_slider = gr.Slider(minimum=0, maximum=1, step=0.01, label="Depth Blur (can reduce edge artifacts on display)", value=0.3)
                generate_btn = gr.Button("Generate")
            with gr.Column(scale=1):
                pass
        
        generate_btn.click(
            fn=stitch_rgbd_videos,
            inputs=[processed_video, depth_vis_video, max_len, target_fps, max_res, stitch_option, grayscale_option, convert_from_color_option, blur_slider],
            outputs=stitched_video,
        )
    
    return demo

if __name__ == "__main__":
    demo = construct_demo()
    demo.queue(max_size=4).launch()