Spaces:
Running
on
Zero
Running
on
Zero
Anton Obukhov
commited on
Commit
·
a20d271
1
Parent(s):
9c30a60
update to use the new template
Browse files- CONTRIBUTING.md +0 -15
- README.md +2 -2
- app.py +127 -578
- extrude.py +0 -354
- files/{bee_depth_fp32.npy → arc.jpeg} +2 -2
- files/bee.jpg +2 -2
- files/bee_depth_16bit.png +0 -0
- files/bee_depth_colored.png +0 -0
- files/{cat_depth_fp32.npy → berries.jpeg} +2 -2
- files/{einstein_depth_16bit.png → butterfly.jpeg} +2 -2
- files/cat.jpg +2 -2
- files/cat_depth_16bit.png +0 -0
- files/cat_depth_colored.png +0 -0
- files/{einstein_depth_fp32.npy → concert.jpeg} +2 -2
- files/dog.jpeg +3 -0
- files/doughnuts.jpeg +3 -0
- files/einstein.jpg +0 -0
- files/einstein_depth_colored.png +0 -0
- files/food.jpeg +3 -0
- files/glasses.jpeg +3 -0
- files/house.jpg +3 -0
- files/lake.jpeg +3 -0
- files/marigold.jpeg +3 -0
- files/portrait_1.jpeg +3 -0
- files/portrait_2.jpeg +3 -0
- files/pumpkins.jpg +3 -0
- files/puzzle.jpeg +3 -0
- files/road.jpg +3 -0
- files/scientists.jpg +3 -0
- files/surfboards.jpeg +3 -0
- files/surfer.jpeg +3 -0
- files/swings.jpg +0 -0
- files/swings_depth_16bit.png +0 -0
- files/swings_depth_colored.png +0 -0
- files/swings_depth_fp32.npy +0 -3
- files/switzerland.jpeg +3 -0
- files/teamwork.jpeg +3 -0
- files/wave.jpeg +3 -0
- marigold_depth_estimation.py +0 -632
- marigold_logo_square.jpg +0 -0
- requirements.txt +9 -13
CONTRIBUTING.md
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
## Contributing instructions
|
2 |
-
|
3 |
-
We appreciate your interest in contributing. Please follow these guidelines:
|
4 |
-
|
5 |
-
1. **Discuss Changes:** Start a GitHub issue to talk about your proposed change before proceeding.
|
6 |
-
|
7 |
-
2. **Pull Requests:** Avoid unsolicited PRs. Discussion helps align with project goals.
|
8 |
-
|
9 |
-
3. **License Agreement:** By submitting a PR, you accept our LICENSE terms.
|
10 |
-
|
11 |
-
4. **Legal Compatibility:** Ensure your change complies with our project's objectives and licensing.
|
12 |
-
|
13 |
-
5. **Attribution:** Credit third-party code in your PR if used.
|
14 |
-
|
15 |
-
Please, feel free to reach out for questions or assistance. Your contributions are valued, and we're excited to work together to enhance this project!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -4,12 +4,12 @@ emoji: 🏵️
|
|
4 |
colorFrom: blue
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: cc-by-sa-4.0
|
11 |
models:
|
12 |
-
- prs-eth/marigold-v1-0
|
13 |
---
|
14 |
|
15 |
This is a demo of the monocular depth estimation pipeline, described in the CVPR 2024 paper titled ["Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation"](https://arxiv.org/abs/2312.02145)
|
|
|
4 |
colorFrom: blue
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.44.1
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: cc-by-sa-4.0
|
11 |
models:
|
12 |
+
- prs-eth/marigold-depth-v1-0
|
13 |
---
|
14 |
|
15 |
This is a demo of the monocular depth estimation pipeline, described in the CVPR 2024 paper titled ["Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation"](https://arxiv.org/abs/2312.02145)
|
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# Copyright
|
2 |
#
|
3 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
# you may not use this file except in compliance with the License.
|
@@ -12,613 +12,162 @@
|
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
14 |
# --------------------------------------------------------------------------
|
15 |
-
#
|
16 |
-
#
|
17 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
# --------------------------------------------------------------------------
|
19 |
|
20 |
-
|
21 |
-
import functools
|
22 |
import os
|
23 |
-
|
24 |
-
import spaces
|
25 |
import gradio as gr
|
26 |
-
import numpy as np
|
27 |
import torch as torch
|
28 |
-
from
|
29 |
-
|
30 |
-
from gradio_imageslider import ImageSlider
|
31 |
from huggingface_hub import login
|
|
|
32 |
|
33 |
-
|
34 |
-
from marigold_depth_estimation import MarigoldPipeline
|
35 |
-
|
36 |
-
|
37 |
-
def process(
|
38 |
-
pipe,
|
39 |
-
path_input,
|
40 |
-
ensemble_size,
|
41 |
-
denoise_steps,
|
42 |
-
processing_res,
|
43 |
-
path_out_16bit=None,
|
44 |
-
path_out_fp32=None,
|
45 |
-
path_out_vis=None,
|
46 |
-
_input_3d_plane_near=None,
|
47 |
-
_input_3d_plane_far=None,
|
48 |
-
_input_3d_embossing=None,
|
49 |
-
_input_3d_filter_size=None,
|
50 |
-
_input_3d_frame_near=None,
|
51 |
-
):
|
52 |
-
if path_out_vis is not None:
|
53 |
-
return (
|
54 |
-
[path_out_16bit, path_out_vis],
|
55 |
-
[path_out_16bit, path_out_fp32, path_out_vis],
|
56 |
-
)
|
57 |
-
|
58 |
-
input_image = Image.open(path_input)
|
59 |
-
|
60 |
-
pipe_out = pipe(
|
61 |
-
input_image,
|
62 |
-
ensemble_size=ensemble_size,
|
63 |
-
denoising_steps=denoise_steps,
|
64 |
-
processing_res=processing_res,
|
65 |
-
batch_size=1 if processing_res == 0 else 0,
|
66 |
-
show_progress_bar=True,
|
67 |
-
)
|
68 |
-
|
69 |
-
depth_pred = pipe_out.depth_np
|
70 |
-
depth_colored = pipe_out.depth_colored
|
71 |
-
depth_16bit = (depth_pred * 65535.0).astype(np.uint16)
|
72 |
-
|
73 |
-
path_output_dir = os.path.splitext(path_input)[0] + "_output"
|
74 |
-
os.makedirs(path_output_dir, exist_ok=True)
|
75 |
-
|
76 |
-
name_base = os.path.splitext(os.path.basename(path_input))[0]
|
77 |
-
path_out_fp32 = os.path.join(path_output_dir, f"{name_base}_depth_fp32.npy")
|
78 |
-
path_out_16bit = os.path.join(path_output_dir, f"{name_base}_depth_16bit.png")
|
79 |
-
path_out_vis = os.path.join(path_output_dir, f"{name_base}_depth_colored.png")
|
80 |
-
|
81 |
-
np.save(path_out_fp32, depth_pred)
|
82 |
-
Image.fromarray(depth_16bit).save(path_out_16bit, mode="I;16")
|
83 |
-
depth_colored.save(path_out_vis)
|
84 |
-
|
85 |
-
return (
|
86 |
-
[path_out_16bit, path_out_vis],
|
87 |
-
[path_out_16bit, path_out_fp32, path_out_vis],
|
88 |
-
)
|
89 |
-
|
90 |
-
|
91 |
-
def process_3d(
|
92 |
-
input_image,
|
93 |
-
files,
|
94 |
-
size_longest_px,
|
95 |
-
size_longest_cm,
|
96 |
-
filter_size,
|
97 |
-
plane_near,
|
98 |
-
plane_far,
|
99 |
-
embossing,
|
100 |
-
frame_thickness,
|
101 |
-
frame_near,
|
102 |
-
frame_far,
|
103 |
-
):
|
104 |
-
if input_image is None or len(files) < 1:
|
105 |
-
raise gr.Error(
|
106 |
-
"Please upload an image (or use examples) and compute depth first"
|
107 |
-
)
|
108 |
-
|
109 |
-
if plane_near >= plane_far:
|
110 |
-
raise gr.Error("NEAR plane must have a value smaller than the FAR plane")
|
111 |
-
|
112 |
-
def _process_3d(
|
113 |
-
size_longest_px,
|
114 |
-
filter_size,
|
115 |
-
vertex_colors,
|
116 |
-
scene_lights,
|
117 |
-
output_model_scale=None,
|
118 |
-
prepare_for_3d_printing=False,
|
119 |
-
):
|
120 |
-
image_rgb = input_image
|
121 |
-
image_depth = files[0]
|
122 |
-
|
123 |
-
image_rgb_basename, image_rgb_ext = os.path.splitext(image_rgb)
|
124 |
-
image_depth_basename, image_depth_ext = os.path.splitext(image_depth)
|
125 |
-
|
126 |
-
image_rgb_content = Image.open(image_rgb)
|
127 |
-
image_rgb_w, image_rgb_h = image_rgb_content.width, image_rgb_content.height
|
128 |
-
image_rgb_d = max(image_rgb_w, image_rgb_h)
|
129 |
-
image_new_w = size_longest_px * image_rgb_w // image_rgb_d
|
130 |
-
image_new_h = size_longest_px * image_rgb_h // image_rgb_d
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
image_rgb_content.resize((image_new_w, image_new_h), Image.LANCZOS).save(
|
135 |
-
image_rgb_new
|
136 |
-
)
|
137 |
-
Image.open(image_depth).resize((image_new_w, image_new_h), Image.BILINEAR).save(
|
138 |
-
image_depth_new
|
139 |
-
)
|
140 |
|
141 |
-
|
142 |
-
|
143 |
-
image_depth_new,
|
144 |
-
output_model_scale=(
|
145 |
-
size_longest_cm * 10
|
146 |
-
if output_model_scale is None
|
147 |
-
else output_model_scale
|
148 |
-
),
|
149 |
-
filter_size=filter_size,
|
150 |
-
coef_near=plane_near,
|
151 |
-
coef_far=plane_far,
|
152 |
-
emboss=embossing / 100,
|
153 |
-
f_thic=frame_thickness / 100,
|
154 |
-
f_near=frame_near / 100,
|
155 |
-
f_back=frame_far / 100,
|
156 |
-
vertex_colors=vertex_colors,
|
157 |
-
scene_lights=scene_lights,
|
158 |
-
prepare_for_3d_printing=prepare_for_3d_printing,
|
159 |
-
)
|
160 |
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
-
path_viewer_glb, _ = _process_3d(
|
164 |
-
256, filter_size, vertex_colors=False, scene_lights=True, output_model_scale=1
|
165 |
-
)
|
166 |
-
path_files_glb, path_files_stl = _process_3d(
|
167 |
-
size_longest_px,
|
168 |
-
filter_size,
|
169 |
-
vertex_colors=True,
|
170 |
-
scene_lights=False,
|
171 |
-
prepare_for_3d_printing=True,
|
172 |
-
)
|
173 |
|
174 |
-
|
|
|
|
|
|
|
|
|
175 |
|
176 |
-
|
177 |
-
def run_demo_server(pipe):
|
178 |
-
process_pipe = spaces.GPU(functools.partial(process, pipe), duration=120)
|
179 |
-
os.environ["GRADIO_ALLOW_FLAGGING"] = "never"
|
180 |
-
|
181 |
-
with gr.Blocks(
|
182 |
-
analytics_enabled=False,
|
183 |
-
title="Marigold Depth Estimation",
|
184 |
-
css="""
|
185 |
-
#download {
|
186 |
-
height: 118px;
|
187 |
-
}
|
188 |
-
.slider .inner {
|
189 |
-
width: 5px;
|
190 |
-
background: #FFF;
|
191 |
-
}
|
192 |
-
.viewport {
|
193 |
-
aspect-ratio: 4/3;
|
194 |
-
}
|
195 |
-
h1 {
|
196 |
-
text-align: center;
|
197 |
-
display: block;
|
198 |
-
}
|
199 |
-
h2 {
|
200 |
-
text-align: center;
|
201 |
-
display: block;
|
202 |
-
}
|
203 |
-
h3 {
|
204 |
-
text-align: center;
|
205 |
-
display: block;
|
206 |
-
}
|
207 |
-
""",
|
208 |
-
) as demo:
|
209 |
gr.Markdown(
|
210 |
"""
|
211 |
-
|
212 |
-
|
213 |
<p align="center">
|
214 |
<a title="Website" href="https://marigoldmonodepth.github.io/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
|
215 |
-
<img src="https://
|
216 |
</a>
|
217 |
<a title="arXiv" href="https://arxiv.org/abs/2312.02145" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
|
218 |
-
<img src="https://
|
219 |
</a>
|
220 |
<a title="Github" href="https://github.com/prs-eth/marigold" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
|
221 |
<img src="https://img.shields.io/github/stars/prs-eth/marigold?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="badge-github-stars">
|
222 |
</a>
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
<a title="Social" href="https://twitter.com/antonobukhov1" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
|
224 |
-
<img src="https://
|
225 |
</a>
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
</p>
|
227 |
-
|
228 |
-
Marigold is the state-of-the-art depth estimator for images in the wild.
|
229 |
-
Upload your image into the <b>first</b> pane, or click any of the <b>examples</b> below.
|
230 |
-
The result will be computed and appear in the <b>second</b> pane.
|
231 |
-
Scroll down to use the computed depth map for creating a 3D printable asset.
|
232 |
-
|
233 |
-
<a href="https://huggingface.co/spaces/prs-eth/marigold-lcm" style="color: crimson;">
|
234 |
-
<h3 style="color: crimson;">Check out Marigold-LCM — a FAST version of this demo!<h3>
|
235 |
-
</a>
|
236 |
"""
|
237 |
)
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
maximum=20,
|
257 |
-
step=1,
|
258 |
-
value=10,
|
259 |
-
)
|
260 |
-
processing_res = gr.Radio(
|
261 |
-
[
|
262 |
-
("Native", 0),
|
263 |
-
("Recommended", 768),
|
264 |
-
],
|
265 |
-
label="Processing resolution",
|
266 |
-
value=768,
|
267 |
-
)
|
268 |
-
input_output_16bit = gr.File(
|
269 |
-
label="Predicted depth (16-bit)",
|
270 |
-
visible=False,
|
271 |
-
)
|
272 |
-
input_output_fp32 = gr.File(
|
273 |
-
label="Predicted depth (32-bit)",
|
274 |
-
visible=False,
|
275 |
-
)
|
276 |
-
input_output_vis = gr.File(
|
277 |
-
label="Predicted depth (red-near, blue-far)",
|
278 |
-
visible=False,
|
279 |
-
)
|
280 |
-
with gr.Row():
|
281 |
-
submit_btn = gr.Button(value="Compute Depth", variant="primary")
|
282 |
-
clear_btn = gr.Button(value="Clear")
|
283 |
-
with gr.Column():
|
284 |
-
output_slider = ImageSlider(
|
285 |
-
label="Predicted depth (red-near, blue-far)",
|
286 |
-
type="filepath",
|
287 |
-
show_download_button=True,
|
288 |
-
show_share_button=True,
|
289 |
-
interactive=False,
|
290 |
-
elem_classes="slider",
|
291 |
-
position=0.25,
|
292 |
-
)
|
293 |
-
files = gr.Files(
|
294 |
-
label="Depth outputs",
|
295 |
-
elem_id="download",
|
296 |
-
interactive=False,
|
297 |
-
)
|
298 |
-
|
299 |
-
demo_3d_header = gr.Markdown(
|
300 |
-
"""
|
301 |
-
<h3 align="center">3D Printing Depth Maps</h3>
|
302 |
-
<p align="justify">
|
303 |
-
This part of the demo uses Marigold depth maps estimated in the previous step to create a
|
304 |
-
3D-printable model. The models are watertight, with correct normals, and exported in the STL format.
|
305 |
-
We recommended creating the first model with the default parameters and iterating on it until the best
|
306 |
-
result (see Pro Tips below).
|
307 |
-
</p>
|
308 |
-
""",
|
309 |
-
render=False,
|
310 |
-
)
|
311 |
-
|
312 |
-
demo_3d = gr.Row(render=False)
|
313 |
-
with demo_3d:
|
314 |
-
with gr.Column():
|
315 |
-
with gr.Accordion("3D printing demo: Main options", open=True):
|
316 |
-
plane_near = gr.Slider(
|
317 |
-
label="Relative position of the near plane (between 0 and 1)",
|
318 |
-
minimum=0.0,
|
319 |
-
maximum=1.0,
|
320 |
-
step=0.001,
|
321 |
-
value=0.0,
|
322 |
-
)
|
323 |
-
plane_far = gr.Slider(
|
324 |
-
label="Relative position of the far plane (between near and 1)",
|
325 |
-
minimum=0.0,
|
326 |
-
maximum=1.0,
|
327 |
-
step=0.001,
|
328 |
-
value=1.0,
|
329 |
-
)
|
330 |
-
embossing = gr.Slider(
|
331 |
-
label="Embossing level",
|
332 |
-
minimum=0,
|
333 |
-
maximum=100,
|
334 |
-
step=1,
|
335 |
-
value=20,
|
336 |
-
)
|
337 |
-
with gr.Accordion("3D printing demo: Advanced options", open=False):
|
338 |
-
size_longest_px = gr.Slider(
|
339 |
-
label="Size (px) of the longest side",
|
340 |
-
minimum=256,
|
341 |
-
maximum=1024,
|
342 |
-
step=256,
|
343 |
-
value=512,
|
344 |
-
)
|
345 |
-
size_longest_cm = gr.Slider(
|
346 |
-
label="Size (cm) of the longest side",
|
347 |
-
minimum=1,
|
348 |
-
maximum=100,
|
349 |
-
step=1,
|
350 |
-
value=10,
|
351 |
-
)
|
352 |
-
filter_size = gr.Slider(
|
353 |
-
label="Size (px) of the smoothing filter",
|
354 |
-
minimum=1,
|
355 |
-
maximum=5,
|
356 |
-
step=2,
|
357 |
-
value=3,
|
358 |
-
)
|
359 |
-
frame_thickness = gr.Slider(
|
360 |
-
label="Frame thickness",
|
361 |
-
minimum=0,
|
362 |
-
maximum=100,
|
363 |
-
step=1,
|
364 |
-
value=5,
|
365 |
-
)
|
366 |
-
frame_near = gr.Slider(
|
367 |
-
label="Frame's near plane offset",
|
368 |
-
minimum=-100,
|
369 |
-
maximum=100,
|
370 |
-
step=1,
|
371 |
-
value=1,
|
372 |
-
)
|
373 |
-
frame_far = gr.Slider(
|
374 |
-
label="Frame's far plane offset",
|
375 |
-
minimum=1,
|
376 |
-
maximum=10,
|
377 |
-
step=1,
|
378 |
-
value=1,
|
379 |
-
)
|
380 |
-
with gr.Row():
|
381 |
-
submit_3d = gr.Button(value="Create 3D", variant="primary")
|
382 |
-
clear_3d = gr.Button(value="Clear 3D")
|
383 |
-
gr.Markdown(
|
384 |
-
"""
|
385 |
-
<h5 align="center">Pro Tips</h5>
|
386 |
-
<ol>
|
387 |
-
<li><b>Re-render with new parameters</b>: Click "Clear 3D" and then "Create 3D".</li>
|
388 |
-
<li><b>Adjust 3D scale and cut-off focus</b>: Set the frame's near plane offset to the
|
389 |
-
minimum and use 3D preview to evaluate depth scaling. Repeat until the scale is correct and
|
390 |
-
everything important is in the focus. Set the optimal value for frame's near
|
391 |
-
plane offset as a last step.</li>
|
392 |
-
<li><b>Increase details</b>: Decrease size of the smoothing filter (also increases noise).</li>
|
393 |
-
</ol>
|
394 |
-
"""
|
395 |
-
)
|
396 |
-
|
397 |
-
with gr.Column():
|
398 |
-
viewer_3d = gr.Model3D(
|
399 |
-
camera_position=(75.0, 90.0, 1.25),
|
400 |
-
elem_classes="viewport",
|
401 |
-
label="3D preview (low-res, relief highlight)",
|
402 |
-
interactive=False,
|
403 |
-
)
|
404 |
-
files_3d = gr.Files(
|
405 |
-
label="3D model outputs (high-res)",
|
406 |
-
elem_id="download",
|
407 |
-
interactive=False,
|
408 |
-
)
|
409 |
-
|
410 |
-
blocks_settings_depth = [ensemble_size, denoise_steps, processing_res]
|
411 |
-
blocks_settings_3d = [
|
412 |
-
plane_near,
|
413 |
-
plane_far,
|
414 |
-
embossing,
|
415 |
-
size_longest_px,
|
416 |
-
size_longest_cm,
|
417 |
-
filter_size,
|
418 |
-
frame_thickness,
|
419 |
-
frame_near,
|
420 |
-
frame_far,
|
421 |
-
]
|
422 |
-
blocks_settings = blocks_settings_depth + blocks_settings_3d
|
423 |
-
map_id_to_default = {b._id: b.value for b in blocks_settings}
|
424 |
-
|
425 |
-
inputs = [
|
426 |
-
input_image,
|
427 |
-
ensemble_size,
|
428 |
-
denoise_steps,
|
429 |
-
processing_res,
|
430 |
-
input_output_16bit,
|
431 |
-
input_output_fp32,
|
432 |
-
input_output_vis,
|
433 |
-
plane_near,
|
434 |
-
plane_far,
|
435 |
-
embossing,
|
436 |
-
filter_size,
|
437 |
-
frame_near,
|
438 |
-
]
|
439 |
-
outputs = [
|
440 |
-
submit_btn,
|
441 |
-
input_image,
|
442 |
-
output_slider,
|
443 |
-
files,
|
444 |
-
]
|
445 |
-
|
446 |
-
def submit_depth_fn(*args):
|
447 |
-
out = list(process_pipe(*args))
|
448 |
-
out = [gr.Button(interactive=False), gr.Image(interactive=False)] + out
|
449 |
-
return out
|
450 |
-
|
451 |
-
submit_btn.click(
|
452 |
-
fn=submit_depth_fn,
|
453 |
-
inputs=inputs,
|
454 |
-
outputs=outputs,
|
455 |
-
concurrency_limit=1,
|
456 |
-
)
|
457 |
-
|
458 |
-
gr.Examples(
|
459 |
-
fn=submit_depth_fn,
|
460 |
-
examples=[
|
461 |
-
[
|
462 |
-
"files/bee.jpg",
|
463 |
-
10, # ensemble_size
|
464 |
-
10, # denoise_steps
|
465 |
-
768, # processing_res
|
466 |
-
"files/bee_depth_16bit.png",
|
467 |
-
"files/bee_depth_fp32.npy",
|
468 |
-
"files/bee_depth_colored.png",
|
469 |
-
0.0, # plane_near
|
470 |
-
0.5, # plane_far
|
471 |
-
20, # embossing
|
472 |
-
3, # filter_size
|
473 |
-
0, # frame_near
|
474 |
-
],
|
475 |
-
[
|
476 |
-
"files/cat.jpg",
|
477 |
-
10, # ensemble_size
|
478 |
-
10, # denoise_steps
|
479 |
-
768, # processing_res
|
480 |
-
"files/cat_depth_16bit.png",
|
481 |
-
"files/cat_depth_fp32.npy",
|
482 |
-
"files/cat_depth_colored.png",
|
483 |
-
0.0, # plane_near
|
484 |
-
0.3, # plane_far
|
485 |
-
20, # embossing
|
486 |
-
3, # filter_size
|
487 |
-
0, # frame_near
|
488 |
-
],
|
489 |
[
|
490 |
-
"
|
491 |
-
|
492 |
-
10, # denoise_steps
|
493 |
-
768, # processing_res
|
494 |
-
"files/swings_depth_16bit.png",
|
495 |
-
"files/swings_depth_fp32.npy",
|
496 |
-
"files/swings_depth_colored.png",
|
497 |
-
0.05, # plane_near
|
498 |
-
0.25, # plane_far
|
499 |
-
10, # embossing
|
500 |
-
1, # filter_size
|
501 |
-
0, # frame_near
|
502 |
],
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
def clear_fn():
|
527 |
-
out = []
|
528 |
-
for b in blocks_settings:
|
529 |
-
out.append(map_id_to_default[b._id])
|
530 |
-
out += [
|
531 |
-
gr.Button(interactive=True),
|
532 |
-
gr.Button(interactive=True),
|
533 |
-
gr.Image(value=None, interactive=True),
|
534 |
-
None,
|
535 |
-
None,
|
536 |
-
None,
|
537 |
-
None,
|
538 |
-
None,
|
539 |
-
None,
|
540 |
-
None,
|
541 |
-
]
|
542 |
-
return out
|
543 |
-
|
544 |
-
clear_btn.click(
|
545 |
-
fn=clear_fn,
|
546 |
-
inputs=[],
|
547 |
-
outputs=blocks_settings
|
548 |
-
+ [
|
549 |
-
submit_btn,
|
550 |
-
submit_3d,
|
551 |
-
input_image,
|
552 |
-
input_output_16bit,
|
553 |
-
input_output_fp32,
|
554 |
-
input_output_vis,
|
555 |
-
output_slider,
|
556 |
-
files,
|
557 |
-
viewer_3d,
|
558 |
-
files_3d,
|
559 |
-
],
|
560 |
-
)
|
561 |
-
|
562 |
-
def submit_3d_fn(*args):
|
563 |
-
out = list(process_3d(*args))
|
564 |
-
out = [gr.Button(interactive=False)] + out
|
565 |
-
return out
|
566 |
-
|
567 |
-
submit_3d.click(
|
568 |
-
fn=submit_3d_fn,
|
569 |
-
inputs=[
|
570 |
-
input_image,
|
571 |
-
files,
|
572 |
-
size_longest_px,
|
573 |
-
size_longest_cm,
|
574 |
-
filter_size,
|
575 |
-
plane_near,
|
576 |
-
plane_far,
|
577 |
-
embossing,
|
578 |
-
frame_thickness,
|
579 |
-
frame_near,
|
580 |
-
frame_far,
|
581 |
-
],
|
582 |
-
outputs=[submit_3d, viewer_3d, files_3d],
|
583 |
-
concurrency_limit=1,
|
584 |
-
)
|
585 |
-
|
586 |
-
def clear_3d_fn():
|
587 |
-
return [gr.Button(interactive=True), None, None]
|
588 |
-
|
589 |
-
clear_3d.click(
|
590 |
-
fn=clear_3d_fn,
|
591 |
-
inputs=[],
|
592 |
-
outputs=[submit_3d, viewer_3d, files_3d],
|
593 |
-
)
|
594 |
-
|
595 |
-
demo.queue(
|
596 |
-
api_open=False,
|
597 |
-
).launch(
|
598 |
-
server_name="0.0.0.0",
|
599 |
-
server_port=7860,
|
600 |
)
|
601 |
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023-2025 Marigold Team, ETH Zürich. All rights reserved.
|
2 |
#
|
3 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
# you may not use this file except in compliance with the License.
|
|
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
14 |
# --------------------------------------------------------------------------
|
15 |
+
# More information about Marigold:
|
16 |
+
# https://marigoldmonodepth.github.io
|
17 |
+
# https://marigoldcomputervision.github.io
|
18 |
+
# Efficient inference pipelines are now part of diffusers:
|
19 |
+
# https://huggingface.co/docs/diffusers/using-diffusers/marigold_usage
|
20 |
+
# https://huggingface.co/docs/diffusers/api/pipelines/marigold
|
21 |
+
# Examples of trained models and live demos:
|
22 |
+
# https://huggingface.co/prs-eth
|
23 |
+
# Related projects:
|
24 |
+
# https://marigolddepthcompletion.github.io/
|
25 |
+
# https://rollingdepth.github.io/
|
26 |
+
# Citation (BibTeX):
|
27 |
+
# https://github.com/prs-eth/Marigold#-citation
|
28 |
+
# https://github.com/prs-eth/Marigold-DC#-citation
|
29 |
+
# https://github.com/prs-eth/rollingdepth#-citation
|
30 |
# --------------------------------------------------------------------------
|
31 |
|
|
|
|
|
32 |
import os
|
|
|
|
|
33 |
import gradio as gr
|
|
|
34 |
import torch as torch
|
35 |
+
from diffusers import MarigoldDepthPipeline, DDIMScheduler
|
36 |
+
from gradio_dualvision import DualVisionApp
|
|
|
37 |
from huggingface_hub import login
|
38 |
+
from PIL import Image
|
39 |
|
40 |
+
CHECKPOINT = "prs-eth/marigold-depth-v1-0"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
if "HF_TOKEN_LOGIN" in os.environ:
|
43 |
+
login(token=os.environ["HF_TOKEN_LOGIN"])
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
46 |
+
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
pipe = MarigoldDepthPipeline.from_pretrained(CHECKPOINT)
|
49 |
+
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
|
50 |
+
pipe = pipe.to(device=device, dtype=dtype)
|
51 |
+
try:
|
52 |
+
import xformers
|
53 |
+
pipe.enable_xformers_memory_efficient_attention()
|
54 |
+
except:
|
55 |
+
pass
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
+
class MarigoldDepthApp(DualVisionApp):
|
59 |
+
DEFAULT_SEED = 2024
|
60 |
+
DEFAULT_ENSEMBLE_SIZE = 1
|
61 |
+
DEFAULT_DENOISE_STEPS = 4
|
62 |
+
DEFAULT_PROCESSING_RES = 768
|
63 |
|
64 |
+
def make_header(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
gr.Markdown(
|
66 |
"""
|
67 |
+
## Marigold Depth Estimation
|
|
|
68 |
<p align="center">
|
69 |
<a title="Website" href="https://marigoldmonodepth.github.io/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
|
70 |
+
<img src="https://img.shields.io/badge/%E2%99%A5%20Project%20-Website-blue">
|
71 |
</a>
|
72 |
<a title="arXiv" href="https://arxiv.org/abs/2312.02145" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
|
73 |
+
<img src="https://img.shields.io/badge/%F0%9F%93%84%20Read%20-Paper-AF3436">
|
74 |
</a>
|
75 |
<a title="Github" href="https://github.com/prs-eth/marigold" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
|
76 |
<img src="https://img.shields.io/github/stars/prs-eth/marigold?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="badge-github-stars">
|
77 |
</a>
|
78 |
+
<a title="Video Depth" href="https://huggingface.co/spaces/prs-eth/rollingdepth" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
|
79 |
+
<img src="https://img.shields.io/badge/%F0%9F%A4%97%20Video%20Depth%20-Demo-yellow" alt="videodepth">
|
80 |
+
</a>
|
81 |
+
<a title="Depth-to-3D" href="https://huggingface.co/spaces/prs-eth/depth-to-3d-print" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
|
82 |
+
<img src="https://img.shields.io/badge/%F0%9F%A4%97%20Depth--to--3D%20-Demo-yellow" alt="depthto3d">
|
83 |
+
</a>
|
84 |
<a title="Social" href="https://twitter.com/antonobukhov1" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
|
85 |
+
<img src="https://shields.io/twitter/follow/:?label=Subscribe%20for%20updates!" alt="social">
|
86 |
</a>
|
87 |
+
</p>
|
88 |
+
<p align="center" style="margin-top: 0px;">
|
89 |
+
Upload a photo or select an example below to compute depth maps in real time.
|
90 |
+
Use the slider to reveal areas of interest.
|
91 |
+
Use the radio-buttons to switch between modalities.
|
92 |
+
Check our other demo badges above for new or relocated functionality.
|
93 |
</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
"""
|
95 |
)
|
96 |
|
97 |
+
def build_user_components(self):
|
98 |
+
with gr.Column():
|
99 |
+
ensemble_size = gr.Slider(
|
100 |
+
label="Ensemble size",
|
101 |
+
minimum=1,
|
102 |
+
maximum=20,
|
103 |
+
step=1,
|
104 |
+
value=10,
|
105 |
+
)
|
106 |
+
denoise_steps = gr.Slider(
|
107 |
+
label="Number of denoising steps",
|
108 |
+
minimum=1,
|
109 |
+
maximum=20,
|
110 |
+
step=1,
|
111 |
+
value=10,
|
112 |
+
)
|
113 |
+
processing_res = gr.Radio(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
[
|
115 |
+
("Native", 0),
|
116 |
+
("Recommended", 768),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
],
|
118 |
+
label="Processing resolution",
|
119 |
+
value=768,
|
120 |
+
)
|
121 |
+
return {
|
122 |
+
"ensemble_size": ensemble_size,
|
123 |
+
"denoise_steps": denoise_steps,
|
124 |
+
"processing_res": processing_res,
|
125 |
+
}
|
126 |
+
|
127 |
+
def process(self, image_in: Image.Image, **kwargs):
|
128 |
+
ensemble_size = kwargs.get("ensemble_size", self.DEFAULT_ENSEMBLE_SIZE)
|
129 |
+
denoise_steps = kwargs.get("denoise_steps", self.DEFAULT_DENOISE_STEPS)
|
130 |
+
processing_res = kwargs.get("processing_res", self.DEFAULT_PROCESSING_RES)
|
131 |
+
generator = torch.Generator(device=device).manual_seed(self.DEFAULT_SEED)
|
132 |
+
|
133 |
+
pipe_out = pipe(
|
134 |
+
image_in,
|
135 |
+
ensemble_size=ensemble_size,
|
136 |
+
num_inference_steps=denoise_steps,
|
137 |
+
processing_resolution=processing_res,
|
138 |
+
batch_size=1 if processing_res == 0 else 2,
|
139 |
+
output_uncertainty=ensemble_size >= 3,
|
140 |
+
generator=generator,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
)
|
142 |
|
143 |
+
depth_vis = pipe.image_processor.visualize_depth(pipe_out.prediction)[0]
|
144 |
+
depth_16bit = pipe.image_processor.export_depth_to_16bit_png(pipe_out.prediction)[0]
|
145 |
+
|
146 |
+
out_modalities = {
|
147 |
+
"Depth Visualization": depth_vis,
|
148 |
+
"Depth 16-bit": depth_16bit,
|
149 |
+
}
|
150 |
+
if ensemble_size >= 3:
|
151 |
+
uncertainty = pipe.image_processor.visualize_uncertainty(pipe_out.uncertainty)[0]
|
152 |
+
out_modalities["Uncertainty"] = uncertainty
|
153 |
+
|
154 |
+
out_settings = {
|
155 |
+
"ensemble_size": ensemble_size,
|
156 |
+
"denoise_steps": denoise_steps,
|
157 |
+
"processing_res": processing_res,
|
158 |
+
}
|
159 |
+
return out_modalities, out_settings
|
160 |
+
|
161 |
+
|
162 |
+
with MarigoldDepthApp(
|
163 |
+
title="Marigold Depth",
|
164 |
+
examples_path="files",
|
165 |
+
examples_per_page=5,
|
166 |
+
squeeze_canvas=True,
|
167 |
+
) as demo:
|
168 |
+
demo.queue(
|
169 |
+
api_open=False,
|
170 |
+
).launch(
|
171 |
+
server_name="0.0.0.0",
|
172 |
+
server_port=7860,
|
173 |
+
)
|
extrude.py
DELETED
@@ -1,354 +0,0 @@
|
|
1 |
-
# Copyright 2024 Anton Obukhov, ETH Zurich. All rights reserved.
|
2 |
-
#
|
3 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
-
# you may not use this file except in compliance with the License.
|
5 |
-
# You may obtain a copy of the License at
|
6 |
-
#
|
7 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
-
#
|
9 |
-
# Unless required by applicable law or agreed to in writing, software
|
10 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
-
# See the License for the specific language governing permissions and
|
13 |
-
# limitations under the License.
|
14 |
-
# --------------------------------------------------------------------------
|
15 |
-
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
16 |
-
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
17 |
-
# More information about the method can be found at https://marigoldmonodepth.github.io
|
18 |
-
# --------------------------------------------------------------------------
|
19 |
-
|
20 |
-
|
21 |
-
import math
|
22 |
-
import os
|
23 |
-
|
24 |
-
import numpy as np
|
25 |
-
import pygltflib
|
26 |
-
import trimesh
|
27 |
-
from PIL import Image, ImageFilter
|
28 |
-
|
29 |
-
|
30 |
-
def quaternion_multiply(q1, q2):
|
31 |
-
x1, y1, z1, w1 = q1
|
32 |
-
x2, y2, z2, w2 = q2
|
33 |
-
return [
|
34 |
-
w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2,
|
35 |
-
w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2,
|
36 |
-
w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2,
|
37 |
-
w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2,
|
38 |
-
]
|
39 |
-
|
40 |
-
|
41 |
-
def glb_add_lights(path_input, path_output):
|
42 |
-
"""
|
43 |
-
Adds directional lights in the horizontal plane to the glb file.
|
44 |
-
:param path_input: path to input glb
|
45 |
-
:param path_output: path to output glb
|
46 |
-
:return: None
|
47 |
-
"""
|
48 |
-
glb = pygltflib.GLTF2().load(path_input)
|
49 |
-
|
50 |
-
N = 3 # default max num lights in Babylon.js is 4
|
51 |
-
angle_step = 2 * math.pi / N
|
52 |
-
elevation_angle = math.radians(75)
|
53 |
-
|
54 |
-
light_colors = [
|
55 |
-
[1.0, 0.0, 0.0],
|
56 |
-
[0.0, 1.0, 0.0],
|
57 |
-
[0.0, 0.0, 1.0],
|
58 |
-
]
|
59 |
-
|
60 |
-
lights_extension = {
|
61 |
-
"lights": [
|
62 |
-
{"type": "directional", "color": light_colors[i], "intensity": 2.0}
|
63 |
-
for i in range(N)
|
64 |
-
]
|
65 |
-
}
|
66 |
-
|
67 |
-
if "KHR_lights_punctual" not in glb.extensionsUsed:
|
68 |
-
glb.extensionsUsed.append("KHR_lights_punctual")
|
69 |
-
glb.extensions["KHR_lights_punctual"] = lights_extension
|
70 |
-
|
71 |
-
light_nodes = []
|
72 |
-
for i in range(N):
|
73 |
-
angle = i * angle_step
|
74 |
-
|
75 |
-
pos_rot = [0.0, 0.0, math.sin(angle / 2), math.cos(angle / 2)]
|
76 |
-
elev_rot = [
|
77 |
-
math.sin(elevation_angle / 2),
|
78 |
-
0.0,
|
79 |
-
0.0,
|
80 |
-
math.cos(elevation_angle / 2),
|
81 |
-
]
|
82 |
-
rotation = quaternion_multiply(pos_rot, elev_rot)
|
83 |
-
|
84 |
-
node = {
|
85 |
-
"rotation": rotation,
|
86 |
-
"extensions": {"KHR_lights_punctual": {"light": i}},
|
87 |
-
}
|
88 |
-
light_nodes.append(node)
|
89 |
-
|
90 |
-
light_node_indices = list(range(len(glb.nodes), len(glb.nodes) + N))
|
91 |
-
glb.nodes.extend(light_nodes)
|
92 |
-
|
93 |
-
root_node_index = glb.scenes[glb.scene].nodes[0]
|
94 |
-
root_node = glb.nodes[root_node_index]
|
95 |
-
if hasattr(root_node, "children"):
|
96 |
-
root_node.children.extend(light_node_indices)
|
97 |
-
else:
|
98 |
-
root_node.children = light_node_indices
|
99 |
-
|
100 |
-
glb.save(path_output)
|
101 |
-
|
102 |
-
|
103 |
-
def extrude_depth_3d(
|
104 |
-
path_rgb,
|
105 |
-
path_depth,
|
106 |
-
output_model_scale=100,
|
107 |
-
filter_size=3,
|
108 |
-
coef_near=0.0,
|
109 |
-
coef_far=1.0,
|
110 |
-
emboss=0.3,
|
111 |
-
f_thic=0.05,
|
112 |
-
f_near=-0.15,
|
113 |
-
f_back=0.01,
|
114 |
-
vertex_colors=True,
|
115 |
-
scene_lights=True,
|
116 |
-
prepare_for_3d_printing=False,
|
117 |
-
):
|
118 |
-
f_far_inner = -emboss
|
119 |
-
f_far_outer = f_far_inner - f_back
|
120 |
-
|
121 |
-
f_near = max(f_near, f_far_inner)
|
122 |
-
|
123 |
-
depth_image = Image.open(path_depth)
|
124 |
-
assert depth_image.mode == "I", depth_image.mode
|
125 |
-
depth_image = depth_image.filter(ImageFilter.MedianFilter(size=filter_size))
|
126 |
-
|
127 |
-
w, h = depth_image.size
|
128 |
-
d_max = max(w, h)
|
129 |
-
depth_image = np.array(depth_image).astype(np.double)
|
130 |
-
z_min, z_max = np.min(depth_image), np.max(depth_image)
|
131 |
-
depth_image = (depth_image.astype(np.double) - z_min) / (z_max - z_min)
|
132 |
-
depth_image[depth_image < coef_near] = coef_near
|
133 |
-
depth_image[depth_image > coef_far] = coef_far
|
134 |
-
depth_image = emboss * (depth_image - coef_near) / (coef_far - coef_near)
|
135 |
-
rgb_image = np.array(
|
136 |
-
Image.open(path_rgb).convert("RGB").resize((w, h), Image.Resampling.LANCZOS)
|
137 |
-
)
|
138 |
-
|
139 |
-
w_norm = w / float(d_max - 1)
|
140 |
-
h_norm = h / float(d_max - 1)
|
141 |
-
w_half = w_norm / 2
|
142 |
-
h_half = h_norm / 2
|
143 |
-
|
144 |
-
x, y = np.meshgrid(np.arange(w), np.arange(h))
|
145 |
-
x = x / float(d_max - 1) - w_half # [-w_half, w_half]
|
146 |
-
y = -y / float(d_max - 1) + h_half # [-h_half, h_half]
|
147 |
-
z = -depth_image # -depth_emboss (far) - 0 (near)
|
148 |
-
vertices_2d = np.stack((x, y, z), axis=-1)
|
149 |
-
vertices = vertices_2d.reshape(-1, 3)
|
150 |
-
colors = rgb_image[:, :, :3].reshape(-1, 3) / 255.0
|
151 |
-
|
152 |
-
faces = []
|
153 |
-
for y in range(h - 1):
|
154 |
-
for x in range(w - 1):
|
155 |
-
idx = y * w + x
|
156 |
-
faces.append([idx, idx + w, idx + 1])
|
157 |
-
faces.append([idx + 1, idx + w, idx + 1 + w])
|
158 |
-
|
159 |
-
# OUTER frame
|
160 |
-
|
161 |
-
nv = len(vertices)
|
162 |
-
vertices = np.append(
|
163 |
-
vertices,
|
164 |
-
[
|
165 |
-
[-w_half - f_thic, -h_half - f_thic, f_near], # 00
|
166 |
-
[-w_half - f_thic, -h_half - f_thic, f_far_outer], # 01
|
167 |
-
[w_half + f_thic, -h_half - f_thic, f_near], # 02
|
168 |
-
[w_half + f_thic, -h_half - f_thic, f_far_outer], # 03
|
169 |
-
[w_half + f_thic, h_half + f_thic, f_near], # 04
|
170 |
-
[w_half + f_thic, h_half + f_thic, f_far_outer], # 05
|
171 |
-
[-w_half - f_thic, h_half + f_thic, f_near], # 06
|
172 |
-
[-w_half - f_thic, h_half + f_thic, f_far_outer], # 07
|
173 |
-
],
|
174 |
-
axis=0,
|
175 |
-
)
|
176 |
-
faces.extend(
|
177 |
-
[
|
178 |
-
[nv + 0, nv + 1, nv + 2],
|
179 |
-
[nv + 2, nv + 1, nv + 3],
|
180 |
-
[nv + 2, nv + 3, nv + 4],
|
181 |
-
[nv + 4, nv + 3, nv + 5],
|
182 |
-
[nv + 4, nv + 5, nv + 6],
|
183 |
-
[nv + 6, nv + 5, nv + 7],
|
184 |
-
[nv + 6, nv + 7, nv + 0],
|
185 |
-
[nv + 0, nv + 7, nv + 1],
|
186 |
-
]
|
187 |
-
)
|
188 |
-
colors = np.append(colors, [[0.5, 0.5, 0.5]] * 8, axis=0)
|
189 |
-
|
190 |
-
# INNER frame
|
191 |
-
|
192 |
-
nv = len(vertices)
|
193 |
-
vertices_left_data = vertices_2d[:, 0] # H x 3
|
194 |
-
vertices_left_frame = vertices_2d[:, 0].copy() # H x 3
|
195 |
-
vertices_left_frame[:, 2] = f_near
|
196 |
-
vertices = np.append(vertices, vertices_left_data, axis=0)
|
197 |
-
vertices = np.append(vertices, vertices_left_frame, axis=0)
|
198 |
-
colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 * h), axis=0)
|
199 |
-
for i in range(h - 1):
|
200 |
-
nvi_d = nv + i
|
201 |
-
nvi_f = nvi_d + h
|
202 |
-
faces.append([nvi_d, nvi_f, nvi_d + 1])
|
203 |
-
faces.append([nvi_d + 1, nvi_f, nvi_f + 1])
|
204 |
-
|
205 |
-
nv = len(vertices)
|
206 |
-
vertices_right_data = vertices_2d[:, -1] # H x 3
|
207 |
-
vertices_right_frame = vertices_2d[:, -1].copy() # H x 3
|
208 |
-
vertices_right_frame[:, 2] = f_near
|
209 |
-
vertices = np.append(vertices, vertices_right_data, axis=0)
|
210 |
-
vertices = np.append(vertices, vertices_right_frame, axis=0)
|
211 |
-
colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 * h), axis=0)
|
212 |
-
for i in range(h - 1):
|
213 |
-
nvi_d = nv + i
|
214 |
-
nvi_f = nvi_d + h
|
215 |
-
faces.append([nvi_d, nvi_d + 1, nvi_f])
|
216 |
-
faces.append([nvi_d + 1, nvi_f + 1, nvi_f])
|
217 |
-
|
218 |
-
nv = len(vertices)
|
219 |
-
vertices_top_data = vertices_2d[0, :] # H x 3
|
220 |
-
vertices_top_frame = vertices_2d[0, :].copy() # H x 3
|
221 |
-
vertices_top_frame[:, 2] = f_near
|
222 |
-
vertices = np.append(vertices, vertices_top_data, axis=0)
|
223 |
-
vertices = np.append(vertices, vertices_top_frame, axis=0)
|
224 |
-
colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 * w), axis=0)
|
225 |
-
for i in range(w - 1):
|
226 |
-
nvi_d = nv + i
|
227 |
-
nvi_f = nvi_d + w
|
228 |
-
faces.append([nvi_d, nvi_d + 1, nvi_f])
|
229 |
-
faces.append([nvi_d + 1, nvi_f + 1, nvi_f])
|
230 |
-
|
231 |
-
nv = len(vertices)
|
232 |
-
vertices_bottom_data = vertices_2d[-1, :] # H x 3
|
233 |
-
vertices_bottom_frame = vertices_2d[-1, :].copy() # H x 3
|
234 |
-
vertices_bottom_frame[:, 2] = f_near
|
235 |
-
vertices = np.append(vertices, vertices_bottom_data, axis=0)
|
236 |
-
vertices = np.append(vertices, vertices_bottom_frame, axis=0)
|
237 |
-
colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 * w), axis=0)
|
238 |
-
for i in range(w - 1):
|
239 |
-
nvi_d = nv + i
|
240 |
-
nvi_f = nvi_d + w
|
241 |
-
faces.append([nvi_d, nvi_f, nvi_d + 1])
|
242 |
-
faces.append([nvi_d + 1, nvi_f, nvi_f + 1])
|
243 |
-
|
244 |
-
# FRONT frame
|
245 |
-
|
246 |
-
nv = len(vertices)
|
247 |
-
vertices = np.append(
|
248 |
-
vertices,
|
249 |
-
[
|
250 |
-
[-w_half - f_thic, -h_half - f_thic, f_near],
|
251 |
-
[-w_half - f_thic, h_half + f_thic, f_near],
|
252 |
-
],
|
253 |
-
axis=0,
|
254 |
-
)
|
255 |
-
vertices = np.append(vertices, vertices_left_frame, axis=0)
|
256 |
-
colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 + h), axis=0)
|
257 |
-
for i in range(h - 1):
|
258 |
-
faces.append([nv, nv + 2 + i + 1, nv + 2 + i])
|
259 |
-
faces.append([nv, nv + 2, nv + 1])
|
260 |
-
|
261 |
-
nv = len(vertices)
|
262 |
-
vertices = np.append(
|
263 |
-
vertices,
|
264 |
-
[
|
265 |
-
[w_half + f_thic, h_half + f_thic, f_near],
|
266 |
-
[w_half + f_thic, -h_half - f_thic, f_near],
|
267 |
-
],
|
268 |
-
axis=0,
|
269 |
-
)
|
270 |
-
vertices = np.append(vertices, vertices_right_frame, axis=0)
|
271 |
-
colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 + h), axis=0)
|
272 |
-
for i in range(h - 1):
|
273 |
-
faces.append([nv, nv + 2 + i, nv + 2 + i + 1])
|
274 |
-
faces.append([nv, nv + h + 1, nv + 1])
|
275 |
-
|
276 |
-
nv = len(vertices)
|
277 |
-
vertices = np.append(
|
278 |
-
vertices,
|
279 |
-
[
|
280 |
-
[w_half + f_thic, h_half + f_thic, f_near],
|
281 |
-
[-w_half - f_thic, h_half + f_thic, f_near],
|
282 |
-
],
|
283 |
-
axis=0,
|
284 |
-
)
|
285 |
-
vertices = np.append(vertices, vertices_top_frame, axis=0)
|
286 |
-
colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 + w), axis=0)
|
287 |
-
for i in range(w - 1):
|
288 |
-
faces.append([nv, nv + 2 + i, nv + 2 + i + 1])
|
289 |
-
faces.append([nv, nv + 1, nv + 2])
|
290 |
-
|
291 |
-
nv = len(vertices)
|
292 |
-
vertices = np.append(
|
293 |
-
vertices,
|
294 |
-
[
|
295 |
-
[-w_half - f_thic, -h_half - f_thic, f_near],
|
296 |
-
[w_half + f_thic, -h_half - f_thic, f_near],
|
297 |
-
],
|
298 |
-
axis=0,
|
299 |
-
)
|
300 |
-
vertices = np.append(vertices, vertices_bottom_frame, axis=0)
|
301 |
-
colors = np.append(colors, [[0.5, 0.5, 0.5]] * (2 + w), axis=0)
|
302 |
-
for i in range(w - 1):
|
303 |
-
faces.append([nv, nv + 2 + i + 1, nv + 2 + i])
|
304 |
-
faces.append([nv, nv + 1, nv + w + 1])
|
305 |
-
|
306 |
-
# BACK frame
|
307 |
-
|
308 |
-
nv = len(vertices)
|
309 |
-
vertices = np.append(
|
310 |
-
vertices,
|
311 |
-
[
|
312 |
-
[-w_half - f_thic, -h_half - f_thic, f_far_outer], # 00
|
313 |
-
[w_half + f_thic, -h_half - f_thic, f_far_outer], # 01
|
314 |
-
[w_half + f_thic, h_half + f_thic, f_far_outer], # 02
|
315 |
-
[-w_half - f_thic, h_half + f_thic, f_far_outer], # 03
|
316 |
-
],
|
317 |
-
axis=0,
|
318 |
-
)
|
319 |
-
faces.extend(
|
320 |
-
[
|
321 |
-
[nv + 0, nv + 2, nv + 1],
|
322 |
-
[nv + 2, nv + 0, nv + 3],
|
323 |
-
]
|
324 |
-
)
|
325 |
-
colors = np.append(colors, [[0.5, 0.5, 0.5]] * 4, axis=0)
|
326 |
-
|
327 |
-
trimesh_kwargs = {}
|
328 |
-
if vertex_colors:
|
329 |
-
trimesh_kwargs["vertex_colors"] = colors
|
330 |
-
mesh = trimesh.Trimesh(vertices=vertices, faces=faces, **trimesh_kwargs)
|
331 |
-
|
332 |
-
mesh.merge_vertices()
|
333 |
-
|
334 |
-
current_max_dimension = max(mesh.extents)
|
335 |
-
scaling_factor = output_model_scale / current_max_dimension
|
336 |
-
mesh.apply_scale(scaling_factor)
|
337 |
-
|
338 |
-
if prepare_for_3d_printing:
|
339 |
-
rotation_mat = trimesh.transformations.rotation_matrix(
|
340 |
-
np.radians(90), [-1, 0, 0]
|
341 |
-
)
|
342 |
-
mesh.apply_transform(rotation_mat)
|
343 |
-
|
344 |
-
path_out_base = os.path.splitext(path_depth)[0].replace("_16bit", "")
|
345 |
-
path_out_glb = path_out_base + ".glb"
|
346 |
-
path_out_stl = path_out_base + ".stl"
|
347 |
-
|
348 |
-
mesh.export(path_out_glb, file_type="glb")
|
349 |
-
if scene_lights:
|
350 |
-
glb_add_lights(path_out_glb, path_out_glb)
|
351 |
-
|
352 |
-
mesh.export(path_out_stl, file_type="stl")
|
353 |
-
|
354 |
-
return path_out_glb, path_out_stl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
files/{bee_depth_fp32.npy → arc.jpeg}
RENAMED
File without changes
|
files/bee.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
files/bee_depth_16bit.png
DELETED
Binary file (504 kB)
|
|
files/bee_depth_colored.png
DELETED
Binary file (221 kB)
|
|
files/{cat_depth_fp32.npy → berries.jpeg}
RENAMED
File without changes
|
files/{einstein_depth_16bit.png → butterfly.jpeg}
RENAMED
File without changes
|
files/cat.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
files/cat_depth_16bit.png
DELETED
Binary file (557 kB)
|
|
files/cat_depth_colored.png
DELETED
Binary file (221 kB)
|
|
files/{einstein_depth_fp32.npy → concert.jpeg}
RENAMED
File without changes
|
files/dog.jpeg
ADDED
![]() |
Git LFS Details
|
files/doughnuts.jpeg
ADDED
![]() |
Git LFS Details
|
files/einstein.jpg
CHANGED
![]() |
![]() |
Git LFS Details
|
files/einstein_depth_colored.png
DELETED
Binary file (746 kB)
|
|
files/food.jpeg
ADDED
![]() |
Git LFS Details
|
files/glasses.jpeg
ADDED
![]() |
Git LFS Details
|
files/house.jpg
ADDED
![]() |
Git LFS Details
|
files/lake.jpeg
ADDED
![]() |
Git LFS Details
|
files/marigold.jpeg
ADDED
![]() |
Git LFS Details
|
files/portrait_1.jpeg
ADDED
![]() |
Git LFS Details
|
files/portrait_2.jpeg
ADDED
![]() |
Git LFS Details
|
files/pumpkins.jpg
ADDED
![]() |
Git LFS Details
|
files/puzzle.jpeg
ADDED
![]() |
Git LFS Details
|
files/road.jpg
ADDED
![]() |
Git LFS Details
|
files/scientists.jpg
ADDED
![]() |
Git LFS Details
|
files/surfboards.jpeg
ADDED
![]() |
Git LFS Details
|
files/surfer.jpeg
ADDED
![]() |
Git LFS Details
|
files/swings.jpg
CHANGED
![]() |
Git LFS Details
|
![]() |
Git LFS Details
|
files/swings_depth_16bit.png
DELETED
Binary file (523 kB)
|
|
files/swings_depth_colored.png
DELETED
Binary file (268 kB)
|
|
files/swings_depth_fp32.npy
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b5728d846cd554d4a5e1d0e5f71d622135bca36164026b8e49668acdfa20e070
|
3 |
-
size 1398912
|
|
|
|
|
|
|
|
files/switzerland.jpeg
ADDED
![]() |
Git LFS Details
|
files/teamwork.jpeg
ADDED
![]() |
Git LFS Details
|
files/wave.jpeg
ADDED
![]() |
Git LFS Details
|
marigold_depth_estimation.py
DELETED
@@ -1,632 +0,0 @@
|
|
1 |
-
# Copyright 2024 Bingxin Ke, ETH Zurich and The HuggingFace Team. All rights reserved.
|
2 |
-
#
|
3 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
-
# you may not use this file except in compliance with the License.
|
5 |
-
# You may obtain a copy of the License at
|
6 |
-
#
|
7 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
-
#
|
9 |
-
# Unless required by applicable law or agreed to in writing, software
|
10 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
-
# See the License for the specific language governing permissions and
|
13 |
-
# limitations under the License.
|
14 |
-
# --------------------------------------------------------------------------
|
15 |
-
# If you find this code useful, we kindly ask you to cite our paper in your work.
|
16 |
-
# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
|
17 |
-
# More information about the method can be found at https://marigoldmonodepth.github.io
|
18 |
-
# --------------------------------------------------------------------------
|
19 |
-
|
20 |
-
|
21 |
-
import math
|
22 |
-
from typing import Dict, Union
|
23 |
-
|
24 |
-
import matplotlib
|
25 |
-
import numpy as np
|
26 |
-
import torch
|
27 |
-
from PIL import Image
|
28 |
-
from scipy.optimize import minimize
|
29 |
-
from torch.utils.data import DataLoader, TensorDataset
|
30 |
-
from tqdm.auto import tqdm
|
31 |
-
from transformers import CLIPTextModel, CLIPTokenizer
|
32 |
-
|
33 |
-
from diffusers import (
|
34 |
-
AutoencoderKL,
|
35 |
-
DDIMScheduler,
|
36 |
-
DiffusionPipeline,
|
37 |
-
UNet2DConditionModel,
|
38 |
-
)
|
39 |
-
from diffusers.utils import BaseOutput, check_min_version
|
40 |
-
|
41 |
-
|
42 |
-
# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
|
43 |
-
check_min_version("0.27.0.dev0")
|
44 |
-
|
45 |
-
|
46 |
-
class MarigoldDepthOutput(BaseOutput):
|
47 |
-
"""
|
48 |
-
Output class for Marigold monocular depth prediction pipeline.
|
49 |
-
|
50 |
-
Args:
|
51 |
-
depth_np (`np.ndarray`):
|
52 |
-
Predicted depth map, with depth values in the range of [0, 1].
|
53 |
-
depth_colored (`None` or `PIL.Image.Image`):
|
54 |
-
Colorized depth map, with the shape of [3, H, W] and values in [0, 1].
|
55 |
-
uncertainty (`None` or `np.ndarray`):
|
56 |
-
Uncalibrated uncertainty(MAD, median absolute deviation) coming from ensembling.
|
57 |
-
"""
|
58 |
-
|
59 |
-
depth_np: np.ndarray
|
60 |
-
depth_colored: Union[None, Image.Image]
|
61 |
-
uncertainty: Union[None, np.ndarray]
|
62 |
-
|
63 |
-
|
64 |
-
class MarigoldPipeline(DiffusionPipeline):
|
65 |
-
"""
|
66 |
-
Pipeline for monocular depth estimation using Marigold: https://marigoldmonodepth.github.io.
|
67 |
-
|
68 |
-
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
|
69 |
-
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
|
70 |
-
|
71 |
-
Args:
|
72 |
-
unet (`UNet2DConditionModel`):
|
73 |
-
Conditional U-Net to denoise the depth latent, conditioned on image latent.
|
74 |
-
vae (`AutoencoderKL`):
|
75 |
-
Variational Auto-Encoder (VAE) Model to encode and decode images and depth maps
|
76 |
-
to and from latent representations.
|
77 |
-
scheduler (`DDIMScheduler`):
|
78 |
-
A scheduler to be used in combination with `unet` to denoise the encoded image latents.
|
79 |
-
text_encoder (`CLIPTextModel`):
|
80 |
-
Text-encoder, for empty text embedding.
|
81 |
-
tokenizer (`CLIPTokenizer`):
|
82 |
-
CLIP tokenizer.
|
83 |
-
"""
|
84 |
-
|
85 |
-
rgb_latent_scale_factor = 0.18215
|
86 |
-
depth_latent_scale_factor = 0.18215
|
87 |
-
|
88 |
-
def __init__(
|
89 |
-
self,
|
90 |
-
unet: UNet2DConditionModel,
|
91 |
-
vae: AutoencoderKL,
|
92 |
-
scheduler: DDIMScheduler,
|
93 |
-
text_encoder: CLIPTextModel,
|
94 |
-
tokenizer: CLIPTokenizer,
|
95 |
-
):
|
96 |
-
super().__init__()
|
97 |
-
|
98 |
-
self.register_modules(
|
99 |
-
unet=unet,
|
100 |
-
vae=vae,
|
101 |
-
scheduler=scheduler,
|
102 |
-
text_encoder=text_encoder,
|
103 |
-
tokenizer=tokenizer,
|
104 |
-
)
|
105 |
-
|
106 |
-
self.empty_text_embed = None
|
107 |
-
|
108 |
-
@torch.no_grad()
|
109 |
-
def __call__(
|
110 |
-
self,
|
111 |
-
input_image: Image,
|
112 |
-
denoising_steps: int = 10,
|
113 |
-
ensemble_size: int = 10,
|
114 |
-
processing_res: int = 768,
|
115 |
-
match_input_res: bool = True,
|
116 |
-
batch_size: int = 0,
|
117 |
-
color_map: str = "Spectral",
|
118 |
-
show_progress_bar: bool = True,
|
119 |
-
ensemble_kwargs: Dict = None,
|
120 |
-
) -> MarigoldDepthOutput:
|
121 |
-
"""
|
122 |
-
Function invoked when calling the pipeline.
|
123 |
-
|
124 |
-
Args:
|
125 |
-
input_image (`Image`):
|
126 |
-
Input RGB (or gray-scale) image.
|
127 |
-
processing_res (`int`, *optional*, defaults to `768`):
|
128 |
-
Maximum resolution of processing.
|
129 |
-
If set to 0: will not resize at all.
|
130 |
-
match_input_res (`bool`, *optional*, defaults to `True`):
|
131 |
-
Resize depth prediction to match input resolution.
|
132 |
-
Only valid if `limit_input_res` is not None.
|
133 |
-
denoising_steps (`int`, *optional*, defaults to `10`):
|
134 |
-
Number of diffusion denoising steps (DDIM) during inference.
|
135 |
-
ensemble_size (`int`, *optional*, defaults to `10`):
|
136 |
-
Number of predictions to be ensembled.
|
137 |
-
batch_size (`int`, *optional*, defaults to `0`):
|
138 |
-
Inference batch size, no bigger than `num_ensemble`.
|
139 |
-
If set to 0, the script will automatically decide the proper batch size.
|
140 |
-
show_progress_bar (`bool`, *optional*, defaults to `True`):
|
141 |
-
Display a progress bar of diffusion denoising.
|
142 |
-
color_map (`str`, *optional*, defaults to `"Spectral"`, pass `None` to skip colorized depth map generation):
|
143 |
-
Colormap used to colorize the depth map.
|
144 |
-
ensemble_kwargs (`dict`, *optional*, defaults to `None`):
|
145 |
-
Arguments for detailed ensembling settings.
|
146 |
-
Returns:
|
147 |
-
`MarigoldDepthOutput`: Output class for Marigold monocular depth prediction pipeline, including:
|
148 |
-
- **depth_np** (`np.ndarray`) Predicted depth map, with depth values in the range of [0, 1]
|
149 |
-
- **depth_colored** (`None` or `PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] and
|
150 |
-
values in [0, 1]. None if `color_map` is `None`
|
151 |
-
- **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation)
|
152 |
-
coming from ensembling. None if `ensemble_size = 1`
|
153 |
-
"""
|
154 |
-
|
155 |
-
device = self.device
|
156 |
-
input_size = input_image.size
|
157 |
-
|
158 |
-
if not match_input_res:
|
159 |
-
assert (
|
160 |
-
processing_res is not None
|
161 |
-
), "Value error: `resize_output_back` is only valid with "
|
162 |
-
assert processing_res >= 0
|
163 |
-
assert denoising_steps >= 1
|
164 |
-
assert ensemble_size >= 1
|
165 |
-
|
166 |
-
# ----------------- Image Preprocess -----------------
|
167 |
-
# Resize image
|
168 |
-
if processing_res > 0:
|
169 |
-
input_image = self.resize_max_res(
|
170 |
-
input_image, max_edge_resolution=processing_res
|
171 |
-
)
|
172 |
-
# Convert the image to RGB, to 1.remove the alpha channel 2.convert B&W to 3-channel
|
173 |
-
input_image = input_image.convert("RGB")
|
174 |
-
image = np.asarray(input_image)
|
175 |
-
|
176 |
-
# Normalize rgb values
|
177 |
-
rgb = np.transpose(image, (2, 0, 1)) # [H, W, rgb] -> [rgb, H, W]
|
178 |
-
rgb_norm = rgb / 255.0 * 2.0 - 1.0 # [0, 255] -> [-1, 1]
|
179 |
-
rgb_norm = torch.from_numpy(rgb_norm).to(self.dtype)
|
180 |
-
rgb_norm = rgb_norm.to(device)
|
181 |
-
assert rgb_norm.min() >= -1.0 and rgb_norm.max() <= 1.0
|
182 |
-
|
183 |
-
# ----------------- Predicting depth -----------------
|
184 |
-
# Batch repeated input image
|
185 |
-
duplicated_rgb = torch.stack([rgb_norm] * ensemble_size)
|
186 |
-
single_rgb_dataset = TensorDataset(duplicated_rgb)
|
187 |
-
if batch_size > 0:
|
188 |
-
_bs = batch_size
|
189 |
-
else:
|
190 |
-
_bs = self._find_batch_size(
|
191 |
-
ensemble_size=ensemble_size,
|
192 |
-
input_res=max(rgb_norm.shape[1:]),
|
193 |
-
dtype=self.dtype,
|
194 |
-
)
|
195 |
-
|
196 |
-
single_rgb_loader = DataLoader(
|
197 |
-
single_rgb_dataset, batch_size=_bs, shuffle=False
|
198 |
-
)
|
199 |
-
|
200 |
-
# Predict depth maps (batched)
|
201 |
-
depth_pred_ls = []
|
202 |
-
if show_progress_bar:
|
203 |
-
iterable = tqdm(
|
204 |
-
single_rgb_loader, desc=" " * 2 + "Inference batches", leave=False
|
205 |
-
)
|
206 |
-
else:
|
207 |
-
iterable = single_rgb_loader
|
208 |
-
for batch in iterable:
|
209 |
-
(batched_img,) = batch
|
210 |
-
depth_pred_raw = self.single_infer(
|
211 |
-
rgb_in=batched_img,
|
212 |
-
num_inference_steps=denoising_steps,
|
213 |
-
show_pbar=show_progress_bar,
|
214 |
-
)
|
215 |
-
depth_pred_ls.append(depth_pred_raw.detach().clone())
|
216 |
-
depth_preds = torch.concat(depth_pred_ls, axis=0).squeeze()
|
217 |
-
torch.cuda.empty_cache() # clear vram cache for ensembling
|
218 |
-
|
219 |
-
# ----------------- Test-time ensembling -----------------
|
220 |
-
if ensemble_size > 1:
|
221 |
-
depth_pred, pred_uncert = self.ensemble_depths(
|
222 |
-
depth_preds, **(ensemble_kwargs or {})
|
223 |
-
)
|
224 |
-
else:
|
225 |
-
depth_pred = depth_preds
|
226 |
-
pred_uncert = None
|
227 |
-
|
228 |
-
# ----------------- Post processing -----------------
|
229 |
-
# Scale prediction to [0, 1]
|
230 |
-
min_d = torch.min(depth_pred)
|
231 |
-
max_d = torch.max(depth_pred)
|
232 |
-
depth_pred = (depth_pred - min_d) / (max_d - min_d)
|
233 |
-
|
234 |
-
# Convert to numpy
|
235 |
-
depth_pred = depth_pred.cpu().numpy().astype(np.float32)
|
236 |
-
|
237 |
-
# Resize back to original resolution
|
238 |
-
if match_input_res:
|
239 |
-
pred_img = Image.fromarray(depth_pred)
|
240 |
-
pred_img = pred_img.resize(input_size)
|
241 |
-
depth_pred = np.asarray(pred_img)
|
242 |
-
|
243 |
-
# Clip output range
|
244 |
-
depth_pred = depth_pred.clip(0, 1)
|
245 |
-
|
246 |
-
# Colorize
|
247 |
-
if color_map is not None:
|
248 |
-
depth_colored = self.colorize_depth_maps(
|
249 |
-
depth_pred, 0, 1, cmap=color_map
|
250 |
-
).squeeze() # [3, H, W], value in (0, 1)
|
251 |
-
depth_colored = (depth_colored * 255).astype(np.uint8)
|
252 |
-
depth_colored_hwc = self.chw2hwc(depth_colored)
|
253 |
-
depth_colored_img = Image.fromarray(depth_colored_hwc)
|
254 |
-
else:
|
255 |
-
depth_colored_img = None
|
256 |
-
return MarigoldDepthOutput(
|
257 |
-
depth_np=depth_pred,
|
258 |
-
depth_colored=depth_colored_img,
|
259 |
-
uncertainty=pred_uncert,
|
260 |
-
)
|
261 |
-
|
262 |
-
def _encode_empty_text(self):
|
263 |
-
"""
|
264 |
-
Encode text embedding for empty prompt.
|
265 |
-
"""
|
266 |
-
prompt = ""
|
267 |
-
text_inputs = self.tokenizer(
|
268 |
-
prompt,
|
269 |
-
padding="do_not_pad",
|
270 |
-
max_length=self.tokenizer.model_max_length,
|
271 |
-
truncation=True,
|
272 |
-
return_tensors="pt",
|
273 |
-
)
|
274 |
-
text_input_ids = text_inputs.input_ids.to(self.text_encoder.device)
|
275 |
-
self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(self.dtype)
|
276 |
-
|
277 |
-
@torch.no_grad()
|
278 |
-
def single_infer(
|
279 |
-
self, rgb_in: torch.Tensor, num_inference_steps: int, show_pbar: bool
|
280 |
-
) -> torch.Tensor:
|
281 |
-
"""
|
282 |
-
Perform an individual depth prediction without ensembling.
|
283 |
-
|
284 |
-
Args:
|
285 |
-
rgb_in (`torch.Tensor`):
|
286 |
-
Input RGB image.
|
287 |
-
num_inference_steps (`int`):
|
288 |
-
Number of diffusion denoisign steps (DDIM) during inference.
|
289 |
-
show_pbar (`bool`):
|
290 |
-
Display a progress bar of diffusion denoising.
|
291 |
-
Returns:
|
292 |
-
`torch.Tensor`: Predicted depth map.
|
293 |
-
"""
|
294 |
-
device = rgb_in.device
|
295 |
-
|
296 |
-
# Set timesteps
|
297 |
-
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
298 |
-
timesteps = self.scheduler.timesteps # [T]
|
299 |
-
|
300 |
-
# Encode image
|
301 |
-
rgb_latent = self._encode_rgb(rgb_in)
|
302 |
-
|
303 |
-
# Initial depth map (noise)
|
304 |
-
depth_latent = torch.randn(
|
305 |
-
rgb_latent.shape, device=device, dtype=self.dtype
|
306 |
-
) # [B, 4, h, w]
|
307 |
-
|
308 |
-
# Batched empty text embedding
|
309 |
-
if self.empty_text_embed is None:
|
310 |
-
self._encode_empty_text()
|
311 |
-
batch_empty_text_embed = self.empty_text_embed.repeat(
|
312 |
-
(rgb_latent.shape[0], 1, 1)
|
313 |
-
) # [B, 2, 1024]
|
314 |
-
|
315 |
-
# Denoising loop
|
316 |
-
if show_pbar:
|
317 |
-
iterable = tqdm(
|
318 |
-
enumerate(timesteps),
|
319 |
-
total=len(timesteps),
|
320 |
-
leave=False,
|
321 |
-
desc=" " * 4 + "Diffusion denoising",
|
322 |
-
)
|
323 |
-
else:
|
324 |
-
iterable = enumerate(timesteps)
|
325 |
-
|
326 |
-
for i, t in iterable:
|
327 |
-
unet_input = torch.cat(
|
328 |
-
[rgb_latent, depth_latent], dim=1
|
329 |
-
) # this order is important
|
330 |
-
|
331 |
-
# predict the noise residual
|
332 |
-
noise_pred = self.unet(
|
333 |
-
unet_input, t, encoder_hidden_states=batch_empty_text_embed
|
334 |
-
).sample # [B, 4, h, w]
|
335 |
-
|
336 |
-
# compute the previous noisy sample x_t -> x_t-1
|
337 |
-
depth_latent = self.scheduler.step(noise_pred, t, depth_latent).prev_sample
|
338 |
-
torch.cuda.empty_cache()
|
339 |
-
depth = self._decode_depth(depth_latent)
|
340 |
-
|
341 |
-
# clip prediction
|
342 |
-
depth = torch.clip(depth, -1.0, 1.0)
|
343 |
-
# shift to [0, 1]
|
344 |
-
depth = (depth + 1.0) / 2.0
|
345 |
-
|
346 |
-
return depth
|
347 |
-
|
348 |
-
def _encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
|
349 |
-
"""
|
350 |
-
Encode RGB image into latent.
|
351 |
-
|
352 |
-
Args:
|
353 |
-
rgb_in (`torch.Tensor`):
|
354 |
-
Input RGB image to be encoded.
|
355 |
-
|
356 |
-
Returns:
|
357 |
-
`torch.Tensor`: Image latent.
|
358 |
-
"""
|
359 |
-
# encode
|
360 |
-
h = self.vae.encoder(rgb_in)
|
361 |
-
moments = self.vae.quant_conv(h)
|
362 |
-
mean, logvar = torch.chunk(moments, 2, dim=1)
|
363 |
-
# scale latent
|
364 |
-
rgb_latent = mean * self.rgb_latent_scale_factor
|
365 |
-
return rgb_latent
|
366 |
-
|
367 |
-
def _decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
|
368 |
-
"""
|
369 |
-
Decode depth latent into depth map.
|
370 |
-
|
371 |
-
Args:
|
372 |
-
depth_latent (`torch.Tensor`):
|
373 |
-
Depth latent to be decoded.
|
374 |
-
|
375 |
-
Returns:
|
376 |
-
`torch.Tensor`: Decoded depth map.
|
377 |
-
"""
|
378 |
-
# scale latent
|
379 |
-
depth_latent = depth_latent / self.depth_latent_scale_factor
|
380 |
-
# decode
|
381 |
-
z = self.vae.post_quant_conv(depth_latent)
|
382 |
-
stacked = self.vae.decoder(z)
|
383 |
-
# mean of output channels
|
384 |
-
depth_mean = stacked.mean(dim=1, keepdim=True)
|
385 |
-
return depth_mean
|
386 |
-
|
387 |
-
@staticmethod
|
388 |
-
def resize_max_res(img: Image.Image, max_edge_resolution: int) -> Image.Image:
|
389 |
-
"""
|
390 |
-
Resize image to limit maximum edge length while keeping aspect ratio.
|
391 |
-
|
392 |
-
Args:
|
393 |
-
img (`Image.Image`):
|
394 |
-
Image to be resized.
|
395 |
-
max_edge_resolution (`int`):
|
396 |
-
Maximum edge length (pixel).
|
397 |
-
|
398 |
-
Returns:
|
399 |
-
`Image.Image`: Resized image.
|
400 |
-
"""
|
401 |
-
original_width, original_height = img.size
|
402 |
-
downscale_factor = min(
|
403 |
-
max_edge_resolution / original_width, max_edge_resolution / original_height
|
404 |
-
)
|
405 |
-
|
406 |
-
new_width = int(original_width * downscale_factor)
|
407 |
-
new_height = int(original_height * downscale_factor)
|
408 |
-
|
409 |
-
resized_img = img.resize((new_width, new_height))
|
410 |
-
return resized_img
|
411 |
-
|
412 |
-
@staticmethod
|
413 |
-
def colorize_depth_maps(
|
414 |
-
depth_map, min_depth, max_depth, cmap="Spectral", valid_mask=None
|
415 |
-
):
|
416 |
-
"""
|
417 |
-
Colorize depth maps.
|
418 |
-
"""
|
419 |
-
assert len(depth_map.shape) >= 2, "Invalid dimension"
|
420 |
-
|
421 |
-
if isinstance(depth_map, torch.Tensor):
|
422 |
-
depth = depth_map.detach().clone().squeeze().numpy()
|
423 |
-
elif isinstance(depth_map, np.ndarray):
|
424 |
-
depth = depth_map.copy().squeeze()
|
425 |
-
# reshape to [ (B,) H, W ]
|
426 |
-
if depth.ndim < 3:
|
427 |
-
depth = depth[np.newaxis, :, :]
|
428 |
-
|
429 |
-
# colorize
|
430 |
-
cm = matplotlib.colormaps[cmap]
|
431 |
-
depth = ((depth - min_depth) / (max_depth - min_depth)).clip(0, 1)
|
432 |
-
img_colored_np = cm(depth, bytes=False)[:, :, :, 0:3] # value from 0 to 1
|
433 |
-
img_colored_np = np.rollaxis(img_colored_np, 3, 1)
|
434 |
-
|
435 |
-
if valid_mask is not None:
|
436 |
-
if isinstance(depth_map, torch.Tensor):
|
437 |
-
valid_mask = valid_mask.detach().numpy()
|
438 |
-
valid_mask = valid_mask.squeeze() # [H, W] or [B, H, W]
|
439 |
-
if valid_mask.ndim < 3:
|
440 |
-
valid_mask = valid_mask[np.newaxis, np.newaxis, :, :]
|
441 |
-
else:
|
442 |
-
valid_mask = valid_mask[:, np.newaxis, :, :]
|
443 |
-
valid_mask = np.repeat(valid_mask, 3, axis=1)
|
444 |
-
img_colored_np[~valid_mask] = 0
|
445 |
-
|
446 |
-
if isinstance(depth_map, torch.Tensor):
|
447 |
-
img_colored = torch.from_numpy(img_colored_np).float()
|
448 |
-
elif isinstance(depth_map, np.ndarray):
|
449 |
-
img_colored = img_colored_np
|
450 |
-
|
451 |
-
return img_colored
|
452 |
-
|
453 |
-
@staticmethod
|
454 |
-
def chw2hwc(chw):
|
455 |
-
assert 3 == len(chw.shape)
|
456 |
-
if isinstance(chw, torch.Tensor):
|
457 |
-
hwc = torch.permute(chw, (1, 2, 0))
|
458 |
-
elif isinstance(chw, np.ndarray):
|
459 |
-
hwc = np.moveaxis(chw, 0, -1)
|
460 |
-
return hwc
|
461 |
-
|
462 |
-
@staticmethod
|
463 |
-
def _find_batch_size(ensemble_size: int, input_res: int, dtype: torch.dtype) -> int:
|
464 |
-
"""
|
465 |
-
Automatically search for suitable operating batch size.
|
466 |
-
|
467 |
-
Args:
|
468 |
-
ensemble_size (`int`):
|
469 |
-
Number of predictions to be ensembled.
|
470 |
-
input_res (`int`):
|
471 |
-
Operating resolution of the input image.
|
472 |
-
|
473 |
-
Returns:
|
474 |
-
`int`: Operating batch size.
|
475 |
-
"""
|
476 |
-
# Search table for suggested max. inference batch size
|
477 |
-
bs_search_table = [
|
478 |
-
# tested on A100-PCIE-80GB
|
479 |
-
{"res": 768, "total_vram": 79, "bs": 35, "dtype": torch.float32},
|
480 |
-
{"res": 1024, "total_vram": 79, "bs": 20, "dtype": torch.float32},
|
481 |
-
# tested on A100-PCIE-40GB
|
482 |
-
{"res": 768, "total_vram": 39, "bs": 15, "dtype": torch.float32},
|
483 |
-
{"res": 1024, "total_vram": 39, "bs": 8, "dtype": torch.float32},
|
484 |
-
{"res": 768, "total_vram": 39, "bs": 30, "dtype": torch.float16},
|
485 |
-
{"res": 1024, "total_vram": 39, "bs": 15, "dtype": torch.float16},
|
486 |
-
# tested on RTX3090, RTX4090
|
487 |
-
{"res": 512, "total_vram": 23, "bs": 20, "dtype": torch.float32},
|
488 |
-
{"res": 768, "total_vram": 23, "bs": 7, "dtype": torch.float32},
|
489 |
-
{"res": 1024, "total_vram": 23, "bs": 3, "dtype": torch.float32},
|
490 |
-
{"res": 512, "total_vram": 23, "bs": 40, "dtype": torch.float16},
|
491 |
-
{"res": 768, "total_vram": 23, "bs": 18, "dtype": torch.float16},
|
492 |
-
{"res": 1024, "total_vram": 23, "bs": 10, "dtype": torch.float16},
|
493 |
-
# tested on GTX1080Ti
|
494 |
-
{"res": 512, "total_vram": 10, "bs": 5, "dtype": torch.float32},
|
495 |
-
{"res": 768, "total_vram": 10, "bs": 2, "dtype": torch.float32},
|
496 |
-
{"res": 512, "total_vram": 10, "bs": 10, "dtype": torch.float16},
|
497 |
-
{"res": 768, "total_vram": 10, "bs": 5, "dtype": torch.float16},
|
498 |
-
{"res": 1024, "total_vram": 10, "bs": 3, "dtype": torch.float16},
|
499 |
-
]
|
500 |
-
|
501 |
-
if not torch.cuda.is_available():
|
502 |
-
return 1
|
503 |
-
|
504 |
-
total_vram = torch.cuda.mem_get_info()[1] / 1024.0**3
|
505 |
-
filtered_bs_search_table = [s for s in bs_search_table if s["dtype"] == dtype]
|
506 |
-
for settings in sorted(
|
507 |
-
filtered_bs_search_table,
|
508 |
-
key=lambda k: (k["res"], -k["total_vram"]),
|
509 |
-
):
|
510 |
-
if input_res <= settings["res"] and total_vram >= settings["total_vram"]:
|
511 |
-
bs = settings["bs"]
|
512 |
-
if bs > ensemble_size:
|
513 |
-
bs = ensemble_size
|
514 |
-
elif bs > math.ceil(ensemble_size / 2) and bs < ensemble_size:
|
515 |
-
bs = math.ceil(ensemble_size / 2)
|
516 |
-
return bs
|
517 |
-
|
518 |
-
return 1
|
519 |
-
|
520 |
-
@staticmethod
|
521 |
-
def ensemble_depths(
|
522 |
-
input_images: torch.Tensor,
|
523 |
-
regularizer_strength: float = 0.02,
|
524 |
-
max_iter: int = 2,
|
525 |
-
tol: float = 1e-3,
|
526 |
-
reduction: str = "median",
|
527 |
-
max_res: int = None,
|
528 |
-
):
|
529 |
-
"""
|
530 |
-
To ensemble multiple affine-invariant depth images (up to scale and shift),
|
531 |
-
by aligning estimating the scale and shift
|
532 |
-
"""
|
533 |
-
|
534 |
-
def inter_distances(tensors: torch.Tensor):
|
535 |
-
"""
|
536 |
-
To calculate the distance between each two depth maps.
|
537 |
-
"""
|
538 |
-
distances = []
|
539 |
-
for i, j in torch.combinations(torch.arange(tensors.shape[0])):
|
540 |
-
arr1 = tensors[i : i + 1]
|
541 |
-
arr2 = tensors[j : j + 1]
|
542 |
-
distances.append(arr1 - arr2)
|
543 |
-
dist = torch.concatenate(distances, dim=0)
|
544 |
-
return dist
|
545 |
-
|
546 |
-
device = input_images.device
|
547 |
-
dtype = input_images.dtype
|
548 |
-
np_dtype = np.float32
|
549 |
-
|
550 |
-
original_input = input_images.clone()
|
551 |
-
n_img = input_images.shape[0]
|
552 |
-
ori_shape = input_images.shape
|
553 |
-
|
554 |
-
if max_res is not None:
|
555 |
-
scale_factor = torch.min(max_res / torch.tensor(ori_shape[-2:]))
|
556 |
-
if scale_factor < 1:
|
557 |
-
downscaler = torch.nn.Upsample(
|
558 |
-
scale_factor=scale_factor, mode="nearest"
|
559 |
-
)
|
560 |
-
input_images = downscaler(torch.from_numpy(input_images)).numpy()
|
561 |
-
|
562 |
-
# init guess
|
563 |
-
_min = np.min(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1)
|
564 |
-
_max = np.max(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1)
|
565 |
-
s_init = 1.0 / (_max - _min).reshape((-1, 1, 1))
|
566 |
-
t_init = (-1 * s_init.flatten() * _min.flatten()).reshape((-1, 1, 1))
|
567 |
-
x = np.concatenate([s_init, t_init]).reshape(-1).astype(np_dtype)
|
568 |
-
|
569 |
-
input_images = input_images.to(device)
|
570 |
-
|
571 |
-
# objective function
|
572 |
-
def closure(x):
|
573 |
-
l = len(x)
|
574 |
-
s = x[: int(l / 2)]
|
575 |
-
t = x[int(l / 2) :]
|
576 |
-
s = torch.from_numpy(s).to(dtype=dtype).to(device)
|
577 |
-
t = torch.from_numpy(t).to(dtype=dtype).to(device)
|
578 |
-
|
579 |
-
transformed_arrays = input_images * s.view((-1, 1, 1)) + t.view((-1, 1, 1))
|
580 |
-
dists = inter_distances(transformed_arrays)
|
581 |
-
sqrt_dist = torch.sqrt(torch.mean(dists**2))
|
582 |
-
|
583 |
-
if "mean" == reduction:
|
584 |
-
pred = torch.mean(transformed_arrays, dim=0)
|
585 |
-
elif "median" == reduction:
|
586 |
-
pred = torch.median(transformed_arrays, dim=0).values
|
587 |
-
else:
|
588 |
-
raise ValueError
|
589 |
-
|
590 |
-
near_err = torch.sqrt((0 - torch.min(pred)) ** 2)
|
591 |
-
far_err = torch.sqrt((1 - torch.max(pred)) ** 2)
|
592 |
-
|
593 |
-
err = sqrt_dist + (near_err + far_err) * regularizer_strength
|
594 |
-
err = err.detach().cpu().numpy().astype(np_dtype)
|
595 |
-
return err
|
596 |
-
|
597 |
-
res = minimize(
|
598 |
-
closure,
|
599 |
-
x,
|
600 |
-
method="BFGS",
|
601 |
-
tol=tol,
|
602 |
-
options={"maxiter": max_iter, "disp": False},
|
603 |
-
)
|
604 |
-
x = res.x
|
605 |
-
l = len(x)
|
606 |
-
s = x[: int(l / 2)]
|
607 |
-
t = x[int(l / 2) :]
|
608 |
-
|
609 |
-
# Prediction
|
610 |
-
s = torch.from_numpy(s).to(dtype=dtype).to(device)
|
611 |
-
t = torch.from_numpy(t).to(dtype=dtype).to(device)
|
612 |
-
transformed_arrays = original_input * s.view(-1, 1, 1) + t.view(-1, 1, 1)
|
613 |
-
if "mean" == reduction:
|
614 |
-
aligned_images = torch.mean(transformed_arrays, dim=0)
|
615 |
-
std = torch.std(transformed_arrays, dim=0)
|
616 |
-
uncertainty = std
|
617 |
-
elif "median" == reduction:
|
618 |
-
aligned_images = torch.median(transformed_arrays, dim=0).values
|
619 |
-
# MAD (median absolute deviation) as uncertainty indicator
|
620 |
-
abs_dev = torch.abs(transformed_arrays - aligned_images)
|
621 |
-
mad = torch.median(abs_dev, dim=0).values
|
622 |
-
uncertainty = mad
|
623 |
-
else:
|
624 |
-
raise ValueError(f"Unknown reduction method: {reduction}")
|
625 |
-
|
626 |
-
# Scale and shift to [0, 1]
|
627 |
-
_min = torch.min(aligned_images)
|
628 |
-
_max = torch.max(aligned_images)
|
629 |
-
aligned_images = (aligned_images - _min) / (_max - _min)
|
630 |
-
uncertainty /= _max - _min
|
631 |
-
|
632 |
-
return aligned_images, uncertainty
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marigold_logo_square.jpg
DELETED
Binary file (76 kB)
|
|
requirements.txt
CHANGED
@@ -1,13 +1,9 @@
|
|
1 |
-
|
2 |
-
gradio-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
scipy==1.11.4
|
11 |
-
torch==2.0.1
|
12 |
-
transformers>=4.32.1
|
13 |
-
xformers>=0.0.21
|
|
|
1 |
+
diffusers>=0.32.2
|
2 |
+
git+https://github.com/toshas/gradio-dualvision.git
|
3 |
+
accelerate
|
4 |
+
huggingface_hub
|
5 |
+
scipy
|
6 |
+
torch
|
7 |
+
tqdm
|
8 |
+
transformers
|
9 |
+
xformers
|
|
|
|
|
|
|
|