Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +232 -0
- .gitignore +3 -0
- CHANGELOG.md +131 -0
- LICENSE +21 -0
- README.md +244 -12
- __init__.py +0 -0
- bundled_sources.txt +25 -0
- ddepth_anything_v2/DA-2K.md +51 -0
- ddepth_anything_v2/LICENSE +201 -0
- ddepth_anything_v2/README.md +201 -0
- ddepth_anything_v2/__init__.py +1 -0
- ddepth_anything_v2/__pycache__/__init__.cpython-310.pyc +0 -0
- ddepth_anything_v2/__pycache__/__init__.cpython-311.pyc +0 -0
- ddepth_anything_v2/__pycache__/__init__.cpython-312.pyc +0 -0
- ddepth_anything_v2/app.py +88 -0
- ddepth_anything_v2/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/__pycache__/dinov2.cpython-311.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/__pycache__/dinov2.cpython-312.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/__pycache__/dpt.cpython-310.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/__pycache__/dpt.cpython-311.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/__pycache__/dpt.cpython-312.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2.py +415 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__init__.py +11 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-311.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-312.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-311.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-312.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-311.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-312.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-311.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-312.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-311.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-312.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-311.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-312.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-311.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-312.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-311.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-312.pyc +0 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/attention.py +83 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/block.py +252 -0
- ddepth_anything_v2/depth_anything_v2/dinov2_layers/drop_path.py +35 -0
.gitattributes
CHANGED
@@ -33,3 +33,235 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
examples.png filter=lfs diff=lfs merge=lfs -text
|
37 |
+
outputs/depthmap-17278951300005-left-right.png filter=lfs diff=lfs merge=lfs -text
|
38 |
+
outputs/depthmap-17278951300006-left-right.png filter=lfs diff=lfs merge=lfs -text
|
39 |
+
outputs/depthmap-17278951300007-left-right.png filter=lfs diff=lfs merge=lfs -text
|
40 |
+
outputs/depthmap-17278951300008-left-right.png filter=lfs diff=lfs merge=lfs -text
|
41 |
+
outputs/depthmap-17278951300009-left-right.png filter=lfs diff=lfs merge=lfs -text
|
42 |
+
outputs/depthmap-17278951300010-left-right.png filter=lfs diff=lfs merge=lfs -text
|
43 |
+
outputs/depthmap-17278951300011-left-right.png filter=lfs diff=lfs merge=lfs -text
|
44 |
+
outputs/depthmap-17278951300012-left-right.png filter=lfs diff=lfs merge=lfs -text
|
45 |
+
outputs/depthmap-17278951300013-left-right.png filter=lfs diff=lfs merge=lfs -text
|
46 |
+
outputs/depthmap-17278951300014-left-right.png filter=lfs diff=lfs merge=lfs -text
|
47 |
+
outputs/depthmap-17278951300015-left-right.png filter=lfs diff=lfs merge=lfs -text
|
48 |
+
outputs/depthmap-17278951300016-left-right.png filter=lfs diff=lfs merge=lfs -text
|
49 |
+
outputs/depthmap-17278951300017-left-right.png filter=lfs diff=lfs merge=lfs -text
|
50 |
+
outputs/depthmap-17278951300018-left-right.png filter=lfs diff=lfs merge=lfs -text
|
51 |
+
outputs/depthmap-17278951300019-left-right.png filter=lfs diff=lfs merge=lfs -text
|
52 |
+
outputs/depthmap-17278951300020-left-right.png filter=lfs diff=lfs merge=lfs -text
|
53 |
+
outputs/depthmap-17278951300021-left-right.png filter=lfs diff=lfs merge=lfs -text
|
54 |
+
outputs/depthmap-17278951300022-left-right.png filter=lfs diff=lfs merge=lfs -text
|
55 |
+
outputs/depthmap-17278951300023-left-right.png filter=lfs diff=lfs merge=lfs -text
|
56 |
+
outputs/depthmap-17278951300024-left-right.png filter=lfs diff=lfs merge=lfs -text
|
57 |
+
outputs/depthmap-17278951300025-left-right.png filter=lfs diff=lfs merge=lfs -text
|
58 |
+
outputs/depthmap-17278951300026-left-right.png filter=lfs diff=lfs merge=lfs -text
|
59 |
+
outputs/depthmap-17278951300027-left-right.png filter=lfs diff=lfs merge=lfs -text
|
60 |
+
outputs/depthmap-17278951300028-left-right.png filter=lfs diff=lfs merge=lfs -text
|
61 |
+
outputs/depthmap-17278951300029-left-right.png filter=lfs diff=lfs merge=lfs -text
|
62 |
+
outputs/depthmap-17278951300030-left-right.png filter=lfs diff=lfs merge=lfs -text
|
63 |
+
outputs/depthmap-17278951300031-left-right.png filter=lfs diff=lfs merge=lfs -text
|
64 |
+
outputs/depthmap-17278951300033-left-right.png filter=lfs diff=lfs merge=lfs -text
|
65 |
+
outputs/depthmap-17278951300034.png filter=lfs diff=lfs merge=lfs -text
|
66 |
+
outputs/depthmap-17278951300035-left-right.png filter=lfs diff=lfs merge=lfs -text
|
67 |
+
outputs/depthmap-17278951300036.png filter=lfs diff=lfs merge=lfs -text
|
68 |
+
outputs/depthmap-17278951300037-left-right.png filter=lfs diff=lfs merge=lfs -text
|
69 |
+
outputs/depthmap-17278951300039-left-right.png filter=lfs diff=lfs merge=lfs -text
|
70 |
+
outputs/depthmap-17278951300040.png filter=lfs diff=lfs merge=lfs -text
|
71 |
+
outputs/depthmap-17278951300041-left-right.png filter=lfs diff=lfs merge=lfs -text
|
72 |
+
outputs/depthmap-17278951300043-left-right.png filter=lfs diff=lfs merge=lfs -text
|
73 |
+
outputs/depthmap-17278951300045-left-right.png filter=lfs diff=lfs merge=lfs -text
|
74 |
+
outputs/depthmap-17278951300053-simple.obj filter=lfs diff=lfs merge=lfs -text
|
75 |
+
outputs/depthmap-17278951300056-simple.obj filter=lfs diff=lfs merge=lfs -text
|
76 |
+
outputs/depthmap-17278951300061-simple.obj filter=lfs diff=lfs merge=lfs -text
|
77 |
+
outputs/depthmap-17280589390003-left-right.png filter=lfs diff=lfs merge=lfs -text
|
78 |
+
outputs/depthmap-17280589390008.png filter=lfs diff=lfs merge=lfs -text
|
79 |
+
outputs/depthmap-17280589390009-left-right.png filter=lfs diff=lfs merge=lfs -text
|
80 |
+
outputs/depthmap-17280589390010.png filter=lfs diff=lfs merge=lfs -text
|
81 |
+
outputs/depthmap-17280589390011-left-right.png filter=lfs diff=lfs merge=lfs -text
|
82 |
+
outputs/depthmap-17280589390012.png filter=lfs diff=lfs merge=lfs -text
|
83 |
+
outputs/depthmap-17280589390013-left-right.png filter=lfs diff=lfs merge=lfs -text
|
84 |
+
outputs/depthmap-17280589390015-left-right.png filter=lfs diff=lfs merge=lfs -text
|
85 |
+
outputs/depthmap-17280589390017-left-right.png filter=lfs diff=lfs merge=lfs -text
|
86 |
+
outputs/depthmap-17280589390023-left-right.png filter=lfs diff=lfs merge=lfs -text
|
87 |
+
outputs/depthmap-17280589390025-left-right.png filter=lfs diff=lfs merge=lfs -text
|
88 |
+
outputs/depthmap-17280589390027-left-right.png filter=lfs diff=lfs merge=lfs -text
|
89 |
+
outputs/depthmap-17280589390029-left-right.png filter=lfs diff=lfs merge=lfs -text
|
90 |
+
outputs/depthmap-17280589390031-left-right.png filter=lfs diff=lfs merge=lfs -text
|
91 |
+
outputs/depthmap-17280589390033-left-right.png filter=lfs diff=lfs merge=lfs -text
|
92 |
+
outputs/depthmap-17280589390035-left-right.png filter=lfs diff=lfs merge=lfs -text
|
93 |
+
outputs/depthmap-17280589390037-left-right.png filter=lfs diff=lfs merge=lfs -text
|
94 |
+
outputs/depthmap-17280589390039-left-right.png filter=lfs diff=lfs merge=lfs -text
|
95 |
+
outputs/depthmap-17280589390041-left-right.png filter=lfs diff=lfs merge=lfs -text
|
96 |
+
outputs/depthmap-17280589390043-left-right.png filter=lfs diff=lfs merge=lfs -text
|
97 |
+
outputs/depthmap-17280589390045-left-right.png filter=lfs diff=lfs merge=lfs -text
|
98 |
+
outputs/depthmap-17280589390047-left-right.png filter=lfs diff=lfs merge=lfs -text
|
99 |
+
outputs/depthmap-17280589390049-left-right.png filter=lfs diff=lfs merge=lfs -text
|
100 |
+
outputs/depthmap-17280589390051-left-right.png filter=lfs diff=lfs merge=lfs -text
|
101 |
+
outputs/depthmap-17280589390052.png filter=lfs diff=lfs merge=lfs -text
|
102 |
+
outputs/depthmap-17280589390053-left-right.png filter=lfs diff=lfs merge=lfs -text
|
103 |
+
outputs/depthmap-17280589390054.png filter=lfs diff=lfs merge=lfs -text
|
104 |
+
outputs/depthmap-17280589390055-left-right.png filter=lfs diff=lfs merge=lfs -text
|
105 |
+
outputs/depthmap-17280589390056.png filter=lfs diff=lfs merge=lfs -text
|
106 |
+
outputs/depthmap-17280589390057-left-right.png filter=lfs diff=lfs merge=lfs -text
|
107 |
+
outputs/depthmap-17280589390058.png filter=lfs diff=lfs merge=lfs -text
|
108 |
+
outputs/depthmap-17280589390059-left-right.png filter=lfs diff=lfs merge=lfs -text
|
109 |
+
outputs/depthmap-17280589390061-left-right.png filter=lfs diff=lfs merge=lfs -text
|
110 |
+
outputs/depthmap-17280589390063-left-right.png filter=lfs diff=lfs merge=lfs -text
|
111 |
+
outputs/depthmap-17280589390065-left-right.png filter=lfs diff=lfs merge=lfs -text
|
112 |
+
outputs/depthmap-17280589390067-left-right.png filter=lfs diff=lfs merge=lfs -text
|
113 |
+
outputs/depthmap-17280589390069-left-right.png filter=lfs diff=lfs merge=lfs -text
|
114 |
+
outputs/depthmap-17280589390071-left-right.png filter=lfs diff=lfs merge=lfs -text
|
115 |
+
outputs/depthmap-17280589390073-left-right.png filter=lfs diff=lfs merge=lfs -text
|
116 |
+
outputs/depthmap-17280589390075-left-right.png filter=lfs diff=lfs merge=lfs -text
|
117 |
+
outputs/depthmap-17280589390077-left-right.png filter=lfs diff=lfs merge=lfs -text
|
118 |
+
outputs/depthmap-17280589390079-left-right.png filter=lfs diff=lfs merge=lfs -text
|
119 |
+
outputs/depthmap-17280589390081-left-right.png filter=lfs diff=lfs merge=lfs -text
|
120 |
+
outputs/depthmap-17280589390085-left-right.png filter=lfs diff=lfs merge=lfs -text
|
121 |
+
outputs/depthmap-17280589390087-left-right.png filter=lfs diff=lfs merge=lfs -text
|
122 |
+
outputs/depthmap-17280589390089-left-right.png filter=lfs diff=lfs merge=lfs -text
|
123 |
+
outputs/depthmap-17285060200001.png filter=lfs diff=lfs merge=lfs -text
|
124 |
+
outputs/depthmap-17285060200002-left-right.png filter=lfs diff=lfs merge=lfs -text
|
125 |
+
outputs/depthmap-17285060200003-top-bottom.png filter=lfs diff=lfs merge=lfs -text
|
126 |
+
outputs/depthmap-17285371260002-left-right.png filter=lfs diff=lfs merge=lfs -text
|
127 |
+
outputs/depthmap-17285859980001.png filter=lfs diff=lfs merge=lfs -text
|
128 |
+
outputs/depthmap-17285859980002.png filter=lfs diff=lfs merge=lfs -text
|
129 |
+
outputs/depthmap-17285859980003-left-right.png filter=lfs diff=lfs merge=lfs -text
|
130 |
+
outputs/depthmap-17285861380002-left-right.png filter=lfs diff=lfs merge=lfs -text
|
131 |
+
outputs/depthmap-17285861380003-left-right_video.avi filter=lfs diff=lfs merge=lfs -text
|
132 |
+
outputs/depthmap-17285861380004-depth_video.avi filter=lfs diff=lfs merge=lfs -text
|
133 |
+
outputs/depthmap-17286927930002-left-right.png filter=lfs diff=lfs merge=lfs -text
|
134 |
+
outputs/depthmap-17286927930003.png filter=lfs diff=lfs merge=lfs -text
|
135 |
+
outputs/depthmap-17286927930004-left-right.png filter=lfs diff=lfs merge=lfs -text
|
136 |
+
outputs/depthmap-17286927930005.png filter=lfs diff=lfs merge=lfs -text
|
137 |
+
outputs/depthmap-17286927930006-left-right.png filter=lfs diff=lfs merge=lfs -text
|
138 |
+
outputs/depthmap-17286927930010-left-right.png filter=lfs diff=lfs merge=lfs -text
|
139 |
+
outputs/depthmap-17286927930012-left-right.png filter=lfs diff=lfs merge=lfs -text
|
140 |
+
outputs/depthmap-17286927930016-left-right.png filter=lfs diff=lfs merge=lfs -text
|
141 |
+
outputs/depthmap-17286927930018-left-right.png filter=lfs diff=lfs merge=lfs -text
|
142 |
+
outputs/depthmap-17286927930020-left-right.png filter=lfs diff=lfs merge=lfs -text
|
143 |
+
outputs/depthmap-17286927930026-left-right.png filter=lfs diff=lfs merge=lfs -text
|
144 |
+
outputs/depthmap-17286927930028-left-right.png filter=lfs diff=lfs merge=lfs -text
|
145 |
+
outputs/depthmap-17286927930036-left-right.png filter=lfs diff=lfs merge=lfs -text
|
146 |
+
outputs/depthmap-17286927930046-left-right.png filter=lfs diff=lfs merge=lfs -text
|
147 |
+
outputs/depthmap-17286927930050-left-right.png filter=lfs diff=lfs merge=lfs -text
|
148 |
+
outputs/depthmap-17286927930052-left-right.png filter=lfs diff=lfs merge=lfs -text
|
149 |
+
outputs/depthmap-17286927930053.png filter=lfs diff=lfs merge=lfs -text
|
150 |
+
outputs/depthmap-17286927930054-left-right.png filter=lfs diff=lfs merge=lfs -text
|
151 |
+
outputs/depthmap-17286927930055.png filter=lfs diff=lfs merge=lfs -text
|
152 |
+
outputs/depthmap-17286927930056-left-right.png filter=lfs diff=lfs merge=lfs -text
|
153 |
+
outputs/depthmap-17286927930057.png filter=lfs diff=lfs merge=lfs -text
|
154 |
+
outputs/depthmap-17286927930058-left-right.png filter=lfs diff=lfs merge=lfs -text
|
155 |
+
outputs/depthmap-17286927930059.png filter=lfs diff=lfs merge=lfs -text
|
156 |
+
outputs/depthmap-17286927930060-left-right.png filter=lfs diff=lfs merge=lfs -text
|
157 |
+
outputs/depthmap-17286927930061.png filter=lfs diff=lfs merge=lfs -text
|
158 |
+
outputs/depthmap-17286927930062-left-right.png filter=lfs diff=lfs merge=lfs -text
|
159 |
+
outputs/depthmap-17286927930063.png filter=lfs diff=lfs merge=lfs -text
|
160 |
+
outputs/depthmap-17286927930064-left-right.png filter=lfs diff=lfs merge=lfs -text
|
161 |
+
outputs/depthmap-17286927930066-left-right.png filter=lfs diff=lfs merge=lfs -text
|
162 |
+
outputs/depthmap-17286927930070-left-right.png filter=lfs diff=lfs merge=lfs -text
|
163 |
+
outputs/depthmap-17286927930072-left-right.png filter=lfs diff=lfs merge=lfs -text
|
164 |
+
outputs/depthmap-17286927930080-left-right.png filter=lfs diff=lfs merge=lfs -text
|
165 |
+
outputs/depthmap-17286927930082-left-right.png filter=lfs diff=lfs merge=lfs -text
|
166 |
+
outputs/depthmap-17286927930084-left-right.png filter=lfs diff=lfs merge=lfs -text
|
167 |
+
outputs/depthmap-17286927930120-left-right.png filter=lfs diff=lfs merge=lfs -text
|
168 |
+
outputs/depthmap-17286927930126-left-right.png filter=lfs diff=lfs merge=lfs -text
|
169 |
+
outputs/depthmap-17286927930132-left-right.png filter=lfs diff=lfs merge=lfs -text
|
170 |
+
outputs/depthmap-17286927930142-left-right.png filter=lfs diff=lfs merge=lfs -text
|
171 |
+
outputs/depthmap-17286927930147.png filter=lfs diff=lfs merge=lfs -text
|
172 |
+
outputs/depthmap-17286927930152-left-right.png filter=lfs diff=lfs merge=lfs -text
|
173 |
+
outputs/depthmap-17286927930154-left-right.png filter=lfs diff=lfs merge=lfs -text
|
174 |
+
outputs/depthmap-17286927930156-left-right.png filter=lfs diff=lfs merge=lfs -text
|
175 |
+
outputs/depthmap-17286927930158-left-right.png filter=lfs diff=lfs merge=lfs -text
|
176 |
+
outputs/depthmap-17286927930160-left-right.png filter=lfs diff=lfs merge=lfs -text
|
177 |
+
outputs/depthmap-17286927930162-left-right.png filter=lfs diff=lfs merge=lfs -text
|
178 |
+
outputs/depthmap-17286927930164-left-right.png filter=lfs diff=lfs merge=lfs -text
|
179 |
+
outputs/depthmap-17286927930166-left-right.png filter=lfs diff=lfs merge=lfs -text
|
180 |
+
outputs/depthmap-17286927930168-left-right.png filter=lfs diff=lfs merge=lfs -text
|
181 |
+
outputs/depthmap-17286927930170-left-right.png filter=lfs diff=lfs merge=lfs -text
|
182 |
+
outputs/depthmap-17286927930172-left-right.png filter=lfs diff=lfs merge=lfs -text
|
183 |
+
outputs/depthmap-17286927930174-left-right.png filter=lfs diff=lfs merge=lfs -text
|
184 |
+
outputs/depthmap-17286927930176-left-right.png filter=lfs diff=lfs merge=lfs -text
|
185 |
+
outputs/depthmap-17286927930178-left-right.png filter=lfs diff=lfs merge=lfs -text
|
186 |
+
outputs/depthmap-17286927930180-left-right.png filter=lfs diff=lfs merge=lfs -text
|
187 |
+
outputs/depthmap-17286927930182-left-right.png filter=lfs diff=lfs merge=lfs -text
|
188 |
+
outputs/depthmap-17286927930184-left-right.png filter=lfs diff=lfs merge=lfs -text
|
189 |
+
outputs/depthmap-17286927930186-left-right.png filter=lfs diff=lfs merge=lfs -text
|
190 |
+
outputs/depthmap-17286927930188-left-right.png filter=lfs diff=lfs merge=lfs -text
|
191 |
+
outputs/depthmap-17286927930190-left-right.png filter=lfs diff=lfs merge=lfs -text
|
192 |
+
outputs/depthmap-17286927930194-left-right.png filter=lfs diff=lfs merge=lfs -text
|
193 |
+
outputs/depthmap-17286927930196-left-right.png filter=lfs diff=lfs merge=lfs -text
|
194 |
+
outputs/depthmap-17286927930198-left-right.png filter=lfs diff=lfs merge=lfs -text
|
195 |
+
outputs/depthmap-17286927930199.png filter=lfs diff=lfs merge=lfs -text
|
196 |
+
outputs/depthmap-17286927930200-left-right.png filter=lfs diff=lfs merge=lfs -text
|
197 |
+
outputs/depthmap-17286927930202-left-right.png filter=lfs diff=lfs merge=lfs -text
|
198 |
+
outputs/depthmap-17286927930204-left-right.png filter=lfs diff=lfs merge=lfs -text
|
199 |
+
outputs/depthmap-17286927930206-left-right.png filter=lfs diff=lfs merge=lfs -text
|
200 |
+
outputs/depthmap-17286927930208-left-right.png filter=lfs diff=lfs merge=lfs -text
|
201 |
+
outputs/depthmap-17286927930210-left-right.png filter=lfs diff=lfs merge=lfs -text
|
202 |
+
outputs/depthmap-17286927930212-left-right.png filter=lfs diff=lfs merge=lfs -text
|
203 |
+
outputs/depthmap-17286927930213.png filter=lfs diff=lfs merge=lfs -text
|
204 |
+
outputs/depthmap-17286927930214-left-right.png filter=lfs diff=lfs merge=lfs -text
|
205 |
+
outputs/depthmap-17286927930216-left-right.png filter=lfs diff=lfs merge=lfs -text
|
206 |
+
outputs/depthmap-17286927930218-left-right.png filter=lfs diff=lfs merge=lfs -text
|
207 |
+
outputs/depthmap-17286927930220-left-right.png filter=lfs diff=lfs merge=lfs -text
|
208 |
+
outputs/depthmap-17286927930222-left-right.png filter=lfs diff=lfs merge=lfs -text
|
209 |
+
outputs/depthmap-17286927930224-left-right.png filter=lfs diff=lfs merge=lfs -text
|
210 |
+
outputs/depthmap-17286927930226-left-right.png filter=lfs diff=lfs merge=lfs -text
|
211 |
+
outputs/depthmap-17286927930228-left-right.png filter=lfs diff=lfs merge=lfs -text
|
212 |
+
outputs/depthmap-17286927930230-left-right.png filter=lfs diff=lfs merge=lfs -text
|
213 |
+
outputs/depthmap-17286927930232-left-right.png filter=lfs diff=lfs merge=lfs -text
|
214 |
+
outputs/depthmap-17286927930234-left-right.png filter=lfs diff=lfs merge=lfs -text
|
215 |
+
outputs/depthmap-17286927930236-left-right.png filter=lfs diff=lfs merge=lfs -text
|
216 |
+
outputs/depthmap-17286927930238-left-right.png filter=lfs diff=lfs merge=lfs -text
|
217 |
+
outputs/depthmap-17286927930240-left-right.png filter=lfs diff=lfs merge=lfs -text
|
218 |
+
outputs/depthmap-17286927930242-left-right.png filter=lfs diff=lfs merge=lfs -text
|
219 |
+
outputs/depthmap-17286927930244-left-right.png filter=lfs diff=lfs merge=lfs -text
|
220 |
+
outputs/depthmap-17286927930246-left-right.png filter=lfs diff=lfs merge=lfs -text
|
221 |
+
outputs/depthmap-17286927930248-left-right.png filter=lfs diff=lfs merge=lfs -text
|
222 |
+
outputs/depthmap-17286927930250-left-right.png filter=lfs diff=lfs merge=lfs -text
|
223 |
+
outputs/depthmap-17286927930252-left-right.png filter=lfs diff=lfs merge=lfs -text
|
224 |
+
outputs/depthmap-17286927930253.png filter=lfs diff=lfs merge=lfs -text
|
225 |
+
outputs/depthmap-17286927930254-left-right.png filter=lfs diff=lfs merge=lfs -text
|
226 |
+
outputs/depthmap-17286927930256-left-right.png filter=lfs diff=lfs merge=lfs -text
|
227 |
+
outputs/depthmap-17286927930258-left-right.png filter=lfs diff=lfs merge=lfs -text
|
228 |
+
outputs/depthmap-17286927930260-left-right.png filter=lfs diff=lfs merge=lfs -text
|
229 |
+
outputs/depthmap-17286927930261.png filter=lfs diff=lfs merge=lfs -text
|
230 |
+
outputs/depthmap-17286927930262-left-right.png filter=lfs diff=lfs merge=lfs -text
|
231 |
+
outputs/depthmap-17286927930263.png filter=lfs diff=lfs merge=lfs -text
|
232 |
+
outputs/depthmap-17286927930264-left-right.png filter=lfs diff=lfs merge=lfs -text
|
233 |
+
outputs/depthmap-17286927930265.png filter=lfs diff=lfs merge=lfs -text
|
234 |
+
outputs/depthmap-17286927930266-left-right.png filter=lfs diff=lfs merge=lfs -text
|
235 |
+
outputs/depthmap-17286927930268-left-right.png filter=lfs diff=lfs merge=lfs -text
|
236 |
+
outputs/depthmap-17286927930270-left-right.png filter=lfs diff=lfs merge=lfs -text
|
237 |
+
outputs/depthmap-17286927930272-left-right.png filter=lfs diff=lfs merge=lfs -text
|
238 |
+
outputs/depthmap-17286927930274-left-right.png filter=lfs diff=lfs merge=lfs -text
|
239 |
+
outputs/depthmap-17286927930276-left-right.png filter=lfs diff=lfs merge=lfs -text
|
240 |
+
outputs/depthmap-17286927930278-left-right.png filter=lfs diff=lfs merge=lfs -text
|
241 |
+
outputs/depthmap-17286927930280-left-right.png filter=lfs diff=lfs merge=lfs -text
|
242 |
+
outputs/depthmap-17286927930282-left-right.png filter=lfs diff=lfs merge=lfs -text
|
243 |
+
outputs/depthmap-17286927930284-left-right.png filter=lfs diff=lfs merge=lfs -text
|
244 |
+
outputs/depthmap-17286927930286-left-right.png filter=lfs diff=lfs merge=lfs -text
|
245 |
+
outputs/depthmap-17286927930288-left-right.png filter=lfs diff=lfs merge=lfs -text
|
246 |
+
outputs/depthmap-17286927930290-left-right.png filter=lfs diff=lfs merge=lfs -text
|
247 |
+
outputs/depthmap-17286927930292-left-right.png filter=lfs diff=lfs merge=lfs -text
|
248 |
+
outputs/depthmap-17286927930294-left-right.png filter=lfs diff=lfs merge=lfs -text
|
249 |
+
outputs/depthmap-17286927930296-left-right.png filter=lfs diff=lfs merge=lfs -text
|
250 |
+
outputs/depthmap-17286927930298-left-right.png filter=lfs diff=lfs merge=lfs -text
|
251 |
+
outputs/depthmap-17286927930300-left-right.png filter=lfs diff=lfs merge=lfs -text
|
252 |
+
outputs/depthmap-17286927930302-left-right.png filter=lfs diff=lfs merge=lfs -text
|
253 |
+
outputs/depthmap-17286927930304-left-right.png filter=lfs diff=lfs merge=lfs -text
|
254 |
+
outputs/depthmap-17286927930306-left-right.png filter=lfs diff=lfs merge=lfs -text
|
255 |
+
outputs/depthmap-17286927930308-left-right.png filter=lfs diff=lfs merge=lfs -text
|
256 |
+
outputs/depthmap-17286927930310-left-right.png filter=lfs diff=lfs merge=lfs -text
|
257 |
+
outputs/depthmap-17286927930312-left-right.png filter=lfs diff=lfs merge=lfs -text
|
258 |
+
outputs/depthmap-17286927930316-left-right.png filter=lfs diff=lfs merge=lfs -text
|
259 |
+
outputs/depthmap-17286927930318-left-right.png filter=lfs diff=lfs merge=lfs -text
|
260 |
+
outputs/depthmap-17286927930322-left-right.png filter=lfs diff=lfs merge=lfs -text
|
261 |
+
outputs/depthmap-17286927930324-left-right.png filter=lfs diff=lfs merge=lfs -text
|
262 |
+
outputs/depthmap-17286927930326-left-right.png filter=lfs diff=lfs merge=lfs -text
|
263 |
+
outputs/depthmap-17286927930328-left-right.png filter=lfs diff=lfs merge=lfs -text
|
264 |
+
outputs/depthmap-17286927930330-left-right.png filter=lfs diff=lfs merge=lfs -text
|
265 |
+
outputs/depthmap-17286927930332-left-right.png filter=lfs diff=lfs merge=lfs -text
|
266 |
+
outputs/depthmap-17286927930334-left-right.png filter=lfs diff=lfs merge=lfs -text
|
267 |
+
outputs/depthmap-17286927930336-left-right.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
venv/
|
3 |
+
.idea/
|
CHANGELOG.md
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Changelog
|
2 |
+
### 0.4.8
|
3 |
+
* Depth Anything V2 support, thanks [@graemeniedermayer](https://github.com/graemeniedermayer)!
|
4 |
+
### 0.4.7
|
5 |
+
* Tiling mode
|
6 |
+
* Reduced VRAM consumption for Depth Anything, as well as for ZoeDepth k and nk
|
7 |
+
* Some bugfixes
|
8 |
+
### 0.4.6
|
9 |
+
* Support for [Depth Anything](https://github.com/LiheYoung/Depth-Anything).
|
10 |
+
### 0.4.5
|
11 |
+
* Preliminary support for [Marigold](https://marigoldmonodepth.github.io). [PR #385](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/385).
|
12 |
+
### 0.4.4
|
13 |
+
* Compatibility with stable-diffusion-webui 1.6.0
|
14 |
+
### 0.4.3 video processing tab
|
15 |
+
* Added an option to process videos directly from a video file. This leads to better results than batch-processing individual frames of a video. Allows generating depthmap videos, that can be used in further generations as custom depthmap videos.
|
16 |
+
* UI improvements.
|
17 |
+
* Extra stereoimage generation modes - enable in extension settings if you want to use them.
|
18 |
+
* New stereoimage generation parameter - offset exponent. Setting it to 1 may produce more realistic outputs.
|
19 |
+
### 0.4.2
|
20 |
+
* Added UI options for 2 additional rembg models.
|
21 |
+
* Heatmap generation UI option is hidden - if you want to use it, please activate it in the extension settings.
|
22 |
+
* Bugfixes.
|
23 |
+
### 0.4.1 standalone mode
|
24 |
+
* Added ability to run DepthMap without WebUI. (Use main.py. Make sure all the dependencies are installed. The support is not feature-complete.)
|
25 |
+
* NormalMap generation
|
26 |
+
### 0.4.0 large code refactor
|
27 |
+
* UI improvements
|
28 |
+
* Improved Batch from Directory, Clip and renormalize DepthMap
|
29 |
+
* Slightly changed the behaviour of various options
|
30 |
+
* Extension may partially work even if some of the dependencies are unmet
|
31 |
+
|
32 |
+
### 0.3.12
|
33 |
+
* Fixed stereo image generation
|
34 |
+
* Other bugfixes
|
35 |
+
### 0.3.11
|
36 |
+
* 3D model viewer (Experimental!)
|
37 |
+
* simple and fast (occluded) 3D mesh generation, support for equirectangular projection
|
38 |
+
(accurate results with ZoeDepth models only, no boost, no custom maps)
|
39 |
+
* default output format is now obj for inpainted mesh and simple mesh
|
40 |
+
### 0.3.10
|
41 |
+
* ZoeDepth support (with boost), 3 new models, best results so far
|
42 |
+
* better heatmap
|
43 |
+
### 0.3.9
|
44 |
+
* use existing/custom depthmaps in output dir for batch mode
|
45 |
+
* custom depthmap support for single file
|
46 |
+
* wavefront obj output support for inpainted mesh (enabled in settings)
|
47 |
+
* option to generate all stereo formats at once
|
48 |
+
* bugfix: convert single channel input image to rgb
|
49 |
+
* renamed midas imports to fix conflict with deforum
|
50 |
+
* ui cleanup
|
51 |
+
### 0.3.8 bugfix
|
52 |
+
* bugfix in remove background path
|
53 |
+
### 0.3.7 new features
|
54 |
+
* [rembg](https://github.com/danielgatis/rembg) Remove Background [PR](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/78) by [@graemeniedermayer](https://github.com/graemeniedermayer) merged
|
55 |
+
* setting to flip Left/Right SBS images
|
56 |
+
* added missing parameter for 3d inpainting (repeat_inpaint_edge)
|
57 |
+
* option to generate demo videos with mesh
|
58 |
+
### 0.3.6 new feature
|
59 |
+
* implemented binary ply file format for the inpainted 3D mesh, big reduction in filesize and save/load times.
|
60 |
+
* added progress indicators to the inpainting process
|
61 |
+
### 0.3.5 bugfix
|
62 |
+
* create path to 3dphoto models before download (see [issue](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/76))
|
63 |
+
### 0.3.4 new featues
|
64 |
+
* depth clipping option (original idea by [@Extraltodeus](https://github.com/Extraltodeus))
|
65 |
+
* by popular demand, 3D-Photo-Inpainting is now implemented
|
66 |
+
* generate inpainted 3D mesh (PLY) and videos of said mesh
|
67 |
+
### 0.3.3 bugfix and new midas models
|
68 |
+
* updated to midas 3.1, bringing 2 new depth models (the 512 one eats VRAM for breakfast!)
|
69 |
+
* fix Next-ViT dependency issue for new installs
|
70 |
+
* extension no longer clones repositories, all dependencies are now contained in the extension
|
71 |
+
### 0.3.2 new feature and bugfixes
|
72 |
+
* several bug fixes for apple silicon and other machines without cuda
|
73 |
+
* NEW Stereo Image Generation techniques for gap filling by [@semjon00](https://github.com/semjon00) using polylines. (See [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/56)) Significant improvement in quality.
|
74 |
+
### 0.3.1 bugfix
|
75 |
+
* small speed increase for anaglyph creation
|
76 |
+
* clone midas repo before midas 3.1 to fix issue (see [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/55#issue-1510266008))
|
77 |
+
### 0.3.0 improved stereo image generation
|
78 |
+
* New improved technique for generating stereo images and balancing distortion between eyes by [@semjon00](https://github.com/semjon00) (See [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/51))
|
79 |
+
* Substantial speedup of stereo image generation code using numba JIT
|
80 |
+
### 0.2.9 new feature
|
81 |
+
* 3D Stereo (side-by-side) and red/cyan anaglyph image generation.
|
82 |
+
(Thanks to [@sina-masoud-ansari](https://github.com/sina-masoud-ansari) for the tip! Discussion [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/discussions/45))
|
83 |
+
### 0.2.8 bugfix
|
84 |
+
* boost (pix2pix) now also able to compute on cpu
|
85 |
+
* res101 able to compute on cpu
|
86 |
+
### 0.2.7 separate tab
|
87 |
+
* Depth Tab now available for easier stand-alone (batch) processing
|
88 |
+
### 0.2.6 ui layout and settings
|
89 |
+
* added link to repo so more people find their way to the instructions.
|
90 |
+
* boost rmax setting
|
91 |
+
### 0.2.5 bugfix
|
92 |
+
* error checking on model download (now with progressbar)
|
93 |
+
### 0.2.4 high resolution depthmaps
|
94 |
+
* multi-resolution merging is now implemented, significantly improving results!
|
95 |
+
* res101 can now also compute on CPU
|
96 |
+
### 0.2.3 bugfix
|
97 |
+
* path error on linux fixed
|
98 |
+
### 0.2.2 new features
|
99 |
+
* added (experimental) support for AdelaiDepth/LeReS (GPU Only!)
|
100 |
+
* new option to view depthmap as heatmap
|
101 |
+
* optimised ui layout
|
102 |
+
### 0.2.1 bugfix
|
103 |
+
* Correct seed is now used in filename and pnginfo when running batches. (see [issue](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/35))
|
104 |
+
### 0.2.0 upgrade
|
105 |
+
* the script is now an extension, enabling auto installation.
|
106 |
+
### 0.1.9 bugfixes
|
107 |
+
* sd model moved to system memory while computing depthmap
|
108 |
+
* memory leak/fragmentation issue fixed
|
109 |
+
* recover from out of memory error
|
110 |
+
### 0.1.8 new options
|
111 |
+
* net size can now be set as width and height, option to match input size, sliders now have the same range as generation parameters. (see usage below)
|
112 |
+
* better error handling
|
113 |
+
### 0.1.7 bugfixes
|
114 |
+
* batch img2img now works (see [issue](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/21#issuecomment-1306445056))
|
115 |
+
* generation parameters now only saved when enabled in settings
|
116 |
+
* model memory freed explicitly at end of script
|
117 |
+
### 0.1.6 new option
|
118 |
+
* option to invert depthmap (black=near, white=far), as required by some viewers.
|
119 |
+
### 0.1.5 bugfix
|
120 |
+
* saving as any format other than PNG now always produces an 8 bit, 3 channel RGB image. A single channel 16 bit image is only supported when saving as PNG. (see [issue](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/15#issuecomment-1304909019))
|
121 |
+
### 0.1.4 update
|
122 |
+
* added support for `--no-half`. Now also works with cards that don't support half precision like GTX 16xx. ([verified](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/12#issuecomment-1304656398))
|
123 |
+
### 0.1.3 bugfix
|
124 |
+
* bugfix where some controls where not visible (see [issue](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/11#issuecomment-1304400537))
|
125 |
+
### 0.1.2 new option
|
126 |
+
* network size slider. higher resolution depth maps (see usage below)
|
127 |
+
### 0.1.1 bugfixes
|
128 |
+
* overflow issue (see [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/10) for details and examples of artifacts)
|
129 |
+
* when not combining, depthmap is now saved as single channel 16 bit
|
130 |
+
### 0.1.0
|
131 |
+
* initial version: script mode, supports generating depthmaps with 4 different midas models
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Bob Thiry
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,12 +1,244 @@
|
|
1 |
-
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: stable-diffusion-webui-depthmap-script
|
3 |
+
app_file: main.py
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 3.50.2
|
6 |
+
---
|
7 |
+
# High Resolution Depth Maps for Stable Diffusion WebUI
|
8 |
+
This program is an addon for [AUTOMATIC1111's Stable Diffusion WebUI](https://github.com/AUTOMATIC1111/stable-diffusion-webui) that creates depth maps. Using either generated or custom depth maps, it can also create 3D stereo image pairs (side-by-side or anaglyph), normalmaps and 3D meshes. The outputs of the script can be viewed directly or used as an asset for a 3D engine. Please see [wiki](https://github.com/thygate/stable-diffusion-webui-depthmap-script/wiki/Viewing-Results) to learn more. The program has integration with [Rembg](https://github.com/danielgatis/rembg). It also supports batch processing, processing of videos, and can also be run in standalone mode, without Stable Diffusion WebUI.
|
9 |
+
|
10 |
+
To generate realistic depth maps from individual images, this script uses code and models from the [Marigold](https://github.com/prs-eth/Marigold/) repository, from the [MiDaS](https://github.com/isl-org/MiDaS) and [ZoeDepth](https://github.com/isl-org/ZoeDepth) repositories by Intel ISL, or LeReS from the [AdelaiDepth](https://github.com/aim-uofa/AdelaiDepth) repository by Advanced Intelligent Machines. Multi-resolution merging as implemented by [BoostingMonocularDepth](https://github.com/compphoto/BoostingMonocularDepth) is used to generate high resolution depth maps.
|
11 |
+
|
12 |
+
Stereoscopic images are created using a custom-written algorithm.
|
13 |
+
|
14 |
+
3D Photography using Context-aware Layered Depth Inpainting by Virginia Tech Vision and Learning Lab, or [3D-Photo-Inpainting](https://github.com/vt-vl-lab/3d-photo-inpainting) is used to generate a `3D inpainted mesh` and render `videos` from said mesh.
|
15 |
+
|
16 |
+
Rembg uses [U-2-Net](https://github.com/xuebinqin/U-2-Net) and [IS-Net](https://github.com/xuebinqin/DIS).
|
17 |
+
|
18 |
+
## Depthmap Examples
|
19 |
+
[](https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/main/examples.png)
|
20 |
+
|
21 |
+
## 3D Photo Inpainting Examples
|
22 |
+
[](https://www.youtube.com/watch?v=jRmVkIMS-SY)
|
23 |
+
video by [@graemeniedermayer](https://github.com/graemeniedermayer), more examples [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/discussions/50)
|
24 |
+
|
25 |
+
## Stereo Image SBS and Anaglyph Examples
|
26 |
+

|
27 |
+
images generated by [@semjon00](https://github.com/semjon00) from CC0 photos, more examples [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/56#issuecomment-1367596463).
|
28 |
+
|
29 |
+
## Install instructions
|
30 |
+
### As extension
|
31 |
+
The script can be installed directly from WebUI. Please navigate to `Extensions` tab, then click `Available`, `Load from` and then install the `Depth Maps` extension. Alternatively, the extension can be installed from the URL: `https://github.com/thygate/stable-diffusion-webui-depthmap-script`.
|
32 |
+
|
33 |
+
### Updating
|
34 |
+
In the WebUI, in the `Extensions` tab, in the `Installed` subtab, click `Check for Updates` and then `Apply and restart UI`.
|
35 |
+
|
36 |
+
### Standalone
|
37 |
+
Clone the repository, install the requirements from `requirements.txt`, launch using `main.py`.
|
38 |
+
|
39 |
+
>Model weights will be downloaded automatically on their first use and saved to /models/midas, /models/leres and /models/pix2pix. Zoedepth models are stored in the torch cache folder.
|
40 |
+
|
41 |
+
|
42 |
+
## Usage
|
43 |
+
Select the "DepthMap" script from the script selection box in either txt2img or img2img, or go to the Depth tab when using existing images.
|
44 |
+

|
45 |
+
|
46 |
+
The models can `Compute on` GPU and CPU, use CPU if low on VRAM.
|
47 |
+
|
48 |
+
There are ten models available from the `Model` dropdown. For the first model, res101, see [AdelaiDepth/LeReS](https://github.com/aim-uofa/AdelaiDepth/tree/main/LeReS) for more info. The others are the midas models: dpt_beit_large_512, dpt_beit_large_384, dpt_large_384, dpt_hybrid_384, midas_v21, and midas_v21_small. See the [MiDaS](https://github.com/isl-org/MiDaS) repository for more info. The newest dpt_beit_large_512 model was trained on a 512x512 dataset but is VERY VRAM hungry. The last three models are [ZoeDepth](https://github.com/isl-org/ZoeDepth) models.
|
49 |
+
|
50 |
+
Net size can be set with `net width` and `net height`, or will be the same as the input image when `Match input size` is enabled. There is a trade-off between structural consistency and high-frequency details with respect to net size (see [observations](https://github.com/compphoto/BoostingMonocularDepth#observations)).
|
51 |
+
|
52 |
+
`Boost` will enable multi-resolution merging as implemented by [BoostingMonocularDepth](https://github.com/compphoto/BoostingMonocularDepth) and will significantly improve the results, mitigating the observations mentioned above, at the cost of much larger compute time. Best results with res101.
|
53 |
+
|
54 |
+
`Clip and renormalize` allows for clipping the depthmap on the `near` and `far` side, the values in between will be renormalized to fit the available range. Set both values equal to get a b&w mask of a single depth plane at that value. This option works on the 16-bit depthmap and allows for 1000 steps to select the clip values.
|
55 |
+
|
56 |
+
When enabled, `Invert DepthMap` will result in a depthmap with black near and white far.
|
57 |
+
|
58 |
+
Regardless of global settings, `Save DepthMap` will always save the depthmap in the default txt2img or img2img directory with the filename suffix '_depth'. Generation parameters are saved with the image if enabled in settings. Files generated from the Depth tab are saved in the default extras-images directory.
|
59 |
+
|
60 |
+
To see the generated output in the webui `Show DepthMap` should be enabled. When using Batch img2img this option should also be enabled.
|
61 |
+
|
62 |
+
When `Combine into one image` is enabled, the depthmap will be combined with the original image, the orientation can be selected with `Combine axis`. When disabled, the depthmap will be saved as a 16 bit single channel PNG as opposed to a three channel (RGB), 8 bit per channel image when the option is enabled.
|
63 |
+
|
64 |
+
When either `Generate Stereo` or `Generate anaglyph` is enabled, a stereo image pair will be generated. `Divergence` sets the amount of 3D effect that is desired. `Balance between eyes` determines where the (inevitable) distortion from filling up gaps will end up, -1 Left, +1 Right, and 0 balanced.
|
65 |
+
The different `Gap fill technique` options are : none (no gaps are filled),
|
66 |
+
naive (the original method), naive_interpolating (the original method with interpolation), polylines_soft and polylines_sharp are the latest technique, the last one being best quality and slowest. Note: All stereo image generation is done on CPU.
|
67 |
+
|
68 |
+
To generate the mesh required to generate videos, enable `Generate 3D inpainted mesh`. This can be a lengthy process, from a few minutes for small images to an hour for very large images. This option is only available on the Depth tab. When enabled, the mesh in ply format and four demo video are generated. All files are saved to the extras directory.
|
69 |
+
|
70 |
+
Videos can be generated from the PLY mesh on the Depth Tab.
|
71 |
+
It requires the mesh created by this extension, files created elsewhere might not work corectly, as some extra info is stored in the file (required value for dolly). Most options are self-explanatory, like `Number of frames` and `Framerate`. Two output `formats` are supported: mp4 and webm. Supersampling Anti-Aliasing (SSAA) can be used to get rid of jagged edges and flickering. The render size is scaled by this factor and then downsampled.
|
72 |
+
There are three `trajectories` to choose from : circle, straight-line, double-straight-line, to `translate` in three dimensions. The border can be `cropped` on four sides, and the `Dolly` option adjusts the FOV so the center subject will stay approximately the same size, like the dolly-zoom.
|
73 |
+
|
74 |
+
Settings on WebUI Settings tab :
|
75 |
+
`Maximum wholesize for boost` sets the r_max value from the BoostingMonocularDepth paper, it relates to the max size that is chosen to render at internally, and directly influences the max amount of VRAM that could be used. The default value for this from the paper is 3000, I have lowered the value to 1600 so it will work more often with 8GB VRAM GPU's.
|
76 |
+
If you often get out of memory errors when computing a depthmap on GPU while using Boost, you can try lowering this value. Note the 'wholeImage being processed in : xxxx' output when using boost, this number will never be greater than the r_max, but can be larger with a larger r_max. See the paper for more details.
|
77 |
+
|
78 |
+
> 💡 Saving as any format other than PNG always produces an 8 bit, 3 channel RGB image. A single channel 16 bit image is only supported when saving as PNG.
|
79 |
+
|
80 |
+
## FAQ
|
81 |
+
|
82 |
+
* `Can I use this on existing images ?`
|
83 |
+
- Yes, you can use the Depth tab to easily process existing images.
|
84 |
+
- Another way of doing this would be to use img2img with denoising strength to 0. This will effectively skip stable diffusion and use the input image. You will still have to set the correct size, and need to select `Crop and resize` instead of `Just resize` when the input image resolution does not match the set size perfectly.
|
85 |
+
* `Can I run this on Google Colab?`
|
86 |
+
- You can run the MiDaS network on their colab linked here https://pytorch.org/hub/intelisl_midas_v2/
|
87 |
+
- You can run BoostingMonocularDepth on their colab linked here : https://colab.research.google.com/github/compphoto/BoostingMonocularDepth/blob/main/Boostmonoculardepth.ipynb
|
88 |
+
- Running this program on Colab is not officially supported, but it may work. Please look for more suitable ways of running this. If you still decide to try, standalone installation may be easier to manage.
|
89 |
+
* `What other depth-related projects could I check out?`
|
90 |
+
- [DepthFlow](https://github.com/BrokenSource/DepthFlow) by [@Tremeschin](https://github.com/Tremeschin) for a very fast generation of 2.5D videos from images (no need to create mesh beforehand!)
|
91 |
+
- Several [scripts](https://github.com/Extraltodeus?tab=repositories) by [@Extraltodeus](https://github.com/Extraltodeus) using depth maps.
|
92 |
+
- geo-11, [Depth3D](https://github.com/BlueSkyDefender/Depth3D) and [Geo3D](https://github.com/Flugan/Geo3D-Installer) for playing existing games in 3D.
|
93 |
+
- (Feel free to suggest more projects in the discussions!)
|
94 |
+
* `How can I know what changed in the new version of the script?`
|
95 |
+
- You can see the git history log or refer to the `CHANGELOG.md` file.
|
96 |
+
|
97 |
+
## Help wanted!
|
98 |
+
Developers wanted! Please help us fix the bugs and add new features by creating MRs.
|
99 |
+
All help is heavily appreciated.
|
100 |
+
Feel free to comment and share in the discussions and submit issues.
|
101 |
+
|
102 |
+
## Acknowledgements
|
103 |
+
|
104 |
+
This project relies on code and information from the following papers :
|
105 |
+
|
106 |
+
MiDaS :
|
107 |
+
|
108 |
+
```
|
109 |
+
@article {Ranftl2022,
|
110 |
+
author = "Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun",
|
111 |
+
title = "Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-Shot Cross-Dataset Transfer",
|
112 |
+
journal = "IEEE Transactions on Pattern Analysis and Machine Intelligence",
|
113 |
+
year = "2022",
|
114 |
+
volume = "44",
|
115 |
+
number = "3"
|
116 |
+
}
|
117 |
+
```
|
118 |
+
|
119 |
+
Dense Prediction Transformers, DPT-based model :
|
120 |
+
|
121 |
+
```
|
122 |
+
@article{Ranftl2021,
|
123 |
+
author = {Ren\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun},
|
124 |
+
title = {Vision Transformers for Dense Prediction},
|
125 |
+
journal = {ICCV},
|
126 |
+
year = {2021},
|
127 |
+
}
|
128 |
+
```
|
129 |
+
|
130 |
+
AdelaiDepth/LeReS :
|
131 |
+
|
132 |
+
```
|
133 |
+
@article{yin2022towards,
|
134 |
+
title={Towards Accurate Reconstruction of 3D Scene Shape from A Single Monocular Image},
|
135 |
+
author={Yin, Wei and Zhang, Jianming and Wang, Oliver and Niklaus, Simon and Chen, Simon and Liu, Yifan and Shen, Chunhua},
|
136 |
+
journal={TPAMI},
|
137 |
+
year={2022}
|
138 |
+
}
|
139 |
+
@inproceedings{Wei2021CVPR,
|
140 |
+
title = {Learning to Recover 3D Scene Shape from a Single Image},
|
141 |
+
author = {Wei Yin and Jianming Zhang and Oliver Wang and Simon Niklaus and Long Mai and Simon Chen and Chunhua Shen},
|
142 |
+
booktitle = {Proc. IEEE Conf. Comp. Vis. Patt. Recogn. (CVPR)},
|
143 |
+
year = {2021}
|
144 |
+
}
|
145 |
+
```
|
146 |
+
|
147 |
+
Boosting Monocular Depth Estimation Models to High-Resolution via Content-Adaptive Multi-Resolution Merging :
|
148 |
+
|
149 |
+
```
|
150 |
+
@inproceedings{Miangoleh2021Boosting,
|
151 |
+
title={Boosting Monocular Depth Estimation Models to High-Resolution via Content-Adaptive Multi-Resolution Merging},
|
152 |
+
author={S. Mahdi H. Miangoleh and Sebastian Dille and Long Mai and Sylvain Paris and Ya\u{g}{\i}z Aksoy},
|
153 |
+
journal={Proc. CVPR},
|
154 |
+
year={2021},
|
155 |
+
}
|
156 |
+
```
|
157 |
+
|
158 |
+
3D Photography using Context-aware Layered Depth Inpainting :
|
159 |
+
|
160 |
+
```
|
161 |
+
@inproceedings{Shih3DP20,
|
162 |
+
author = {Shih, Meng-Li and Su, Shih-Yang and Kopf, Johannes and Huang, Jia-Bin},
|
163 |
+
title = {3D Photography using Context-aware Layered Depth Inpainting},
|
164 |
+
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
|
165 |
+
year = {2020}
|
166 |
+
}
|
167 |
+
```
|
168 |
+
|
169 |
+
U2-Net:
|
170 |
+
|
171 |
+
```
|
172 |
+
@InProceedings{Qin_2020_PR,
|
173 |
+
title = {U2-Net: Going Deeper with Nested U-Structure for Salient Object Detection},
|
174 |
+
author = {Qin, Xuebin and Zhang, Zichen and Huang, Chenyang and Dehghan, Masood and Zaiane, Osmar and Jagersand, Martin},
|
175 |
+
journal = {Pattern Recognition},
|
176 |
+
volume = {106},
|
177 |
+
pages = {107404},
|
178 |
+
year = {2020}
|
179 |
+
}
|
180 |
+
```
|
181 |
+
|
182 |
+
IS-Net:
|
183 |
+
|
184 |
+
```
|
185 |
+
@InProceedings{qin2022,
|
186 |
+
author={Xuebin Qin and Hang Dai and Xiaobin Hu and Deng-Ping Fan and Ling Shao and Luc Van Gool},
|
187 |
+
title={Highly Accurate Dichotomous Image Segmentation},
|
188 |
+
booktitle={ECCV},
|
189 |
+
year={2022}
|
190 |
+
}
|
191 |
+
```
|
192 |
+
|
193 |
+
|
194 |
+
ZoeDepth :
|
195 |
+
|
196 |
+
```
|
197 |
+
@misc{https://doi.org/10.48550/arxiv.2302.12288,
|
198 |
+
doi = {10.48550/ARXIV.2302.12288},
|
199 |
+
url = {https://arxiv.org/abs/2302.12288},
|
200 |
+
author = {Bhat, Shariq Farooq and Birkl, Reiner and Wofk, Diana and Wonka, Peter and Müller, Matthias},
|
201 |
+
keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
|
202 |
+
title = {ZoeDepth: Zero-shot Transfer by Combining Relative and Metric Depth},
|
203 |
+
publisher = {arXiv},
|
204 |
+
year = {2023},
|
205 |
+
copyright = {arXiv.org perpetual, non-exclusive license}
|
206 |
+
}
|
207 |
+
```
|
208 |
+
|
209 |
+
Marigold - Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation:
|
210 |
+
|
211 |
+
```
|
212 |
+
@misc{ke2023repurposing,
|
213 |
+
title={Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation},
|
214 |
+
author={Bingxin Ke and Anton Obukhov and Shengyu Huang and Nando Metzger and Rodrigo Caye Daudt and Konrad Schindler},
|
215 |
+
year={2023},
|
216 |
+
eprint={2312.02145},
|
217 |
+
archivePrefix={arXiv},
|
218 |
+
primaryClass={cs.CV}
|
219 |
+
}
|
220 |
+
```
|
221 |
+
|
222 |
+
Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data
|
223 |
+
|
224 |
+
```
|
225 |
+
@misc{yang2024depth,
|
226 |
+
title={Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data},
|
227 |
+
author={Lihe Yang and Bingyi Kang and Zilong Huang and Xiaogang Xu and Jiashi Feng and Hengshuang Zhao},
|
228 |
+
year={2024},
|
229 |
+
eprint={2401.10891},
|
230 |
+
archivePrefix={arXiv},
|
231 |
+
primaryClass={cs.CV}
|
232 |
+
}
|
233 |
+
```
|
234 |
+
|
235 |
+
Depth Anything V2
|
236 |
+
|
237 |
+
```bibtex
|
238 |
+
@article{depth_anything_v2,
|
239 |
+
title={Depth Anything V2},
|
240 |
+
author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
|
241 |
+
journal={arXiv:2406.09414},
|
242 |
+
year={2024}
|
243 |
+
}
|
244 |
+
```
|
__init__.py
ADDED
File without changes
|
bundled_sources.txt
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Since commit 110549b2 this extension bundles some code from other repositories.
|
2 |
+
This was done to prevent possible upstream breakage and allow fixing breakage quicker.
|
3 |
+
This file provides information about the original location of the code.
|
4 |
+
*** Some of the bundled code was already modified. ***
|
5 |
+
|
6 |
+
dmidas
|
7 |
+
https://github.com/isl-org/MiDaS/tree/master/midas/
|
8 |
+
|
9 |
+
dzoedepth
|
10 |
+
https://github.com/isl-org/ZoeDepth/tree/main/zoedepth/
|
11 |
+
|
12 |
+
inpaint
|
13 |
+
https://github.com/vt-vl-lab/3d-photo-inpainting/
|
14 |
+
|
15 |
+
lib
|
16 |
+
https://github.com/aim-uofa/AdelaiDepth/tree/main/LeReS/Minist_Test/lib/
|
17 |
+
|
18 |
+
pix2pix
|
19 |
+
https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/
|
20 |
+
|
21 |
+
Marigold
|
22 |
+
https://github.com/prs-eth/Marigold/tree/22437a
|
23 |
+
|
24 |
+
depth_anything_v2
|
25 |
+
https://github.com/DepthAnything/Depth-Anything-V2/tree/bc0283
|
ddepth_anything_v2/DA-2K.md
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# DA-2K Evaluation Benchmark
|
2 |
+
|
3 |
+
## Introduction
|
4 |
+
|
5 |
+

|
6 |
+
|
7 |
+
DA-2K is proposed in [Depth Anything V2](https://depth-anything-v2.github.io) to evaluate the relative depth estimation capability. It encompasses eight representative scenarios of `indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`. It consists of 1K diverse high-quality images and 2K precise pair-wise relative depth annotations.
|
8 |
+
|
9 |
+
Please refer to our [paper](https://arxiv.org/abs/2406.09414) for details in constructing this benchmark.
|
10 |
+
|
11 |
+
|
12 |
+
## Usage
|
13 |
+
|
14 |
+
Please first [download the benchmark](https://huggingface.co/datasets/depth-anything/DA-2K/tree/main).
|
15 |
+
|
16 |
+
All annotations are stored in `annotations.json`. The annotation file is a JSON object where each key is the path to an image file, and the value is a list of annotations associated with that image. Each annotation describes two points and identifies which point is closer to the camera. The structure is detailed below:
|
17 |
+
|
18 |
+
```
|
19 |
+
{
|
20 |
+
"image_path": [
|
21 |
+
{
|
22 |
+
"point1": [h1, w1], # (vertical position, horizontal position)
|
23 |
+
"point2": [h2, w2], # (vertical position, horizontal position)
|
24 |
+
"closer_point": "point1" # we always set "point1" as the closer one
|
25 |
+
},
|
26 |
+
...
|
27 |
+
],
|
28 |
+
...
|
29 |
+
}
|
30 |
+
```
|
31 |
+
|
32 |
+
To visualize the annotations:
|
33 |
+
```bash
|
34 |
+
python visualize.py [--scene-type <type>]
|
35 |
+
```
|
36 |
+
|
37 |
+
**Options**
|
38 |
+
- `--scene-type <type>` (optional): Specify the scene type (`indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`). Skip this argument or set <type> as `""` to include all scene types.
|
39 |
+
|
40 |
+
## Citation
|
41 |
+
|
42 |
+
If you find this benchmark useful, please consider citing:
|
43 |
+
|
44 |
+
```bibtex
|
45 |
+
@article{depth_anything_v2,
|
46 |
+
title={Depth Anything V2},
|
47 |
+
author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
|
48 |
+
journal={arXiv:2406.09414},
|
49 |
+
year={2024}
|
50 |
+
}
|
51 |
+
```
|
ddepth_anything_v2/LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
ddepth_anything_v2/README.md
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<div align="center">
|
2 |
+
<h1>Depth Anything V2</h1>
|
3 |
+
|
4 |
+
[**Lihe Yang**](https://liheyoung.github.io/)<sup>1</sup> · [**Bingyi Kang**](https://bingykang.github.io/)<sup>2†</sup> · [**Zilong Huang**](http://speedinghzl.github.io/)<sup>2</sup>
|
5 |
+
<br>
|
6 |
+
[**Zhen Zhao**](http://zhaozhen.me/) · [**Xiaogang Xu**](https://xiaogang00.github.io/) · [**Jiashi Feng**](https://sites.google.com/site/jshfeng/)<sup>2</sup> · [**Hengshuang Zhao**](https://hszhao.github.io/)<sup>1*</sup>
|
7 |
+
|
8 |
+
<sup>1</sup>HKU   <sup>2</sup>TikTok
|
9 |
+
<br>
|
10 |
+
†project lead *corresponding author
|
11 |
+
†[Bingyi Kang](https://bingykang.github.io/) proposed this project and advised in every aspect.
|
12 |
+
|
13 |
+
<a href="https://arxiv.org/abs/2406.09414"><img src='https://img.shields.io/badge/arXiv-Depth Anything V2-red' alt='Paper PDF'></a>
|
14 |
+
<a href='https://depth-anything-v2.github.io'><img src='https://img.shields.io/badge/Project_Page-Depth Anything V2-green' alt='Project Page'></a>
|
15 |
+
<a href='https://huggingface.co/spaces/depth-anything/Depth-Anything-V2'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-blue'></a>
|
16 |
+
<a href='https://huggingface.co/datasets/depth-anything/DA-2K'><img src='https://img.shields.io/badge/Benchmark-DA--2K-yellow' alt='Benchmark'></a>
|
17 |
+
</div>
|
18 |
+
|
19 |
+
This work presents Depth Anything V2. It significantly outperforms [V1](https://github.com/LiheYoung/Depth-Anything) in fine-grained details and robustness. Compared with SD-based models, it enjoys faster inference speed, fewer parameters, and higher depth accuracy.
|
20 |
+
|
21 |
+

|
22 |
+
|
23 |
+
|
24 |
+
## News
|
25 |
+
|
26 |
+
- **2024-07-06:** Depth Anything V2 is supported in [Transformers](https://github.com/huggingface/transformers/). See the [instructions](https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything_v2) for convenient usage.
|
27 |
+
- **2024-06-25:** Depth Anything is integrated into [Apple Core ML Models](https://developer.apple.com/machine-learning/models/). See the instructions ([V1](https://huggingface.co/apple/coreml-depth-anything-small), [V2](https://huggingface.co/apple/coreml-depth-anything-v2-small)) for usage.
|
28 |
+
- **2024-06-22:** We release [smaller metric depth models](https://github.com/DepthAnything/Depth-Anything-V2/tree/main/metric_depth#pre-trained-models) based on Depth-Anything-V2-Small and Base.
|
29 |
+
- **2024-06-20:** Our repository and project page are flagged by GitHub and removed from the public for 6 days. Sorry for the inconvenience.
|
30 |
+
- **2024-06-14:** Paper, project page, code, models, demo, and benchmark are all released.
|
31 |
+
|
32 |
+
|
33 |
+
## Pre-trained Models
|
34 |
+
|
35 |
+
We provide **four models** of varying scales for robust relative depth estimation:
|
36 |
+
|
37 |
+
| Model | Params | Checkpoint |
|
38 |
+
|:-|-:|:-:|
|
39 |
+
| Depth-Anything-V2-Small | 24.8M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Small/resolve/main/depth_anything_v2_vits.pth?download=true) |
|
40 |
+
| Depth-Anything-V2-Base | 97.5M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Base/resolve/main/depth_anything_v2_vitb.pth?download=true) |
|
41 |
+
| Depth-Anything-V2-Large | 335.3M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Large/resolve/main/depth_anything_v2_vitl.pth?download=true) |
|
42 |
+
| Depth-Anything-V2-Giant | 1.3B | Coming soon |
|
43 |
+
|
44 |
+
|
45 |
+
## Usage
|
46 |
+
|
47 |
+
### Prepraration
|
48 |
+
|
49 |
+
```bash
|
50 |
+
git clone https://github.com/DepthAnything/Depth-Anything-V2
|
51 |
+
cd Depth-Anything-V2
|
52 |
+
pip install -r requirements.txt
|
53 |
+
```
|
54 |
+
|
55 |
+
Download the checkpoints listed [here](#pre-trained-models) and put them under the `checkpoints` directory.
|
56 |
+
|
57 |
+
### Use our models
|
58 |
+
```python
|
59 |
+
import cv2
|
60 |
+
import torch
|
61 |
+
|
62 |
+
from depth_anything_v2.dpt import DepthAnythingV2
|
63 |
+
|
64 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
|
65 |
+
|
66 |
+
model_configs = {
|
67 |
+
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
|
68 |
+
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
|
69 |
+
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
|
70 |
+
'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
|
71 |
+
}
|
72 |
+
|
73 |
+
encoder = 'vitl' # or 'vits', 'vitb', 'vitg'
|
74 |
+
|
75 |
+
model = DepthAnythingV2(**model_configs[encoder])
|
76 |
+
model.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location='cpu'))
|
77 |
+
model = model.to(DEVICE).eval()
|
78 |
+
|
79 |
+
raw_img = cv2.imread('your/image/path')
|
80 |
+
depth = model.infer_image(raw_img) # HxW raw depth map in numpy
|
81 |
+
```
|
82 |
+
|
83 |
+
If you do not want to clone this repository, you can also load our models through [Transformers](https://github.com/huggingface/transformers/). Below is a simple code snippet. Please refer to the [official page](https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything_v2) for more details.
|
84 |
+
|
85 |
+
- Note 1: Make sure you can connect to Hugging Face and have installed the latest Transformers.
|
86 |
+
- Note 2: Due to the [upsampling difference](https://github.com/huggingface/transformers/pull/31522#issuecomment-2184123463) between OpenCV (we used) and Pillow (HF used), predictions may differ slightly. So you are more recommended to use our models through the way introduced above.
|
87 |
+
```python
|
88 |
+
from transformers import pipeline
|
89 |
+
from PIL import Image
|
90 |
+
|
91 |
+
pipe = pipeline(task="depth-estimation", model="depth-anything/Depth-Anything-V2-Small-hf")
|
92 |
+
image = Image.open('your/image/path')
|
93 |
+
depth = pipe(image)["depth"]
|
94 |
+
```
|
95 |
+
|
96 |
+
### Running script on *images*
|
97 |
+
|
98 |
+
```bash
|
99 |
+
python run.py \
|
100 |
+
--encoder <vits | vitb | vitl | vitg> \
|
101 |
+
--img-path <path> --outdir <outdir> \
|
102 |
+
[--input-size <size>] [--pred-only] [--grayscale]
|
103 |
+
```
|
104 |
+
Options:
|
105 |
+
- `--img-path`: You can either 1) point it to an image directory storing all interested images, 2) point it to a single image, or 3) point it to a text file storing all image paths.
|
106 |
+
- `--input-size` (optional): By default, we use input size `518` for model inference. ***You can increase the size for even more fine-grained results.***
|
107 |
+
- `--pred-only` (optional): Only save the predicted depth map, without raw image.
|
108 |
+
- `--grayscale` (optional): Save the grayscale depth map, without applying color palette.
|
109 |
+
|
110 |
+
For example:
|
111 |
+
```bash
|
112 |
+
python run.py --encoder vitl --img-path assets/examples --outdir depth_vis
|
113 |
+
```
|
114 |
+
|
115 |
+
### Running script on *videos*
|
116 |
+
|
117 |
+
```bash
|
118 |
+
python run_video.py \
|
119 |
+
--encoder <vits | vitb | vitl | vitg> \
|
120 |
+
--video-path assets/examples_video --outdir video_depth_vis \
|
121 |
+
[--input-size <size>] [--pred-only] [--grayscale]
|
122 |
+
```
|
123 |
+
|
124 |
+
***Our larger model has better temporal consistency on videos.***
|
125 |
+
|
126 |
+
### Gradio demo
|
127 |
+
|
128 |
+
To use our gradio demo locally:
|
129 |
+
|
130 |
+
```bash
|
131 |
+
python app.py
|
132 |
+
```
|
133 |
+
|
134 |
+
You can also try our [online demo](https://huggingface.co/spaces/Depth-Anything/Depth-Anything-V2).
|
135 |
+
|
136 |
+
***Note: Compared to V1, we have made a minor modification to the DINOv2-DPT architecture (originating from this [issue](https://github.com/LiheYoung/Depth-Anything/issues/81)).*** In V1, we *unintentionally* used features from the last four layers of DINOv2 for decoding. In V2, we use [intermediate features](https://github.com/DepthAnything/Depth-Anything-V2/blob/2cbc36a8ce2cec41d38ee51153f112e87c8e42d8/depth_anything_v2/dpt.py#L164-L169) instead. Although this modification did not improve details or accuracy, we decided to follow this common practice.
|
137 |
+
|
138 |
+
|
139 |
+
## Fine-tuned to Metric Depth Estimation
|
140 |
+
|
141 |
+
Please refer to [metric depth estimation](./metric_depth).
|
142 |
+
|
143 |
+
|
144 |
+
## DA-2K Evaluation Benchmark
|
145 |
+
|
146 |
+
Please refer to [DA-2K benchmark](./DA-2K.md).
|
147 |
+
|
148 |
+
|
149 |
+
## Community Support
|
150 |
+
|
151 |
+
**We sincerely appreciate all the community support for our Depth Anything series. Thank you a lot!**
|
152 |
+
|
153 |
+
- Apple Core ML:
|
154 |
+
- https://developer.apple.com/machine-learning/models
|
155 |
+
- https://huggingface.co/apple/coreml-depth-anything-v2-small
|
156 |
+
- https://huggingface.co/apple/coreml-depth-anything-small
|
157 |
+
- Transformers:
|
158 |
+
- https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything_v2
|
159 |
+
- https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything
|
160 |
+
- TensorRT:
|
161 |
+
- https://github.com/spacewalk01/depth-anything-tensorrt
|
162 |
+
- https://github.com/zhujiajian98/Depth-Anythingv2-TensorRT-python
|
163 |
+
- ONNX: https://github.com/fabio-sim/Depth-Anything-ONNX
|
164 |
+
- ComfyUI: https://github.com/kijai/ComfyUI-DepthAnythingV2
|
165 |
+
- Transformers.js (real-time depth in web): https://huggingface.co/spaces/Xenova/webgpu-realtime-depth-estimation
|
166 |
+
- Android:
|
167 |
+
- https://github.com/shubham0204/Depth-Anything-Android
|
168 |
+
- https://github.com/FeiGeChuanShu/ncnn-android-depth_anything
|
169 |
+
|
170 |
+
|
171 |
+
## Acknowledgement
|
172 |
+
|
173 |
+
We are sincerely grateful to the awesome Hugging Face team ([@Pedro Cuenca](https://huggingface.co/pcuenq), [@Niels Rogge](https://huggingface.co/nielsr), [@Merve Noyan](https://huggingface.co/merve), [@Amy Roberts](https://huggingface.co/amyeroberts), et al.) for their huge efforts in supporting our models in Transformers and Apple Core ML.
|
174 |
+
|
175 |
+
We also thank the [DINOv2](https://github.com/facebookresearch/dinov2) team for contributing such impressive models to our community.
|
176 |
+
|
177 |
+
|
178 |
+
## LICENSE
|
179 |
+
|
180 |
+
Depth-Anything-V2-Small model is under the Apache-2.0 license. Depth-Anything-V2-Base/Large/Giant models are under the CC-BY-NC-4.0 license.
|
181 |
+
|
182 |
+
|
183 |
+
## Citation
|
184 |
+
|
185 |
+
If you find this project useful, please consider citing:
|
186 |
+
|
187 |
+
```bibtex
|
188 |
+
@article{depth_anything_v2,
|
189 |
+
title={Depth Anything V2},
|
190 |
+
author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
|
191 |
+
journal={arXiv:2406.09414},
|
192 |
+
year={2024}
|
193 |
+
}
|
194 |
+
|
195 |
+
@inproceedings{depth_anything_v1,
|
196 |
+
title={Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data},
|
197 |
+
author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
|
198 |
+
booktitle={CVPR},
|
199 |
+
year={2024}
|
200 |
+
}
|
201 |
+
```
|
ddepth_anything_v2/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .depth_anything_v2.dpt import DepthAnythingV2
|
ddepth_anything_v2/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (237 Bytes). View file
|
|
ddepth_anything_v2/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (267 Bytes). View file
|
|
ddepth_anything_v2/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (244 Bytes). View file
|
|
ddepth_anything_v2/app.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import gradio as gr
|
3 |
+
import matplotlib
|
4 |
+
import numpy as np
|
5 |
+
from PIL import Image
|
6 |
+
import torch
|
7 |
+
import tempfile
|
8 |
+
from gradio_imageslider import ImageSlider
|
9 |
+
|
10 |
+
from depth_anything_v2.dpt import DepthAnythingV2
|
11 |
+
|
12 |
+
css = """
|
13 |
+
#img-display-container {
|
14 |
+
max-height: 100vh;
|
15 |
+
}
|
16 |
+
#img-display-input {
|
17 |
+
max-height: 80vh;
|
18 |
+
}
|
19 |
+
#img-display-output {
|
20 |
+
max-height: 80vh;
|
21 |
+
}
|
22 |
+
#download {
|
23 |
+
height: 62px;
|
24 |
+
}
|
25 |
+
"""
|
26 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
|
27 |
+
model_configs = {
|
28 |
+
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
|
29 |
+
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
|
30 |
+
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
|
31 |
+
'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
|
32 |
+
}
|
33 |
+
encoder = 'vitl'
|
34 |
+
model = DepthAnythingV2(**model_configs[encoder])
|
35 |
+
state_dict = torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location="cpu")
|
36 |
+
model.load_state_dict(state_dict)
|
37 |
+
model = model.to(DEVICE).eval()
|
38 |
+
|
39 |
+
title = "# Depth Anything V2"
|
40 |
+
description = """Official demo for **Depth Anything V2**.
|
41 |
+
Please refer to our [paper](https://arxiv.org/abs/2406.09414), [project page](https://depth-anything-v2.github.io), or [github](https://github.com/DepthAnything/Depth-Anything-V2) for more details."""
|
42 |
+
|
43 |
+
def predict_depth(image):
|
44 |
+
return model.infer_image(image)
|
45 |
+
|
46 |
+
with gr.Blocks(css=css) as demo:
|
47 |
+
gr.Markdown(title)
|
48 |
+
gr.Markdown(description)
|
49 |
+
gr.Markdown("### Depth Prediction demo")
|
50 |
+
|
51 |
+
with gr.Row():
|
52 |
+
input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input')
|
53 |
+
depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output', position=0.5)
|
54 |
+
submit = gr.Button(value="Compute Depth")
|
55 |
+
gray_depth_file = gr.File(label="Grayscale depth map", elem_id="download",)
|
56 |
+
raw_file = gr.File(label="16-bit raw output (can be considered as disparity)", elem_id="download",)
|
57 |
+
|
58 |
+
cmap = matplotlib.colormaps.get_cmap('Spectral_r')
|
59 |
+
|
60 |
+
def on_submit(image):
|
61 |
+
original_image = image.copy()
|
62 |
+
|
63 |
+
h, w = image.shape[:2]
|
64 |
+
|
65 |
+
depth = predict_depth(image[:, :, ::-1])
|
66 |
+
|
67 |
+
raw_depth = Image.fromarray(depth.astype('uint16'))
|
68 |
+
tmp_raw_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
|
69 |
+
raw_depth.save(tmp_raw_depth.name)
|
70 |
+
|
71 |
+
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
|
72 |
+
depth = depth.astype(np.uint8)
|
73 |
+
colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
|
74 |
+
|
75 |
+
gray_depth = Image.fromarray(depth)
|
76 |
+
tmp_gray_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
|
77 |
+
gray_depth.save(tmp_gray_depth.name)
|
78 |
+
|
79 |
+
return [(original_image, colored_depth), tmp_gray_depth.name, tmp_raw_depth.name]
|
80 |
+
|
81 |
+
submit.click(on_submit, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file])
|
82 |
+
|
83 |
+
example_files = glob.glob('assets/examples/*')
|
84 |
+
examples = gr.Examples(examples=example_files, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file], fn=on_submit)
|
85 |
+
|
86 |
+
|
87 |
+
if __name__ == '__main__':
|
88 |
+
demo.queue().launch()
|
ddepth_anything_v2/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc
ADDED
Binary file (12.2 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/__pycache__/dinov2.cpython-311.pyc
ADDED
Binary file (21.8 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/__pycache__/dinov2.cpython-312.pyc
ADDED
Binary file (18.7 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/__pycache__/dpt.cpython-310.pyc
ADDED
Binary file (5.99 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/__pycache__/dpt.cpython-311.pyc
ADDED
Binary file (11.8 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/__pycache__/dpt.cpython-312.pyc
ADDED
Binary file (10.7 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2.py
ADDED
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the Apache License, Version 2.0
|
4 |
+
# found in the LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
# References:
|
7 |
+
# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
|
8 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
|
9 |
+
|
10 |
+
from functools import partial
|
11 |
+
import math
|
12 |
+
import logging
|
13 |
+
from typing import Sequence, Tuple, Union, Callable
|
14 |
+
|
15 |
+
import torch
|
16 |
+
import torch.nn as nn
|
17 |
+
import torch.utils.checkpoint
|
18 |
+
from torch.nn.init import trunc_normal_
|
19 |
+
|
20 |
+
from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
|
21 |
+
|
22 |
+
|
23 |
+
logger = logging.getLogger("dinov2")
|
24 |
+
|
25 |
+
|
26 |
+
def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
|
27 |
+
if not depth_first and include_root:
|
28 |
+
fn(module=module, name=name)
|
29 |
+
for child_name, child_module in module.named_children():
|
30 |
+
child_name = ".".join((name, child_name)) if name else child_name
|
31 |
+
named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
|
32 |
+
if depth_first and include_root:
|
33 |
+
fn(module=module, name=name)
|
34 |
+
return module
|
35 |
+
|
36 |
+
|
37 |
+
class BlockChunk(nn.ModuleList):
|
38 |
+
def forward(self, x):
|
39 |
+
for b in self:
|
40 |
+
x = b(x)
|
41 |
+
return x
|
42 |
+
|
43 |
+
|
44 |
+
class DinoVisionTransformer(nn.Module):
|
45 |
+
def __init__(
|
46 |
+
self,
|
47 |
+
img_size=224,
|
48 |
+
patch_size=16,
|
49 |
+
in_chans=3,
|
50 |
+
embed_dim=768,
|
51 |
+
depth=12,
|
52 |
+
num_heads=12,
|
53 |
+
mlp_ratio=4.0,
|
54 |
+
qkv_bias=True,
|
55 |
+
ffn_bias=True,
|
56 |
+
proj_bias=True,
|
57 |
+
drop_path_rate=0.0,
|
58 |
+
drop_path_uniform=False,
|
59 |
+
init_values=None, # for layerscale: None or 0 => no layerscale
|
60 |
+
embed_layer=PatchEmbed,
|
61 |
+
act_layer=nn.GELU,
|
62 |
+
block_fn=Block,
|
63 |
+
ffn_layer="mlp",
|
64 |
+
block_chunks=1,
|
65 |
+
num_register_tokens=0,
|
66 |
+
interpolate_antialias=False,
|
67 |
+
interpolate_offset=0.1,
|
68 |
+
):
|
69 |
+
"""
|
70 |
+
Args:
|
71 |
+
img_size (int, tuple): input image size
|
72 |
+
patch_size (int, tuple): patch size
|
73 |
+
in_chans (int): number of input channels
|
74 |
+
embed_dim (int): embedding dimension
|
75 |
+
depth (int): depth of transformer
|
76 |
+
num_heads (int): number of attention heads
|
77 |
+
mlp_ratio (int): ratio of mlp hidden dim to embedding dim
|
78 |
+
qkv_bias (bool): enable bias for qkv if True
|
79 |
+
proj_bias (bool): enable bias for proj in attn if True
|
80 |
+
ffn_bias (bool): enable bias for ffn if True
|
81 |
+
drop_path_rate (float): stochastic depth rate
|
82 |
+
drop_path_uniform (bool): apply uniform drop rate across blocks
|
83 |
+
weight_init (str): weight init scheme
|
84 |
+
init_values (float): layer-scale init values
|
85 |
+
embed_layer (nn.Module): patch embedding layer
|
86 |
+
act_layer (nn.Module): MLP activation layer
|
87 |
+
block_fn (nn.Module): transformer block class
|
88 |
+
ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
|
89 |
+
block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
|
90 |
+
num_register_tokens: (int) number of extra cls tokens (so-called "registers")
|
91 |
+
interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
|
92 |
+
interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
|
93 |
+
"""
|
94 |
+
super().__init__()
|
95 |
+
norm_layer = partial(nn.LayerNorm, eps=1e-6)
|
96 |
+
|
97 |
+
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
|
98 |
+
self.num_tokens = 1
|
99 |
+
self.n_blocks = depth
|
100 |
+
self.num_heads = num_heads
|
101 |
+
self.patch_size = patch_size
|
102 |
+
self.num_register_tokens = num_register_tokens
|
103 |
+
self.interpolate_antialias = interpolate_antialias
|
104 |
+
self.interpolate_offset = interpolate_offset
|
105 |
+
|
106 |
+
self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
|
107 |
+
num_patches = self.patch_embed.num_patches
|
108 |
+
|
109 |
+
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
|
110 |
+
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
|
111 |
+
assert num_register_tokens >= 0
|
112 |
+
self.register_tokens = (
|
113 |
+
nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
|
114 |
+
)
|
115 |
+
|
116 |
+
if drop_path_uniform is True:
|
117 |
+
dpr = [drop_path_rate] * depth
|
118 |
+
else:
|
119 |
+
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
|
120 |
+
|
121 |
+
if ffn_layer == "mlp":
|
122 |
+
logger.info("using MLP layer as FFN")
|
123 |
+
ffn_layer = Mlp
|
124 |
+
elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
|
125 |
+
logger.info("using SwiGLU layer as FFN")
|
126 |
+
ffn_layer = SwiGLUFFNFused
|
127 |
+
elif ffn_layer == "identity":
|
128 |
+
logger.info("using Identity layer as FFN")
|
129 |
+
|
130 |
+
def f(*args, **kwargs):
|
131 |
+
return nn.Identity()
|
132 |
+
|
133 |
+
ffn_layer = f
|
134 |
+
else:
|
135 |
+
raise NotImplementedError
|
136 |
+
|
137 |
+
blocks_list = [
|
138 |
+
block_fn(
|
139 |
+
dim=embed_dim,
|
140 |
+
num_heads=num_heads,
|
141 |
+
mlp_ratio=mlp_ratio,
|
142 |
+
qkv_bias=qkv_bias,
|
143 |
+
proj_bias=proj_bias,
|
144 |
+
ffn_bias=ffn_bias,
|
145 |
+
drop_path=dpr[i],
|
146 |
+
norm_layer=norm_layer,
|
147 |
+
act_layer=act_layer,
|
148 |
+
ffn_layer=ffn_layer,
|
149 |
+
init_values=init_values,
|
150 |
+
)
|
151 |
+
for i in range(depth)
|
152 |
+
]
|
153 |
+
if block_chunks > 0:
|
154 |
+
self.chunked_blocks = True
|
155 |
+
chunked_blocks = []
|
156 |
+
chunksize = depth // block_chunks
|
157 |
+
for i in range(0, depth, chunksize):
|
158 |
+
# this is to keep the block index consistent if we chunk the block list
|
159 |
+
chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
|
160 |
+
self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
|
161 |
+
else:
|
162 |
+
self.chunked_blocks = False
|
163 |
+
self.blocks = nn.ModuleList(blocks_list)
|
164 |
+
|
165 |
+
self.norm = norm_layer(embed_dim)
|
166 |
+
self.head = nn.Identity()
|
167 |
+
|
168 |
+
self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
|
169 |
+
|
170 |
+
self.init_weights()
|
171 |
+
|
172 |
+
def init_weights(self):
|
173 |
+
trunc_normal_(self.pos_embed, std=0.02)
|
174 |
+
nn.init.normal_(self.cls_token, std=1e-6)
|
175 |
+
if self.register_tokens is not None:
|
176 |
+
nn.init.normal_(self.register_tokens, std=1e-6)
|
177 |
+
named_apply(init_weights_vit_timm, self)
|
178 |
+
|
179 |
+
def interpolate_pos_encoding(self, x, w, h):
|
180 |
+
previous_dtype = x.dtype
|
181 |
+
npatch = x.shape[1] - 1
|
182 |
+
N = self.pos_embed.shape[1] - 1
|
183 |
+
if npatch == N and w == h:
|
184 |
+
return self.pos_embed
|
185 |
+
pos_embed = self.pos_embed.float()
|
186 |
+
class_pos_embed = pos_embed[:, 0]
|
187 |
+
patch_pos_embed = pos_embed[:, 1:]
|
188 |
+
dim = x.shape[-1]
|
189 |
+
w0 = w // self.patch_size
|
190 |
+
h0 = h // self.patch_size
|
191 |
+
# we add a small number to avoid floating point error in the interpolation
|
192 |
+
# see discussion at https://github.com/facebookresearch/dino/issues/8
|
193 |
+
# DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
|
194 |
+
w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
|
195 |
+
# w0, h0 = w0 + 0.1, h0 + 0.1
|
196 |
+
|
197 |
+
sqrt_N = math.sqrt(N)
|
198 |
+
sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
|
199 |
+
patch_pos_embed = nn.functional.interpolate(
|
200 |
+
patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
|
201 |
+
scale_factor=(sx, sy),
|
202 |
+
# (int(w0), int(h0)), # to solve the upsampling shape issue
|
203 |
+
mode="bicubic",
|
204 |
+
antialias=self.interpolate_antialias
|
205 |
+
)
|
206 |
+
|
207 |
+
assert int(w0) == patch_pos_embed.shape[-2]
|
208 |
+
assert int(h0) == patch_pos_embed.shape[-1]
|
209 |
+
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
|
210 |
+
return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
|
211 |
+
|
212 |
+
def prepare_tokens_with_masks(self, x, masks=None):
|
213 |
+
B, nc, w, h = x.shape
|
214 |
+
x = self.patch_embed(x)
|
215 |
+
if masks is not None:
|
216 |
+
x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
|
217 |
+
|
218 |
+
x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
|
219 |
+
x = x + self.interpolate_pos_encoding(x, w, h)
|
220 |
+
|
221 |
+
if self.register_tokens is not None:
|
222 |
+
x = torch.cat(
|
223 |
+
(
|
224 |
+
x[:, :1],
|
225 |
+
self.register_tokens.expand(x.shape[0], -1, -1),
|
226 |
+
x[:, 1:],
|
227 |
+
),
|
228 |
+
dim=1,
|
229 |
+
)
|
230 |
+
|
231 |
+
return x
|
232 |
+
|
233 |
+
def forward_features_list(self, x_list, masks_list):
|
234 |
+
x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
|
235 |
+
for blk in self.blocks:
|
236 |
+
x = blk(x)
|
237 |
+
|
238 |
+
all_x = x
|
239 |
+
output = []
|
240 |
+
for x, masks in zip(all_x, masks_list):
|
241 |
+
x_norm = self.norm(x)
|
242 |
+
output.append(
|
243 |
+
{
|
244 |
+
"x_norm_clstoken": x_norm[:, 0],
|
245 |
+
"x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
|
246 |
+
"x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
|
247 |
+
"x_prenorm": x,
|
248 |
+
"masks": masks,
|
249 |
+
}
|
250 |
+
)
|
251 |
+
return output
|
252 |
+
|
253 |
+
def forward_features(self, x, masks=None):
|
254 |
+
if isinstance(x, list):
|
255 |
+
return self.forward_features_list(x, masks)
|
256 |
+
|
257 |
+
x = self.prepare_tokens_with_masks(x, masks)
|
258 |
+
|
259 |
+
for blk in self.blocks:
|
260 |
+
x = blk(x)
|
261 |
+
|
262 |
+
x_norm = self.norm(x)
|
263 |
+
return {
|
264 |
+
"x_norm_clstoken": x_norm[:, 0],
|
265 |
+
"x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
|
266 |
+
"x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
|
267 |
+
"x_prenorm": x,
|
268 |
+
"masks": masks,
|
269 |
+
}
|
270 |
+
|
271 |
+
def _get_intermediate_layers_not_chunked(self, x, n=1):
|
272 |
+
x = self.prepare_tokens_with_masks(x)
|
273 |
+
# If n is an int, take the n last blocks. If it's a list, take them
|
274 |
+
output, total_block_len = [], len(self.blocks)
|
275 |
+
blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
|
276 |
+
for i, blk in enumerate(self.blocks):
|
277 |
+
x = blk(x)
|
278 |
+
if i in blocks_to_take:
|
279 |
+
output.append(x)
|
280 |
+
assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
|
281 |
+
return output
|
282 |
+
|
283 |
+
def _get_intermediate_layers_chunked(self, x, n=1):
|
284 |
+
x = self.prepare_tokens_with_masks(x)
|
285 |
+
output, i, total_block_len = [], 0, len(self.blocks[-1])
|
286 |
+
# If n is an int, take the n last blocks. If it's a list, take them
|
287 |
+
blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
|
288 |
+
for block_chunk in self.blocks:
|
289 |
+
for blk in block_chunk[i:]: # Passing the nn.Identity()
|
290 |
+
x = blk(x)
|
291 |
+
if i in blocks_to_take:
|
292 |
+
output.append(x)
|
293 |
+
i += 1
|
294 |
+
assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
|
295 |
+
return output
|
296 |
+
|
297 |
+
def get_intermediate_layers(
|
298 |
+
self,
|
299 |
+
x: torch.Tensor,
|
300 |
+
n: Union[int, Sequence] = 1, # Layers or n last layers to take
|
301 |
+
reshape: bool = False,
|
302 |
+
return_class_token: bool = False,
|
303 |
+
norm=True
|
304 |
+
) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
|
305 |
+
if self.chunked_blocks:
|
306 |
+
outputs = self._get_intermediate_layers_chunked(x, n)
|
307 |
+
else:
|
308 |
+
outputs = self._get_intermediate_layers_not_chunked(x, n)
|
309 |
+
if norm:
|
310 |
+
outputs = [self.norm(out) for out in outputs]
|
311 |
+
class_tokens = [out[:, 0] for out in outputs]
|
312 |
+
outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
|
313 |
+
if reshape:
|
314 |
+
B, _, w, h = x.shape
|
315 |
+
outputs = [
|
316 |
+
out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
|
317 |
+
for out in outputs
|
318 |
+
]
|
319 |
+
if return_class_token:
|
320 |
+
return tuple(zip(outputs, class_tokens))
|
321 |
+
return tuple(outputs)
|
322 |
+
|
323 |
+
def forward(self, *args, is_training=False, **kwargs):
|
324 |
+
ret = self.forward_features(*args, **kwargs)
|
325 |
+
if is_training:
|
326 |
+
return ret
|
327 |
+
else:
|
328 |
+
return self.head(ret["x_norm_clstoken"])
|
329 |
+
|
330 |
+
|
331 |
+
def init_weights_vit_timm(module: nn.Module, name: str = ""):
|
332 |
+
"""ViT weight initialization, original timm impl (for reproducibility)"""
|
333 |
+
if isinstance(module, nn.Linear):
|
334 |
+
trunc_normal_(module.weight, std=0.02)
|
335 |
+
if module.bias is not None:
|
336 |
+
nn.init.zeros_(module.bias)
|
337 |
+
|
338 |
+
|
339 |
+
def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
|
340 |
+
model = DinoVisionTransformer(
|
341 |
+
patch_size=patch_size,
|
342 |
+
embed_dim=384,
|
343 |
+
depth=12,
|
344 |
+
num_heads=6,
|
345 |
+
mlp_ratio=4,
|
346 |
+
block_fn=partial(Block, attn_class=MemEffAttention),
|
347 |
+
num_register_tokens=num_register_tokens,
|
348 |
+
**kwargs,
|
349 |
+
)
|
350 |
+
return model
|
351 |
+
|
352 |
+
|
353 |
+
def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
|
354 |
+
model = DinoVisionTransformer(
|
355 |
+
patch_size=patch_size,
|
356 |
+
embed_dim=768,
|
357 |
+
depth=12,
|
358 |
+
num_heads=12,
|
359 |
+
mlp_ratio=4,
|
360 |
+
block_fn=partial(Block, attn_class=MemEffAttention),
|
361 |
+
num_register_tokens=num_register_tokens,
|
362 |
+
**kwargs,
|
363 |
+
)
|
364 |
+
return model
|
365 |
+
|
366 |
+
|
367 |
+
def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
|
368 |
+
model = DinoVisionTransformer(
|
369 |
+
patch_size=patch_size,
|
370 |
+
embed_dim=1024,
|
371 |
+
depth=24,
|
372 |
+
num_heads=16,
|
373 |
+
mlp_ratio=4,
|
374 |
+
block_fn=partial(Block, attn_class=MemEffAttention),
|
375 |
+
num_register_tokens=num_register_tokens,
|
376 |
+
**kwargs,
|
377 |
+
)
|
378 |
+
return model
|
379 |
+
|
380 |
+
|
381 |
+
def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
|
382 |
+
"""
|
383 |
+
Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
|
384 |
+
"""
|
385 |
+
model = DinoVisionTransformer(
|
386 |
+
patch_size=patch_size,
|
387 |
+
embed_dim=1536,
|
388 |
+
depth=40,
|
389 |
+
num_heads=24,
|
390 |
+
mlp_ratio=4,
|
391 |
+
block_fn=partial(Block, attn_class=MemEffAttention),
|
392 |
+
num_register_tokens=num_register_tokens,
|
393 |
+
**kwargs,
|
394 |
+
)
|
395 |
+
return model
|
396 |
+
|
397 |
+
|
398 |
+
def DINOv2(model_name):
|
399 |
+
model_zoo = {
|
400 |
+
"vits": vit_small,
|
401 |
+
"vitb": vit_base,
|
402 |
+
"vitl": vit_large,
|
403 |
+
"vitg": vit_giant2
|
404 |
+
}
|
405 |
+
|
406 |
+
return model_zoo[model_name](
|
407 |
+
img_size=518,
|
408 |
+
patch_size=14,
|
409 |
+
init_values=1.0,
|
410 |
+
ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
|
411 |
+
block_chunks=0,
|
412 |
+
num_register_tokens=0,
|
413 |
+
interpolate_antialias=False,
|
414 |
+
interpolate_offset=0.1
|
415 |
+
)
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__init__.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
from .mlp import Mlp
|
8 |
+
from .patch_embed import PatchEmbed
|
9 |
+
from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
|
10 |
+
from .block import NestedTensorBlock
|
11 |
+
from .attention import MemEffAttention
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (450 Bytes). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (552 Bytes). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (463 Bytes). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc
ADDED
Binary file (2.42 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-311.pyc
ADDED
Binary file (4.51 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-312.pyc
ADDED
Binary file (3.97 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc
ADDED
Binary file (8.02 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-311.pyc
ADDED
Binary file (15.5 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-312.pyc
ADDED
Binary file (13.1 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc
ADDED
Binary file (1.25 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-311.pyc
ADDED
Binary file (1.9 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-312.pyc
ADDED
Binary file (1.68 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc
ADDED
Binary file (1.05 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-311.pyc
ADDED
Binary file (1.66 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-312.pyc
ADDED
Binary file (1.44 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc
ADDED
Binary file (1.24 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-311.pyc
ADDED
Binary file (2.12 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-312.pyc
ADDED
Binary file (1.87 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc
ADDED
Binary file (2.69 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-311.pyc
ADDED
Binary file (4.49 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-312.pyc
ADDED
Binary file (4.08 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc
ADDED
Binary file (2.04 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-311.pyc
ADDED
Binary file (3.33 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-312.pyc
ADDED
Binary file (2.86 kB). View file
|
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/attention.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
# References:
|
8 |
+
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
9 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
|
10 |
+
|
11 |
+
import logging
|
12 |
+
|
13 |
+
from torch import Tensor
|
14 |
+
from torch import nn
|
15 |
+
|
16 |
+
|
17 |
+
logger = logging.getLogger("dinov2")
|
18 |
+
|
19 |
+
|
20 |
+
try:
|
21 |
+
from xformers.ops import memory_efficient_attention, unbind, fmha
|
22 |
+
|
23 |
+
XFORMERS_AVAILABLE = True
|
24 |
+
except ImportError:
|
25 |
+
logger.warning("xFormers not available")
|
26 |
+
XFORMERS_AVAILABLE = False
|
27 |
+
|
28 |
+
|
29 |
+
class Attention(nn.Module):
|
30 |
+
def __init__(
|
31 |
+
self,
|
32 |
+
dim: int,
|
33 |
+
num_heads: int = 8,
|
34 |
+
qkv_bias: bool = False,
|
35 |
+
proj_bias: bool = True,
|
36 |
+
attn_drop: float = 0.0,
|
37 |
+
proj_drop: float = 0.0,
|
38 |
+
) -> None:
|
39 |
+
super().__init__()
|
40 |
+
self.num_heads = num_heads
|
41 |
+
head_dim = dim // num_heads
|
42 |
+
self.scale = head_dim**-0.5
|
43 |
+
|
44 |
+
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
45 |
+
self.attn_drop = nn.Dropout(attn_drop)
|
46 |
+
self.proj = nn.Linear(dim, dim, bias=proj_bias)
|
47 |
+
self.proj_drop = nn.Dropout(proj_drop)
|
48 |
+
|
49 |
+
def forward(self, x: Tensor) -> Tensor:
|
50 |
+
B, N, C = x.shape
|
51 |
+
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
52 |
+
|
53 |
+
q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
|
54 |
+
attn = q @ k.transpose(-2, -1)
|
55 |
+
|
56 |
+
attn = attn.softmax(dim=-1)
|
57 |
+
attn = self.attn_drop(attn)
|
58 |
+
|
59 |
+
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
|
60 |
+
x = self.proj(x)
|
61 |
+
x = self.proj_drop(x)
|
62 |
+
return x
|
63 |
+
|
64 |
+
|
65 |
+
class MemEffAttention(Attention):
|
66 |
+
def forward(self, x: Tensor, attn_bias=None) -> Tensor:
|
67 |
+
if not XFORMERS_AVAILABLE:
|
68 |
+
assert attn_bias is None, "xFormers is required for nested tensors usage"
|
69 |
+
return super().forward(x)
|
70 |
+
|
71 |
+
B, N, C = x.shape
|
72 |
+
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
|
73 |
+
|
74 |
+
q, k, v = unbind(qkv, 2)
|
75 |
+
|
76 |
+
x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
|
77 |
+
x = x.reshape([B, N, C])
|
78 |
+
|
79 |
+
x = self.proj(x)
|
80 |
+
x = self.proj_drop(x)
|
81 |
+
return x
|
82 |
+
|
83 |
+
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/block.py
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
# References:
|
8 |
+
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
9 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
|
10 |
+
|
11 |
+
import logging
|
12 |
+
from typing import Callable, List, Any, Tuple, Dict
|
13 |
+
|
14 |
+
import torch
|
15 |
+
from torch import nn, Tensor
|
16 |
+
|
17 |
+
from .attention import Attention, MemEffAttention
|
18 |
+
from .drop_path import DropPath
|
19 |
+
from .layer_scale import LayerScale
|
20 |
+
from .mlp import Mlp
|
21 |
+
|
22 |
+
|
23 |
+
logger = logging.getLogger("dinov2")
|
24 |
+
|
25 |
+
|
26 |
+
try:
|
27 |
+
from xformers.ops import fmha
|
28 |
+
from xformers.ops import scaled_index_add, index_select_cat
|
29 |
+
|
30 |
+
XFORMERS_AVAILABLE = True
|
31 |
+
except ImportError:
|
32 |
+
logger.warning("xFormers not available")
|
33 |
+
XFORMERS_AVAILABLE = False
|
34 |
+
|
35 |
+
|
36 |
+
class Block(nn.Module):
|
37 |
+
def __init__(
|
38 |
+
self,
|
39 |
+
dim: int,
|
40 |
+
num_heads: int,
|
41 |
+
mlp_ratio: float = 4.0,
|
42 |
+
qkv_bias: bool = False,
|
43 |
+
proj_bias: bool = True,
|
44 |
+
ffn_bias: bool = True,
|
45 |
+
drop: float = 0.0,
|
46 |
+
attn_drop: float = 0.0,
|
47 |
+
init_values=None,
|
48 |
+
drop_path: float = 0.0,
|
49 |
+
act_layer: Callable[..., nn.Module] = nn.GELU,
|
50 |
+
norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
|
51 |
+
attn_class: Callable[..., nn.Module] = Attention,
|
52 |
+
ffn_layer: Callable[..., nn.Module] = Mlp,
|
53 |
+
) -> None:
|
54 |
+
super().__init__()
|
55 |
+
# print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
|
56 |
+
self.norm1 = norm_layer(dim)
|
57 |
+
self.attn = attn_class(
|
58 |
+
dim,
|
59 |
+
num_heads=num_heads,
|
60 |
+
qkv_bias=qkv_bias,
|
61 |
+
proj_bias=proj_bias,
|
62 |
+
attn_drop=attn_drop,
|
63 |
+
proj_drop=drop,
|
64 |
+
)
|
65 |
+
self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
|
66 |
+
self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
67 |
+
|
68 |
+
self.norm2 = norm_layer(dim)
|
69 |
+
mlp_hidden_dim = int(dim * mlp_ratio)
|
70 |
+
self.mlp = ffn_layer(
|
71 |
+
in_features=dim,
|
72 |
+
hidden_features=mlp_hidden_dim,
|
73 |
+
act_layer=act_layer,
|
74 |
+
drop=drop,
|
75 |
+
bias=ffn_bias,
|
76 |
+
)
|
77 |
+
self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
|
78 |
+
self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
79 |
+
|
80 |
+
self.sample_drop_ratio = drop_path
|
81 |
+
|
82 |
+
def forward(self, x: Tensor) -> Tensor:
|
83 |
+
def attn_residual_func(x: Tensor) -> Tensor:
|
84 |
+
return self.ls1(self.attn(self.norm1(x)))
|
85 |
+
|
86 |
+
def ffn_residual_func(x: Tensor) -> Tensor:
|
87 |
+
return self.ls2(self.mlp(self.norm2(x)))
|
88 |
+
|
89 |
+
if self.training and self.sample_drop_ratio > 0.1:
|
90 |
+
# the overhead is compensated only for a drop path rate larger than 0.1
|
91 |
+
x = drop_add_residual_stochastic_depth(
|
92 |
+
x,
|
93 |
+
residual_func=attn_residual_func,
|
94 |
+
sample_drop_ratio=self.sample_drop_ratio,
|
95 |
+
)
|
96 |
+
x = drop_add_residual_stochastic_depth(
|
97 |
+
x,
|
98 |
+
residual_func=ffn_residual_func,
|
99 |
+
sample_drop_ratio=self.sample_drop_ratio,
|
100 |
+
)
|
101 |
+
elif self.training and self.sample_drop_ratio > 0.0:
|
102 |
+
x = x + self.drop_path1(attn_residual_func(x))
|
103 |
+
x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2
|
104 |
+
else:
|
105 |
+
x = x + attn_residual_func(x)
|
106 |
+
x = x + ffn_residual_func(x)
|
107 |
+
return x
|
108 |
+
|
109 |
+
|
110 |
+
def drop_add_residual_stochastic_depth(
|
111 |
+
x: Tensor,
|
112 |
+
residual_func: Callable[[Tensor], Tensor],
|
113 |
+
sample_drop_ratio: float = 0.0,
|
114 |
+
) -> Tensor:
|
115 |
+
# 1) extract subset using permutation
|
116 |
+
b, n, d = x.shape
|
117 |
+
sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
|
118 |
+
brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
|
119 |
+
x_subset = x[brange]
|
120 |
+
|
121 |
+
# 2) apply residual_func to get residual
|
122 |
+
residual = residual_func(x_subset)
|
123 |
+
|
124 |
+
x_flat = x.flatten(1)
|
125 |
+
residual = residual.flatten(1)
|
126 |
+
|
127 |
+
residual_scale_factor = b / sample_subset_size
|
128 |
+
|
129 |
+
# 3) add the residual
|
130 |
+
x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
|
131 |
+
return x_plus_residual.view_as(x)
|
132 |
+
|
133 |
+
|
134 |
+
def get_branges_scales(x, sample_drop_ratio=0.0):
|
135 |
+
b, n, d = x.shape
|
136 |
+
sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
|
137 |
+
brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
|
138 |
+
residual_scale_factor = b / sample_subset_size
|
139 |
+
return brange, residual_scale_factor
|
140 |
+
|
141 |
+
|
142 |
+
def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
|
143 |
+
if scaling_vector is None:
|
144 |
+
x_flat = x.flatten(1)
|
145 |
+
residual = residual.flatten(1)
|
146 |
+
x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
|
147 |
+
else:
|
148 |
+
x_plus_residual = scaled_index_add(
|
149 |
+
x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
|
150 |
+
)
|
151 |
+
return x_plus_residual
|
152 |
+
|
153 |
+
|
154 |
+
attn_bias_cache: Dict[Tuple, Any] = {}
|
155 |
+
|
156 |
+
|
157 |
+
def get_attn_bias_and_cat(x_list, branges=None):
|
158 |
+
"""
|
159 |
+
this will perform the index select, cat the tensors, and provide the attn_bias from cache
|
160 |
+
"""
|
161 |
+
batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
|
162 |
+
all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
|
163 |
+
if all_shapes not in attn_bias_cache.keys():
|
164 |
+
seqlens = []
|
165 |
+
for b, x in zip(batch_sizes, x_list):
|
166 |
+
for _ in range(b):
|
167 |
+
seqlens.append(x.shape[1])
|
168 |
+
attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
|
169 |
+
attn_bias._batch_sizes = batch_sizes
|
170 |
+
attn_bias_cache[all_shapes] = attn_bias
|
171 |
+
|
172 |
+
if branges is not None:
|
173 |
+
cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
|
174 |
+
else:
|
175 |
+
tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
|
176 |
+
cat_tensors = torch.cat(tensors_bs1, dim=1)
|
177 |
+
|
178 |
+
return attn_bias_cache[all_shapes], cat_tensors
|
179 |
+
|
180 |
+
|
181 |
+
def drop_add_residual_stochastic_depth_list(
|
182 |
+
x_list: List[Tensor],
|
183 |
+
residual_func: Callable[[Tensor, Any], Tensor],
|
184 |
+
sample_drop_ratio: float = 0.0,
|
185 |
+
scaling_vector=None,
|
186 |
+
) -> Tensor:
|
187 |
+
# 1) generate random set of indices for dropping samples in the batch
|
188 |
+
branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
|
189 |
+
branges = [s[0] for s in branges_scales]
|
190 |
+
residual_scale_factors = [s[1] for s in branges_scales]
|
191 |
+
|
192 |
+
# 2) get attention bias and index+concat the tensors
|
193 |
+
attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
|
194 |
+
|
195 |
+
# 3) apply residual_func to get residual, and split the result
|
196 |
+
residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
|
197 |
+
|
198 |
+
outputs = []
|
199 |
+
for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
|
200 |
+
outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
|
201 |
+
return outputs
|
202 |
+
|
203 |
+
|
204 |
+
class NestedTensorBlock(Block):
|
205 |
+
def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
|
206 |
+
"""
|
207 |
+
x_list contains a list of tensors to nest together and run
|
208 |
+
"""
|
209 |
+
assert isinstance(self.attn, MemEffAttention)
|
210 |
+
|
211 |
+
if self.training and self.sample_drop_ratio > 0.0:
|
212 |
+
|
213 |
+
def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
|
214 |
+
return self.attn(self.norm1(x), attn_bias=attn_bias)
|
215 |
+
|
216 |
+
def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
|
217 |
+
return self.mlp(self.norm2(x))
|
218 |
+
|
219 |
+
x_list = drop_add_residual_stochastic_depth_list(
|
220 |
+
x_list,
|
221 |
+
residual_func=attn_residual_func,
|
222 |
+
sample_drop_ratio=self.sample_drop_ratio,
|
223 |
+
scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
|
224 |
+
)
|
225 |
+
x_list = drop_add_residual_stochastic_depth_list(
|
226 |
+
x_list,
|
227 |
+
residual_func=ffn_residual_func,
|
228 |
+
sample_drop_ratio=self.sample_drop_ratio,
|
229 |
+
scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
|
230 |
+
)
|
231 |
+
return x_list
|
232 |
+
else:
|
233 |
+
|
234 |
+
def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
|
235 |
+
return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
|
236 |
+
|
237 |
+
def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
|
238 |
+
return self.ls2(self.mlp(self.norm2(x)))
|
239 |
+
|
240 |
+
attn_bias, x = get_attn_bias_and_cat(x_list)
|
241 |
+
x = x + attn_residual_func(x, attn_bias=attn_bias)
|
242 |
+
x = x + ffn_residual_func(x)
|
243 |
+
return attn_bias.split(x)
|
244 |
+
|
245 |
+
def forward(self, x_or_x_list):
|
246 |
+
if isinstance(x_or_x_list, Tensor):
|
247 |
+
return super().forward(x_or_x_list)
|
248 |
+
elif isinstance(x_or_x_list, list):
|
249 |
+
assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
|
250 |
+
return self.forward_nested(x_or_x_list)
|
251 |
+
else:
|
252 |
+
raise AssertionError
|
ddepth_anything_v2/depth_anything_v2/dinov2_layers/drop_path.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
#
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
# References:
|
8 |
+
# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
|
9 |
+
# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
|
10 |
+
|
11 |
+
|
12 |
+
from torch import nn
|
13 |
+
|
14 |
+
|
15 |
+
def drop_path(x, drop_prob: float = 0.0, training: bool = False):
|
16 |
+
if drop_prob == 0.0 or not training:
|
17 |
+
return x
|
18 |
+
keep_prob = 1 - drop_prob
|
19 |
+
shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
|
20 |
+
random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
|
21 |
+
if keep_prob > 0.0:
|
22 |
+
random_tensor.div_(keep_prob)
|
23 |
+
output = x * random_tensor
|
24 |
+
return output
|
25 |
+
|
26 |
+
|
27 |
+
class DropPath(nn.Module):
|
28 |
+
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
|
29 |
+
|
30 |
+
def __init__(self, drop_prob=None):
|
31 |
+
super(DropPath, self).__init__()
|
32 |
+
self.drop_prob = drop_prob
|
33 |
+
|
34 |
+
def forward(self, x):
|
35 |
+
return drop_path(x, self.drop_prob, self.training)
|